2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
67 #include <net/inet_hashtables.h>
70 #include <net/inet_common.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
97 .port_rover = 1024 - 1,
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
105 int sysctl_local_port_range[2] = { 1024, 4999 };
107 static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
109 const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
114 sk_for_each_bound(sk2, node, &tb->owners) {
116 !inet_v6_ipv6only(sk2) &&
117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
122 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
132 /* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
135 int inet_csk_get_port(struct inet_hashinfo *hashinfo,
136 struct sock *sk, unsigned short snum)
138 struct inet_bind_hashbucket *head;
139 struct hlist_node *node;
140 struct inet_bind_bucket *tb;
145 int low = sysctl_local_port_range[0];
146 int high = sysctl_local_port_range[1];
147 int remaining = (high - low) + 1;
150 spin_lock(&hashinfo->portalloc_lock);
151 if (hashinfo->port_rover < low)
154 rover = hashinfo->port_rover;
159 head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
160 spin_lock(&head->lock);
161 inet_bind_bucket_for_each(tb, node, &head->chain)
162 if (tb->port == rover)
166 spin_unlock(&head->lock);
167 } while (--remaining > 0);
168 hashinfo->port_rover = rover;
169 spin_unlock(&hashinfo->portalloc_lock);
171 /* Exhausted local port range during search? It is not
172 * possible for us to be holding one of the bind hash
173 * locks if this test triggers, because if 'remaining'
174 * drops to zero, we broke out of the do/while loop at
175 * the top level, not from the 'break;' statement.
178 if (unlikely(remaining <= 0))
181 /* OK, here is the one we will use. HEAD is
182 * non-NULL and we hold it's mutex.
186 head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
187 spin_lock(&head->lock);
188 inet_bind_bucket_for_each(tb, node, &head->chain)
189 if (tb->port == snum)
195 if (!hlist_empty(&tb->owners)) {
196 if (sk->sk_reuse > 1)
198 if (tb->fastreuse > 0 &&
199 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
203 if (inet_csk_bind_conflict(sk, tb))
209 if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
211 if (hlist_empty(&tb->owners)) {
212 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
216 } else if (tb->fastreuse &&
217 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
220 if (!inet_csk(sk)->icsk_bind_hash)
221 inet_bind_hash(sk, tb, snum);
222 BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
226 spin_unlock(&head->lock);
232 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
234 return inet_csk_get_port(&tcp_hashinfo, sk, snum);
237 static void tcp_v4_hash(struct sock *sk)
239 inet_hash(&tcp_hashinfo, sk);
242 void tcp_unhash(struct sock *sk)
244 inet_unhash(&tcp_hashinfo, sk);
247 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
249 return secure_tcp_sequence_number(skb->nh.iph->daddr,
255 /* called with local bh disabled */
256 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
257 struct inet_timewait_sock **twp)
259 struct inet_sock *inet = inet_sk(sk);
260 u32 daddr = inet->rcv_saddr;
261 u32 saddr = inet->daddr;
262 int dif = sk->sk_bound_dev_if;
263 INET_ADDR_COOKIE(acookie, saddr, daddr)
264 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
265 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
266 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
268 const struct hlist_node *node;
269 struct inet_timewait_sock *tw;
271 write_lock(&head->lock);
273 /* Check TIME-WAIT sockets first. */
274 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
277 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
278 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
279 struct tcp_sock *tp = tcp_sk(sk);
281 /* With PAWS, it is safe from the viewpoint
282 of data integrity. Even without PAWS it
283 is safe provided sequence spaces do not
284 overlap i.e. at data rates <= 80Mbit/sec.
286 Actually, the idea is close to VJ's one,
287 only timestamp cache is held not per host,
288 but per port pair and TW bucket is used
291 If TW bucket has been already destroyed we
292 fall back to VJ's scheme and use initial
293 timestamp retrieved from peer table.
295 if (tcptw->tw_ts_recent_stamp &&
296 (!twp || (sysctl_tcp_tw_reuse &&
298 tcptw->tw_ts_recent_stamp > 1))) {
299 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
300 if (tp->write_seq == 0)
302 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
303 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
312 /* And established part... */
313 sk_for_each(sk2, node, &head->chain) {
314 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
319 /* Must record num and sport now. Otherwise we will see
320 * in hash table socket with a funny identity. */
322 inet->sport = htons(lport);
323 sk->sk_hashent = hash;
324 BUG_TRAP(sk_unhashed(sk));
325 __sk_add_node(sk, &head->chain);
326 sock_prot_inc_use(sk->sk_prot);
327 write_unlock(&head->lock);
331 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
333 /* Silly. Should hash-dance instead... */
334 tcp_tw_deschedule(tw);
335 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
343 write_unlock(&head->lock);
344 return -EADDRNOTAVAIL;
347 static inline u32 connect_port_offset(const struct sock *sk)
349 const struct inet_sock *inet = inet_sk(sk);
351 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
356 * Bind a port for a connect operation and hash it.
358 static inline int tcp_v4_hash_connect(struct sock *sk)
360 const unsigned short snum = inet_sk(sk)->num;
361 struct inet_bind_hashbucket *head;
362 struct inet_bind_bucket *tb;
366 int low = sysctl_local_port_range[0];
367 int high = sysctl_local_port_range[1];
368 int range = high - low;
372 u32 offset = hint + connect_port_offset(sk);
373 struct hlist_node *node;
374 struct inet_timewait_sock *tw = NULL;
377 for (i = 1; i <= range; i++) {
378 port = low + (i + offset) % range;
379 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
380 spin_lock(&head->lock);
382 /* Does not bother with rcv_saddr checks,
383 * because the established check is already
386 inet_bind_bucket_for_each(tb, node, &head->chain) {
387 if (tb->port == port) {
388 BUG_TRAP(!hlist_empty(&tb->owners));
389 if (tb->fastreuse >= 0)
391 if (!__tcp_v4_check_established(sk,
399 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
401 spin_unlock(&head->lock);
408 spin_unlock(&head->lock);
412 return -EADDRNOTAVAIL;
417 /* Head lock still held and bh's disabled */
418 inet_bind_hash(sk, tb, port);
419 if (sk_unhashed(sk)) {
420 inet_sk(sk)->sport = htons(port);
421 __inet_hash(&tcp_hashinfo, sk, 0);
423 spin_unlock(&head->lock);
426 tcp_tw_deschedule(tw);
434 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
435 tb = inet_csk(sk)->icsk_bind_hash;
436 spin_lock_bh(&head->lock);
437 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
438 __inet_hash(&tcp_hashinfo, sk, 0);
439 spin_unlock_bh(&head->lock);
442 spin_unlock(&head->lock);
443 /* No definite answer... Walk to established hash table */
444 ret = __tcp_v4_check_established(sk, snum, NULL);
451 /* This will initiate an outgoing connection. */
452 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
454 struct inet_sock *inet = inet_sk(sk);
455 struct tcp_sock *tp = tcp_sk(sk);
456 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
462 if (addr_len < sizeof(struct sockaddr_in))
465 if (usin->sin_family != AF_INET)
466 return -EAFNOSUPPORT;
468 nexthop = daddr = usin->sin_addr.s_addr;
469 if (inet->opt && inet->opt->srr) {
472 nexthop = inet->opt->faddr;
475 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
476 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
478 inet->sport, usin->sin_port, sk);
482 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
487 if (!inet->opt || !inet->opt->srr)
491 inet->saddr = rt->rt_src;
492 inet->rcv_saddr = inet->saddr;
494 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
495 /* Reset inherited state */
496 tp->rx_opt.ts_recent = 0;
497 tp->rx_opt.ts_recent_stamp = 0;
501 if (sysctl_tcp_tw_recycle &&
502 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
503 struct inet_peer *peer = rt_get_peer(rt);
505 /* VJ's idea. We save last timestamp seen from
506 * the destination in peer table, when entering state TIME-WAIT
507 * and initialize rx_opt.ts_recent from it, when trying new connection.
510 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
511 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
512 tp->rx_opt.ts_recent = peer->tcp_ts;
516 inet->dport = usin->sin_port;
519 tp->ext_header_len = 0;
521 tp->ext_header_len = inet->opt->optlen;
523 tp->rx_opt.mss_clamp = 536;
525 /* Socket identity is still unknown (sport may be zero).
526 * However we set state to SYN-SENT and not releasing socket
527 * lock select source port, enter ourselves into the hash tables and
528 * complete initialization after this.
530 tcp_set_state(sk, TCP_SYN_SENT);
531 err = tcp_v4_hash_connect(sk);
535 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
539 /* OK, now commit destination to socket. */
540 sk_setup_caps(sk, &rt->u.dst);
543 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
548 inet->id = tp->write_seq ^ jiffies;
550 err = tcp_connect(sk);
558 /* This unhashes the socket and releases the local port, if necessary. */
559 tcp_set_state(sk, TCP_CLOSE);
561 sk->sk_route_caps = 0;
566 static inline int inet_iif(const struct sk_buff *skb)
568 return ((struct rtable *)skb->dst)->rt_iif;
571 static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
572 const u32 rnd, const u16 synq_hsize)
574 return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
577 struct request_sock *inet_csk_search_req(const struct sock *sk,
578 struct request_sock ***prevp,
579 const __u16 rport, const __u32 raddr,
582 const struct inet_connection_sock *icsk = inet_csk(sk);
583 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
584 struct request_sock *req, **prev;
586 for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
587 lopt->nr_table_entries)];
588 (req = *prev) != NULL;
589 prev = &req->dl_next) {
590 const struct inet_request_sock *ireq = inet_rsk(req);
592 if (ireq->rmt_port == rport &&
593 ireq->rmt_addr == raddr &&
594 ireq->loc_addr == laddr &&
595 AF_INET_FAMILY(req->rsk_ops->family)) {
605 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
607 struct inet_connection_sock *icsk = inet_csk(sk);
608 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
609 const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
610 lopt->hash_rnd, lopt->nr_table_entries);
612 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT);
613 inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
618 * This routine does path mtu discovery as defined in RFC1191.
620 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
623 struct dst_entry *dst;
624 struct inet_sock *inet = inet_sk(sk);
625 struct tcp_sock *tp = tcp_sk(sk);
627 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
628 * send out by Linux are always <576bytes so they should go through
631 if (sk->sk_state == TCP_LISTEN)
634 /* We don't check in the destentry if pmtu discovery is forbidden
635 * on this route. We just assume that no packet_to_big packets
636 * are send back when pmtu discovery is not active.
637 * There is a small race when the user changes this flag in the
638 * route, but I think that's acceptable.
640 if ((dst = __sk_dst_check(sk, 0)) == NULL)
643 dst->ops->update_pmtu(dst, mtu);
645 /* Something is about to be wrong... Remember soft error
646 * for the case, if this connection will not able to recover.
648 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
649 sk->sk_err_soft = EMSGSIZE;
653 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
654 tp->pmtu_cookie > mtu) {
655 tcp_sync_mss(sk, mtu);
657 /* Resend the TCP packet because it's
658 * clear that the old packet has been
659 * dropped. This is the new "fast" path mtu
662 tcp_simple_retransmit(sk);
663 } /* else let the usual retransmit timer handle it */
667 * This routine is called by the ICMP module when it gets some
668 * sort of error condition. If err < 0 then the socket should
669 * be closed and the error returned to the user. If err > 0
670 * it's just the icmp type << 8 | icmp code. After adjustment
671 * header points to the first 8 bytes of the tcp header. We need
672 * to find the appropriate port.
674 * The locking strategy used here is very "optimistic". When
675 * someone else accesses the socket the ICMP is just dropped
676 * and for some paths there is no check at all.
677 * A more general error queue to queue errors for later handling
678 * is probably better.
682 void tcp_v4_err(struct sk_buff *skb, u32 info)
684 struct iphdr *iph = (struct iphdr *)skb->data;
685 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
687 struct inet_sock *inet;
688 int type = skb->h.icmph->type;
689 int code = skb->h.icmph->code;
694 if (skb->len < (iph->ihl << 2) + 8) {
695 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
699 sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
700 th->source, inet_iif(skb));
702 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
705 if (sk->sk_state == TCP_TIME_WAIT) {
706 inet_twsk_put((struct inet_timewait_sock *)sk);
711 /* If too many ICMPs get dropped on busy
712 * servers this needs to be solved differently.
714 if (sock_owned_by_user(sk))
715 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
717 if (sk->sk_state == TCP_CLOSE)
721 seq = ntohl(th->seq);
722 if (sk->sk_state != TCP_LISTEN &&
723 !between(seq, tp->snd_una, tp->snd_nxt)) {
724 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
729 case ICMP_SOURCE_QUENCH:
730 /* Just silently ignore these. */
732 case ICMP_PARAMETERPROB:
735 case ICMP_DEST_UNREACH:
736 if (code > NR_ICMP_UNREACH)
739 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
740 if (!sock_owned_by_user(sk))
741 do_pmtu_discovery(sk, iph, info);
745 err = icmp_err_convert[code].errno;
747 case ICMP_TIME_EXCEEDED:
754 switch (sk->sk_state) {
755 struct request_sock *req, **prev;
757 if (sock_owned_by_user(sk))
760 req = inet_csk_search_req(sk, &prev, th->dest,
761 iph->daddr, iph->saddr);
765 /* ICMPs are not backlogged, hence we cannot get
766 an established socket here.
770 if (seq != tcp_rsk(req)->snt_isn) {
771 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
776 * Still in SYN_RECV, just remove it silently.
777 * There is no good way to pass the error to the newly
778 * created socket, and POSIX does not want network
779 * errors returned from accept().
781 inet_csk_reqsk_queue_drop(sk, req, prev);
785 case TCP_SYN_RECV: /* Cannot happen.
786 It can f.e. if SYNs crossed.
788 if (!sock_owned_by_user(sk)) {
789 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
792 sk->sk_error_report(sk);
796 sk->sk_err_soft = err;
801 /* If we've already connected we will keep trying
802 * until we time out, or the user gives up.
804 * rfc1122 4.2.3.9 allows to consider as hard errors
805 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
806 * but it is obsoleted by pmtu discovery).
808 * Note, that in modern internet, where routing is unreliable
809 * and in each dark corner broken firewalls sit, sending random
810 * errors ordered by their masters even this two messages finally lose
811 * their original sense (even Linux sends invalid PORT_UNREACHs)
813 * Now we are in compliance with RFCs.
818 if (!sock_owned_by_user(sk) && inet->recverr) {
820 sk->sk_error_report(sk);
821 } else { /* Only an error on timeout */
822 sk->sk_err_soft = err;
830 /* This routine computes an IPv4 TCP checksum. */
831 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
834 struct inet_sock *inet = inet_sk(sk);
836 if (skb->ip_summed == CHECKSUM_HW) {
837 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
838 skb->csum = offsetof(struct tcphdr, check);
840 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
841 csum_partial((char *)th,
848 * This routine will send an RST to the other tcp.
850 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
852 * Answer: if a packet caused RST, it is not for a socket
853 * existing in our system, if it is matched to a socket,
854 * it is just duplicate segment or bug in other side's TCP.
855 * So that we build reply only basing on parameters
856 * arrived with segment.
857 * Exception: precedence violation. We do not implement it in any case.
860 static void tcp_v4_send_reset(struct sk_buff *skb)
862 struct tcphdr *th = skb->h.th;
864 struct ip_reply_arg arg;
866 /* Never send a reset in response to a reset. */
870 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
873 /* Swap the send and the receive. */
874 memset(&rth, 0, sizeof(struct tcphdr));
875 rth.dest = th->source;
876 rth.source = th->dest;
877 rth.doff = sizeof(struct tcphdr) / 4;
881 rth.seq = th->ack_seq;
884 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
885 skb->len - (th->doff << 2));
888 memset(&arg, 0, sizeof arg);
889 arg.iov[0].iov_base = (unsigned char *)&rth;
890 arg.iov[0].iov_len = sizeof rth;
891 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
892 skb->nh.iph->saddr, /*XXX*/
893 sizeof(struct tcphdr), IPPROTO_TCP, 0);
894 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
896 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
898 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
899 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
902 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
903 outside socket context is ugly, certainly. What can I do?
906 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
909 struct tcphdr *th = skb->h.th;
914 struct ip_reply_arg arg;
916 memset(&rep.th, 0, sizeof(struct tcphdr));
917 memset(&arg, 0, sizeof arg);
919 arg.iov[0].iov_base = (unsigned char *)&rep;
920 arg.iov[0].iov_len = sizeof(rep.th);
922 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
923 (TCPOPT_TIMESTAMP << 8) |
925 rep.tsopt[1] = htonl(tcp_time_stamp);
926 rep.tsopt[2] = htonl(ts);
927 arg.iov[0].iov_len = sizeof(rep);
930 /* Swap the send and the receive. */
931 rep.th.dest = th->source;
932 rep.th.source = th->dest;
933 rep.th.doff = arg.iov[0].iov_len / 4;
934 rep.th.seq = htonl(seq);
935 rep.th.ack_seq = htonl(ack);
937 rep.th.window = htons(win);
939 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
940 skb->nh.iph->saddr, /*XXX*/
941 arg.iov[0].iov_len, IPPROTO_TCP, 0);
942 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
944 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
946 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
949 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
951 struct inet_timewait_sock *tw = inet_twsk(sk);
952 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
954 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
955 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
960 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
962 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
966 struct dst_entry* inet_csk_route_req(struct sock *sk,
967 const struct request_sock *req)
970 const struct inet_request_sock *ireq = inet_rsk(req);
971 struct ip_options *opt = inet_rsk(req)->opt;
972 struct flowi fl = { .oif = sk->sk_bound_dev_if,
974 { .daddr = ((opt && opt->srr) ?
977 .saddr = ireq->loc_addr,
978 .tos = RT_CONN_FLAGS(sk) } },
979 .proto = sk->sk_protocol,
981 { .sport = inet_sk(sk)->sport,
982 .dport = ireq->rmt_port } } };
984 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
985 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
988 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
990 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
997 * Send a SYN-ACK after having received an ACK.
998 * This still operates on a request_sock only, not on a big
1001 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1002 struct dst_entry *dst)
1004 const struct inet_request_sock *ireq = inet_rsk(req);
1006 struct sk_buff * skb;
1008 /* First, grab a route. */
1009 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1012 skb = tcp_make_synack(sk, dst, req);
1015 struct tcphdr *th = skb->h.th;
1017 th->check = tcp_v4_check(th, skb->len,
1020 csum_partial((char *)th, skb->len,
1023 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1026 if (err == NET_XMIT_CN)
1036 * IPv4 request_sock destructor.
1038 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1040 if (inet_rsk(req)->opt)
1041 kfree(inet_rsk(req)->opt);
1044 static inline void syn_flood_warning(struct sk_buff *skb)
1046 static unsigned long warntime;
1048 if (time_after(jiffies, (warntime + HZ * 60))) {
1051 "possible SYN flooding on port %d. Sending cookies.\n",
1052 ntohs(skb->h.th->dest));
1057 * Save and compile IPv4 options into the request_sock if needed.
1059 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1060 struct sk_buff *skb)
1062 struct ip_options *opt = &(IPCB(skb)->opt);
1063 struct ip_options *dopt = NULL;
1065 if (opt && opt->optlen) {
1066 int opt_size = optlength(opt);
1067 dopt = kmalloc(opt_size, GFP_ATOMIC);
1069 if (ip_options_echo(dopt, skb)) {
1078 struct request_sock_ops tcp_request_sock_ops = {
1080 .obj_size = sizeof(struct tcp_request_sock),
1081 .rtx_syn_ack = tcp_v4_send_synack,
1082 .send_ack = tcp_v4_reqsk_send_ack,
1083 .destructor = tcp_v4_reqsk_destructor,
1084 .send_reset = tcp_v4_send_reset,
1087 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1089 struct inet_request_sock *ireq;
1090 struct tcp_options_received tmp_opt;
1091 struct request_sock *req;
1092 __u32 saddr = skb->nh.iph->saddr;
1093 __u32 daddr = skb->nh.iph->daddr;
1094 __u32 isn = TCP_SKB_CB(skb)->when;
1095 struct dst_entry *dst = NULL;
1096 #ifdef CONFIG_SYN_COOKIES
1097 int want_cookie = 0;
1099 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1102 /* Never answer to SYNs send to broadcast or multicast */
1103 if (((struct rtable *)skb->dst)->rt_flags &
1104 (RTCF_BROADCAST | RTCF_MULTICAST))
1107 /* TW buckets are converted to open requests without
1108 * limitations, they conserve resources and peer is
1109 * evidently real one.
1111 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1112 #ifdef CONFIG_SYN_COOKIES
1113 if (sysctl_tcp_syncookies) {
1120 /* Accept backlog is full. If we have already queued enough
1121 * of warm entries in syn queue, drop request. It is better than
1122 * clogging syn queue with openreqs with exponentially increasing
1125 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1128 req = reqsk_alloc(&tcp_request_sock_ops);
1132 tcp_clear_options(&tmp_opt);
1133 tmp_opt.mss_clamp = 536;
1134 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1136 tcp_parse_options(skb, &tmp_opt, 0);
1139 tcp_clear_options(&tmp_opt);
1140 tmp_opt.saw_tstamp = 0;
1143 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1144 /* Some OSes (unknown ones, but I see them on web server, which
1145 * contains information interesting only for windows'
1146 * users) do not send their stamp in SYN. It is easy case.
1147 * We simply do not advertise TS support.
1149 tmp_opt.saw_tstamp = 0;
1150 tmp_opt.tstamp_ok = 0;
1152 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1154 tcp_openreq_init(req, &tmp_opt, skb);
1156 ireq = inet_rsk(req);
1157 ireq->loc_addr = daddr;
1158 ireq->rmt_addr = saddr;
1159 ireq->opt = tcp_v4_save_options(sk, skb);
1161 TCP_ECN_create_request(req, skb->h.th);
1164 #ifdef CONFIG_SYN_COOKIES
1165 syn_flood_warning(skb);
1167 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1169 struct inet_peer *peer = NULL;
1171 /* VJ's idea. We save last timestamp seen
1172 * from the destination in peer table, when entering
1173 * state TIME-WAIT, and check against it before
1174 * accepting new connection request.
1176 * If "isn" is not zero, this request hit alive
1177 * timewait bucket, so that all the necessary checks
1178 * are made in the function processing timewait state.
1180 if (tmp_opt.saw_tstamp &&
1181 sysctl_tcp_tw_recycle &&
1182 (dst = inet_csk_route_req(sk, req)) != NULL &&
1183 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1184 peer->v4daddr == saddr) {
1185 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1186 (s32)(peer->tcp_ts - req->ts_recent) >
1188 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1193 /* Kill the following clause, if you dislike this way. */
1194 else if (!sysctl_tcp_syncookies &&
1195 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1196 (sysctl_max_syn_backlog >> 2)) &&
1197 (!peer || !peer->tcp_ts_stamp) &&
1198 (!dst || !dst_metric(dst, RTAX_RTT))) {
1199 /* Without syncookies last quarter of
1200 * backlog is filled with destinations,
1201 * proven to be alive.
1202 * It means that we continue to communicate
1203 * to destinations, already remembered
1204 * to the moment of synflood.
1206 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1207 "request from %u.%u."
1210 ntohs(skb->h.th->source)));
1215 isn = tcp_v4_init_sequence(sk, skb);
1217 tcp_rsk(req)->snt_isn = isn;
1219 if (tcp_v4_send_synack(sk, req, dst))
1225 tcp_v4_synq_add(sk, req);
1232 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1238 * The three way handshake has completed - we got a valid synack -
1239 * now create the new socket.
1241 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1242 struct request_sock *req,
1243 struct dst_entry *dst)
1245 struct inet_request_sock *ireq;
1246 struct inet_sock *newinet;
1247 struct tcp_sock *newtp;
1250 if (sk_acceptq_is_full(sk))
1253 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1256 newsk = tcp_create_openreq_child(sk, req, skb);
1260 sk_setup_caps(newsk, dst);
1262 newtp = tcp_sk(newsk);
1263 newinet = inet_sk(newsk);
1264 ireq = inet_rsk(req);
1265 newinet->daddr = ireq->rmt_addr;
1266 newinet->rcv_saddr = ireq->loc_addr;
1267 newinet->saddr = ireq->loc_addr;
1268 newinet->opt = ireq->opt;
1270 newinet->mc_index = inet_iif(skb);
1271 newinet->mc_ttl = skb->nh.iph->ttl;
1272 newtp->ext_header_len = 0;
1274 newtp->ext_header_len = newinet->opt->optlen;
1275 newinet->id = newtp->write_seq ^ jiffies;
1277 tcp_sync_mss(newsk, dst_mtu(dst));
1278 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1279 tcp_initialize_rcv_mss(newsk);
1281 __inet_hash(&tcp_hashinfo, newsk, 0);
1282 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1287 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1289 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1294 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1296 struct tcphdr *th = skb->h.th;
1297 struct iphdr *iph = skb->nh.iph;
1299 struct request_sock **prev;
1300 /* Find possible connection requests. */
1301 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1302 iph->saddr, iph->daddr);
1304 return tcp_check_req(sk, skb, req, prev);
1306 nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1307 th->source, skb->nh.iph->daddr,
1308 ntohs(th->dest), inet_iif(skb));
1311 if (nsk->sk_state != TCP_TIME_WAIT) {
1315 inet_twsk_put((struct inet_timewait_sock *)nsk);
1319 #ifdef CONFIG_SYN_COOKIES
1320 if (!th->rst && !th->syn && th->ack)
1321 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1326 static int tcp_v4_checksum_init(struct sk_buff *skb)
1328 if (skb->ip_summed == CHECKSUM_HW) {
1329 skb->ip_summed = CHECKSUM_UNNECESSARY;
1330 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1331 skb->nh.iph->daddr, skb->csum))
1334 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1335 skb->ip_summed = CHECKSUM_NONE;
1337 if (skb->len <= 76) {
1338 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1340 skb_checksum(skb, 0, skb->len, 0)))
1342 skb->ip_summed = CHECKSUM_UNNECESSARY;
1344 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1346 skb->nh.iph->daddr, 0);
1352 /* The socket must have it's spinlock held when we get
1355 * We have a potential double-lock case here, so even when
1356 * doing backlog processing we use the BH locking scheme.
1357 * This is because we cannot sleep with the original spinlock
1360 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1362 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1363 TCP_CHECK_TIMER(sk);
1364 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1366 TCP_CHECK_TIMER(sk);
1370 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1373 if (sk->sk_state == TCP_LISTEN) {
1374 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1379 if (tcp_child_process(sk, nsk, skb))
1385 TCP_CHECK_TIMER(sk);
1386 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1388 TCP_CHECK_TIMER(sk);
1392 tcp_v4_send_reset(skb);
1395 /* Be careful here. If this function gets more complicated and
1396 * gcc suffers from register pressure on the x86, sk (in %ebx)
1397 * might be destroyed here. This current version compiles correctly,
1398 * but you have been warned.
1403 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1411 int tcp_v4_rcv(struct sk_buff *skb)
1417 if (skb->pkt_type != PACKET_HOST)
1420 /* Count it even if it's bad */
1421 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1423 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1428 if (th->doff < sizeof(struct tcphdr) / 4)
1430 if (!pskb_may_pull(skb, th->doff * 4))
1433 /* An explanation is required here, I think.
1434 * Packet length and doff are validated by header prediction,
1435 * provided case of th->doff==0 is elimineted.
1436 * So, we defer the checks. */
1437 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1438 tcp_v4_checksum_init(skb) < 0))
1442 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1443 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1444 skb->len - th->doff * 4);
1445 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1446 TCP_SKB_CB(skb)->when = 0;
1447 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1448 TCP_SKB_CB(skb)->sacked = 0;
1450 sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1451 skb->nh.iph->daddr, ntohs(th->dest),
1458 if (sk->sk_state == TCP_TIME_WAIT)
1461 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1462 goto discard_and_relse;
1464 if (sk_filter(sk, skb, 0))
1465 goto discard_and_relse;
1471 if (!sock_owned_by_user(sk)) {
1472 if (!tcp_prequeue(sk, skb))
1473 ret = tcp_v4_do_rcv(sk, skb);
1475 sk_add_backlog(sk, skb);
1483 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1486 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1488 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1490 tcp_v4_send_reset(skb);
1494 /* Discard frame. */
1503 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1504 inet_twsk_put((struct inet_timewait_sock *) sk);
1508 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1509 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1510 inet_twsk_put((struct inet_timewait_sock *) sk);
1513 switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1516 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1521 tcp_tw_deschedule((struct inet_timewait_sock *)sk);
1522 inet_twsk_put((struct inet_timewait_sock *)sk);
1526 /* Fall through to ACK */
1529 tcp_v4_timewait_ack(sk, skb);
1533 case TCP_TW_SUCCESS:;
1538 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1540 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1541 struct inet_sock *inet = inet_sk(sk);
1543 sin->sin_family = AF_INET;
1544 sin->sin_addr.s_addr = inet->daddr;
1545 sin->sin_port = inet->dport;
1548 /* VJ's idea. Save last timestamp seen from this destination
1549 * and hold it at least for normal timewait interval to use for duplicate
1550 * segment detection in subsequent connections, before they enter synchronized
1554 int tcp_v4_remember_stamp(struct sock *sk)
1556 struct inet_sock *inet = inet_sk(sk);
1557 struct tcp_sock *tp = tcp_sk(sk);
1558 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1559 struct inet_peer *peer = NULL;
1562 if (!rt || rt->rt_dst != inet->daddr) {
1563 peer = inet_getpeer(inet->daddr, 1);
1567 rt_bind_peer(rt, 1);
1572 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1573 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1574 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1575 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1576 peer->tcp_ts = tp->rx_opt.ts_recent;
1586 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1588 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1591 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1593 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1594 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1595 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1596 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1597 peer->tcp_ts = tcptw->tw_ts_recent;
1606 struct tcp_func ipv4_specific = {
1607 .queue_xmit = ip_queue_xmit,
1608 .send_check = tcp_v4_send_check,
1609 .rebuild_header = inet_sk_rebuild_header,
1610 .conn_request = tcp_v4_conn_request,
1611 .syn_recv_sock = tcp_v4_syn_recv_sock,
1612 .remember_stamp = tcp_v4_remember_stamp,
1613 .net_header_len = sizeof(struct iphdr),
1614 .setsockopt = ip_setsockopt,
1615 .getsockopt = ip_getsockopt,
1616 .addr2sockaddr = v4_addr2sockaddr,
1617 .sockaddr_len = sizeof(struct sockaddr_in),
1620 /* NOTE: A lot of things set to zero explicitly by call to
1621 * sk_alloc() so need not be done here.
1623 static int tcp_v4_init_sock(struct sock *sk)
1625 struct tcp_sock *tp = tcp_sk(sk);
1627 skb_queue_head_init(&tp->out_of_order_queue);
1628 tcp_init_xmit_timers(sk);
1629 tcp_prequeue_init(tp);
1631 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
1632 tp->mdev = TCP_TIMEOUT_INIT;
1634 /* So many TCP implementations out there (incorrectly) count the
1635 * initial SYN frame in their delayed-ACK and congestion control
1636 * algorithms that we must have the following bandaid to talk
1637 * efficiently to them. -DaveM
1641 /* See draft-stevens-tcpca-spec-01 for discussion of the
1642 * initialization of these values.
1644 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1645 tp->snd_cwnd_clamp = ~0;
1646 tp->mss_cache = 536;
1648 tp->reordering = sysctl_tcp_reordering;
1649 tp->ca_ops = &tcp_init_congestion_ops;
1651 sk->sk_state = TCP_CLOSE;
1653 sk->sk_write_space = sk_stream_write_space;
1654 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1656 tp->af_specific = &ipv4_specific;
1658 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1659 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1661 atomic_inc(&tcp_sockets_allocated);
1666 int tcp_v4_destroy_sock(struct sock *sk)
1668 struct tcp_sock *tp = tcp_sk(sk);
1670 tcp_clear_xmit_timers(sk);
1672 tcp_cleanup_congestion_control(tp);
1674 /* Cleanup up the write buffer. */
1675 sk_stream_writequeue_purge(sk);
1677 /* Cleans up our, hopefully empty, out_of_order_queue. */
1678 __skb_queue_purge(&tp->out_of_order_queue);
1680 /* Clean prequeue, it must be empty really */
1681 __skb_queue_purge(&tp->ucopy.prequeue);
1683 /* Clean up a referenced TCP bind bucket. */
1684 if (inet_csk(sk)->icsk_bind_hash)
1685 inet_put_port(&tcp_hashinfo, sk);
1688 * If sendmsg cached page exists, toss it.
1690 if (sk->sk_sndmsg_page) {
1691 __free_page(sk->sk_sndmsg_page);
1692 sk->sk_sndmsg_page = NULL;
1695 atomic_dec(&tcp_sockets_allocated);
1700 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1702 #ifdef CONFIG_PROC_FS
1703 /* Proc filesystem TCP sock list dumping. */
1705 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1707 return hlist_empty(head) ? NULL :
1708 list_entry(head->first, struct inet_timewait_sock, tw_node);
1711 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1713 return tw->tw_node.next ?
1714 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1717 static void *listening_get_next(struct seq_file *seq, void *cur)
1719 struct inet_connection_sock *icsk;
1720 struct hlist_node *node;
1721 struct sock *sk = cur;
1722 struct tcp_iter_state* st = seq->private;
1726 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1732 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1733 struct request_sock *req = cur;
1735 icsk = inet_csk(st->syn_wait_sk);
1739 if (req->rsk_ops->family == st->family) {
1745 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1748 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1750 sk = sk_next(st->syn_wait_sk);
1751 st->state = TCP_SEQ_STATE_LISTENING;
1752 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1754 icsk = inet_csk(sk);
1755 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1756 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1758 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1762 sk_for_each_from(sk, node) {
1763 if (sk->sk_family == st->family) {
1767 icsk = inet_csk(sk);
1768 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1769 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1771 st->uid = sock_i_uid(sk);
1772 st->syn_wait_sk = sk;
1773 st->state = TCP_SEQ_STATE_OPENREQ;
1777 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1779 if (++st->bucket < INET_LHTABLE_SIZE) {
1780 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1788 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1790 void *rc = listening_get_next(seq, NULL);
1792 while (rc && *pos) {
1793 rc = listening_get_next(seq, rc);
1799 static void *established_get_first(struct seq_file *seq)
1801 struct tcp_iter_state* st = seq->private;
1804 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1806 struct hlist_node *node;
1807 struct inet_timewait_sock *tw;
1809 /* We can reschedule _before_ having picked the target: */
1810 cond_resched_softirq();
1812 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1813 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1814 if (sk->sk_family != st->family) {
1820 st->state = TCP_SEQ_STATE_TIME_WAIT;
1821 inet_twsk_for_each(tw, node,
1822 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1823 if (tw->tw_family != st->family) {
1829 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1830 st->state = TCP_SEQ_STATE_ESTABLISHED;
1836 static void *established_get_next(struct seq_file *seq, void *cur)
1838 struct sock *sk = cur;
1839 struct inet_timewait_sock *tw;
1840 struct hlist_node *node;
1841 struct tcp_iter_state* st = seq->private;
1845 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1849 while (tw && tw->tw_family != st->family) {
1856 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1857 st->state = TCP_SEQ_STATE_ESTABLISHED;
1859 /* We can reschedule between buckets: */
1860 cond_resched_softirq();
1862 if (++st->bucket < tcp_hashinfo.ehash_size) {
1863 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1864 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1872 sk_for_each_from(sk, node) {
1873 if (sk->sk_family == st->family)
1877 st->state = TCP_SEQ_STATE_TIME_WAIT;
1878 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1886 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1888 void *rc = established_get_first(seq);
1891 rc = established_get_next(seq, rc);
1897 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1900 struct tcp_iter_state* st = seq->private;
1902 inet_listen_lock(&tcp_hashinfo);
1903 st->state = TCP_SEQ_STATE_LISTENING;
1904 rc = listening_get_idx(seq, &pos);
1907 inet_listen_unlock(&tcp_hashinfo);
1909 st->state = TCP_SEQ_STATE_ESTABLISHED;
1910 rc = established_get_idx(seq, pos);
1916 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1918 struct tcp_iter_state* st = seq->private;
1919 st->state = TCP_SEQ_STATE_LISTENING;
1921 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1924 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1927 struct tcp_iter_state* st;
1929 if (v == SEQ_START_TOKEN) {
1930 rc = tcp_get_idx(seq, 0);
1935 switch (st->state) {
1936 case TCP_SEQ_STATE_OPENREQ:
1937 case TCP_SEQ_STATE_LISTENING:
1938 rc = listening_get_next(seq, v);
1940 inet_listen_unlock(&tcp_hashinfo);
1942 st->state = TCP_SEQ_STATE_ESTABLISHED;
1943 rc = established_get_first(seq);
1946 case TCP_SEQ_STATE_ESTABLISHED:
1947 case TCP_SEQ_STATE_TIME_WAIT:
1948 rc = established_get_next(seq, v);
1956 static void tcp_seq_stop(struct seq_file *seq, void *v)
1958 struct tcp_iter_state* st = seq->private;
1960 switch (st->state) {
1961 case TCP_SEQ_STATE_OPENREQ:
1963 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1964 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1966 case TCP_SEQ_STATE_LISTENING:
1967 if (v != SEQ_START_TOKEN)
1968 inet_listen_unlock(&tcp_hashinfo);
1970 case TCP_SEQ_STATE_TIME_WAIT:
1971 case TCP_SEQ_STATE_ESTABLISHED:
1973 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1979 static int tcp_seq_open(struct inode *inode, struct file *file)
1981 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1982 struct seq_file *seq;
1983 struct tcp_iter_state *s;
1986 if (unlikely(afinfo == NULL))
1989 s = kmalloc(sizeof(*s), GFP_KERNEL);
1992 memset(s, 0, sizeof(*s));
1993 s->family = afinfo->family;
1994 s->seq_ops.start = tcp_seq_start;
1995 s->seq_ops.next = tcp_seq_next;
1996 s->seq_ops.show = afinfo->seq_show;
1997 s->seq_ops.stop = tcp_seq_stop;
1999 rc = seq_open(file, &s->seq_ops);
2002 seq = file->private_data;
2011 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2014 struct proc_dir_entry *p;
2018 afinfo->seq_fops->owner = afinfo->owner;
2019 afinfo->seq_fops->open = tcp_seq_open;
2020 afinfo->seq_fops->read = seq_read;
2021 afinfo->seq_fops->llseek = seq_lseek;
2022 afinfo->seq_fops->release = seq_release_private;
2024 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2032 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2036 proc_net_remove(afinfo->name);
2037 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2040 static void get_openreq4(struct sock *sk, struct request_sock *req,
2041 char *tmpbuf, int i, int uid)
2043 const struct inet_request_sock *ireq = inet_rsk(req);
2044 int ttd = req->expires - jiffies;
2046 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2047 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2050 ntohs(inet_sk(sk)->sport),
2052 ntohs(ireq->rmt_port),
2054 0, 0, /* could print option size, but that is af dependent. */
2055 1, /* timers active (only the expire timer) */
2056 jiffies_to_clock_t(ttd),
2059 0, /* non standard timer */
2060 0, /* open_requests have no inode */
2061 atomic_read(&sk->sk_refcnt),
2065 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2068 unsigned long timer_expires;
2069 struct tcp_sock *tp = tcp_sk(sp);
2070 const struct inet_connection_sock *icsk = inet_csk(sp);
2071 struct inet_sock *inet = inet_sk(sp);
2072 unsigned int dest = inet->daddr;
2073 unsigned int src = inet->rcv_saddr;
2074 __u16 destp = ntohs(inet->dport);
2075 __u16 srcp = ntohs(inet->sport);
2077 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2079 timer_expires = icsk->icsk_timeout;
2080 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2082 timer_expires = icsk->icsk_timeout;
2083 } else if (timer_pending(&sp->sk_timer)) {
2085 timer_expires = sp->sk_timer.expires;
2088 timer_expires = jiffies;
2091 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2092 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2093 i, src, srcp, dest, destp, sp->sk_state,
2094 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2096 jiffies_to_clock_t(timer_expires - jiffies),
2097 icsk->icsk_retransmits,
2101 atomic_read(&sp->sk_refcnt), sp,
2104 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2106 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2109 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
2111 unsigned int dest, src;
2113 int ttd = tw->tw_ttd - jiffies;
2118 dest = tw->tw_daddr;
2119 src = tw->tw_rcv_saddr;
2120 destp = ntohs(tw->tw_dport);
2121 srcp = ntohs(tw->tw_sport);
2123 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2124 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2125 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2126 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2127 atomic_read(&tw->tw_refcnt), tw);
2132 static int tcp4_seq_show(struct seq_file *seq, void *v)
2134 struct tcp_iter_state* st;
2135 char tmpbuf[TMPSZ + 1];
2137 if (v == SEQ_START_TOKEN) {
2138 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2139 " sl local_address rem_address st tx_queue "
2140 "rx_queue tr tm->when retrnsmt uid timeout "
2146 switch (st->state) {
2147 case TCP_SEQ_STATE_LISTENING:
2148 case TCP_SEQ_STATE_ESTABLISHED:
2149 get_tcp4_sock(v, tmpbuf, st->num);
2151 case TCP_SEQ_STATE_OPENREQ:
2152 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2154 case TCP_SEQ_STATE_TIME_WAIT:
2155 get_timewait4_sock(v, tmpbuf, st->num);
2158 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2163 static struct file_operations tcp4_seq_fops;
2164 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2165 .owner = THIS_MODULE,
2168 .seq_show = tcp4_seq_show,
2169 .seq_fops = &tcp4_seq_fops,
2172 int __init tcp4_proc_init(void)
2174 return tcp_proc_register(&tcp4_seq_afinfo);
2177 void tcp4_proc_exit(void)
2179 tcp_proc_unregister(&tcp4_seq_afinfo);
2181 #endif /* CONFIG_PROC_FS */
2183 struct proto tcp_prot = {
2185 .owner = THIS_MODULE,
2187 .connect = tcp_v4_connect,
2188 .disconnect = tcp_disconnect,
2189 .accept = inet_csk_accept,
2191 .init = tcp_v4_init_sock,
2192 .destroy = tcp_v4_destroy_sock,
2193 .shutdown = tcp_shutdown,
2194 .setsockopt = tcp_setsockopt,
2195 .getsockopt = tcp_getsockopt,
2196 .sendmsg = tcp_sendmsg,
2197 .recvmsg = tcp_recvmsg,
2198 .backlog_rcv = tcp_v4_do_rcv,
2199 .hash = tcp_v4_hash,
2200 .unhash = tcp_unhash,
2201 .get_port = tcp_v4_get_port,
2202 .enter_memory_pressure = tcp_enter_memory_pressure,
2203 .sockets_allocated = &tcp_sockets_allocated,
2204 .memory_allocated = &tcp_memory_allocated,
2205 .memory_pressure = &tcp_memory_pressure,
2206 .sysctl_mem = sysctl_tcp_mem,
2207 .sysctl_wmem = sysctl_tcp_wmem,
2208 .sysctl_rmem = sysctl_tcp_rmem,
2209 .max_header = MAX_TCP_HEADER,
2210 .obj_size = sizeof(struct tcp_sock),
2211 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2212 .rsk_prot = &tcp_request_sock_ops,
2217 void __init tcp_v4_init(struct net_proto_family *ops)
2219 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2221 panic("Failed to create the TCP control socket.\n");
2222 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2223 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2225 /* Unhash it so that IP input processing does not even
2226 * see it, we do not wish this socket to see incoming
2229 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2232 EXPORT_SYMBOL(ipv4_specific);
2233 EXPORT_SYMBOL(inet_bind_bucket_create);
2234 EXPORT_SYMBOL(tcp_hashinfo);
2235 EXPORT_SYMBOL(tcp_prot);
2236 EXPORT_SYMBOL(tcp_unhash);
2237 EXPORT_SYMBOL(tcp_v4_conn_request);
2238 EXPORT_SYMBOL(tcp_v4_connect);
2239 EXPORT_SYMBOL(tcp_v4_do_rcv);
2240 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2241 EXPORT_SYMBOL(tcp_v4_send_check);
2242 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2244 #ifdef CONFIG_PROC_FS
2245 EXPORT_SYMBOL(tcp_proc_register);
2246 EXPORT_SYMBOL(tcp_proc_unregister);
2248 EXPORT_SYMBOL(sysctl_local_port_range);
2249 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2250 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);