2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
67 #include <net/inet_hashtables.h>
70 #include <net/inet_common.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
97 .port_rover = 1024 - 1,
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
105 int sysctl_local_port_range[2] = { 1024, 4999 };
107 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
109 const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
114 sk_for_each_bound(sk2, node, &tb->owners) {
116 !tcp_v6_ipv6only(sk2) &&
117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
122 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
132 /* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
135 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
137 struct inet_bind_hashbucket *head;
138 struct hlist_node *node;
139 struct inet_bind_bucket *tb;
144 int low = sysctl_local_port_range[0];
145 int high = sysctl_local_port_range[1];
146 int remaining = (high - low) + 1;
149 spin_lock(&tcp_hashinfo.portalloc_lock);
150 if (tcp_hashinfo.port_rover < low)
153 rover = tcp_hashinfo.port_rover;
158 head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
159 spin_lock(&head->lock);
160 inet_bind_bucket_for_each(tb, node, &head->chain)
161 if (tb->port == rover)
165 spin_unlock(&head->lock);
166 } while (--remaining > 0);
167 tcp_hashinfo.port_rover = rover;
168 spin_unlock(&tcp_hashinfo.portalloc_lock);
170 /* Exhausted local port range during search? It is not
171 * possible for us to be holding one of the bind hash
172 * locks if this test triggers, because if 'remaining'
173 * drops to zero, we broke out of the do/while loop at
174 * the top level, not from the 'break;' statement.
177 if (unlikely(remaining <= 0))
180 /* OK, here is the one we will use. HEAD is
181 * non-NULL and we hold it's mutex.
185 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
186 spin_lock(&head->lock);
187 inet_bind_bucket_for_each(tb, node, &head->chain)
188 if (tb->port == snum)
194 if (!hlist_empty(&tb->owners)) {
195 if (sk->sk_reuse > 1)
197 if (tb->fastreuse > 0 &&
198 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
202 if (tcp_bind_conflict(sk, tb))
208 if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
210 if (hlist_empty(&tb->owners)) {
211 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
215 } else if (tb->fastreuse &&
216 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
219 if (!inet_sk(sk)->bind_hash)
220 inet_bind_hash(sk, tb, snum);
221 BUG_TRAP(inet_sk(sk)->bind_hash == tb);
225 spin_unlock(&head->lock);
231 static void tcp_v4_hash(struct sock *sk)
233 inet_hash(&tcp_hashinfo, sk);
236 void tcp_unhash(struct sock *sk)
238 inet_unhash(&tcp_hashinfo, sk);
241 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
242 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
244 * Local BH must be disabled here.
247 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
253 struct inet_ehash_bucket *head;
254 INET_ADDR_COOKIE(acookie, saddr, daddr)
255 const __u32 ports = INET_COMBINED_PORTS(sport, hnum);
257 const struct hlist_node *node;
258 /* Optimize here for direct hit, only listening connections can
259 * have wildcards anyways.
261 const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size);
262 head = &tcp_hashinfo.ehash[hash];
263 read_lock(&head->lock);
264 sk_for_each(sk, node, &head->chain) {
265 if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
266 goto hit; /* You sunk my battleship! */
269 /* Must check for a TIME_WAIT'er before going to listener hash. */
270 sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) {
271 if (INET_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
276 read_unlock(&head->lock);
283 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
284 u32 daddr, u16 hnum, int dif)
286 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
289 return sk ? : inet_lookup_listener(&tcp_hashinfo, daddr, hnum, dif);
292 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
298 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
304 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
306 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
308 return secure_tcp_sequence_number(skb->nh.iph->daddr,
314 /* called with local bh disabled */
315 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
316 struct inet_timewait_sock **twp)
318 struct inet_sock *inet = inet_sk(sk);
319 u32 daddr = inet->rcv_saddr;
320 u32 saddr = inet->daddr;
321 int dif = sk->sk_bound_dev_if;
322 INET_ADDR_COOKIE(acookie, saddr, daddr)
323 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
324 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
325 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
327 const struct hlist_node *node;
328 struct inet_timewait_sock *tw;
330 write_lock(&head->lock);
332 /* Check TIME-WAIT sockets first. */
333 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
336 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
337 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
338 struct tcp_sock *tp = tcp_sk(sk);
340 /* With PAWS, it is safe from the viewpoint
341 of data integrity. Even without PAWS it
342 is safe provided sequence spaces do not
343 overlap i.e. at data rates <= 80Mbit/sec.
345 Actually, the idea is close to VJ's one,
346 only timestamp cache is held not per host,
347 but per port pair and TW bucket is used
350 If TW bucket has been already destroyed we
351 fall back to VJ's scheme and use initial
352 timestamp retrieved from peer table.
354 if (tcptw->tw_ts_recent_stamp &&
355 (!twp || (sysctl_tcp_tw_reuse &&
357 tcptw->tw_ts_recent_stamp > 1))) {
358 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
359 if (tp->write_seq == 0)
361 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
362 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
371 /* And established part... */
372 sk_for_each(sk2, node, &head->chain) {
373 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
378 /* Must record num and sport now. Otherwise we will see
379 * in hash table socket with a funny identity. */
381 inet->sport = htons(lport);
382 sk->sk_hashent = hash;
383 BUG_TRAP(sk_unhashed(sk));
384 __sk_add_node(sk, &head->chain);
385 sock_prot_inc_use(sk->sk_prot);
386 write_unlock(&head->lock);
390 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
392 /* Silly. Should hash-dance instead... */
393 tcp_tw_deschedule(tw);
394 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
402 write_unlock(&head->lock);
403 return -EADDRNOTAVAIL;
406 static inline u32 connect_port_offset(const struct sock *sk)
408 const struct inet_sock *inet = inet_sk(sk);
410 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
415 * Bind a port for a connect operation and hash it.
417 static inline int tcp_v4_hash_connect(struct sock *sk)
419 const unsigned short snum = inet_sk(sk)->num;
420 struct inet_bind_hashbucket *head;
421 struct inet_bind_bucket *tb;
425 int low = sysctl_local_port_range[0];
426 int high = sysctl_local_port_range[1];
427 int range = high - low;
431 u32 offset = hint + connect_port_offset(sk);
432 struct hlist_node *node;
433 struct inet_timewait_sock *tw = NULL;
436 for (i = 1; i <= range; i++) {
437 port = low + (i + offset) % range;
438 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
439 spin_lock(&head->lock);
441 /* Does not bother with rcv_saddr checks,
442 * because the established check is already
445 inet_bind_bucket_for_each(tb, node, &head->chain) {
446 if (tb->port == port) {
447 BUG_TRAP(!hlist_empty(&tb->owners));
448 if (tb->fastreuse >= 0)
450 if (!__tcp_v4_check_established(sk,
458 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
460 spin_unlock(&head->lock);
467 spin_unlock(&head->lock);
471 return -EADDRNOTAVAIL;
476 /* Head lock still held and bh's disabled */
477 inet_bind_hash(sk, tb, port);
478 if (sk_unhashed(sk)) {
479 inet_sk(sk)->sport = htons(port);
480 __inet_hash(&tcp_hashinfo, sk, 0);
482 spin_unlock(&head->lock);
485 tcp_tw_deschedule(tw);
493 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
494 tb = inet_sk(sk)->bind_hash;
495 spin_lock_bh(&head->lock);
496 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
497 __inet_hash(&tcp_hashinfo, sk, 0);
498 spin_unlock_bh(&head->lock);
501 spin_unlock(&head->lock);
502 /* No definite answer... Walk to established hash table */
503 ret = __tcp_v4_check_established(sk, snum, NULL);
510 /* This will initiate an outgoing connection. */
511 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
513 struct inet_sock *inet = inet_sk(sk);
514 struct tcp_sock *tp = tcp_sk(sk);
515 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
521 if (addr_len < sizeof(struct sockaddr_in))
524 if (usin->sin_family != AF_INET)
525 return -EAFNOSUPPORT;
527 nexthop = daddr = usin->sin_addr.s_addr;
528 if (inet->opt && inet->opt->srr) {
531 nexthop = inet->opt->faddr;
534 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
535 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
537 inet->sport, usin->sin_port, sk);
541 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
546 if (!inet->opt || !inet->opt->srr)
550 inet->saddr = rt->rt_src;
551 inet->rcv_saddr = inet->saddr;
553 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
554 /* Reset inherited state */
555 tp->rx_opt.ts_recent = 0;
556 tp->rx_opt.ts_recent_stamp = 0;
560 if (sysctl_tcp_tw_recycle &&
561 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
562 struct inet_peer *peer = rt_get_peer(rt);
564 /* VJ's idea. We save last timestamp seen from
565 * the destination in peer table, when entering state TIME-WAIT
566 * and initialize rx_opt.ts_recent from it, when trying new connection.
569 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
570 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
571 tp->rx_opt.ts_recent = peer->tcp_ts;
575 inet->dport = usin->sin_port;
578 tp->ext_header_len = 0;
580 tp->ext_header_len = inet->opt->optlen;
582 tp->rx_opt.mss_clamp = 536;
584 /* Socket identity is still unknown (sport may be zero).
585 * However we set state to SYN-SENT and not releasing socket
586 * lock select source port, enter ourselves into the hash tables and
587 * complete initialization after this.
589 tcp_set_state(sk, TCP_SYN_SENT);
590 err = tcp_v4_hash_connect(sk);
594 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
598 /* OK, now commit destination to socket. */
599 sk_setup_caps(sk, &rt->u.dst);
602 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
607 inet->id = tp->write_seq ^ jiffies;
609 err = tcp_connect(sk);
617 /* This unhashes the socket and releases the local port, if necessary. */
618 tcp_set_state(sk, TCP_CLOSE);
620 sk->sk_route_caps = 0;
625 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
627 return ((struct rtable *)skb->dst)->rt_iif;
630 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
632 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
635 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
636 struct request_sock ***prevp,
638 __u32 raddr, __u32 laddr)
640 struct listen_sock *lopt = tp->accept_queue.listen_opt;
641 struct request_sock *req, **prev;
643 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
644 (req = *prev) != NULL;
645 prev = &req->dl_next) {
646 const struct inet_request_sock *ireq = inet_rsk(req);
648 if (ireq->rmt_port == rport &&
649 ireq->rmt_addr == raddr &&
650 ireq->loc_addr == laddr &&
651 TCP_INET_FAMILY(req->rsk_ops->family)) {
661 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
663 struct tcp_sock *tp = tcp_sk(sk);
664 struct listen_sock *lopt = tp->accept_queue.listen_opt;
665 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
667 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
673 * This routine does path mtu discovery as defined in RFC1191.
675 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
678 struct dst_entry *dst;
679 struct inet_sock *inet = inet_sk(sk);
680 struct tcp_sock *tp = tcp_sk(sk);
682 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
683 * send out by Linux are always <576bytes so they should go through
686 if (sk->sk_state == TCP_LISTEN)
689 /* We don't check in the destentry if pmtu discovery is forbidden
690 * on this route. We just assume that no packet_to_big packets
691 * are send back when pmtu discovery is not active.
692 * There is a small race when the user changes this flag in the
693 * route, but I think that's acceptable.
695 if ((dst = __sk_dst_check(sk, 0)) == NULL)
698 dst->ops->update_pmtu(dst, mtu);
700 /* Something is about to be wrong... Remember soft error
701 * for the case, if this connection will not able to recover.
703 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
704 sk->sk_err_soft = EMSGSIZE;
708 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
709 tp->pmtu_cookie > mtu) {
710 tcp_sync_mss(sk, mtu);
712 /* Resend the TCP packet because it's
713 * clear that the old packet has been
714 * dropped. This is the new "fast" path mtu
717 tcp_simple_retransmit(sk);
718 } /* else let the usual retransmit timer handle it */
722 * This routine is called by the ICMP module when it gets some
723 * sort of error condition. If err < 0 then the socket should
724 * be closed and the error returned to the user. If err > 0
725 * it's just the icmp type << 8 | icmp code. After adjustment
726 * header points to the first 8 bytes of the tcp header. We need
727 * to find the appropriate port.
729 * The locking strategy used here is very "optimistic". When
730 * someone else accesses the socket the ICMP is just dropped
731 * and for some paths there is no check at all.
732 * A more general error queue to queue errors for later handling
733 * is probably better.
737 void tcp_v4_err(struct sk_buff *skb, u32 info)
739 struct iphdr *iph = (struct iphdr *)skb->data;
740 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
742 struct inet_sock *inet;
743 int type = skb->h.icmph->type;
744 int code = skb->h.icmph->code;
749 if (skb->len < (iph->ihl << 2) + 8) {
750 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
754 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
755 th->source, tcp_v4_iif(skb));
757 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
760 if (sk->sk_state == TCP_TIME_WAIT) {
761 inet_twsk_put((struct inet_timewait_sock *)sk);
766 /* If too many ICMPs get dropped on busy
767 * servers this needs to be solved differently.
769 if (sock_owned_by_user(sk))
770 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
772 if (sk->sk_state == TCP_CLOSE)
776 seq = ntohl(th->seq);
777 if (sk->sk_state != TCP_LISTEN &&
778 !between(seq, tp->snd_una, tp->snd_nxt)) {
779 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
784 case ICMP_SOURCE_QUENCH:
785 /* Just silently ignore these. */
787 case ICMP_PARAMETERPROB:
790 case ICMP_DEST_UNREACH:
791 if (code > NR_ICMP_UNREACH)
794 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
795 if (!sock_owned_by_user(sk))
796 do_pmtu_discovery(sk, iph, info);
800 err = icmp_err_convert[code].errno;
802 case ICMP_TIME_EXCEEDED:
809 switch (sk->sk_state) {
810 struct request_sock *req, **prev;
812 if (sock_owned_by_user(sk))
815 req = tcp_v4_search_req(tp, &prev, th->dest,
816 iph->daddr, iph->saddr);
820 /* ICMPs are not backlogged, hence we cannot get
821 an established socket here.
825 if (seq != tcp_rsk(req)->snt_isn) {
826 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
831 * Still in SYN_RECV, just remove it silently.
832 * There is no good way to pass the error to the newly
833 * created socket, and POSIX does not want network
834 * errors returned from accept().
836 tcp_synq_drop(sk, req, prev);
840 case TCP_SYN_RECV: /* Cannot happen.
841 It can f.e. if SYNs crossed.
843 if (!sock_owned_by_user(sk)) {
844 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
847 sk->sk_error_report(sk);
851 sk->sk_err_soft = err;
856 /* If we've already connected we will keep trying
857 * until we time out, or the user gives up.
859 * rfc1122 4.2.3.9 allows to consider as hard errors
860 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
861 * but it is obsoleted by pmtu discovery).
863 * Note, that in modern internet, where routing is unreliable
864 * and in each dark corner broken firewalls sit, sending random
865 * errors ordered by their masters even this two messages finally lose
866 * their original sense (even Linux sends invalid PORT_UNREACHs)
868 * Now we are in compliance with RFCs.
873 if (!sock_owned_by_user(sk) && inet->recverr) {
875 sk->sk_error_report(sk);
876 } else { /* Only an error on timeout */
877 sk->sk_err_soft = err;
885 /* This routine computes an IPv4 TCP checksum. */
886 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
889 struct inet_sock *inet = inet_sk(sk);
891 if (skb->ip_summed == CHECKSUM_HW) {
892 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
893 skb->csum = offsetof(struct tcphdr, check);
895 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
896 csum_partial((char *)th,
903 * This routine will send an RST to the other tcp.
905 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
907 * Answer: if a packet caused RST, it is not for a socket
908 * existing in our system, if it is matched to a socket,
909 * it is just duplicate segment or bug in other side's TCP.
910 * So that we build reply only basing on parameters
911 * arrived with segment.
912 * Exception: precedence violation. We do not implement it in any case.
915 static void tcp_v4_send_reset(struct sk_buff *skb)
917 struct tcphdr *th = skb->h.th;
919 struct ip_reply_arg arg;
921 /* Never send a reset in response to a reset. */
925 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
928 /* Swap the send and the receive. */
929 memset(&rth, 0, sizeof(struct tcphdr));
930 rth.dest = th->source;
931 rth.source = th->dest;
932 rth.doff = sizeof(struct tcphdr) / 4;
936 rth.seq = th->ack_seq;
939 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
940 skb->len - (th->doff << 2));
943 memset(&arg, 0, sizeof arg);
944 arg.iov[0].iov_base = (unsigned char *)&rth;
945 arg.iov[0].iov_len = sizeof rth;
946 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
947 skb->nh.iph->saddr, /*XXX*/
948 sizeof(struct tcphdr), IPPROTO_TCP, 0);
949 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
951 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
953 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
954 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
957 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
958 outside socket context is ugly, certainly. What can I do?
961 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
964 struct tcphdr *th = skb->h.th;
969 struct ip_reply_arg arg;
971 memset(&rep.th, 0, sizeof(struct tcphdr));
972 memset(&arg, 0, sizeof arg);
974 arg.iov[0].iov_base = (unsigned char *)&rep;
975 arg.iov[0].iov_len = sizeof(rep.th);
977 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
978 (TCPOPT_TIMESTAMP << 8) |
980 rep.tsopt[1] = htonl(tcp_time_stamp);
981 rep.tsopt[2] = htonl(ts);
982 arg.iov[0].iov_len = sizeof(rep);
985 /* Swap the send and the receive. */
986 rep.th.dest = th->source;
987 rep.th.source = th->dest;
988 rep.th.doff = arg.iov[0].iov_len / 4;
989 rep.th.seq = htonl(seq);
990 rep.th.ack_seq = htonl(ack);
992 rep.th.window = htons(win);
994 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
995 skb->nh.iph->saddr, /*XXX*/
996 arg.iov[0].iov_len, IPPROTO_TCP, 0);
997 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
999 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1001 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1004 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1006 struct inet_timewait_sock *tw = inet_twsk(sk);
1007 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1009 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1010 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
1015 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1017 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1021 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1022 struct request_sock *req)
1025 const struct inet_request_sock *ireq = inet_rsk(req);
1026 struct ip_options *opt = inet_rsk(req)->opt;
1027 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1029 { .daddr = ((opt && opt->srr) ?
1032 .saddr = ireq->loc_addr,
1033 .tos = RT_CONN_FLAGS(sk) } },
1034 .proto = IPPROTO_TCP,
1036 { .sport = inet_sk(sk)->sport,
1037 .dport = ireq->rmt_port } } };
1039 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1040 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1043 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1045 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1052 * Send a SYN-ACK after having received an ACK.
1053 * This still operates on a request_sock only, not on a big
1056 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1057 struct dst_entry *dst)
1059 const struct inet_request_sock *ireq = inet_rsk(req);
1061 struct sk_buff * skb;
1063 /* First, grab a route. */
1064 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1067 skb = tcp_make_synack(sk, dst, req);
1070 struct tcphdr *th = skb->h.th;
1072 th->check = tcp_v4_check(th, skb->len,
1075 csum_partial((char *)th, skb->len,
1078 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1081 if (err == NET_XMIT_CN)
1091 * IPv4 request_sock destructor.
1093 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1095 if (inet_rsk(req)->opt)
1096 kfree(inet_rsk(req)->opt);
1099 static inline void syn_flood_warning(struct sk_buff *skb)
1101 static unsigned long warntime;
1103 if (time_after(jiffies, (warntime + HZ * 60))) {
1106 "possible SYN flooding on port %d. Sending cookies.\n",
1107 ntohs(skb->h.th->dest));
1112 * Save and compile IPv4 options into the request_sock if needed.
1114 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1115 struct sk_buff *skb)
1117 struct ip_options *opt = &(IPCB(skb)->opt);
1118 struct ip_options *dopt = NULL;
1120 if (opt && opt->optlen) {
1121 int opt_size = optlength(opt);
1122 dopt = kmalloc(opt_size, GFP_ATOMIC);
1124 if (ip_options_echo(dopt, skb)) {
1133 struct request_sock_ops tcp_request_sock_ops = {
1135 .obj_size = sizeof(struct tcp_request_sock),
1136 .rtx_syn_ack = tcp_v4_send_synack,
1137 .send_ack = tcp_v4_reqsk_send_ack,
1138 .destructor = tcp_v4_reqsk_destructor,
1139 .send_reset = tcp_v4_send_reset,
1142 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1144 struct inet_request_sock *ireq;
1145 struct tcp_options_received tmp_opt;
1146 struct request_sock *req;
1147 __u32 saddr = skb->nh.iph->saddr;
1148 __u32 daddr = skb->nh.iph->daddr;
1149 __u32 isn = TCP_SKB_CB(skb)->when;
1150 struct dst_entry *dst = NULL;
1151 #ifdef CONFIG_SYN_COOKIES
1152 int want_cookie = 0;
1154 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1157 /* Never answer to SYNs send to broadcast or multicast */
1158 if (((struct rtable *)skb->dst)->rt_flags &
1159 (RTCF_BROADCAST | RTCF_MULTICAST))
1162 /* TW buckets are converted to open requests without
1163 * limitations, they conserve resources and peer is
1164 * evidently real one.
1166 if (tcp_synq_is_full(sk) && !isn) {
1167 #ifdef CONFIG_SYN_COOKIES
1168 if (sysctl_tcp_syncookies) {
1175 /* Accept backlog is full. If we have already queued enough
1176 * of warm entries in syn queue, drop request. It is better than
1177 * clogging syn queue with openreqs with exponentially increasing
1180 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1183 req = reqsk_alloc(&tcp_request_sock_ops);
1187 tcp_clear_options(&tmp_opt);
1188 tmp_opt.mss_clamp = 536;
1189 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1191 tcp_parse_options(skb, &tmp_opt, 0);
1194 tcp_clear_options(&tmp_opt);
1195 tmp_opt.saw_tstamp = 0;
1198 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1199 /* Some OSes (unknown ones, but I see them on web server, which
1200 * contains information interesting only for windows'
1201 * users) do not send their stamp in SYN. It is easy case.
1202 * We simply do not advertise TS support.
1204 tmp_opt.saw_tstamp = 0;
1205 tmp_opt.tstamp_ok = 0;
1207 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1209 tcp_openreq_init(req, &tmp_opt, skb);
1211 ireq = inet_rsk(req);
1212 ireq->loc_addr = daddr;
1213 ireq->rmt_addr = saddr;
1214 ireq->opt = tcp_v4_save_options(sk, skb);
1216 TCP_ECN_create_request(req, skb->h.th);
1219 #ifdef CONFIG_SYN_COOKIES
1220 syn_flood_warning(skb);
1222 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1224 struct inet_peer *peer = NULL;
1226 /* VJ's idea. We save last timestamp seen
1227 * from the destination in peer table, when entering
1228 * state TIME-WAIT, and check against it before
1229 * accepting new connection request.
1231 * If "isn" is not zero, this request hit alive
1232 * timewait bucket, so that all the necessary checks
1233 * are made in the function processing timewait state.
1235 if (tmp_opt.saw_tstamp &&
1236 sysctl_tcp_tw_recycle &&
1237 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1238 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1239 peer->v4daddr == saddr) {
1240 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1241 (s32)(peer->tcp_ts - req->ts_recent) >
1243 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1248 /* Kill the following clause, if you dislike this way. */
1249 else if (!sysctl_tcp_syncookies &&
1250 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1251 (sysctl_max_syn_backlog >> 2)) &&
1252 (!peer || !peer->tcp_ts_stamp) &&
1253 (!dst || !dst_metric(dst, RTAX_RTT))) {
1254 /* Without syncookies last quarter of
1255 * backlog is filled with destinations,
1256 * proven to be alive.
1257 * It means that we continue to communicate
1258 * to destinations, already remembered
1259 * to the moment of synflood.
1261 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1262 "request from %u.%u."
1265 ntohs(skb->h.th->source)));
1270 isn = tcp_v4_init_sequence(sk, skb);
1272 tcp_rsk(req)->snt_isn = isn;
1274 if (tcp_v4_send_synack(sk, req, dst))
1280 tcp_v4_synq_add(sk, req);
1287 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1293 * The three way handshake has completed - we got a valid synack -
1294 * now create the new socket.
1296 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1297 struct request_sock *req,
1298 struct dst_entry *dst)
1300 struct inet_request_sock *ireq;
1301 struct inet_sock *newinet;
1302 struct tcp_sock *newtp;
1305 if (sk_acceptq_is_full(sk))
1308 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1311 newsk = tcp_create_openreq_child(sk, req, skb);
1315 sk_setup_caps(newsk, dst);
1317 newtp = tcp_sk(newsk);
1318 newinet = inet_sk(newsk);
1319 ireq = inet_rsk(req);
1320 newinet->daddr = ireq->rmt_addr;
1321 newinet->rcv_saddr = ireq->loc_addr;
1322 newinet->saddr = ireq->loc_addr;
1323 newinet->opt = ireq->opt;
1325 newinet->mc_index = tcp_v4_iif(skb);
1326 newinet->mc_ttl = skb->nh.iph->ttl;
1327 newtp->ext_header_len = 0;
1329 newtp->ext_header_len = newinet->opt->optlen;
1330 newinet->id = newtp->write_seq ^ jiffies;
1332 tcp_sync_mss(newsk, dst_mtu(dst));
1333 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1334 tcp_initialize_rcv_mss(newsk);
1336 __inet_hash(&tcp_hashinfo, newsk, 0);
1337 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1342 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1344 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1349 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1351 struct tcphdr *th = skb->h.th;
1352 struct iphdr *iph = skb->nh.iph;
1353 struct tcp_sock *tp = tcp_sk(sk);
1355 struct request_sock **prev;
1356 /* Find possible connection requests. */
1357 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1358 iph->saddr, iph->daddr);
1360 return tcp_check_req(sk, skb, req, prev);
1362 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1369 if (nsk->sk_state != TCP_TIME_WAIT) {
1373 inet_twsk_put((struct inet_timewait_sock *)nsk);
1377 #ifdef CONFIG_SYN_COOKIES
1378 if (!th->rst && !th->syn && th->ack)
1379 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1384 static int tcp_v4_checksum_init(struct sk_buff *skb)
1386 if (skb->ip_summed == CHECKSUM_HW) {
1387 skb->ip_summed = CHECKSUM_UNNECESSARY;
1388 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1389 skb->nh.iph->daddr, skb->csum))
1392 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1393 skb->ip_summed = CHECKSUM_NONE;
1395 if (skb->len <= 76) {
1396 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1398 skb_checksum(skb, 0, skb->len, 0)))
1400 skb->ip_summed = CHECKSUM_UNNECESSARY;
1402 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1404 skb->nh.iph->daddr, 0);
1410 /* The socket must have it's spinlock held when we get
1413 * We have a potential double-lock case here, so even when
1414 * doing backlog processing we use the BH locking scheme.
1415 * This is because we cannot sleep with the original spinlock
1418 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1420 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1421 TCP_CHECK_TIMER(sk);
1422 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1424 TCP_CHECK_TIMER(sk);
1428 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1431 if (sk->sk_state == TCP_LISTEN) {
1432 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1437 if (tcp_child_process(sk, nsk, skb))
1443 TCP_CHECK_TIMER(sk);
1444 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1446 TCP_CHECK_TIMER(sk);
1450 tcp_v4_send_reset(skb);
1453 /* Be careful here. If this function gets more complicated and
1454 * gcc suffers from register pressure on the x86, sk (in %ebx)
1455 * might be destroyed here. This current version compiles correctly,
1456 * but you have been warned.
1461 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1469 int tcp_v4_rcv(struct sk_buff *skb)
1475 if (skb->pkt_type != PACKET_HOST)
1478 /* Count it even if it's bad */
1479 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1481 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1486 if (th->doff < sizeof(struct tcphdr) / 4)
1488 if (!pskb_may_pull(skb, th->doff * 4))
1491 /* An explanation is required here, I think.
1492 * Packet length and doff are validated by header prediction,
1493 * provided case of th->doff==0 is elimineted.
1494 * So, we defer the checks. */
1495 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1496 tcp_v4_checksum_init(skb) < 0))
1500 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1501 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1502 skb->len - th->doff * 4);
1503 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1504 TCP_SKB_CB(skb)->when = 0;
1505 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1506 TCP_SKB_CB(skb)->sacked = 0;
1508 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1509 skb->nh.iph->daddr, ntohs(th->dest),
1516 if (sk->sk_state == TCP_TIME_WAIT)
1519 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1520 goto discard_and_relse;
1522 if (sk_filter(sk, skb, 0))
1523 goto discard_and_relse;
1529 if (!sock_owned_by_user(sk)) {
1530 if (!tcp_prequeue(sk, skb))
1531 ret = tcp_v4_do_rcv(sk, skb);
1533 sk_add_backlog(sk, skb);
1541 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1544 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1546 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1548 tcp_v4_send_reset(skb);
1552 /* Discard frame. */
1561 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1562 inet_twsk_put((struct inet_timewait_sock *) sk);
1566 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1567 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1568 inet_twsk_put((struct inet_timewait_sock *) sk);
1571 switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1574 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1579 tcp_tw_deschedule((struct inet_timewait_sock *)sk);
1580 inet_twsk_put((struct inet_timewait_sock *)sk);
1584 /* Fall through to ACK */
1587 tcp_v4_timewait_ack(sk, skb);
1591 case TCP_TW_SUCCESS:;
1596 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1598 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1599 struct inet_sock *inet = inet_sk(sk);
1601 sin->sin_family = AF_INET;
1602 sin->sin_addr.s_addr = inet->daddr;
1603 sin->sin_port = inet->dport;
1606 /* VJ's idea. Save last timestamp seen from this destination
1607 * and hold it at least for normal timewait interval to use for duplicate
1608 * segment detection in subsequent connections, before they enter synchronized
1612 int tcp_v4_remember_stamp(struct sock *sk)
1614 struct inet_sock *inet = inet_sk(sk);
1615 struct tcp_sock *tp = tcp_sk(sk);
1616 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1617 struct inet_peer *peer = NULL;
1620 if (!rt || rt->rt_dst != inet->daddr) {
1621 peer = inet_getpeer(inet->daddr, 1);
1625 rt_bind_peer(rt, 1);
1630 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1631 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1632 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1633 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1634 peer->tcp_ts = tp->rx_opt.ts_recent;
1644 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1646 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1649 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1651 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1652 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1653 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1654 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1655 peer->tcp_ts = tcptw->tw_ts_recent;
1664 struct tcp_func ipv4_specific = {
1665 .queue_xmit = ip_queue_xmit,
1666 .send_check = tcp_v4_send_check,
1667 .rebuild_header = inet_sk_rebuild_header,
1668 .conn_request = tcp_v4_conn_request,
1669 .syn_recv_sock = tcp_v4_syn_recv_sock,
1670 .remember_stamp = tcp_v4_remember_stamp,
1671 .net_header_len = sizeof(struct iphdr),
1672 .setsockopt = ip_setsockopt,
1673 .getsockopt = ip_getsockopt,
1674 .addr2sockaddr = v4_addr2sockaddr,
1675 .sockaddr_len = sizeof(struct sockaddr_in),
1678 /* NOTE: A lot of things set to zero explicitly by call to
1679 * sk_alloc() so need not be done here.
1681 static int tcp_v4_init_sock(struct sock *sk)
1683 struct tcp_sock *tp = tcp_sk(sk);
1685 skb_queue_head_init(&tp->out_of_order_queue);
1686 tcp_init_xmit_timers(sk);
1687 tcp_prequeue_init(tp);
1689 tp->rto = TCP_TIMEOUT_INIT;
1690 tp->mdev = TCP_TIMEOUT_INIT;
1692 /* So many TCP implementations out there (incorrectly) count the
1693 * initial SYN frame in their delayed-ACK and congestion control
1694 * algorithms that we must have the following bandaid to talk
1695 * efficiently to them. -DaveM
1699 /* See draft-stevens-tcpca-spec-01 for discussion of the
1700 * initialization of these values.
1702 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1703 tp->snd_cwnd_clamp = ~0;
1704 tp->mss_cache = 536;
1706 tp->reordering = sysctl_tcp_reordering;
1707 tp->ca_ops = &tcp_init_congestion_ops;
1709 sk->sk_state = TCP_CLOSE;
1711 sk->sk_write_space = sk_stream_write_space;
1712 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1714 tp->af_specific = &ipv4_specific;
1716 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1717 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1719 atomic_inc(&tcp_sockets_allocated);
1724 int tcp_v4_destroy_sock(struct sock *sk)
1726 struct tcp_sock *tp = tcp_sk(sk);
1728 tcp_clear_xmit_timers(sk);
1730 tcp_cleanup_congestion_control(tp);
1732 /* Cleanup up the write buffer. */
1733 sk_stream_writequeue_purge(sk);
1735 /* Cleans up our, hopefully empty, out_of_order_queue. */
1736 __skb_queue_purge(&tp->out_of_order_queue);
1738 /* Clean prequeue, it must be empty really */
1739 __skb_queue_purge(&tp->ucopy.prequeue);
1741 /* Clean up a referenced TCP bind bucket. */
1742 if (inet_sk(sk)->bind_hash)
1743 inet_put_port(&tcp_hashinfo, sk);
1746 * If sendmsg cached page exists, toss it.
1748 if (sk->sk_sndmsg_page) {
1749 __free_page(sk->sk_sndmsg_page);
1750 sk->sk_sndmsg_page = NULL;
1753 atomic_dec(&tcp_sockets_allocated);
1758 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1760 #ifdef CONFIG_PROC_FS
1761 /* Proc filesystem TCP sock list dumping. */
1763 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1765 return hlist_empty(head) ? NULL :
1766 list_entry(head->first, struct inet_timewait_sock, tw_node);
1769 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1771 return tw->tw_node.next ?
1772 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1775 static void *listening_get_next(struct seq_file *seq, void *cur)
1777 struct tcp_sock *tp;
1778 struct hlist_node *node;
1779 struct sock *sk = cur;
1780 struct tcp_iter_state* st = seq->private;
1784 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1790 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1791 struct request_sock *req = cur;
1793 tp = tcp_sk(st->syn_wait_sk);
1797 if (req->rsk_ops->family == st->family) {
1803 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1806 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1808 sk = sk_next(st->syn_wait_sk);
1809 st->state = TCP_SEQ_STATE_LISTENING;
1810 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1813 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1814 if (reqsk_queue_len(&tp->accept_queue))
1816 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1820 sk_for_each_from(sk, node) {
1821 if (sk->sk_family == st->family) {
1826 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1827 if (reqsk_queue_len(&tp->accept_queue)) {
1829 st->uid = sock_i_uid(sk);
1830 st->syn_wait_sk = sk;
1831 st->state = TCP_SEQ_STATE_OPENREQ;
1835 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1837 if (++st->bucket < INET_LHTABLE_SIZE) {
1838 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1846 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1848 void *rc = listening_get_next(seq, NULL);
1850 while (rc && *pos) {
1851 rc = listening_get_next(seq, rc);
1857 static void *established_get_first(struct seq_file *seq)
1859 struct tcp_iter_state* st = seq->private;
1862 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1864 struct hlist_node *node;
1865 struct inet_timewait_sock *tw;
1867 /* We can reschedule _before_ having picked the target: */
1868 cond_resched_softirq();
1870 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1871 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1872 if (sk->sk_family != st->family) {
1878 st->state = TCP_SEQ_STATE_TIME_WAIT;
1879 inet_twsk_for_each(tw, node,
1880 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1881 if (tw->tw_family != st->family) {
1887 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1888 st->state = TCP_SEQ_STATE_ESTABLISHED;
1894 static void *established_get_next(struct seq_file *seq, void *cur)
1896 struct sock *sk = cur;
1897 struct inet_timewait_sock *tw;
1898 struct hlist_node *node;
1899 struct tcp_iter_state* st = seq->private;
1903 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1907 while (tw && tw->tw_family != st->family) {
1914 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1915 st->state = TCP_SEQ_STATE_ESTABLISHED;
1917 /* We can reschedule between buckets: */
1918 cond_resched_softirq();
1920 if (++st->bucket < tcp_hashinfo.ehash_size) {
1921 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1922 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1930 sk_for_each_from(sk, node) {
1931 if (sk->sk_family == st->family)
1935 st->state = TCP_SEQ_STATE_TIME_WAIT;
1936 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1944 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1946 void *rc = established_get_first(seq);
1949 rc = established_get_next(seq, rc);
1955 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1958 struct tcp_iter_state* st = seq->private;
1960 inet_listen_lock(&tcp_hashinfo);
1961 st->state = TCP_SEQ_STATE_LISTENING;
1962 rc = listening_get_idx(seq, &pos);
1965 inet_listen_unlock(&tcp_hashinfo);
1967 st->state = TCP_SEQ_STATE_ESTABLISHED;
1968 rc = established_get_idx(seq, pos);
1974 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1976 struct tcp_iter_state* st = seq->private;
1977 st->state = TCP_SEQ_STATE_LISTENING;
1979 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1982 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1985 struct tcp_iter_state* st;
1987 if (v == SEQ_START_TOKEN) {
1988 rc = tcp_get_idx(seq, 0);
1993 switch (st->state) {
1994 case TCP_SEQ_STATE_OPENREQ:
1995 case TCP_SEQ_STATE_LISTENING:
1996 rc = listening_get_next(seq, v);
1998 inet_listen_unlock(&tcp_hashinfo);
2000 st->state = TCP_SEQ_STATE_ESTABLISHED;
2001 rc = established_get_first(seq);
2004 case TCP_SEQ_STATE_ESTABLISHED:
2005 case TCP_SEQ_STATE_TIME_WAIT:
2006 rc = established_get_next(seq, v);
2014 static void tcp_seq_stop(struct seq_file *seq, void *v)
2016 struct tcp_iter_state* st = seq->private;
2018 switch (st->state) {
2019 case TCP_SEQ_STATE_OPENREQ:
2021 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2022 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2024 case TCP_SEQ_STATE_LISTENING:
2025 if (v != SEQ_START_TOKEN)
2026 inet_listen_unlock(&tcp_hashinfo);
2028 case TCP_SEQ_STATE_TIME_WAIT:
2029 case TCP_SEQ_STATE_ESTABLISHED:
2031 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2037 static int tcp_seq_open(struct inode *inode, struct file *file)
2039 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2040 struct seq_file *seq;
2041 struct tcp_iter_state *s;
2044 if (unlikely(afinfo == NULL))
2047 s = kmalloc(sizeof(*s), GFP_KERNEL);
2050 memset(s, 0, sizeof(*s));
2051 s->family = afinfo->family;
2052 s->seq_ops.start = tcp_seq_start;
2053 s->seq_ops.next = tcp_seq_next;
2054 s->seq_ops.show = afinfo->seq_show;
2055 s->seq_ops.stop = tcp_seq_stop;
2057 rc = seq_open(file, &s->seq_ops);
2060 seq = file->private_data;
2069 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2072 struct proc_dir_entry *p;
2076 afinfo->seq_fops->owner = afinfo->owner;
2077 afinfo->seq_fops->open = tcp_seq_open;
2078 afinfo->seq_fops->read = seq_read;
2079 afinfo->seq_fops->llseek = seq_lseek;
2080 afinfo->seq_fops->release = seq_release_private;
2082 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2090 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2094 proc_net_remove(afinfo->name);
2095 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2098 static void get_openreq4(struct sock *sk, struct request_sock *req,
2099 char *tmpbuf, int i, int uid)
2101 const struct inet_request_sock *ireq = inet_rsk(req);
2102 int ttd = req->expires - jiffies;
2104 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2105 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2108 ntohs(inet_sk(sk)->sport),
2110 ntohs(ireq->rmt_port),
2112 0, 0, /* could print option size, but that is af dependent. */
2113 1, /* timers active (only the expire timer) */
2114 jiffies_to_clock_t(ttd),
2117 0, /* non standard timer */
2118 0, /* open_requests have no inode */
2119 atomic_read(&sk->sk_refcnt),
2123 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2126 unsigned long timer_expires;
2127 struct tcp_sock *tp = tcp_sk(sp);
2128 struct inet_sock *inet = inet_sk(sp);
2129 unsigned int dest = inet->daddr;
2130 unsigned int src = inet->rcv_saddr;
2131 __u16 destp = ntohs(inet->dport);
2132 __u16 srcp = ntohs(inet->sport);
2134 if (tp->pending == TCP_TIME_RETRANS) {
2136 timer_expires = tp->timeout;
2137 } else if (tp->pending == TCP_TIME_PROBE0) {
2139 timer_expires = tp->timeout;
2140 } else if (timer_pending(&sp->sk_timer)) {
2142 timer_expires = sp->sk_timer.expires;
2145 timer_expires = jiffies;
2148 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2149 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2150 i, src, srcp, dest, destp, sp->sk_state,
2151 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2153 jiffies_to_clock_t(timer_expires - jiffies),
2158 atomic_read(&sp->sk_refcnt), sp,
2159 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2161 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2164 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
2166 unsigned int dest, src;
2168 int ttd = tw->tw_ttd - jiffies;
2173 dest = tw->tw_daddr;
2174 src = tw->tw_rcv_saddr;
2175 destp = ntohs(tw->tw_dport);
2176 srcp = ntohs(tw->tw_sport);
2178 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2179 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2180 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2181 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2182 atomic_read(&tw->tw_refcnt), tw);
2187 static int tcp4_seq_show(struct seq_file *seq, void *v)
2189 struct tcp_iter_state* st;
2190 char tmpbuf[TMPSZ + 1];
2192 if (v == SEQ_START_TOKEN) {
2193 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2194 " sl local_address rem_address st tx_queue "
2195 "rx_queue tr tm->when retrnsmt uid timeout "
2201 switch (st->state) {
2202 case TCP_SEQ_STATE_LISTENING:
2203 case TCP_SEQ_STATE_ESTABLISHED:
2204 get_tcp4_sock(v, tmpbuf, st->num);
2206 case TCP_SEQ_STATE_OPENREQ:
2207 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2209 case TCP_SEQ_STATE_TIME_WAIT:
2210 get_timewait4_sock(v, tmpbuf, st->num);
2213 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2218 static struct file_operations tcp4_seq_fops;
2219 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2220 .owner = THIS_MODULE,
2223 .seq_show = tcp4_seq_show,
2224 .seq_fops = &tcp4_seq_fops,
2227 int __init tcp4_proc_init(void)
2229 return tcp_proc_register(&tcp4_seq_afinfo);
2232 void tcp4_proc_exit(void)
2234 tcp_proc_unregister(&tcp4_seq_afinfo);
2236 #endif /* CONFIG_PROC_FS */
2238 struct proto tcp_prot = {
2240 .owner = THIS_MODULE,
2242 .connect = tcp_v4_connect,
2243 .disconnect = tcp_disconnect,
2244 .accept = tcp_accept,
2246 .init = tcp_v4_init_sock,
2247 .destroy = tcp_v4_destroy_sock,
2248 .shutdown = tcp_shutdown,
2249 .setsockopt = tcp_setsockopt,
2250 .getsockopt = tcp_getsockopt,
2251 .sendmsg = tcp_sendmsg,
2252 .recvmsg = tcp_recvmsg,
2253 .backlog_rcv = tcp_v4_do_rcv,
2254 .hash = tcp_v4_hash,
2255 .unhash = tcp_unhash,
2256 .get_port = tcp_v4_get_port,
2257 .enter_memory_pressure = tcp_enter_memory_pressure,
2258 .sockets_allocated = &tcp_sockets_allocated,
2259 .memory_allocated = &tcp_memory_allocated,
2260 .memory_pressure = &tcp_memory_pressure,
2261 .sysctl_mem = sysctl_tcp_mem,
2262 .sysctl_wmem = sysctl_tcp_wmem,
2263 .sysctl_rmem = sysctl_tcp_rmem,
2264 .max_header = MAX_TCP_HEADER,
2265 .obj_size = sizeof(struct tcp_sock),
2266 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2267 .rsk_prot = &tcp_request_sock_ops,
2272 void __init tcp_v4_init(struct net_proto_family *ops)
2274 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2276 panic("Failed to create the TCP control socket.\n");
2277 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2278 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2280 /* Unhash it so that IP input processing does not even
2281 * see it, we do not wish this socket to see incoming
2284 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2287 EXPORT_SYMBOL(ipv4_specific);
2288 EXPORT_SYMBOL(inet_bind_bucket_create);
2289 EXPORT_SYMBOL(tcp_hashinfo);
2290 EXPORT_SYMBOL(tcp_prot);
2291 EXPORT_SYMBOL(tcp_unhash);
2292 EXPORT_SYMBOL(tcp_v4_conn_request);
2293 EXPORT_SYMBOL(tcp_v4_connect);
2294 EXPORT_SYMBOL(tcp_v4_do_rcv);
2295 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2296 EXPORT_SYMBOL(tcp_v4_send_check);
2297 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2299 #ifdef CONFIG_PROC_FS
2300 EXPORT_SYMBOL(tcp_proc_register);
2301 EXPORT_SYMBOL(tcp_proc_unregister);
2303 EXPORT_SYMBOL(sysctl_local_port_range);
2304 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2305 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);