]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv4/tcp_ipv4.c
[INET]: Generalise tcp_tw_bucket, aka TIME_WAIT sockets
[net-next-2.6.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen sematics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55 #include <linux/config.h>
56
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/xfrm.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
82
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
85
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
88
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90                        struct sk_buff *skb);
91
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93         .lhash_lock     = RW_LOCK_UNLOCKED,
94         .lhash_users    = ATOMIC_INIT(0),
95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96         .portalloc_lock = SPIN_LOCK_UNLOCKED,
97         .port_rover     = 1024 - 1,
98 };
99
100 /*
101  * This array holds the first and last local port number.
102  * For high-usage systems, use sysctl to change this to
103  * 32768-61000
104  */
105 int sysctl_local_port_range[2] = { 1024, 4999 };
106
107 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
108 {
109         const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
110         struct sock *sk2;
111         struct hlist_node *node;
112         int reuse = sk->sk_reuse;
113
114         sk_for_each_bound(sk2, node, &tb->owners) {
115                 if (sk != sk2 &&
116                     !tcp_v6_ipv6only(sk2) &&
117                     (!sk->sk_bound_dev_if ||
118                      !sk2->sk_bound_dev_if ||
119                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120                         if (!reuse || !sk2->sk_reuse ||
121                             sk2->sk_state == TCP_LISTEN) {
122                                 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
123                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124                                     sk2_rcv_saddr == sk_rcv_saddr)
125                                         break;
126                         }
127                 }
128         }
129         return node != NULL;
130 }
131
132 /* Obtain a reference to a local port for the given sock,
133  * if snum is zero it means select any available local port.
134  */
135 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
136 {
137         struct inet_bind_hashbucket *head;
138         struct hlist_node *node;
139         struct inet_bind_bucket *tb;
140         int ret;
141
142         local_bh_disable();
143         if (!snum) {
144                 int low = sysctl_local_port_range[0];
145                 int high = sysctl_local_port_range[1];
146                 int remaining = (high - low) + 1;
147                 int rover;
148
149                 spin_lock(&tcp_hashinfo.portalloc_lock);
150                 if (tcp_hashinfo.port_rover < low)
151                         rover = low;
152                 else
153                         rover = tcp_hashinfo.port_rover;
154                 do {
155                         rover++;
156                         if (rover > high)
157                                 rover = low;
158                         head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
159                         spin_lock(&head->lock);
160                         inet_bind_bucket_for_each(tb, node, &head->chain)
161                                 if (tb->port == rover)
162                                         goto next;
163                         break;
164                 next:
165                         spin_unlock(&head->lock);
166                 } while (--remaining > 0);
167                 tcp_hashinfo.port_rover = rover;
168                 spin_unlock(&tcp_hashinfo.portalloc_lock);
169
170                 /* Exhausted local port range during search?  It is not
171                  * possible for us to be holding one of the bind hash
172                  * locks if this test triggers, because if 'remaining'
173                  * drops to zero, we broke out of the do/while loop at
174                  * the top level, not from the 'break;' statement.
175                  */
176                 ret = 1;
177                 if (unlikely(remaining <= 0))
178                         goto fail;
179
180                 /* OK, here is the one we will use.  HEAD is
181                  * non-NULL and we hold it's mutex.
182                  */
183                 snum = rover;
184         } else {
185                 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
186                 spin_lock(&head->lock);
187                 inet_bind_bucket_for_each(tb, node, &head->chain)
188                         if (tb->port == snum)
189                                 goto tb_found;
190         }
191         tb = NULL;
192         goto tb_not_found;
193 tb_found:
194         if (!hlist_empty(&tb->owners)) {
195                 if (sk->sk_reuse > 1)
196                         goto success;
197                 if (tb->fastreuse > 0 &&
198                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
199                         goto success;
200                 } else {
201                         ret = 1;
202                         if (tcp_bind_conflict(sk, tb))
203                                 goto fail_unlock;
204                 }
205         }
206 tb_not_found:
207         ret = 1;
208         if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
209                 goto fail_unlock;
210         if (hlist_empty(&tb->owners)) {
211                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
212                         tb->fastreuse = 1;
213                 else
214                         tb->fastreuse = 0;
215         } else if (tb->fastreuse &&
216                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
217                 tb->fastreuse = 0;
218 success:
219         if (!inet_sk(sk)->bind_hash)
220                 inet_bind_hash(sk, tb, snum);
221         BUG_TRAP(inet_sk(sk)->bind_hash == tb);
222         ret = 0;
223
224 fail_unlock:
225         spin_unlock(&head->lock);
226 fail:
227         local_bh_enable();
228         return ret;
229 }
230
231 static void tcp_v4_hash(struct sock *sk)
232 {
233         inet_hash(&tcp_hashinfo, sk);
234 }
235
236 void tcp_unhash(struct sock *sk)
237 {
238         inet_unhash(&tcp_hashinfo, sk);
239 }
240
241 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
242  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
243  *
244  * Local BH must be disabled here.
245  */
246
247 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
248                                                        const u16 sport,
249                                                        const u32 daddr,
250                                                        const u16 hnum,
251                                                        const int dif)
252 {
253         struct inet_ehash_bucket *head;
254         INET_ADDR_COOKIE(acookie, saddr, daddr)
255         const __u32 ports = INET_COMBINED_PORTS(sport, hnum);
256         struct sock *sk;
257         const struct hlist_node *node;
258         /* Optimize here for direct hit, only listening connections can
259          * have wildcards anyways.
260          */
261         const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size);
262         head = &tcp_hashinfo.ehash[hash];
263         read_lock(&head->lock);
264         sk_for_each(sk, node, &head->chain) {
265                 if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
266                         goto hit; /* You sunk my battleship! */
267         }
268
269         /* Must check for a TIME_WAIT'er before going to listener hash. */
270         sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) {
271                 if (INET_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
272                         goto hit;
273         }
274         sk = NULL;
275 out:
276         read_unlock(&head->lock);
277         return sk;
278 hit:
279         sock_hold(sk);
280         goto out;
281 }
282
283 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
284                                            u32 daddr, u16 hnum, int dif)
285 {
286         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
287                                                       daddr, hnum, dif);
288
289         return sk ? : inet_lookup_listener(&tcp_hashinfo, daddr, hnum, dif);
290 }
291
292 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
293                                   u16 dport, int dif)
294 {
295         struct sock *sk;
296
297         local_bh_disable();
298         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
299         local_bh_enable();
300
301         return sk;
302 }
303
304 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
305
306 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
307 {
308         return secure_tcp_sequence_number(skb->nh.iph->daddr,
309                                           skb->nh.iph->saddr,
310                                           skb->h.th->dest,
311                                           skb->h.th->source);
312 }
313
314 /* called with local bh disabled */
315 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
316                                       struct inet_timewait_sock **twp)
317 {
318         struct inet_sock *inet = inet_sk(sk);
319         u32 daddr = inet->rcv_saddr;
320         u32 saddr = inet->daddr;
321         int dif = sk->sk_bound_dev_if;
322         INET_ADDR_COOKIE(acookie, saddr, daddr)
323         const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
324         const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
325         struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
326         struct sock *sk2;
327         const struct hlist_node *node;
328         struct inet_timewait_sock *tw;
329
330         write_lock(&head->lock);
331
332         /* Check TIME-WAIT sockets first. */
333         sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
334                 tw = inet_twsk(sk2);
335
336                 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
337                         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
338                         struct tcp_sock *tp = tcp_sk(sk);
339
340                         /* With PAWS, it is safe from the viewpoint
341                            of data integrity. Even without PAWS it
342                            is safe provided sequence spaces do not
343                            overlap i.e. at data rates <= 80Mbit/sec.
344
345                            Actually, the idea is close to VJ's one,
346                            only timestamp cache is held not per host,
347                            but per port pair and TW bucket is used
348                            as state holder.
349
350                            If TW bucket has been already destroyed we
351                            fall back to VJ's scheme and use initial
352                            timestamp retrieved from peer table.
353                          */
354                         if (tcptw->tw_ts_recent_stamp &&
355                             (!twp || (sysctl_tcp_tw_reuse &&
356                                       xtime.tv_sec -
357                                       tcptw->tw_ts_recent_stamp > 1))) {
358                                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
359                                 if (tp->write_seq == 0)
360                                         tp->write_seq = 1;
361                                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
362                                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
363                                 sock_hold(sk2);
364                                 goto unique;
365                         } else
366                                 goto not_unique;
367                 }
368         }
369         tw = NULL;
370
371         /* And established part... */
372         sk_for_each(sk2, node, &head->chain) {
373                 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
374                         goto not_unique;
375         }
376
377 unique:
378         /* Must record num and sport now. Otherwise we will see
379          * in hash table socket with a funny identity. */
380         inet->num = lport;
381         inet->sport = htons(lport);
382         sk->sk_hashent = hash;
383         BUG_TRAP(sk_unhashed(sk));
384         __sk_add_node(sk, &head->chain);
385         sock_prot_inc_use(sk->sk_prot);
386         write_unlock(&head->lock);
387
388         if (twp) {
389                 *twp = tw;
390                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
391         } else if (tw) {
392                 /* Silly. Should hash-dance instead... */
393                 tcp_tw_deschedule(tw);
394                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
395
396                 inet_twsk_put(tw);
397         }
398
399         return 0;
400
401 not_unique:
402         write_unlock(&head->lock);
403         return -EADDRNOTAVAIL;
404 }
405
406 static inline u32 connect_port_offset(const struct sock *sk)
407 {
408         const struct inet_sock *inet = inet_sk(sk);
409
410         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
411                                          inet->dport);
412 }
413
414 /*
415  * Bind a port for a connect operation and hash it.
416  */
417 static inline int tcp_v4_hash_connect(struct sock *sk)
418 {
419         const unsigned short snum = inet_sk(sk)->num;
420         struct inet_bind_hashbucket *head;
421         struct inet_bind_bucket *tb;
422         int ret;
423
424         if (!snum) {
425                 int low = sysctl_local_port_range[0];
426                 int high = sysctl_local_port_range[1];
427                 int range = high - low;
428                 int i;
429                 int port;
430                 static u32 hint;
431                 u32 offset = hint + connect_port_offset(sk);
432                 struct hlist_node *node;
433                 struct inet_timewait_sock *tw = NULL;
434
435                 local_bh_disable();
436                 for (i = 1; i <= range; i++) {
437                         port = low + (i + offset) % range;
438                         head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
439                         spin_lock(&head->lock);
440
441                         /* Does not bother with rcv_saddr checks,
442                          * because the established check is already
443                          * unique enough.
444                          */
445                         inet_bind_bucket_for_each(tb, node, &head->chain) {
446                                 if (tb->port == port) {
447                                         BUG_TRAP(!hlist_empty(&tb->owners));
448                                         if (tb->fastreuse >= 0)
449                                                 goto next_port;
450                                         if (!__tcp_v4_check_established(sk,
451                                                                         port,
452                                                                         &tw))
453                                                 goto ok;
454                                         goto next_port;
455                                 }
456                         }
457
458                         tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
459                         if (!tb) {
460                                 spin_unlock(&head->lock);
461                                 break;
462                         }
463                         tb->fastreuse = -1;
464                         goto ok;
465
466                 next_port:
467                         spin_unlock(&head->lock);
468                 }
469                 local_bh_enable();
470
471                 return -EADDRNOTAVAIL;
472
473 ok:
474                 hint += i;
475
476                 /* Head lock still held and bh's disabled */
477                 inet_bind_hash(sk, tb, port);
478                 if (sk_unhashed(sk)) {
479                         inet_sk(sk)->sport = htons(port);
480                         __inet_hash(&tcp_hashinfo, sk, 0);
481                 }
482                 spin_unlock(&head->lock);
483
484                 if (tw) {
485                         tcp_tw_deschedule(tw);
486                         inet_twsk_put(tw);
487                 }
488
489                 ret = 0;
490                 goto out;
491         }
492
493         head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
494         tb  = inet_sk(sk)->bind_hash;
495         spin_lock_bh(&head->lock);
496         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
497                 __inet_hash(&tcp_hashinfo, sk, 0);
498                 spin_unlock_bh(&head->lock);
499                 return 0;
500         } else {
501                 spin_unlock(&head->lock);
502                 /* No definite answer... Walk to established hash table */
503                 ret = __tcp_v4_check_established(sk, snum, NULL);
504 out:
505                 local_bh_enable();
506                 return ret;
507         }
508 }
509
510 /* This will initiate an outgoing connection. */
511 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
512 {
513         struct inet_sock *inet = inet_sk(sk);
514         struct tcp_sock *tp = tcp_sk(sk);
515         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
516         struct rtable *rt;
517         u32 daddr, nexthop;
518         int tmp;
519         int err;
520
521         if (addr_len < sizeof(struct sockaddr_in))
522                 return -EINVAL;
523
524         if (usin->sin_family != AF_INET)
525                 return -EAFNOSUPPORT;
526
527         nexthop = daddr = usin->sin_addr.s_addr;
528         if (inet->opt && inet->opt->srr) {
529                 if (!daddr)
530                         return -EINVAL;
531                 nexthop = inet->opt->faddr;
532         }
533
534         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
535                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
536                                IPPROTO_TCP,
537                                inet->sport, usin->sin_port, sk);
538         if (tmp < 0)
539                 return tmp;
540
541         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
542                 ip_rt_put(rt);
543                 return -ENETUNREACH;
544         }
545
546         if (!inet->opt || !inet->opt->srr)
547                 daddr = rt->rt_dst;
548
549         if (!inet->saddr)
550                 inet->saddr = rt->rt_src;
551         inet->rcv_saddr = inet->saddr;
552
553         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
554                 /* Reset inherited state */
555                 tp->rx_opt.ts_recent       = 0;
556                 tp->rx_opt.ts_recent_stamp = 0;
557                 tp->write_seq              = 0;
558         }
559
560         if (sysctl_tcp_tw_recycle &&
561             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
562                 struct inet_peer *peer = rt_get_peer(rt);
563
564                 /* VJ's idea. We save last timestamp seen from
565                  * the destination in peer table, when entering state TIME-WAIT
566                  * and initialize rx_opt.ts_recent from it, when trying new connection.
567                  */
568
569                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
570                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
571                         tp->rx_opt.ts_recent = peer->tcp_ts;
572                 }
573         }
574
575         inet->dport = usin->sin_port;
576         inet->daddr = daddr;
577
578         tp->ext_header_len = 0;
579         if (inet->opt)
580                 tp->ext_header_len = inet->opt->optlen;
581
582         tp->rx_opt.mss_clamp = 536;
583
584         /* Socket identity is still unknown (sport may be zero).
585          * However we set state to SYN-SENT and not releasing socket
586          * lock select source port, enter ourselves into the hash tables and
587          * complete initialization after this.
588          */
589         tcp_set_state(sk, TCP_SYN_SENT);
590         err = tcp_v4_hash_connect(sk);
591         if (err)
592                 goto failure;
593
594         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
595         if (err)
596                 goto failure;
597
598         /* OK, now commit destination to socket.  */
599         sk_setup_caps(sk, &rt->u.dst);
600
601         if (!tp->write_seq)
602                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
603                                                            inet->daddr,
604                                                            inet->sport,
605                                                            usin->sin_port);
606
607         inet->id = tp->write_seq ^ jiffies;
608
609         err = tcp_connect(sk);
610         rt = NULL;
611         if (err)
612                 goto failure;
613
614         return 0;
615
616 failure:
617         /* This unhashes the socket and releases the local port, if necessary. */
618         tcp_set_state(sk, TCP_CLOSE);
619         ip_rt_put(rt);
620         sk->sk_route_caps = 0;
621         inet->dport = 0;
622         return err;
623 }
624
625 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
626 {
627         return ((struct rtable *)skb->dst)->rt_iif;
628 }
629
630 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
631 {
632         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
633 }
634
635 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
636                                               struct request_sock ***prevp,
637                                               __u16 rport,
638                                               __u32 raddr, __u32 laddr)
639 {
640         struct listen_sock *lopt = tp->accept_queue.listen_opt;
641         struct request_sock *req, **prev;
642
643         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
644              (req = *prev) != NULL;
645              prev = &req->dl_next) {
646                 const struct inet_request_sock *ireq = inet_rsk(req);
647
648                 if (ireq->rmt_port == rport &&
649                     ireq->rmt_addr == raddr &&
650                     ireq->loc_addr == laddr &&
651                     TCP_INET_FAMILY(req->rsk_ops->family)) {
652                         BUG_TRAP(!req->sk);
653                         *prevp = prev;
654                         break;
655                 }
656         }
657
658         return req;
659 }
660
661 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
662 {
663         struct tcp_sock *tp = tcp_sk(sk);
664         struct listen_sock *lopt = tp->accept_queue.listen_opt;
665         u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
666
667         reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
668         tcp_synq_added(sk);
669 }
670
671
672 /*
673  * This routine does path mtu discovery as defined in RFC1191.
674  */
675 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
676                                      u32 mtu)
677 {
678         struct dst_entry *dst;
679         struct inet_sock *inet = inet_sk(sk);
680         struct tcp_sock *tp = tcp_sk(sk);
681
682         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
683          * send out by Linux are always <576bytes so they should go through
684          * unfragmented).
685          */
686         if (sk->sk_state == TCP_LISTEN)
687                 return;
688
689         /* We don't check in the destentry if pmtu discovery is forbidden
690          * on this route. We just assume that no packet_to_big packets
691          * are send back when pmtu discovery is not active.
692          * There is a small race when the user changes this flag in the
693          * route, but I think that's acceptable.
694          */
695         if ((dst = __sk_dst_check(sk, 0)) == NULL)
696                 return;
697
698         dst->ops->update_pmtu(dst, mtu);
699
700         /* Something is about to be wrong... Remember soft error
701          * for the case, if this connection will not able to recover.
702          */
703         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
704                 sk->sk_err_soft = EMSGSIZE;
705
706         mtu = dst_mtu(dst);
707
708         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
709             tp->pmtu_cookie > mtu) {
710                 tcp_sync_mss(sk, mtu);
711
712                 /* Resend the TCP packet because it's
713                  * clear that the old packet has been
714                  * dropped. This is the new "fast" path mtu
715                  * discovery.
716                  */
717                 tcp_simple_retransmit(sk);
718         } /* else let the usual retransmit timer handle it */
719 }
720
721 /*
722  * This routine is called by the ICMP module when it gets some
723  * sort of error condition.  If err < 0 then the socket should
724  * be closed and the error returned to the user.  If err > 0
725  * it's just the icmp type << 8 | icmp code.  After adjustment
726  * header points to the first 8 bytes of the tcp header.  We need
727  * to find the appropriate port.
728  *
729  * The locking strategy used here is very "optimistic". When
730  * someone else accesses the socket the ICMP is just dropped
731  * and for some paths there is no check at all.
732  * A more general error queue to queue errors for later handling
733  * is probably better.
734  *
735  */
736
737 void tcp_v4_err(struct sk_buff *skb, u32 info)
738 {
739         struct iphdr *iph = (struct iphdr *)skb->data;
740         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
741         struct tcp_sock *tp;
742         struct inet_sock *inet;
743         int type = skb->h.icmph->type;
744         int code = skb->h.icmph->code;
745         struct sock *sk;
746         __u32 seq;
747         int err;
748
749         if (skb->len < (iph->ihl << 2) + 8) {
750                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
751                 return;
752         }
753
754         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
755                            th->source, tcp_v4_iif(skb));
756         if (!sk) {
757                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
758                 return;
759         }
760         if (sk->sk_state == TCP_TIME_WAIT) {
761                 inet_twsk_put((struct inet_timewait_sock *)sk);
762                 return;
763         }
764
765         bh_lock_sock(sk);
766         /* If too many ICMPs get dropped on busy
767          * servers this needs to be solved differently.
768          */
769         if (sock_owned_by_user(sk))
770                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
771
772         if (sk->sk_state == TCP_CLOSE)
773                 goto out;
774
775         tp = tcp_sk(sk);
776         seq = ntohl(th->seq);
777         if (sk->sk_state != TCP_LISTEN &&
778             !between(seq, tp->snd_una, tp->snd_nxt)) {
779                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
780                 goto out;
781         }
782
783         switch (type) {
784         case ICMP_SOURCE_QUENCH:
785                 /* Just silently ignore these. */
786                 goto out;
787         case ICMP_PARAMETERPROB:
788                 err = EPROTO;
789                 break;
790         case ICMP_DEST_UNREACH:
791                 if (code > NR_ICMP_UNREACH)
792                         goto out;
793
794                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
795                         if (!sock_owned_by_user(sk))
796                                 do_pmtu_discovery(sk, iph, info);
797                         goto out;
798                 }
799
800                 err = icmp_err_convert[code].errno;
801                 break;
802         case ICMP_TIME_EXCEEDED:
803                 err = EHOSTUNREACH;
804                 break;
805         default:
806                 goto out;
807         }
808
809         switch (sk->sk_state) {
810                 struct request_sock *req, **prev;
811         case TCP_LISTEN:
812                 if (sock_owned_by_user(sk))
813                         goto out;
814
815                 req = tcp_v4_search_req(tp, &prev, th->dest,
816                                         iph->daddr, iph->saddr);
817                 if (!req)
818                         goto out;
819
820                 /* ICMPs are not backlogged, hence we cannot get
821                    an established socket here.
822                  */
823                 BUG_TRAP(!req->sk);
824
825                 if (seq != tcp_rsk(req)->snt_isn) {
826                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
827                         goto out;
828                 }
829
830                 /*
831                  * Still in SYN_RECV, just remove it silently.
832                  * There is no good way to pass the error to the newly
833                  * created socket, and POSIX does not want network
834                  * errors returned from accept().
835                  */
836                 tcp_synq_drop(sk, req, prev);
837                 goto out;
838
839         case TCP_SYN_SENT:
840         case TCP_SYN_RECV:  /* Cannot happen.
841                                It can f.e. if SYNs crossed.
842                              */
843                 if (!sock_owned_by_user(sk)) {
844                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
845                         sk->sk_err = err;
846
847                         sk->sk_error_report(sk);
848
849                         tcp_done(sk);
850                 } else {
851                         sk->sk_err_soft = err;
852                 }
853                 goto out;
854         }
855
856         /* If we've already connected we will keep trying
857          * until we time out, or the user gives up.
858          *
859          * rfc1122 4.2.3.9 allows to consider as hard errors
860          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
861          * but it is obsoleted by pmtu discovery).
862          *
863          * Note, that in modern internet, where routing is unreliable
864          * and in each dark corner broken firewalls sit, sending random
865          * errors ordered by their masters even this two messages finally lose
866          * their original sense (even Linux sends invalid PORT_UNREACHs)
867          *
868          * Now we are in compliance with RFCs.
869          *                                                      --ANK (980905)
870          */
871
872         inet = inet_sk(sk);
873         if (!sock_owned_by_user(sk) && inet->recverr) {
874                 sk->sk_err = err;
875                 sk->sk_error_report(sk);
876         } else  { /* Only an error on timeout */
877                 sk->sk_err_soft = err;
878         }
879
880 out:
881         bh_unlock_sock(sk);
882         sock_put(sk);
883 }
884
885 /* This routine computes an IPv4 TCP checksum. */
886 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
887                        struct sk_buff *skb)
888 {
889         struct inet_sock *inet = inet_sk(sk);
890
891         if (skb->ip_summed == CHECKSUM_HW) {
892                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
893                 skb->csum = offsetof(struct tcphdr, check);
894         } else {
895                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
896                                          csum_partial((char *)th,
897                                                       th->doff << 2,
898                                                       skb->csum));
899         }
900 }
901
902 /*
903  *      This routine will send an RST to the other tcp.
904  *
905  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
906  *                    for reset.
907  *      Answer: if a packet caused RST, it is not for a socket
908  *              existing in our system, if it is matched to a socket,
909  *              it is just duplicate segment or bug in other side's TCP.
910  *              So that we build reply only basing on parameters
911  *              arrived with segment.
912  *      Exception: precedence violation. We do not implement it in any case.
913  */
914
915 static void tcp_v4_send_reset(struct sk_buff *skb)
916 {
917         struct tcphdr *th = skb->h.th;
918         struct tcphdr rth;
919         struct ip_reply_arg arg;
920
921         /* Never send a reset in response to a reset. */
922         if (th->rst)
923                 return;
924
925         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
926                 return;
927
928         /* Swap the send and the receive. */
929         memset(&rth, 0, sizeof(struct tcphdr));
930         rth.dest   = th->source;
931         rth.source = th->dest;
932         rth.doff   = sizeof(struct tcphdr) / 4;
933         rth.rst    = 1;
934
935         if (th->ack) {
936                 rth.seq = th->ack_seq;
937         } else {
938                 rth.ack = 1;
939                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
940                                     skb->len - (th->doff << 2));
941         }
942
943         memset(&arg, 0, sizeof arg);
944         arg.iov[0].iov_base = (unsigned char *)&rth;
945         arg.iov[0].iov_len  = sizeof rth;
946         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
947                                       skb->nh.iph->saddr, /*XXX*/
948                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
949         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
950
951         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
952
953         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
954         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
955 }
956
957 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
958    outside socket context is ugly, certainly. What can I do?
959  */
960
961 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
962                             u32 win, u32 ts)
963 {
964         struct tcphdr *th = skb->h.th;
965         struct {
966                 struct tcphdr th;
967                 u32 tsopt[3];
968         } rep;
969         struct ip_reply_arg arg;
970
971         memset(&rep.th, 0, sizeof(struct tcphdr));
972         memset(&arg, 0, sizeof arg);
973
974         arg.iov[0].iov_base = (unsigned char *)&rep;
975         arg.iov[0].iov_len  = sizeof(rep.th);
976         if (ts) {
977                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
978                                      (TCPOPT_TIMESTAMP << 8) |
979                                      TCPOLEN_TIMESTAMP);
980                 rep.tsopt[1] = htonl(tcp_time_stamp);
981                 rep.tsopt[2] = htonl(ts);
982                 arg.iov[0].iov_len = sizeof(rep);
983         }
984
985         /* Swap the send and the receive. */
986         rep.th.dest    = th->source;
987         rep.th.source  = th->dest;
988         rep.th.doff    = arg.iov[0].iov_len / 4;
989         rep.th.seq     = htonl(seq);
990         rep.th.ack_seq = htonl(ack);
991         rep.th.ack     = 1;
992         rep.th.window  = htons(win);
993
994         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
995                                       skb->nh.iph->saddr, /*XXX*/
996                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
997         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
998
999         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1000
1001         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1002 }
1003
1004 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1005 {
1006         struct inet_timewait_sock *tw = inet_twsk(sk);
1007         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1008
1009         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1010                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
1011
1012         inet_twsk_put(tw);
1013 }
1014
1015 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1016 {
1017         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1018                         req->ts_recent);
1019 }
1020
1021 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1022                                           struct request_sock *req)
1023 {
1024         struct rtable *rt;
1025         const struct inet_request_sock *ireq = inet_rsk(req);
1026         struct ip_options *opt = inet_rsk(req)->opt;
1027         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1028                             .nl_u = { .ip4_u =
1029                                       { .daddr = ((opt && opt->srr) ?
1030                                                   opt->faddr :
1031                                                   ireq->rmt_addr),
1032                                         .saddr = ireq->loc_addr,
1033                                         .tos = RT_CONN_FLAGS(sk) } },
1034                             .proto = IPPROTO_TCP,
1035                             .uli_u = { .ports =
1036                                        { .sport = inet_sk(sk)->sport,
1037                                          .dport = ireq->rmt_port } } };
1038
1039         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1040                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1041                 return NULL;
1042         }
1043         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1044                 ip_rt_put(rt);
1045                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1046                 return NULL;
1047         }
1048         return &rt->u.dst;
1049 }
1050
1051 /*
1052  *      Send a SYN-ACK after having received an ACK.
1053  *      This still operates on a request_sock only, not on a big
1054  *      socket.
1055  */
1056 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1057                               struct dst_entry *dst)
1058 {
1059         const struct inet_request_sock *ireq = inet_rsk(req);
1060         int err = -1;
1061         struct sk_buff * skb;
1062
1063         /* First, grab a route. */
1064         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1065                 goto out;
1066
1067         skb = tcp_make_synack(sk, dst, req);
1068
1069         if (skb) {
1070                 struct tcphdr *th = skb->h.th;
1071
1072                 th->check = tcp_v4_check(th, skb->len,
1073                                          ireq->loc_addr,
1074                                          ireq->rmt_addr,
1075                                          csum_partial((char *)th, skb->len,
1076                                                       skb->csum));
1077
1078                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1079                                             ireq->rmt_addr,
1080                                             ireq->opt);
1081                 if (err == NET_XMIT_CN)
1082                         err = 0;
1083         }
1084
1085 out:
1086         dst_release(dst);
1087         return err;
1088 }
1089
1090 /*
1091  *      IPv4 request_sock destructor.
1092  */
1093 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1094 {
1095         if (inet_rsk(req)->opt)
1096                 kfree(inet_rsk(req)->opt);
1097 }
1098
1099 static inline void syn_flood_warning(struct sk_buff *skb)
1100 {
1101         static unsigned long warntime;
1102
1103         if (time_after(jiffies, (warntime + HZ * 60))) {
1104                 warntime = jiffies;
1105                 printk(KERN_INFO
1106                        "possible SYN flooding on port %d. Sending cookies.\n",
1107                        ntohs(skb->h.th->dest));
1108         }
1109 }
1110
1111 /*
1112  * Save and compile IPv4 options into the request_sock if needed.
1113  */
1114 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1115                                                      struct sk_buff *skb)
1116 {
1117         struct ip_options *opt = &(IPCB(skb)->opt);
1118         struct ip_options *dopt = NULL;
1119
1120         if (opt && opt->optlen) {
1121                 int opt_size = optlength(opt);
1122                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1123                 if (dopt) {
1124                         if (ip_options_echo(dopt, skb)) {
1125                                 kfree(dopt);
1126                                 dopt = NULL;
1127                         }
1128                 }
1129         }
1130         return dopt;
1131 }
1132
1133 struct request_sock_ops tcp_request_sock_ops = {
1134         .family         =       PF_INET,
1135         .obj_size       =       sizeof(struct tcp_request_sock),
1136         .rtx_syn_ack    =       tcp_v4_send_synack,
1137         .send_ack       =       tcp_v4_reqsk_send_ack,
1138         .destructor     =       tcp_v4_reqsk_destructor,
1139         .send_reset     =       tcp_v4_send_reset,
1140 };
1141
1142 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1143 {
1144         struct inet_request_sock *ireq;
1145         struct tcp_options_received tmp_opt;
1146         struct request_sock *req;
1147         __u32 saddr = skb->nh.iph->saddr;
1148         __u32 daddr = skb->nh.iph->daddr;
1149         __u32 isn = TCP_SKB_CB(skb)->when;
1150         struct dst_entry *dst = NULL;
1151 #ifdef CONFIG_SYN_COOKIES
1152         int want_cookie = 0;
1153 #else
1154 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1155 #endif
1156
1157         /* Never answer to SYNs send to broadcast or multicast */
1158         if (((struct rtable *)skb->dst)->rt_flags &
1159             (RTCF_BROADCAST | RTCF_MULTICAST))
1160                 goto drop;
1161
1162         /* TW buckets are converted to open requests without
1163          * limitations, they conserve resources and peer is
1164          * evidently real one.
1165          */
1166         if (tcp_synq_is_full(sk) && !isn) {
1167 #ifdef CONFIG_SYN_COOKIES
1168                 if (sysctl_tcp_syncookies) {
1169                         want_cookie = 1;
1170                 } else
1171 #endif
1172                 goto drop;
1173         }
1174
1175         /* Accept backlog is full. If we have already queued enough
1176          * of warm entries in syn queue, drop request. It is better than
1177          * clogging syn queue with openreqs with exponentially increasing
1178          * timeout.
1179          */
1180         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1181                 goto drop;
1182
1183         req = reqsk_alloc(&tcp_request_sock_ops);
1184         if (!req)
1185                 goto drop;
1186
1187         tcp_clear_options(&tmp_opt);
1188         tmp_opt.mss_clamp = 536;
1189         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1190
1191         tcp_parse_options(skb, &tmp_opt, 0);
1192
1193         if (want_cookie) {
1194                 tcp_clear_options(&tmp_opt);
1195                 tmp_opt.saw_tstamp = 0;
1196         }
1197
1198         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1199                 /* Some OSes (unknown ones, but I see them on web server, which
1200                  * contains information interesting only for windows'
1201                  * users) do not send their stamp in SYN. It is easy case.
1202                  * We simply do not advertise TS support.
1203                  */
1204                 tmp_opt.saw_tstamp = 0;
1205                 tmp_opt.tstamp_ok  = 0;
1206         }
1207         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1208
1209         tcp_openreq_init(req, &tmp_opt, skb);
1210
1211         ireq = inet_rsk(req);
1212         ireq->loc_addr = daddr;
1213         ireq->rmt_addr = saddr;
1214         ireq->opt = tcp_v4_save_options(sk, skb);
1215         if (!want_cookie)
1216                 TCP_ECN_create_request(req, skb->h.th);
1217
1218         if (want_cookie) {
1219 #ifdef CONFIG_SYN_COOKIES
1220                 syn_flood_warning(skb);
1221 #endif
1222                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1223         } else if (!isn) {
1224                 struct inet_peer *peer = NULL;
1225
1226                 /* VJ's idea. We save last timestamp seen
1227                  * from the destination in peer table, when entering
1228                  * state TIME-WAIT, and check against it before
1229                  * accepting new connection request.
1230                  *
1231                  * If "isn" is not zero, this request hit alive
1232                  * timewait bucket, so that all the necessary checks
1233                  * are made in the function processing timewait state.
1234                  */
1235                 if (tmp_opt.saw_tstamp &&
1236                     sysctl_tcp_tw_recycle &&
1237                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1238                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1239                     peer->v4daddr == saddr) {
1240                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1241                             (s32)(peer->tcp_ts - req->ts_recent) >
1242                                                         TCP_PAWS_WINDOW) {
1243                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1244                                 dst_release(dst);
1245                                 goto drop_and_free;
1246                         }
1247                 }
1248                 /* Kill the following clause, if you dislike this way. */
1249                 else if (!sysctl_tcp_syncookies &&
1250                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1251                           (sysctl_max_syn_backlog >> 2)) &&
1252                          (!peer || !peer->tcp_ts_stamp) &&
1253                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1254                         /* Without syncookies last quarter of
1255                          * backlog is filled with destinations,
1256                          * proven to be alive.
1257                          * It means that we continue to communicate
1258                          * to destinations, already remembered
1259                          * to the moment of synflood.
1260                          */
1261                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1262                                               "request from %u.%u."
1263                                               "%u.%u/%u\n",
1264                                               NIPQUAD(saddr),
1265                                               ntohs(skb->h.th->source)));
1266                         dst_release(dst);
1267                         goto drop_and_free;
1268                 }
1269
1270                 isn = tcp_v4_init_sequence(sk, skb);
1271         }
1272         tcp_rsk(req)->snt_isn = isn;
1273
1274         if (tcp_v4_send_synack(sk, req, dst))
1275                 goto drop_and_free;
1276
1277         if (want_cookie) {
1278                 reqsk_free(req);
1279         } else {
1280                 tcp_v4_synq_add(sk, req);
1281         }
1282         return 0;
1283
1284 drop_and_free:
1285         reqsk_free(req);
1286 drop:
1287         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1288         return 0;
1289 }
1290
1291
1292 /*
1293  * The three way handshake has completed - we got a valid synack -
1294  * now create the new socket.
1295  */
1296 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1297                                   struct request_sock *req,
1298                                   struct dst_entry *dst)
1299 {
1300         struct inet_request_sock *ireq;
1301         struct inet_sock *newinet;
1302         struct tcp_sock *newtp;
1303         struct sock *newsk;
1304
1305         if (sk_acceptq_is_full(sk))
1306                 goto exit_overflow;
1307
1308         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1309                 goto exit;
1310
1311         newsk = tcp_create_openreq_child(sk, req, skb);
1312         if (!newsk)
1313                 goto exit;
1314
1315         sk_setup_caps(newsk, dst);
1316
1317         newtp                 = tcp_sk(newsk);
1318         newinet               = inet_sk(newsk);
1319         ireq                  = inet_rsk(req);
1320         newinet->daddr        = ireq->rmt_addr;
1321         newinet->rcv_saddr    = ireq->loc_addr;
1322         newinet->saddr        = ireq->loc_addr;
1323         newinet->opt          = ireq->opt;
1324         ireq->opt             = NULL;
1325         newinet->mc_index     = tcp_v4_iif(skb);
1326         newinet->mc_ttl       = skb->nh.iph->ttl;
1327         newtp->ext_header_len = 0;
1328         if (newinet->opt)
1329                 newtp->ext_header_len = newinet->opt->optlen;
1330         newinet->id = newtp->write_seq ^ jiffies;
1331
1332         tcp_sync_mss(newsk, dst_mtu(dst));
1333         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1334         tcp_initialize_rcv_mss(newsk);
1335
1336         __inet_hash(&tcp_hashinfo, newsk, 0);
1337         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1338
1339         return newsk;
1340
1341 exit_overflow:
1342         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1343 exit:
1344         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1345         dst_release(dst);
1346         return NULL;
1347 }
1348
1349 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1350 {
1351         struct tcphdr *th = skb->h.th;
1352         struct iphdr *iph = skb->nh.iph;
1353         struct tcp_sock *tp = tcp_sk(sk);
1354         struct sock *nsk;
1355         struct request_sock **prev;
1356         /* Find possible connection requests. */
1357         struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1358                                                      iph->saddr, iph->daddr);
1359         if (req)
1360                 return tcp_check_req(sk, skb, req, prev);
1361
1362         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1363                                           th->source,
1364                                           skb->nh.iph->daddr,
1365                                           ntohs(th->dest),
1366                                           tcp_v4_iif(skb));
1367
1368         if (nsk) {
1369                 if (nsk->sk_state != TCP_TIME_WAIT) {
1370                         bh_lock_sock(nsk);
1371                         return nsk;
1372                 }
1373                 inet_twsk_put((struct inet_timewait_sock *)nsk);
1374                 return NULL;
1375         }
1376
1377 #ifdef CONFIG_SYN_COOKIES
1378         if (!th->rst && !th->syn && th->ack)
1379                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1380 #endif
1381         return sk;
1382 }
1383
1384 static int tcp_v4_checksum_init(struct sk_buff *skb)
1385 {
1386         if (skb->ip_summed == CHECKSUM_HW) {
1387                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1388                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1389                                   skb->nh.iph->daddr, skb->csum))
1390                         return 0;
1391
1392                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1393                 skb->ip_summed = CHECKSUM_NONE;
1394         }
1395         if (skb->len <= 76) {
1396                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1397                                  skb->nh.iph->daddr,
1398                                  skb_checksum(skb, 0, skb->len, 0)))
1399                         return -1;
1400                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1401         } else {
1402                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1403                                           skb->nh.iph->saddr,
1404                                           skb->nh.iph->daddr, 0);
1405         }
1406         return 0;
1407 }
1408
1409
1410 /* The socket must have it's spinlock held when we get
1411  * here.
1412  *
1413  * We have a potential double-lock case here, so even when
1414  * doing backlog processing we use the BH locking scheme.
1415  * This is because we cannot sleep with the original spinlock
1416  * held.
1417  */
1418 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1419 {
1420         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1421                 TCP_CHECK_TIMER(sk);
1422                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1423                         goto reset;
1424                 TCP_CHECK_TIMER(sk);
1425                 return 0;
1426         }
1427
1428         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1429                 goto csum_err;
1430
1431         if (sk->sk_state == TCP_LISTEN) {
1432                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1433                 if (!nsk)
1434                         goto discard;
1435
1436                 if (nsk != sk) {
1437                         if (tcp_child_process(sk, nsk, skb))
1438                                 goto reset;
1439                         return 0;
1440                 }
1441         }
1442
1443         TCP_CHECK_TIMER(sk);
1444         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1445                 goto reset;
1446         TCP_CHECK_TIMER(sk);
1447         return 0;
1448
1449 reset:
1450         tcp_v4_send_reset(skb);
1451 discard:
1452         kfree_skb(skb);
1453         /* Be careful here. If this function gets more complicated and
1454          * gcc suffers from register pressure on the x86, sk (in %ebx)
1455          * might be destroyed here. This current version compiles correctly,
1456          * but you have been warned.
1457          */
1458         return 0;
1459
1460 csum_err:
1461         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1462         goto discard;
1463 }
1464
1465 /*
1466  *      From tcp_input.c
1467  */
1468
1469 int tcp_v4_rcv(struct sk_buff *skb)
1470 {
1471         struct tcphdr *th;
1472         struct sock *sk;
1473         int ret;
1474
1475         if (skb->pkt_type != PACKET_HOST)
1476                 goto discard_it;
1477
1478         /* Count it even if it's bad */
1479         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1480
1481         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1482                 goto discard_it;
1483
1484         th = skb->h.th;
1485
1486         if (th->doff < sizeof(struct tcphdr) / 4)
1487                 goto bad_packet;
1488         if (!pskb_may_pull(skb, th->doff * 4))
1489                 goto discard_it;
1490
1491         /* An explanation is required here, I think.
1492          * Packet length and doff are validated by header prediction,
1493          * provided case of th->doff==0 is elimineted.
1494          * So, we defer the checks. */
1495         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1496              tcp_v4_checksum_init(skb) < 0))
1497                 goto bad_packet;
1498
1499         th = skb->h.th;
1500         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1501         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1502                                     skb->len - th->doff * 4);
1503         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1504         TCP_SKB_CB(skb)->when    = 0;
1505         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1506         TCP_SKB_CB(skb)->sacked  = 0;
1507
1508         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1509                              skb->nh.iph->daddr, ntohs(th->dest),
1510                              tcp_v4_iif(skb));
1511
1512         if (!sk)
1513                 goto no_tcp_socket;
1514
1515 process:
1516         if (sk->sk_state == TCP_TIME_WAIT)
1517                 goto do_time_wait;
1518
1519         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1520                 goto discard_and_relse;
1521
1522         if (sk_filter(sk, skb, 0))
1523                 goto discard_and_relse;
1524
1525         skb->dev = NULL;
1526
1527         bh_lock_sock(sk);
1528         ret = 0;
1529         if (!sock_owned_by_user(sk)) {
1530                 if (!tcp_prequeue(sk, skb))
1531                         ret = tcp_v4_do_rcv(sk, skb);
1532         } else
1533                 sk_add_backlog(sk, skb);
1534         bh_unlock_sock(sk);
1535
1536         sock_put(sk);
1537
1538         return ret;
1539
1540 no_tcp_socket:
1541         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1542                 goto discard_it;
1543
1544         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1545 bad_packet:
1546                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1547         } else {
1548                 tcp_v4_send_reset(skb);
1549         }
1550
1551 discard_it:
1552         /* Discard frame. */
1553         kfree_skb(skb);
1554         return 0;
1555
1556 discard_and_relse:
1557         sock_put(sk);
1558         goto discard_it;
1559
1560 do_time_wait:
1561         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1562                 inet_twsk_put((struct inet_timewait_sock *) sk);
1563                 goto discard_it;
1564         }
1565
1566         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1567                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1568                 inet_twsk_put((struct inet_timewait_sock *) sk);
1569                 goto discard_it;
1570         }
1571         switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1572                                            skb, th)) {
1573         case TCP_TW_SYN: {
1574                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1575                                                         skb->nh.iph->daddr,
1576                                                         ntohs(th->dest),
1577                                                         tcp_v4_iif(skb));
1578                 if (sk2) {
1579                         tcp_tw_deschedule((struct inet_timewait_sock *)sk);
1580                         inet_twsk_put((struct inet_timewait_sock *)sk);
1581                         sk = sk2;
1582                         goto process;
1583                 }
1584                 /* Fall through to ACK */
1585         }
1586         case TCP_TW_ACK:
1587                 tcp_v4_timewait_ack(sk, skb);
1588                 break;
1589         case TCP_TW_RST:
1590                 goto no_tcp_socket;
1591         case TCP_TW_SUCCESS:;
1592         }
1593         goto discard_it;
1594 }
1595
1596 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1597 {
1598         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1599         struct inet_sock *inet = inet_sk(sk);
1600
1601         sin->sin_family         = AF_INET;
1602         sin->sin_addr.s_addr    = inet->daddr;
1603         sin->sin_port           = inet->dport;
1604 }
1605
1606 /* VJ's idea. Save last timestamp seen from this destination
1607  * and hold it at least for normal timewait interval to use for duplicate
1608  * segment detection in subsequent connections, before they enter synchronized
1609  * state.
1610  */
1611
1612 int tcp_v4_remember_stamp(struct sock *sk)
1613 {
1614         struct inet_sock *inet = inet_sk(sk);
1615         struct tcp_sock *tp = tcp_sk(sk);
1616         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1617         struct inet_peer *peer = NULL;
1618         int release_it = 0;
1619
1620         if (!rt || rt->rt_dst != inet->daddr) {
1621                 peer = inet_getpeer(inet->daddr, 1);
1622                 release_it = 1;
1623         } else {
1624                 if (!rt->peer)
1625                         rt_bind_peer(rt, 1);
1626                 peer = rt->peer;
1627         }
1628
1629         if (peer) {
1630                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1631                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1632                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1633                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1634                         peer->tcp_ts = tp->rx_opt.ts_recent;
1635                 }
1636                 if (release_it)
1637                         inet_putpeer(peer);
1638                 return 1;
1639         }
1640
1641         return 0;
1642 }
1643
1644 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1645 {
1646         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1647
1648         if (peer) {
1649                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1650
1651                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1652                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1653                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1654                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1655                         peer->tcp_ts       = tcptw->tw_ts_recent;
1656                 }
1657                 inet_putpeer(peer);
1658                 return 1;
1659         }
1660
1661         return 0;
1662 }
1663
1664 struct tcp_func ipv4_specific = {
1665         .queue_xmit     =       ip_queue_xmit,
1666         .send_check     =       tcp_v4_send_check,
1667         .rebuild_header =       inet_sk_rebuild_header,
1668         .conn_request   =       tcp_v4_conn_request,
1669         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1670         .remember_stamp =       tcp_v4_remember_stamp,
1671         .net_header_len =       sizeof(struct iphdr),
1672         .setsockopt     =       ip_setsockopt,
1673         .getsockopt     =       ip_getsockopt,
1674         .addr2sockaddr  =       v4_addr2sockaddr,
1675         .sockaddr_len   =       sizeof(struct sockaddr_in),
1676 };
1677
1678 /* NOTE: A lot of things set to zero explicitly by call to
1679  *       sk_alloc() so need not be done here.
1680  */
1681 static int tcp_v4_init_sock(struct sock *sk)
1682 {
1683         struct tcp_sock *tp = tcp_sk(sk);
1684
1685         skb_queue_head_init(&tp->out_of_order_queue);
1686         tcp_init_xmit_timers(sk);
1687         tcp_prequeue_init(tp);
1688
1689         tp->rto  = TCP_TIMEOUT_INIT;
1690         tp->mdev = TCP_TIMEOUT_INIT;
1691
1692         /* So many TCP implementations out there (incorrectly) count the
1693          * initial SYN frame in their delayed-ACK and congestion control
1694          * algorithms that we must have the following bandaid to talk
1695          * efficiently to them.  -DaveM
1696          */
1697         tp->snd_cwnd = 2;
1698
1699         /* See draft-stevens-tcpca-spec-01 for discussion of the
1700          * initialization of these values.
1701          */
1702         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1703         tp->snd_cwnd_clamp = ~0;
1704         tp->mss_cache = 536;
1705
1706         tp->reordering = sysctl_tcp_reordering;
1707         tp->ca_ops = &tcp_init_congestion_ops;
1708
1709         sk->sk_state = TCP_CLOSE;
1710
1711         sk->sk_write_space = sk_stream_write_space;
1712         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1713
1714         tp->af_specific = &ipv4_specific;
1715
1716         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1717         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1718
1719         atomic_inc(&tcp_sockets_allocated);
1720
1721         return 0;
1722 }
1723
1724 int tcp_v4_destroy_sock(struct sock *sk)
1725 {
1726         struct tcp_sock *tp = tcp_sk(sk);
1727
1728         tcp_clear_xmit_timers(sk);
1729
1730         tcp_cleanup_congestion_control(tp);
1731
1732         /* Cleanup up the write buffer. */
1733         sk_stream_writequeue_purge(sk);
1734
1735         /* Cleans up our, hopefully empty, out_of_order_queue. */
1736         __skb_queue_purge(&tp->out_of_order_queue);
1737
1738         /* Clean prequeue, it must be empty really */
1739         __skb_queue_purge(&tp->ucopy.prequeue);
1740
1741         /* Clean up a referenced TCP bind bucket. */
1742         if (inet_sk(sk)->bind_hash)
1743                 inet_put_port(&tcp_hashinfo, sk);
1744
1745         /*
1746          * If sendmsg cached page exists, toss it.
1747          */
1748         if (sk->sk_sndmsg_page) {
1749                 __free_page(sk->sk_sndmsg_page);
1750                 sk->sk_sndmsg_page = NULL;
1751         }
1752
1753         atomic_dec(&tcp_sockets_allocated);
1754
1755         return 0;
1756 }
1757
1758 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1759
1760 #ifdef CONFIG_PROC_FS
1761 /* Proc filesystem TCP sock list dumping. */
1762
1763 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1764 {
1765         return hlist_empty(head) ? NULL :
1766                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1767 }
1768
1769 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1770 {
1771         return tw->tw_node.next ?
1772                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1773 }
1774
1775 static void *listening_get_next(struct seq_file *seq, void *cur)
1776 {
1777         struct tcp_sock *tp;
1778         struct hlist_node *node;
1779         struct sock *sk = cur;
1780         struct tcp_iter_state* st = seq->private;
1781
1782         if (!sk) {
1783                 st->bucket = 0;
1784                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1785                 goto get_sk;
1786         }
1787
1788         ++st->num;
1789
1790         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1791                 struct request_sock *req = cur;
1792
1793                 tp = tcp_sk(st->syn_wait_sk);
1794                 req = req->dl_next;
1795                 while (1) {
1796                         while (req) {
1797                                 if (req->rsk_ops->family == st->family) {
1798                                         cur = req;
1799                                         goto out;
1800                                 }
1801                                 req = req->dl_next;
1802                         }
1803                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1804                                 break;
1805 get_req:
1806                         req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1807                 }
1808                 sk        = sk_next(st->syn_wait_sk);
1809                 st->state = TCP_SEQ_STATE_LISTENING;
1810                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1811         } else {
1812                 tp = tcp_sk(sk);
1813                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1814                 if (reqsk_queue_len(&tp->accept_queue))
1815                         goto start_req;
1816                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1817                 sk = sk_next(sk);
1818         }
1819 get_sk:
1820         sk_for_each_from(sk, node) {
1821                 if (sk->sk_family == st->family) {
1822                         cur = sk;
1823                         goto out;
1824                 }
1825                 tp = tcp_sk(sk);
1826                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1827                 if (reqsk_queue_len(&tp->accept_queue)) {
1828 start_req:
1829                         st->uid         = sock_i_uid(sk);
1830                         st->syn_wait_sk = sk;
1831                         st->state       = TCP_SEQ_STATE_OPENREQ;
1832                         st->sbucket     = 0;
1833                         goto get_req;
1834                 }
1835                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1836         }
1837         if (++st->bucket < INET_LHTABLE_SIZE) {
1838                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1839                 goto get_sk;
1840         }
1841         cur = NULL;
1842 out:
1843         return cur;
1844 }
1845
1846 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1847 {
1848         void *rc = listening_get_next(seq, NULL);
1849
1850         while (rc && *pos) {
1851                 rc = listening_get_next(seq, rc);
1852                 --*pos;
1853         }
1854         return rc;
1855 }
1856
1857 static void *established_get_first(struct seq_file *seq)
1858 {
1859         struct tcp_iter_state* st = seq->private;
1860         void *rc = NULL;
1861
1862         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1863                 struct sock *sk;
1864                 struct hlist_node *node;
1865                 struct inet_timewait_sock *tw;
1866
1867                 /* We can reschedule _before_ having picked the target: */
1868                 cond_resched_softirq();
1869
1870                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1871                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1872                         if (sk->sk_family != st->family) {
1873                                 continue;
1874                         }
1875                         rc = sk;
1876                         goto out;
1877                 }
1878                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1879                 inet_twsk_for_each(tw, node,
1880                                    &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1881                         if (tw->tw_family != st->family) {
1882                                 continue;
1883                         }
1884                         rc = tw;
1885                         goto out;
1886                 }
1887                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1888                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1889         }
1890 out:
1891         return rc;
1892 }
1893
1894 static void *established_get_next(struct seq_file *seq, void *cur)
1895 {
1896         struct sock *sk = cur;
1897         struct inet_timewait_sock *tw;
1898         struct hlist_node *node;
1899         struct tcp_iter_state* st = seq->private;
1900
1901         ++st->num;
1902
1903         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1904                 tw = cur;
1905                 tw = tw_next(tw);
1906 get_tw:
1907                 while (tw && tw->tw_family != st->family) {
1908                         tw = tw_next(tw);
1909                 }
1910                 if (tw) {
1911                         cur = tw;
1912                         goto out;
1913                 }
1914                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1915                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1916
1917                 /* We can reschedule between buckets: */
1918                 cond_resched_softirq();
1919
1920                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1921                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1922                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1923                 } else {
1924                         cur = NULL;
1925                         goto out;
1926                 }
1927         } else
1928                 sk = sk_next(sk);
1929
1930         sk_for_each_from(sk, node) {
1931                 if (sk->sk_family == st->family)
1932                         goto found;
1933         }
1934
1935         st->state = TCP_SEQ_STATE_TIME_WAIT;
1936         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1937         goto get_tw;
1938 found:
1939         cur = sk;
1940 out:
1941         return cur;
1942 }
1943
1944 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1945 {
1946         void *rc = established_get_first(seq);
1947
1948         while (rc && pos) {
1949                 rc = established_get_next(seq, rc);
1950                 --pos;
1951         }               
1952         return rc;
1953 }
1954
1955 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1956 {
1957         void *rc;
1958         struct tcp_iter_state* st = seq->private;
1959
1960         inet_listen_lock(&tcp_hashinfo);
1961         st->state = TCP_SEQ_STATE_LISTENING;
1962         rc        = listening_get_idx(seq, &pos);
1963
1964         if (!rc) {
1965                 inet_listen_unlock(&tcp_hashinfo);
1966                 local_bh_disable();
1967                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1968                 rc        = established_get_idx(seq, pos);
1969         }
1970
1971         return rc;
1972 }
1973
1974 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1975 {
1976         struct tcp_iter_state* st = seq->private;
1977         st->state = TCP_SEQ_STATE_LISTENING;
1978         st->num = 0;
1979         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1980 }
1981
1982 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1983 {
1984         void *rc = NULL;
1985         struct tcp_iter_state* st;
1986
1987         if (v == SEQ_START_TOKEN) {
1988                 rc = tcp_get_idx(seq, 0);
1989                 goto out;
1990         }
1991         st = seq->private;
1992
1993         switch (st->state) {
1994         case TCP_SEQ_STATE_OPENREQ:
1995         case TCP_SEQ_STATE_LISTENING:
1996                 rc = listening_get_next(seq, v);
1997                 if (!rc) {
1998                         inet_listen_unlock(&tcp_hashinfo);
1999                         local_bh_disable();
2000                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2001                         rc        = established_get_first(seq);
2002                 }
2003                 break;
2004         case TCP_SEQ_STATE_ESTABLISHED:
2005         case TCP_SEQ_STATE_TIME_WAIT:
2006                 rc = established_get_next(seq, v);
2007                 break;
2008         }
2009 out:
2010         ++*pos;
2011         return rc;
2012 }
2013
2014 static void tcp_seq_stop(struct seq_file *seq, void *v)
2015 {
2016         struct tcp_iter_state* st = seq->private;
2017
2018         switch (st->state) {
2019         case TCP_SEQ_STATE_OPENREQ:
2020                 if (v) {
2021                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2022                         read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2023                 }
2024         case TCP_SEQ_STATE_LISTENING:
2025                 if (v != SEQ_START_TOKEN)
2026                         inet_listen_unlock(&tcp_hashinfo);
2027                 break;
2028         case TCP_SEQ_STATE_TIME_WAIT:
2029         case TCP_SEQ_STATE_ESTABLISHED:
2030                 if (v)
2031                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2032                 local_bh_enable();
2033                 break;
2034         }
2035 }
2036
2037 static int tcp_seq_open(struct inode *inode, struct file *file)
2038 {
2039         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2040         struct seq_file *seq;
2041         struct tcp_iter_state *s;
2042         int rc;
2043
2044         if (unlikely(afinfo == NULL))
2045                 return -EINVAL;
2046
2047         s = kmalloc(sizeof(*s), GFP_KERNEL);
2048         if (!s)
2049                 return -ENOMEM;
2050         memset(s, 0, sizeof(*s));
2051         s->family               = afinfo->family;
2052         s->seq_ops.start        = tcp_seq_start;
2053         s->seq_ops.next         = tcp_seq_next;
2054         s->seq_ops.show         = afinfo->seq_show;
2055         s->seq_ops.stop         = tcp_seq_stop;
2056
2057         rc = seq_open(file, &s->seq_ops);
2058         if (rc)
2059                 goto out_kfree;
2060         seq          = file->private_data;
2061         seq->private = s;
2062 out:
2063         return rc;
2064 out_kfree:
2065         kfree(s);
2066         goto out;
2067 }
2068
2069 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2070 {
2071         int rc = 0;
2072         struct proc_dir_entry *p;
2073
2074         if (!afinfo)
2075                 return -EINVAL;
2076         afinfo->seq_fops->owner         = afinfo->owner;
2077         afinfo->seq_fops->open          = tcp_seq_open;
2078         afinfo->seq_fops->read          = seq_read;
2079         afinfo->seq_fops->llseek        = seq_lseek;
2080         afinfo->seq_fops->release       = seq_release_private;
2081         
2082         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2083         if (p)
2084                 p->data = afinfo;
2085         else
2086                 rc = -ENOMEM;
2087         return rc;
2088 }
2089
2090 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2091 {
2092         if (!afinfo)
2093                 return;
2094         proc_net_remove(afinfo->name);
2095         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
2096 }
2097
2098 static void get_openreq4(struct sock *sk, struct request_sock *req,
2099                          char *tmpbuf, int i, int uid)
2100 {
2101         const struct inet_request_sock *ireq = inet_rsk(req);
2102         int ttd = req->expires - jiffies;
2103
2104         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2105                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2106                 i,
2107                 ireq->loc_addr,
2108                 ntohs(inet_sk(sk)->sport),
2109                 ireq->rmt_addr,
2110                 ntohs(ireq->rmt_port),
2111                 TCP_SYN_RECV,
2112                 0, 0, /* could print option size, but that is af dependent. */
2113                 1,    /* timers active (only the expire timer) */
2114                 jiffies_to_clock_t(ttd),
2115                 req->retrans,
2116                 uid,
2117                 0,  /* non standard timer */
2118                 0, /* open_requests have no inode */
2119                 atomic_read(&sk->sk_refcnt),
2120                 req);
2121 }
2122
2123 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2124 {
2125         int timer_active;
2126         unsigned long timer_expires;
2127         struct tcp_sock *tp = tcp_sk(sp);
2128         struct inet_sock *inet = inet_sk(sp);
2129         unsigned int dest = inet->daddr;
2130         unsigned int src = inet->rcv_saddr;
2131         __u16 destp = ntohs(inet->dport);
2132         __u16 srcp = ntohs(inet->sport);
2133
2134         if (tp->pending == TCP_TIME_RETRANS) {
2135                 timer_active    = 1;
2136                 timer_expires   = tp->timeout;
2137         } else if (tp->pending == TCP_TIME_PROBE0) {
2138                 timer_active    = 4;
2139                 timer_expires   = tp->timeout;
2140         } else if (timer_pending(&sp->sk_timer)) {
2141                 timer_active    = 2;
2142                 timer_expires   = sp->sk_timer.expires;
2143         } else {
2144                 timer_active    = 0;
2145                 timer_expires = jiffies;
2146         }
2147
2148         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2149                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2150                 i, src, srcp, dest, destp, sp->sk_state,
2151                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2152                 timer_active,
2153                 jiffies_to_clock_t(timer_expires - jiffies),
2154                 tp->retransmits,
2155                 sock_i_uid(sp),
2156                 tp->probes_out,
2157                 sock_i_ino(sp),
2158                 atomic_read(&sp->sk_refcnt), sp,
2159                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2160                 tp->snd_cwnd,
2161                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2162 }
2163
2164 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
2165 {
2166         unsigned int dest, src;
2167         __u16 destp, srcp;
2168         int ttd = tw->tw_ttd - jiffies;
2169
2170         if (ttd < 0)
2171                 ttd = 0;
2172
2173         dest  = tw->tw_daddr;
2174         src   = tw->tw_rcv_saddr;
2175         destp = ntohs(tw->tw_dport);
2176         srcp  = ntohs(tw->tw_sport);
2177
2178         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2179                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2180                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2181                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2182                 atomic_read(&tw->tw_refcnt), tw);
2183 }
2184
2185 #define TMPSZ 150
2186
2187 static int tcp4_seq_show(struct seq_file *seq, void *v)
2188 {
2189         struct tcp_iter_state* st;
2190         char tmpbuf[TMPSZ + 1];
2191
2192         if (v == SEQ_START_TOKEN) {
2193                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2194                            "  sl  local_address rem_address   st tx_queue "
2195                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2196                            "inode");
2197                 goto out;
2198         }
2199         st = seq->private;
2200
2201         switch (st->state) {
2202         case TCP_SEQ_STATE_LISTENING:
2203         case TCP_SEQ_STATE_ESTABLISHED:
2204                 get_tcp4_sock(v, tmpbuf, st->num);
2205                 break;
2206         case TCP_SEQ_STATE_OPENREQ:
2207                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2208                 break;
2209         case TCP_SEQ_STATE_TIME_WAIT:
2210                 get_timewait4_sock(v, tmpbuf, st->num);
2211                 break;
2212         }
2213         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2214 out:
2215         return 0;
2216 }
2217
2218 static struct file_operations tcp4_seq_fops;
2219 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2220         .owner          = THIS_MODULE,
2221         .name           = "tcp",
2222         .family         = AF_INET,
2223         .seq_show       = tcp4_seq_show,
2224         .seq_fops       = &tcp4_seq_fops,
2225 };
2226
2227 int __init tcp4_proc_init(void)
2228 {
2229         return tcp_proc_register(&tcp4_seq_afinfo);
2230 }
2231
2232 void tcp4_proc_exit(void)
2233 {
2234         tcp_proc_unregister(&tcp4_seq_afinfo);
2235 }
2236 #endif /* CONFIG_PROC_FS */
2237
2238 struct proto tcp_prot = {
2239         .name                   = "TCP",
2240         .owner                  = THIS_MODULE,
2241         .close                  = tcp_close,
2242         .connect                = tcp_v4_connect,
2243         .disconnect             = tcp_disconnect,
2244         .accept                 = tcp_accept,
2245         .ioctl                  = tcp_ioctl,
2246         .init                   = tcp_v4_init_sock,
2247         .destroy                = tcp_v4_destroy_sock,
2248         .shutdown               = tcp_shutdown,
2249         .setsockopt             = tcp_setsockopt,
2250         .getsockopt             = tcp_getsockopt,
2251         .sendmsg                = tcp_sendmsg,
2252         .recvmsg                = tcp_recvmsg,
2253         .backlog_rcv            = tcp_v4_do_rcv,
2254         .hash                   = tcp_v4_hash,
2255         .unhash                 = tcp_unhash,
2256         .get_port               = tcp_v4_get_port,
2257         .enter_memory_pressure  = tcp_enter_memory_pressure,
2258         .sockets_allocated      = &tcp_sockets_allocated,
2259         .memory_allocated       = &tcp_memory_allocated,
2260         .memory_pressure        = &tcp_memory_pressure,
2261         .sysctl_mem             = sysctl_tcp_mem,
2262         .sysctl_wmem            = sysctl_tcp_wmem,
2263         .sysctl_rmem            = sysctl_tcp_rmem,
2264         .max_header             = MAX_TCP_HEADER,
2265         .obj_size               = sizeof(struct tcp_sock),
2266         .twsk_obj_size          = sizeof(struct tcp_timewait_sock),
2267         .rsk_prot               = &tcp_request_sock_ops,
2268 };
2269
2270
2271
2272 void __init tcp_v4_init(struct net_proto_family *ops)
2273 {
2274         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2275         if (err < 0)
2276                 panic("Failed to create the TCP control socket.\n");
2277         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2278         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2279
2280         /* Unhash it so that IP input processing does not even
2281          * see it, we do not wish this socket to see incoming
2282          * packets.
2283          */
2284         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2285 }
2286
2287 EXPORT_SYMBOL(ipv4_specific);
2288 EXPORT_SYMBOL(inet_bind_bucket_create);
2289 EXPORT_SYMBOL(tcp_hashinfo);
2290 EXPORT_SYMBOL(tcp_prot);
2291 EXPORT_SYMBOL(tcp_unhash);
2292 EXPORT_SYMBOL(tcp_v4_conn_request);
2293 EXPORT_SYMBOL(tcp_v4_connect);
2294 EXPORT_SYMBOL(tcp_v4_do_rcv);
2295 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2296 EXPORT_SYMBOL(tcp_v4_send_check);
2297 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2298
2299 #ifdef CONFIG_PROC_FS
2300 EXPORT_SYMBOL(tcp_proc_register);
2301 EXPORT_SYMBOL(tcp_proc_unregister);
2302 #endif
2303 EXPORT_SYMBOL(sysctl_local_port_range);
2304 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2305 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2306