]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv4/tcp_ipv4.c
inetpeer: restore small inet_peer structures
[net-next-2.6.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63 #include <linux/slab.h>
64
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81
82 #include <linux/crypto.h>
83 #include <linux/scatterlist.h>
84
85 int sysctl_tcp_tw_reuse __read_mostly;
86 int sysctl_tcp_low_latency __read_mostly;
87
88
89 #ifdef CONFIG_TCP_MD5SIG
90 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
91                                                    __be32 addr);
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
93                                __be32 daddr, __be32 saddr, struct tcphdr *th);
94 #else
95 static inline
96 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
97 {
98         return NULL;
99 }
100 #endif
101
102 struct inet_hashinfo tcp_hashinfo;
103
104 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
105 {
106         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
107                                           ip_hdr(skb)->saddr,
108                                           tcp_hdr(skb)->dest,
109                                           tcp_hdr(skb)->source);
110 }
111
112 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
113 {
114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115         struct tcp_sock *tp = tcp_sk(sk);
116
117         /* With PAWS, it is safe from the viewpoint
118            of data integrity. Even without PAWS it is safe provided sequence
119            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
120
121            Actually, the idea is close to VJ's one, only timestamp cache is
122            held not per host, but per port pair and TW bucket is used as state
123            holder.
124
125            If TW bucket has been already destroyed we fall back to VJ's scheme
126            and use initial timestamp retrieved from peer table.
127          */
128         if (tcptw->tw_ts_recent_stamp &&
129             (twp == NULL || (sysctl_tcp_tw_reuse &&
130                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
131                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
132                 if (tp->write_seq == 0)
133                         tp->write_seq = 1;
134                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
135                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
136                 sock_hold(sktw);
137                 return 1;
138         }
139
140         return 0;
141 }
142
143 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
144
145 /* This will initiate an outgoing connection. */
146 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
147 {
148         struct inet_sock *inet = inet_sk(sk);
149         struct tcp_sock *tp = tcp_sk(sk);
150         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
151         struct rtable *rt;
152         __be32 daddr, nexthop;
153         int tmp;
154         int err;
155
156         if (addr_len < sizeof(struct sockaddr_in))
157                 return -EINVAL;
158
159         if (usin->sin_family != AF_INET)
160                 return -EAFNOSUPPORT;
161
162         nexthop = daddr = usin->sin_addr.s_addr;
163         if (inet->opt && inet->opt->srr) {
164                 if (!daddr)
165                         return -EINVAL;
166                 nexthop = inet->opt->faddr;
167         }
168
169         tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
170                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
171                                IPPROTO_TCP,
172                                inet->inet_sport, usin->sin_port, sk, 1);
173         if (tmp < 0) {
174                 if (tmp == -ENETUNREACH)
175                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
176                 return tmp;
177         }
178
179         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
180                 ip_rt_put(rt);
181                 return -ENETUNREACH;
182         }
183
184         if (!inet->opt || !inet->opt->srr)
185                 daddr = rt->rt_dst;
186
187         if (!inet->inet_saddr)
188                 inet->inet_saddr = rt->rt_src;
189         inet->inet_rcv_saddr = inet->inet_saddr;
190
191         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
192                 /* Reset inherited state */
193                 tp->rx_opt.ts_recent       = 0;
194                 tp->rx_opt.ts_recent_stamp = 0;
195                 tp->write_seq              = 0;
196         }
197
198         if (tcp_death_row.sysctl_tw_recycle &&
199             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
200                 struct inet_peer *peer = rt_get_peer(rt);
201                 /*
202                  * VJ's idea. We save last timestamp seen from
203                  * the destination in peer table, when entering state
204                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
205                  * when trying new connection.
206                  */
207                 if (peer) {
208                         inet_peer_refcheck(peer);
209                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
210                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
211                                 tp->rx_opt.ts_recent = peer->tcp_ts;
212                         }
213                 }
214         }
215
216         inet->inet_dport = usin->sin_port;
217         inet->inet_daddr = daddr;
218
219         inet_csk(sk)->icsk_ext_hdr_len = 0;
220         if (inet->opt)
221                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
222
223         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
224
225         /* Socket identity is still unknown (sport may be zero).
226          * However we set state to SYN-SENT and not releasing socket
227          * lock select source port, enter ourselves into the hash tables and
228          * complete initialization after this.
229          */
230         tcp_set_state(sk, TCP_SYN_SENT);
231         err = inet_hash_connect(&tcp_death_row, sk);
232         if (err)
233                 goto failure;
234
235         err = ip_route_newports(&rt, IPPROTO_TCP,
236                                 inet->inet_sport, inet->inet_dport, sk);
237         if (err)
238                 goto failure;
239
240         /* OK, now commit destination to socket.  */
241         sk->sk_gso_type = SKB_GSO_TCPV4;
242         sk_setup_caps(sk, &rt->dst);
243
244         if (!tp->write_seq)
245                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
246                                                            inet->inet_daddr,
247                                                            inet->inet_sport,
248                                                            usin->sin_port);
249
250         inet->inet_id = tp->write_seq ^ jiffies;
251
252         err = tcp_connect(sk);
253         rt = NULL;
254         if (err)
255                 goto failure;
256
257         return 0;
258
259 failure:
260         /*
261          * This unhashes the socket and releases the local port,
262          * if necessary.
263          */
264         tcp_set_state(sk, TCP_CLOSE);
265         ip_rt_put(rt);
266         sk->sk_route_caps = 0;
267         inet->inet_dport = 0;
268         return err;
269 }
270
271 /*
272  * This routine does path mtu discovery as defined in RFC1191.
273  */
274 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
275 {
276         struct dst_entry *dst;
277         struct inet_sock *inet = inet_sk(sk);
278
279         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
280          * send out by Linux are always <576bytes so they should go through
281          * unfragmented).
282          */
283         if (sk->sk_state == TCP_LISTEN)
284                 return;
285
286         /* We don't check in the destentry if pmtu discovery is forbidden
287          * on this route. We just assume that no packet_to_big packets
288          * are send back when pmtu discovery is not active.
289          * There is a small race when the user changes this flag in the
290          * route, but I think that's acceptable.
291          */
292         if ((dst = __sk_dst_check(sk, 0)) == NULL)
293                 return;
294
295         dst->ops->update_pmtu(dst, mtu);
296
297         /* Something is about to be wrong... Remember soft error
298          * for the case, if this connection will not able to recover.
299          */
300         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
301                 sk->sk_err_soft = EMSGSIZE;
302
303         mtu = dst_mtu(dst);
304
305         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
306             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
307                 tcp_sync_mss(sk, mtu);
308
309                 /* Resend the TCP packet because it's
310                  * clear that the old packet has been
311                  * dropped. This is the new "fast" path mtu
312                  * discovery.
313                  */
314                 tcp_simple_retransmit(sk);
315         } /* else let the usual retransmit timer handle it */
316 }
317
318 /*
319  * This routine is called by the ICMP module when it gets some
320  * sort of error condition.  If err < 0 then the socket should
321  * be closed and the error returned to the user.  If err > 0
322  * it's just the icmp type << 8 | icmp code.  After adjustment
323  * header points to the first 8 bytes of the tcp header.  We need
324  * to find the appropriate port.
325  *
326  * The locking strategy used here is very "optimistic". When
327  * someone else accesses the socket the ICMP is just dropped
328  * and for some paths there is no check at all.
329  * A more general error queue to queue errors for later handling
330  * is probably better.
331  *
332  */
333
334 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
335 {
336         struct iphdr *iph = (struct iphdr *)icmp_skb->data;
337         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
338         struct inet_connection_sock *icsk;
339         struct tcp_sock *tp;
340         struct inet_sock *inet;
341         const int type = icmp_hdr(icmp_skb)->type;
342         const int code = icmp_hdr(icmp_skb)->code;
343         struct sock *sk;
344         struct sk_buff *skb;
345         __u32 seq;
346         __u32 remaining;
347         int err;
348         struct net *net = dev_net(icmp_skb->dev);
349
350         if (icmp_skb->len < (iph->ihl << 2) + 8) {
351                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
352                 return;
353         }
354
355         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
356                         iph->saddr, th->source, inet_iif(icmp_skb));
357         if (!sk) {
358                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
359                 return;
360         }
361         if (sk->sk_state == TCP_TIME_WAIT) {
362                 inet_twsk_put(inet_twsk(sk));
363                 return;
364         }
365
366         bh_lock_sock(sk);
367         /* If too many ICMPs get dropped on busy
368          * servers this needs to be solved differently.
369          */
370         if (sock_owned_by_user(sk))
371                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
372
373         if (sk->sk_state == TCP_CLOSE)
374                 goto out;
375
376         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
377                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
378                 goto out;
379         }
380
381         icsk = inet_csk(sk);
382         tp = tcp_sk(sk);
383         seq = ntohl(th->seq);
384         if (sk->sk_state != TCP_LISTEN &&
385             !between(seq, tp->snd_una, tp->snd_nxt)) {
386                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
387                 goto out;
388         }
389
390         switch (type) {
391         case ICMP_SOURCE_QUENCH:
392                 /* Just silently ignore these. */
393                 goto out;
394         case ICMP_PARAMETERPROB:
395                 err = EPROTO;
396                 break;
397         case ICMP_DEST_UNREACH:
398                 if (code > NR_ICMP_UNREACH)
399                         goto out;
400
401                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
402                         if (!sock_owned_by_user(sk))
403                                 do_pmtu_discovery(sk, iph, info);
404                         goto out;
405                 }
406
407                 err = icmp_err_convert[code].errno;
408                 /* check if icmp_skb allows revert of backoff
409                  * (see draft-zimmermann-tcp-lcd) */
410                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
411                         break;
412                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
413                     !icsk->icsk_backoff)
414                         break;
415
416                 icsk->icsk_backoff--;
417                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
418                                          icsk->icsk_backoff;
419                 tcp_bound_rto(sk);
420
421                 skb = tcp_write_queue_head(sk);
422                 BUG_ON(!skb);
423
424                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
425                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
426
427                 if (remaining) {
428                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
429                                                   remaining, TCP_RTO_MAX);
430                 } else if (sock_owned_by_user(sk)) {
431                         /* RTO revert clocked out retransmission,
432                          * but socket is locked. Will defer. */
433                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
434                                                   HZ/20, TCP_RTO_MAX);
435                 } else {
436                         /* RTO revert clocked out retransmission.
437                          * Will retransmit now */
438                         tcp_retransmit_timer(sk);
439                 }
440
441                 break;
442         case ICMP_TIME_EXCEEDED:
443                 err = EHOSTUNREACH;
444                 break;
445         default:
446                 goto out;
447         }
448
449         switch (sk->sk_state) {
450                 struct request_sock *req, **prev;
451         case TCP_LISTEN:
452                 if (sock_owned_by_user(sk))
453                         goto out;
454
455                 req = inet_csk_search_req(sk, &prev, th->dest,
456                                           iph->daddr, iph->saddr);
457                 if (!req)
458                         goto out;
459
460                 /* ICMPs are not backlogged, hence we cannot get
461                    an established socket here.
462                  */
463                 WARN_ON(req->sk);
464
465                 if (seq != tcp_rsk(req)->snt_isn) {
466                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
467                         goto out;
468                 }
469
470                 /*
471                  * Still in SYN_RECV, just remove it silently.
472                  * There is no good way to pass the error to the newly
473                  * created socket, and POSIX does not want network
474                  * errors returned from accept().
475                  */
476                 inet_csk_reqsk_queue_drop(sk, req, prev);
477                 goto out;
478
479         case TCP_SYN_SENT:
480         case TCP_SYN_RECV:  /* Cannot happen.
481                                It can f.e. if SYNs crossed.
482                              */
483                 if (!sock_owned_by_user(sk)) {
484                         sk->sk_err = err;
485
486                         sk->sk_error_report(sk);
487
488                         tcp_done(sk);
489                 } else {
490                         sk->sk_err_soft = err;
491                 }
492                 goto out;
493         }
494
495         /* If we've already connected we will keep trying
496          * until we time out, or the user gives up.
497          *
498          * rfc1122 4.2.3.9 allows to consider as hard errors
499          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
500          * but it is obsoleted by pmtu discovery).
501          *
502          * Note, that in modern internet, where routing is unreliable
503          * and in each dark corner broken firewalls sit, sending random
504          * errors ordered by their masters even this two messages finally lose
505          * their original sense (even Linux sends invalid PORT_UNREACHs)
506          *
507          * Now we are in compliance with RFCs.
508          *                                                      --ANK (980905)
509          */
510
511         inet = inet_sk(sk);
512         if (!sock_owned_by_user(sk) && inet->recverr) {
513                 sk->sk_err = err;
514                 sk->sk_error_report(sk);
515         } else  { /* Only an error on timeout */
516                 sk->sk_err_soft = err;
517         }
518
519 out:
520         bh_unlock_sock(sk);
521         sock_put(sk);
522 }
523
524 static void __tcp_v4_send_check(struct sk_buff *skb,
525                                 __be32 saddr, __be32 daddr)
526 {
527         struct tcphdr *th = tcp_hdr(skb);
528
529         if (skb->ip_summed == CHECKSUM_PARTIAL) {
530                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
531                 skb->csum_start = skb_transport_header(skb) - skb->head;
532                 skb->csum_offset = offsetof(struct tcphdr, check);
533         } else {
534                 th->check = tcp_v4_check(skb->len, saddr, daddr,
535                                          csum_partial(th,
536                                                       th->doff << 2,
537                                                       skb->csum));
538         }
539 }
540
541 /* This routine computes an IPv4 TCP checksum. */
542 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
543 {
544         struct inet_sock *inet = inet_sk(sk);
545
546         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
547 }
548
549 int tcp_v4_gso_send_check(struct sk_buff *skb)
550 {
551         const struct iphdr *iph;
552         struct tcphdr *th;
553
554         if (!pskb_may_pull(skb, sizeof(*th)))
555                 return -EINVAL;
556
557         iph = ip_hdr(skb);
558         th = tcp_hdr(skb);
559
560         th->check = 0;
561         skb->ip_summed = CHECKSUM_PARTIAL;
562         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
563         return 0;
564 }
565
566 /*
567  *      This routine will send an RST to the other tcp.
568  *
569  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
570  *                    for reset.
571  *      Answer: if a packet caused RST, it is not for a socket
572  *              existing in our system, if it is matched to a socket,
573  *              it is just duplicate segment or bug in other side's TCP.
574  *              So that we build reply only basing on parameters
575  *              arrived with segment.
576  *      Exception: precedence violation. We do not implement it in any case.
577  */
578
579 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
580 {
581         struct tcphdr *th = tcp_hdr(skb);
582         struct {
583                 struct tcphdr th;
584 #ifdef CONFIG_TCP_MD5SIG
585                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
586 #endif
587         } rep;
588         struct ip_reply_arg arg;
589 #ifdef CONFIG_TCP_MD5SIG
590         struct tcp_md5sig_key *key;
591 #endif
592         struct net *net;
593
594         /* Never send a reset in response to a reset. */
595         if (th->rst)
596                 return;
597
598         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
599                 return;
600
601         /* Swap the send and the receive. */
602         memset(&rep, 0, sizeof(rep));
603         rep.th.dest   = th->source;
604         rep.th.source = th->dest;
605         rep.th.doff   = sizeof(struct tcphdr) / 4;
606         rep.th.rst    = 1;
607
608         if (th->ack) {
609                 rep.th.seq = th->ack_seq;
610         } else {
611                 rep.th.ack = 1;
612                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
613                                        skb->len - (th->doff << 2));
614         }
615
616         memset(&arg, 0, sizeof(arg));
617         arg.iov[0].iov_base = (unsigned char *)&rep;
618         arg.iov[0].iov_len  = sizeof(rep.th);
619
620 #ifdef CONFIG_TCP_MD5SIG
621         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
622         if (key) {
623                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
624                                    (TCPOPT_NOP << 16) |
625                                    (TCPOPT_MD5SIG << 8) |
626                                    TCPOLEN_MD5SIG);
627                 /* Update length and the length the header thinks exists */
628                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
629                 rep.th.doff = arg.iov[0].iov_len / 4;
630
631                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
632                                      key, ip_hdr(skb)->saddr,
633                                      ip_hdr(skb)->daddr, &rep.th);
634         }
635 #endif
636         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
637                                       ip_hdr(skb)->saddr, /* XXX */
638                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
639         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
640         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
641
642         net = dev_net(skb_dst(skb)->dev);
643         ip_send_reply(net->ipv4.tcp_sock, skb,
644                       &arg, arg.iov[0].iov_len);
645
646         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
647         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
648 }
649
650 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
651    outside socket context is ugly, certainly. What can I do?
652  */
653
654 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
655                             u32 win, u32 ts, int oif,
656                             struct tcp_md5sig_key *key,
657                             int reply_flags)
658 {
659         struct tcphdr *th = tcp_hdr(skb);
660         struct {
661                 struct tcphdr th;
662                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
663 #ifdef CONFIG_TCP_MD5SIG
664                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
665 #endif
666                         ];
667         } rep;
668         struct ip_reply_arg arg;
669         struct net *net = dev_net(skb_dst(skb)->dev);
670
671         memset(&rep.th, 0, sizeof(struct tcphdr));
672         memset(&arg, 0, sizeof(arg));
673
674         arg.iov[0].iov_base = (unsigned char *)&rep;
675         arg.iov[0].iov_len  = sizeof(rep.th);
676         if (ts) {
677                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
678                                    (TCPOPT_TIMESTAMP << 8) |
679                                    TCPOLEN_TIMESTAMP);
680                 rep.opt[1] = htonl(tcp_time_stamp);
681                 rep.opt[2] = htonl(ts);
682                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
683         }
684
685         /* Swap the send and the receive. */
686         rep.th.dest    = th->source;
687         rep.th.source  = th->dest;
688         rep.th.doff    = arg.iov[0].iov_len / 4;
689         rep.th.seq     = htonl(seq);
690         rep.th.ack_seq = htonl(ack);
691         rep.th.ack     = 1;
692         rep.th.window  = htons(win);
693
694 #ifdef CONFIG_TCP_MD5SIG
695         if (key) {
696                 int offset = (ts) ? 3 : 0;
697
698                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
699                                           (TCPOPT_NOP << 16) |
700                                           (TCPOPT_MD5SIG << 8) |
701                                           TCPOLEN_MD5SIG);
702                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
703                 rep.th.doff = arg.iov[0].iov_len/4;
704
705                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
706                                     key, ip_hdr(skb)->saddr,
707                                     ip_hdr(skb)->daddr, &rep.th);
708         }
709 #endif
710         arg.flags = reply_flags;
711         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
712                                       ip_hdr(skb)->saddr, /* XXX */
713                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
714         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
715         if (oif)
716                 arg.bound_dev_if = oif;
717
718         ip_send_reply(net->ipv4.tcp_sock, skb,
719                       &arg, arg.iov[0].iov_len);
720
721         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
722 }
723
724 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
725 {
726         struct inet_timewait_sock *tw = inet_twsk(sk);
727         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
728
729         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
730                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
731                         tcptw->tw_ts_recent,
732                         tw->tw_bound_dev_if,
733                         tcp_twsk_md5_key(tcptw),
734                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
735                         );
736
737         inet_twsk_put(tw);
738 }
739
740 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
741                                   struct request_sock *req)
742 {
743         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
744                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
745                         req->ts_recent,
746                         0,
747                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
748                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
749 }
750
751 /*
752  *      Send a SYN-ACK after having received a SYN.
753  *      This still operates on a request_sock only, not on a big
754  *      socket.
755  */
756 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
757                               struct request_sock *req,
758                               struct request_values *rvp)
759 {
760         const struct inet_request_sock *ireq = inet_rsk(req);
761         int err = -1;
762         struct sk_buff * skb;
763
764         /* First, grab a route. */
765         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
766                 return -1;
767
768         skb = tcp_make_synack(sk, dst, req, rvp);
769
770         if (skb) {
771                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
772
773                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
774                                             ireq->rmt_addr,
775                                             ireq->opt);
776                 err = net_xmit_eval(err);
777         }
778
779         dst_release(dst);
780         return err;
781 }
782
783 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
784                               struct request_values *rvp)
785 {
786         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
787         return tcp_v4_send_synack(sk, NULL, req, rvp);
788 }
789
790 /*
791  *      IPv4 request_sock destructor.
792  */
793 static void tcp_v4_reqsk_destructor(struct request_sock *req)
794 {
795         kfree(inet_rsk(req)->opt);
796 }
797
798 static void syn_flood_warning(const struct sk_buff *skb)
799 {
800         const char *msg;
801
802 #ifdef CONFIG_SYN_COOKIES
803         if (sysctl_tcp_syncookies)
804                 msg = "Sending cookies";
805         else
806 #endif
807                 msg = "Dropping request";
808
809         pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
810                                 ntohs(tcp_hdr(skb)->dest), msg);
811 }
812
813 /*
814  * Save and compile IPv4 options into the request_sock if needed.
815  */
816 static struct ip_options *tcp_v4_save_options(struct sock *sk,
817                                               struct sk_buff *skb)
818 {
819         struct ip_options *opt = &(IPCB(skb)->opt);
820         struct ip_options *dopt = NULL;
821
822         if (opt && opt->optlen) {
823                 int opt_size = optlength(opt);
824                 dopt = kmalloc(opt_size, GFP_ATOMIC);
825                 if (dopt) {
826                         if (ip_options_echo(dopt, skb)) {
827                                 kfree(dopt);
828                                 dopt = NULL;
829                         }
830                 }
831         }
832         return dopt;
833 }
834
835 #ifdef CONFIG_TCP_MD5SIG
836 /*
837  * RFC2385 MD5 checksumming requires a mapping of
838  * IP address->MD5 Key.
839  * We need to maintain these in the sk structure.
840  */
841
842 /* Find the Key structure for an address.  */
843 static struct tcp_md5sig_key *
844                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
845 {
846         struct tcp_sock *tp = tcp_sk(sk);
847         int i;
848
849         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
850                 return NULL;
851         for (i = 0; i < tp->md5sig_info->entries4; i++) {
852                 if (tp->md5sig_info->keys4[i].addr == addr)
853                         return &tp->md5sig_info->keys4[i].base;
854         }
855         return NULL;
856 }
857
858 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
859                                          struct sock *addr_sk)
860 {
861         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
862 }
863
864 EXPORT_SYMBOL(tcp_v4_md5_lookup);
865
866 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
867                                                       struct request_sock *req)
868 {
869         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
870 }
871
872 /* This can be called on a newly created socket, from other files */
873 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
874                       u8 *newkey, u8 newkeylen)
875 {
876         /* Add Key to the list */
877         struct tcp_md5sig_key *key;
878         struct tcp_sock *tp = tcp_sk(sk);
879         struct tcp4_md5sig_key *keys;
880
881         key = tcp_v4_md5_do_lookup(sk, addr);
882         if (key) {
883                 /* Pre-existing entry - just update that one. */
884                 kfree(key->key);
885                 key->key = newkey;
886                 key->keylen = newkeylen;
887         } else {
888                 struct tcp_md5sig_info *md5sig;
889
890                 if (!tp->md5sig_info) {
891                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
892                                                   GFP_ATOMIC);
893                         if (!tp->md5sig_info) {
894                                 kfree(newkey);
895                                 return -ENOMEM;
896                         }
897                         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
898                 }
899                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
900                         kfree(newkey);
901                         return -ENOMEM;
902                 }
903                 md5sig = tp->md5sig_info;
904
905                 if (md5sig->alloced4 == md5sig->entries4) {
906                         keys = kmalloc((sizeof(*keys) *
907                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
908                         if (!keys) {
909                                 kfree(newkey);
910                                 tcp_free_md5sig_pool();
911                                 return -ENOMEM;
912                         }
913
914                         if (md5sig->entries4)
915                                 memcpy(keys, md5sig->keys4,
916                                        sizeof(*keys) * md5sig->entries4);
917
918                         /* Free old key list, and reference new one */
919                         kfree(md5sig->keys4);
920                         md5sig->keys4 = keys;
921                         md5sig->alloced4++;
922                 }
923                 md5sig->entries4++;
924                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
925                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
926                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
927         }
928         return 0;
929 }
930
931 EXPORT_SYMBOL(tcp_v4_md5_do_add);
932
933 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
934                                u8 *newkey, u8 newkeylen)
935 {
936         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
937                                  newkey, newkeylen);
938 }
939
940 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
941 {
942         struct tcp_sock *tp = tcp_sk(sk);
943         int i;
944
945         for (i = 0; i < tp->md5sig_info->entries4; i++) {
946                 if (tp->md5sig_info->keys4[i].addr == addr) {
947                         /* Free the key */
948                         kfree(tp->md5sig_info->keys4[i].base.key);
949                         tp->md5sig_info->entries4--;
950
951                         if (tp->md5sig_info->entries4 == 0) {
952                                 kfree(tp->md5sig_info->keys4);
953                                 tp->md5sig_info->keys4 = NULL;
954                                 tp->md5sig_info->alloced4 = 0;
955                         } else if (tp->md5sig_info->entries4 != i) {
956                                 /* Need to do some manipulation */
957                                 memmove(&tp->md5sig_info->keys4[i],
958                                         &tp->md5sig_info->keys4[i+1],
959                                         (tp->md5sig_info->entries4 - i) *
960                                          sizeof(struct tcp4_md5sig_key));
961                         }
962                         tcp_free_md5sig_pool();
963                         return 0;
964                 }
965         }
966         return -ENOENT;
967 }
968
969 EXPORT_SYMBOL(tcp_v4_md5_do_del);
970
971 static void tcp_v4_clear_md5_list(struct sock *sk)
972 {
973         struct tcp_sock *tp = tcp_sk(sk);
974
975         /* Free each key, then the set of key keys,
976          * the crypto element, and then decrement our
977          * hold on the last resort crypto.
978          */
979         if (tp->md5sig_info->entries4) {
980                 int i;
981                 for (i = 0; i < tp->md5sig_info->entries4; i++)
982                         kfree(tp->md5sig_info->keys4[i].base.key);
983                 tp->md5sig_info->entries4 = 0;
984                 tcp_free_md5sig_pool();
985         }
986         if (tp->md5sig_info->keys4) {
987                 kfree(tp->md5sig_info->keys4);
988                 tp->md5sig_info->keys4 = NULL;
989                 tp->md5sig_info->alloced4  = 0;
990         }
991 }
992
993 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
994                                  int optlen)
995 {
996         struct tcp_md5sig cmd;
997         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
998         u8 *newkey;
999
1000         if (optlen < sizeof(cmd))
1001                 return -EINVAL;
1002
1003         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1004                 return -EFAULT;
1005
1006         if (sin->sin_family != AF_INET)
1007                 return -EINVAL;
1008
1009         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1010                 if (!tcp_sk(sk)->md5sig_info)
1011                         return -ENOENT;
1012                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1013         }
1014
1015         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1016                 return -EINVAL;
1017
1018         if (!tcp_sk(sk)->md5sig_info) {
1019                 struct tcp_sock *tp = tcp_sk(sk);
1020                 struct tcp_md5sig_info *p;
1021
1022                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1023                 if (!p)
1024                         return -EINVAL;
1025
1026                 tp->md5sig_info = p;
1027                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1028         }
1029
1030         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1031         if (!newkey)
1032                 return -ENOMEM;
1033         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1034                                  newkey, cmd.tcpm_keylen);
1035 }
1036
1037 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1038                                         __be32 daddr, __be32 saddr, int nbytes)
1039 {
1040         struct tcp4_pseudohdr *bp;
1041         struct scatterlist sg;
1042
1043         bp = &hp->md5_blk.ip4;
1044
1045         /*
1046          * 1. the TCP pseudo-header (in the order: source IP address,
1047          * destination IP address, zero-padded protocol number, and
1048          * segment length)
1049          */
1050         bp->saddr = saddr;
1051         bp->daddr = daddr;
1052         bp->pad = 0;
1053         bp->protocol = IPPROTO_TCP;
1054         bp->len = cpu_to_be16(nbytes);
1055
1056         sg_init_one(&sg, bp, sizeof(*bp));
1057         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1058 }
1059
1060 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1061                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1062 {
1063         struct tcp_md5sig_pool *hp;
1064         struct hash_desc *desc;
1065
1066         hp = tcp_get_md5sig_pool();
1067         if (!hp)
1068                 goto clear_hash_noput;
1069         desc = &hp->md5_desc;
1070
1071         if (crypto_hash_init(desc))
1072                 goto clear_hash;
1073         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1074                 goto clear_hash;
1075         if (tcp_md5_hash_header(hp, th))
1076                 goto clear_hash;
1077         if (tcp_md5_hash_key(hp, key))
1078                 goto clear_hash;
1079         if (crypto_hash_final(desc, md5_hash))
1080                 goto clear_hash;
1081
1082         tcp_put_md5sig_pool();
1083         return 0;
1084
1085 clear_hash:
1086         tcp_put_md5sig_pool();
1087 clear_hash_noput:
1088         memset(md5_hash, 0, 16);
1089         return 1;
1090 }
1091
1092 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1093                         struct sock *sk, struct request_sock *req,
1094                         struct sk_buff *skb)
1095 {
1096         struct tcp_md5sig_pool *hp;
1097         struct hash_desc *desc;
1098         struct tcphdr *th = tcp_hdr(skb);
1099         __be32 saddr, daddr;
1100
1101         if (sk) {
1102                 saddr = inet_sk(sk)->inet_saddr;
1103                 daddr = inet_sk(sk)->inet_daddr;
1104         } else if (req) {
1105                 saddr = inet_rsk(req)->loc_addr;
1106                 daddr = inet_rsk(req)->rmt_addr;
1107         } else {
1108                 const struct iphdr *iph = ip_hdr(skb);
1109                 saddr = iph->saddr;
1110                 daddr = iph->daddr;
1111         }
1112
1113         hp = tcp_get_md5sig_pool();
1114         if (!hp)
1115                 goto clear_hash_noput;
1116         desc = &hp->md5_desc;
1117
1118         if (crypto_hash_init(desc))
1119                 goto clear_hash;
1120
1121         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1122                 goto clear_hash;
1123         if (tcp_md5_hash_header(hp, th))
1124                 goto clear_hash;
1125         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1126                 goto clear_hash;
1127         if (tcp_md5_hash_key(hp, key))
1128                 goto clear_hash;
1129         if (crypto_hash_final(desc, md5_hash))
1130                 goto clear_hash;
1131
1132         tcp_put_md5sig_pool();
1133         return 0;
1134
1135 clear_hash:
1136         tcp_put_md5sig_pool();
1137 clear_hash_noput:
1138         memset(md5_hash, 0, 16);
1139         return 1;
1140 }
1141
1142 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1143
1144 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1145 {
1146         /*
1147          * This gets called for each TCP segment that arrives
1148          * so we want to be efficient.
1149          * We have 3 drop cases:
1150          * o No MD5 hash and one expected.
1151          * o MD5 hash and we're not expecting one.
1152          * o MD5 hash and its wrong.
1153          */
1154         __u8 *hash_location = NULL;
1155         struct tcp_md5sig_key *hash_expected;
1156         const struct iphdr *iph = ip_hdr(skb);
1157         struct tcphdr *th = tcp_hdr(skb);
1158         int genhash;
1159         unsigned char newhash[16];
1160
1161         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1162         hash_location = tcp_parse_md5sig_option(th);
1163
1164         /* We've parsed the options - do we have a hash? */
1165         if (!hash_expected && !hash_location)
1166                 return 0;
1167
1168         if (hash_expected && !hash_location) {
1169                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1170                 return 1;
1171         }
1172
1173         if (!hash_expected && hash_location) {
1174                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1175                 return 1;
1176         }
1177
1178         /* Okay, so this is hash_expected and hash_location -
1179          * so we need to calculate the checksum.
1180          */
1181         genhash = tcp_v4_md5_hash_skb(newhash,
1182                                       hash_expected,
1183                                       NULL, NULL, skb);
1184
1185         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1186                 if (net_ratelimit()) {
1187                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1188                                &iph->saddr, ntohs(th->source),
1189                                &iph->daddr, ntohs(th->dest),
1190                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1191                 }
1192                 return 1;
1193         }
1194         return 0;
1195 }
1196
1197 #endif
1198
1199 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1200         .family         =       PF_INET,
1201         .obj_size       =       sizeof(struct tcp_request_sock),
1202         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1203         .send_ack       =       tcp_v4_reqsk_send_ack,
1204         .destructor     =       tcp_v4_reqsk_destructor,
1205         .send_reset     =       tcp_v4_send_reset,
1206         .syn_ack_timeout =      tcp_syn_ack_timeout,
1207 };
1208
1209 #ifdef CONFIG_TCP_MD5SIG
1210 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1211         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1212         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1213 };
1214 #endif
1215
1216 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1217         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1218         .twsk_unique    = tcp_twsk_unique,
1219         .twsk_destructor= tcp_twsk_destructor,
1220 };
1221
1222 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1223 {
1224         struct tcp_extend_values tmp_ext;
1225         struct tcp_options_received tmp_opt;
1226         u8 *hash_location;
1227         struct request_sock *req;
1228         struct inet_request_sock *ireq;
1229         struct tcp_sock *tp = tcp_sk(sk);
1230         struct dst_entry *dst = NULL;
1231         __be32 saddr = ip_hdr(skb)->saddr;
1232         __be32 daddr = ip_hdr(skb)->daddr;
1233         __u32 isn = TCP_SKB_CB(skb)->when;
1234 #ifdef CONFIG_SYN_COOKIES
1235         int want_cookie = 0;
1236 #else
1237 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1238 #endif
1239
1240         /* Never answer to SYNs send to broadcast or multicast */
1241         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1242                 goto drop;
1243
1244         /* TW buckets are converted to open requests without
1245          * limitations, they conserve resources and peer is
1246          * evidently real one.
1247          */
1248         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1249                 if (net_ratelimit())
1250                         syn_flood_warning(skb);
1251 #ifdef CONFIG_SYN_COOKIES
1252                 if (sysctl_tcp_syncookies) {
1253                         want_cookie = 1;
1254                 } else
1255 #endif
1256                 goto drop;
1257         }
1258
1259         /* Accept backlog is full. If we have already queued enough
1260          * of warm entries in syn queue, drop request. It is better than
1261          * clogging syn queue with openreqs with exponentially increasing
1262          * timeout.
1263          */
1264         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1265                 goto drop;
1266
1267         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1268         if (!req)
1269                 goto drop;
1270
1271 #ifdef CONFIG_TCP_MD5SIG
1272         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1273 #endif
1274
1275         tcp_clear_options(&tmp_opt);
1276         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1277         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1278         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1279
1280         if (tmp_opt.cookie_plus > 0 &&
1281             tmp_opt.saw_tstamp &&
1282             !tp->rx_opt.cookie_out_never &&
1283             (sysctl_tcp_cookie_size > 0 ||
1284              (tp->cookie_values != NULL &&
1285               tp->cookie_values->cookie_desired > 0))) {
1286                 u8 *c;
1287                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1288                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1289
1290                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1291                         goto drop_and_release;
1292
1293                 /* Secret recipe starts with IP addresses */
1294                 *mess++ ^= (__force u32)daddr;
1295                 *mess++ ^= (__force u32)saddr;
1296
1297                 /* plus variable length Initiator Cookie */
1298                 c = (u8 *)mess;
1299                 while (l-- > 0)
1300                         *c++ ^= *hash_location++;
1301
1302 #ifdef CONFIG_SYN_COOKIES
1303                 want_cookie = 0;        /* not our kind of cookie */
1304 #endif
1305                 tmp_ext.cookie_out_never = 0; /* false */
1306                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1307         } else if (!tp->rx_opt.cookie_in_always) {
1308                 /* redundant indications, but ensure initialization. */
1309                 tmp_ext.cookie_out_never = 1; /* true */
1310                 tmp_ext.cookie_plus = 0;
1311         } else {
1312                 goto drop_and_release;
1313         }
1314         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1315
1316         if (want_cookie && !tmp_opt.saw_tstamp)
1317                 tcp_clear_options(&tmp_opt);
1318
1319         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1320         tcp_openreq_init(req, &tmp_opt, skb);
1321
1322         ireq = inet_rsk(req);
1323         ireq->loc_addr = daddr;
1324         ireq->rmt_addr = saddr;
1325         ireq->no_srccheck = inet_sk(sk)->transparent;
1326         ireq->opt = tcp_v4_save_options(sk, skb);
1327
1328         if (security_inet_conn_request(sk, skb, req))
1329                 goto drop_and_free;
1330
1331         if (!want_cookie)
1332                 TCP_ECN_create_request(req, tcp_hdr(skb));
1333
1334         if (want_cookie) {
1335 #ifdef CONFIG_SYN_COOKIES
1336                 req->cookie_ts = tmp_opt.tstamp_ok;
1337 #endif
1338                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1339         } else if (!isn) {
1340                 struct inet_peer *peer = NULL;
1341
1342                 /* VJ's idea. We save last timestamp seen
1343                  * from the destination in peer table, when entering
1344                  * state TIME-WAIT, and check against it before
1345                  * accepting new connection request.
1346                  *
1347                  * If "isn" is not zero, this request hit alive
1348                  * timewait bucket, so that all the necessary checks
1349                  * are made in the function processing timewait state.
1350                  */
1351                 if (tmp_opt.saw_tstamp &&
1352                     tcp_death_row.sysctl_tw_recycle &&
1353                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1354                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1355                     peer->v4daddr == saddr) {
1356                         inet_peer_refcheck(peer);
1357                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1358                             (s32)(peer->tcp_ts - req->ts_recent) >
1359                                                         TCP_PAWS_WINDOW) {
1360                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1361                                 goto drop_and_release;
1362                         }
1363                 }
1364                 /* Kill the following clause, if you dislike this way. */
1365                 else if (!sysctl_tcp_syncookies &&
1366                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1367                           (sysctl_max_syn_backlog >> 2)) &&
1368                          (!peer || !peer->tcp_ts_stamp) &&
1369                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1370                         /* Without syncookies last quarter of
1371                          * backlog is filled with destinations,
1372                          * proven to be alive.
1373                          * It means that we continue to communicate
1374                          * to destinations, already remembered
1375                          * to the moment of synflood.
1376                          */
1377                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1378                                        &saddr, ntohs(tcp_hdr(skb)->source));
1379                         goto drop_and_release;
1380                 }
1381
1382                 isn = tcp_v4_init_sequence(skb);
1383         }
1384         tcp_rsk(req)->snt_isn = isn;
1385
1386         if (tcp_v4_send_synack(sk, dst, req,
1387                                (struct request_values *)&tmp_ext) ||
1388             want_cookie)
1389                 goto drop_and_free;
1390
1391         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1392         return 0;
1393
1394 drop_and_release:
1395         dst_release(dst);
1396 drop_and_free:
1397         reqsk_free(req);
1398 drop:
1399         return 0;
1400 }
1401
1402
1403 /*
1404  * The three way handshake has completed - we got a valid synack -
1405  * now create the new socket.
1406  */
1407 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1408                                   struct request_sock *req,
1409                                   struct dst_entry *dst)
1410 {
1411         struct inet_request_sock *ireq;
1412         struct inet_sock *newinet;
1413         struct tcp_sock *newtp;
1414         struct sock *newsk;
1415 #ifdef CONFIG_TCP_MD5SIG
1416         struct tcp_md5sig_key *key;
1417 #endif
1418
1419         if (sk_acceptq_is_full(sk))
1420                 goto exit_overflow;
1421
1422         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1423                 goto exit;
1424
1425         newsk = tcp_create_openreq_child(sk, req, skb);
1426         if (!newsk)
1427                 goto exit;
1428
1429         newsk->sk_gso_type = SKB_GSO_TCPV4;
1430         sk_setup_caps(newsk, dst);
1431
1432         newtp                 = tcp_sk(newsk);
1433         newinet               = inet_sk(newsk);
1434         ireq                  = inet_rsk(req);
1435         newinet->inet_daddr   = ireq->rmt_addr;
1436         newinet->inet_rcv_saddr = ireq->loc_addr;
1437         newinet->inet_saddr           = ireq->loc_addr;
1438         newinet->opt          = ireq->opt;
1439         ireq->opt             = NULL;
1440         newinet->mc_index     = inet_iif(skb);
1441         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1442         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1443         if (newinet->opt)
1444                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1445         newinet->inet_id = newtp->write_seq ^ jiffies;
1446
1447         tcp_mtup_init(newsk);
1448         tcp_sync_mss(newsk, dst_mtu(dst));
1449         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1450         if (tcp_sk(sk)->rx_opt.user_mss &&
1451             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1452                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1453
1454         tcp_initialize_rcv_mss(newsk);
1455
1456 #ifdef CONFIG_TCP_MD5SIG
1457         /* Copy over the MD5 key from the original socket */
1458         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1459         if (key != NULL) {
1460                 /*
1461                  * We're using one, so create a matching key
1462                  * on the newsk structure. If we fail to get
1463                  * memory, then we end up not copying the key
1464                  * across. Shucks.
1465                  */
1466                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1467                 if (newkey != NULL)
1468                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1469                                           newkey, key->keylen);
1470                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1471         }
1472 #endif
1473
1474         __inet_hash_nolisten(newsk, NULL);
1475         __inet_inherit_port(sk, newsk);
1476
1477         return newsk;
1478
1479 exit_overflow:
1480         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1481 exit:
1482         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1483         dst_release(dst);
1484         return NULL;
1485 }
1486
1487 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1488 {
1489         struct tcphdr *th = tcp_hdr(skb);
1490         const struct iphdr *iph = ip_hdr(skb);
1491         struct sock *nsk;
1492         struct request_sock **prev;
1493         /* Find possible connection requests. */
1494         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1495                                                        iph->saddr, iph->daddr);
1496         if (req)
1497                 return tcp_check_req(sk, skb, req, prev);
1498
1499         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1500                         th->source, iph->daddr, th->dest, inet_iif(skb));
1501
1502         if (nsk) {
1503                 if (nsk->sk_state != TCP_TIME_WAIT) {
1504                         bh_lock_sock(nsk);
1505                         return nsk;
1506                 }
1507                 inet_twsk_put(inet_twsk(nsk));
1508                 return NULL;
1509         }
1510
1511 #ifdef CONFIG_SYN_COOKIES
1512         if (!th->syn)
1513                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1514 #endif
1515         return sk;
1516 }
1517
1518 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1519 {
1520         const struct iphdr *iph = ip_hdr(skb);
1521
1522         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1523                 if (!tcp_v4_check(skb->len, iph->saddr,
1524                                   iph->daddr, skb->csum)) {
1525                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1526                         return 0;
1527                 }
1528         }
1529
1530         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1531                                        skb->len, IPPROTO_TCP, 0);
1532
1533         if (skb->len <= 76) {
1534                 return __skb_checksum_complete(skb);
1535         }
1536         return 0;
1537 }
1538
1539
1540 /* The socket must have it's spinlock held when we get
1541  * here.
1542  *
1543  * We have a potential double-lock case here, so even when
1544  * doing backlog processing we use the BH locking scheme.
1545  * This is because we cannot sleep with the original spinlock
1546  * held.
1547  */
1548 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1549 {
1550         struct sock *rsk;
1551 #ifdef CONFIG_TCP_MD5SIG
1552         /*
1553          * We really want to reject the packet as early as possible
1554          * if:
1555          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1556          *  o There is an MD5 option and we're not expecting one
1557          */
1558         if (tcp_v4_inbound_md5_hash(sk, skb))
1559                 goto discard;
1560 #endif
1561
1562         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1563                 sock_rps_save_rxhash(sk, skb->rxhash);
1564                 TCP_CHECK_TIMER(sk);
1565                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1566                         rsk = sk;
1567                         goto reset;
1568                 }
1569                 TCP_CHECK_TIMER(sk);
1570                 return 0;
1571         }
1572
1573         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1574                 goto csum_err;
1575
1576         if (sk->sk_state == TCP_LISTEN) {
1577                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1578                 if (!nsk)
1579                         goto discard;
1580
1581                 if (nsk != sk) {
1582                         if (tcp_child_process(sk, nsk, skb)) {
1583                                 rsk = nsk;
1584                                 goto reset;
1585                         }
1586                         return 0;
1587                 }
1588         } else
1589                 sock_rps_save_rxhash(sk, skb->rxhash);
1590
1591
1592         TCP_CHECK_TIMER(sk);
1593         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1594                 rsk = sk;
1595                 goto reset;
1596         }
1597         TCP_CHECK_TIMER(sk);
1598         return 0;
1599
1600 reset:
1601         tcp_v4_send_reset(rsk, skb);
1602 discard:
1603         kfree_skb(skb);
1604         /* Be careful here. If this function gets more complicated and
1605          * gcc suffers from register pressure on the x86, sk (in %ebx)
1606          * might be destroyed here. This current version compiles correctly,
1607          * but you have been warned.
1608          */
1609         return 0;
1610
1611 csum_err:
1612         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1613         goto discard;
1614 }
1615
1616 /*
1617  *      From tcp_input.c
1618  */
1619
1620 int tcp_v4_rcv(struct sk_buff *skb)
1621 {
1622         const struct iphdr *iph;
1623         struct tcphdr *th;
1624         struct sock *sk;
1625         int ret;
1626         struct net *net = dev_net(skb->dev);
1627
1628         if (skb->pkt_type != PACKET_HOST)
1629                 goto discard_it;
1630
1631         /* Count it even if it's bad */
1632         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1633
1634         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1635                 goto discard_it;
1636
1637         th = tcp_hdr(skb);
1638
1639         if (th->doff < sizeof(struct tcphdr) / 4)
1640                 goto bad_packet;
1641         if (!pskb_may_pull(skb, th->doff * 4))
1642                 goto discard_it;
1643
1644         /* An explanation is required here, I think.
1645          * Packet length and doff are validated by header prediction,
1646          * provided case of th->doff==0 is eliminated.
1647          * So, we defer the checks. */
1648         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1649                 goto bad_packet;
1650
1651         th = tcp_hdr(skb);
1652         iph = ip_hdr(skb);
1653         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1654         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1655                                     skb->len - th->doff * 4);
1656         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1657         TCP_SKB_CB(skb)->when    = 0;
1658         TCP_SKB_CB(skb)->flags   = iph->tos;
1659         TCP_SKB_CB(skb)->sacked  = 0;
1660
1661         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1662         if (!sk)
1663                 goto no_tcp_socket;
1664
1665 process:
1666         if (sk->sk_state == TCP_TIME_WAIT)
1667                 goto do_time_wait;
1668
1669         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1670                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1671                 goto discard_and_relse;
1672         }
1673
1674         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1675                 goto discard_and_relse;
1676         nf_reset(skb);
1677
1678         if (sk_filter(sk, skb))
1679                 goto discard_and_relse;
1680
1681         skb->dev = NULL;
1682
1683         bh_lock_sock_nested(sk);
1684         ret = 0;
1685         if (!sock_owned_by_user(sk)) {
1686 #ifdef CONFIG_NET_DMA
1687                 struct tcp_sock *tp = tcp_sk(sk);
1688                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1689                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1690                 if (tp->ucopy.dma_chan)
1691                         ret = tcp_v4_do_rcv(sk, skb);
1692                 else
1693 #endif
1694                 {
1695                         if (!tcp_prequeue(sk, skb))
1696                                 ret = tcp_v4_do_rcv(sk, skb);
1697                 }
1698         } else if (unlikely(sk_add_backlog(sk, skb))) {
1699                 bh_unlock_sock(sk);
1700                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1701                 goto discard_and_relse;
1702         }
1703         bh_unlock_sock(sk);
1704
1705         sock_put(sk);
1706
1707         return ret;
1708
1709 no_tcp_socket:
1710         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1711                 goto discard_it;
1712
1713         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1714 bad_packet:
1715                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1716         } else {
1717                 tcp_v4_send_reset(NULL, skb);
1718         }
1719
1720 discard_it:
1721         /* Discard frame. */
1722         kfree_skb(skb);
1723         return 0;
1724
1725 discard_and_relse:
1726         sock_put(sk);
1727         goto discard_it;
1728
1729 do_time_wait:
1730         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1731                 inet_twsk_put(inet_twsk(sk));
1732                 goto discard_it;
1733         }
1734
1735         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1736                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1737                 inet_twsk_put(inet_twsk(sk));
1738                 goto discard_it;
1739         }
1740         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1741         case TCP_TW_SYN: {
1742                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1743                                                         &tcp_hashinfo,
1744                                                         iph->daddr, th->dest,
1745                                                         inet_iif(skb));
1746                 if (sk2) {
1747                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1748                         inet_twsk_put(inet_twsk(sk));
1749                         sk = sk2;
1750                         goto process;
1751                 }
1752                 /* Fall through to ACK */
1753         }
1754         case TCP_TW_ACK:
1755                 tcp_v4_timewait_ack(sk, skb);
1756                 break;
1757         case TCP_TW_RST:
1758                 goto no_tcp_socket;
1759         case TCP_TW_SUCCESS:;
1760         }
1761         goto discard_it;
1762 }
1763
1764 /* VJ's idea. Save last timestamp seen from this destination
1765  * and hold it at least for normal timewait interval to use for duplicate
1766  * segment detection in subsequent connections, before they enter synchronized
1767  * state.
1768  */
1769
1770 int tcp_v4_remember_stamp(struct sock *sk)
1771 {
1772         struct inet_sock *inet = inet_sk(sk);
1773         struct tcp_sock *tp = tcp_sk(sk);
1774         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1775         struct inet_peer *peer = NULL;
1776         int release_it = 0;
1777
1778         if (!rt || rt->rt_dst != inet->inet_daddr) {
1779                 peer = inet_getpeer(inet->inet_daddr, 1);
1780                 release_it = 1;
1781         } else {
1782                 if (!rt->peer)
1783                         rt_bind_peer(rt, 1);
1784                 peer = rt->peer;
1785         }
1786
1787         if (peer) {
1788                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1789                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1790                      peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1791                         peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1792                         peer->tcp_ts = tp->rx_opt.ts_recent;
1793                 }
1794                 if (release_it)
1795                         inet_putpeer(peer);
1796                 return 1;
1797         }
1798
1799         return 0;
1800 }
1801
1802 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1803 {
1804         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1805
1806         if (peer) {
1807                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1808
1809                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1810                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1811                      peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1812                         peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1813                         peer->tcp_ts       = tcptw->tw_ts_recent;
1814                 }
1815                 inet_putpeer(peer);
1816                 return 1;
1817         }
1818
1819         return 0;
1820 }
1821
1822 const struct inet_connection_sock_af_ops ipv4_specific = {
1823         .queue_xmit        = ip_queue_xmit,
1824         .send_check        = tcp_v4_send_check,
1825         .rebuild_header    = inet_sk_rebuild_header,
1826         .conn_request      = tcp_v4_conn_request,
1827         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1828         .remember_stamp    = tcp_v4_remember_stamp,
1829         .net_header_len    = sizeof(struct iphdr),
1830         .setsockopt        = ip_setsockopt,
1831         .getsockopt        = ip_getsockopt,
1832         .addr2sockaddr     = inet_csk_addr2sockaddr,
1833         .sockaddr_len      = sizeof(struct sockaddr_in),
1834         .bind_conflict     = inet_csk_bind_conflict,
1835 #ifdef CONFIG_COMPAT
1836         .compat_setsockopt = compat_ip_setsockopt,
1837         .compat_getsockopt = compat_ip_getsockopt,
1838 #endif
1839 };
1840
1841 #ifdef CONFIG_TCP_MD5SIG
1842 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1843         .md5_lookup             = tcp_v4_md5_lookup,
1844         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1845         .md5_add                = tcp_v4_md5_add_func,
1846         .md5_parse              = tcp_v4_parse_md5_keys,
1847 };
1848 #endif
1849
1850 /* NOTE: A lot of things set to zero explicitly by call to
1851  *       sk_alloc() so need not be done here.
1852  */
1853 static int tcp_v4_init_sock(struct sock *sk)
1854 {
1855         struct inet_connection_sock *icsk = inet_csk(sk);
1856         struct tcp_sock *tp = tcp_sk(sk);
1857
1858         skb_queue_head_init(&tp->out_of_order_queue);
1859         tcp_init_xmit_timers(sk);
1860         tcp_prequeue_init(tp);
1861
1862         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1863         tp->mdev = TCP_TIMEOUT_INIT;
1864
1865         /* So many TCP implementations out there (incorrectly) count the
1866          * initial SYN frame in their delayed-ACK and congestion control
1867          * algorithms that we must have the following bandaid to talk
1868          * efficiently to them.  -DaveM
1869          */
1870         tp->snd_cwnd = 2;
1871
1872         /* See draft-stevens-tcpca-spec-01 for discussion of the
1873          * initialization of these values.
1874          */
1875         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1876         tp->snd_cwnd_clamp = ~0;
1877         tp->mss_cache = TCP_MSS_DEFAULT;
1878
1879         tp->reordering = sysctl_tcp_reordering;
1880         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1881
1882         sk->sk_state = TCP_CLOSE;
1883
1884         sk->sk_write_space = sk_stream_write_space;
1885         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1886
1887         icsk->icsk_af_ops = &ipv4_specific;
1888         icsk->icsk_sync_mss = tcp_sync_mss;
1889 #ifdef CONFIG_TCP_MD5SIG
1890         tp->af_specific = &tcp_sock_ipv4_specific;
1891 #endif
1892
1893         /* TCP Cookie Transactions */
1894         if (sysctl_tcp_cookie_size > 0) {
1895                 /* Default, cookies without s_data_payload. */
1896                 tp->cookie_values =
1897                         kzalloc(sizeof(*tp->cookie_values),
1898                                 sk->sk_allocation);
1899                 if (tp->cookie_values != NULL)
1900                         kref_init(&tp->cookie_values->kref);
1901         }
1902         /* Presumed zeroed, in order of appearance:
1903          *      cookie_in_always, cookie_out_never,
1904          *      s_data_constant, s_data_in, s_data_out
1905          */
1906         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1907         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1908
1909         local_bh_disable();
1910         percpu_counter_inc(&tcp_sockets_allocated);
1911         local_bh_enable();
1912
1913         return 0;
1914 }
1915
1916 void tcp_v4_destroy_sock(struct sock *sk)
1917 {
1918         struct tcp_sock *tp = tcp_sk(sk);
1919
1920         tcp_clear_xmit_timers(sk);
1921
1922         tcp_cleanup_congestion_control(sk);
1923
1924         /* Cleanup up the write buffer. */
1925         tcp_write_queue_purge(sk);
1926
1927         /* Cleans up our, hopefully empty, out_of_order_queue. */
1928         __skb_queue_purge(&tp->out_of_order_queue);
1929
1930 #ifdef CONFIG_TCP_MD5SIG
1931         /* Clean up the MD5 key list, if any */
1932         if (tp->md5sig_info) {
1933                 tcp_v4_clear_md5_list(sk);
1934                 kfree(tp->md5sig_info);
1935                 tp->md5sig_info = NULL;
1936         }
1937 #endif
1938
1939 #ifdef CONFIG_NET_DMA
1940         /* Cleans up our sk_async_wait_queue */
1941         __skb_queue_purge(&sk->sk_async_wait_queue);
1942 #endif
1943
1944         /* Clean prequeue, it must be empty really */
1945         __skb_queue_purge(&tp->ucopy.prequeue);
1946
1947         /* Clean up a referenced TCP bind bucket. */
1948         if (inet_csk(sk)->icsk_bind_hash)
1949                 inet_put_port(sk);
1950
1951         /*
1952          * If sendmsg cached page exists, toss it.
1953          */
1954         if (sk->sk_sndmsg_page) {
1955                 __free_page(sk->sk_sndmsg_page);
1956                 sk->sk_sndmsg_page = NULL;
1957         }
1958
1959         /* TCP Cookie Transactions */
1960         if (tp->cookie_values != NULL) {
1961                 kref_put(&tp->cookie_values->kref,
1962                          tcp_cookie_values_release);
1963                 tp->cookie_values = NULL;
1964         }
1965
1966         percpu_counter_dec(&tcp_sockets_allocated);
1967 }
1968
1969 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1970
1971 #ifdef CONFIG_PROC_FS
1972 /* Proc filesystem TCP sock list dumping. */
1973
1974 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1975 {
1976         return hlist_nulls_empty(head) ? NULL :
1977                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1978 }
1979
1980 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1981 {
1982         return !is_a_nulls(tw->tw_node.next) ?
1983                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1984 }
1985
1986 /*
1987  * Get next listener socket follow cur.  If cur is NULL, get first socket
1988  * starting from bucket given in st->bucket; when st->bucket is zero the
1989  * very first socket in the hash table is returned.
1990  */
1991 static void *listening_get_next(struct seq_file *seq, void *cur)
1992 {
1993         struct inet_connection_sock *icsk;
1994         struct hlist_nulls_node *node;
1995         struct sock *sk = cur;
1996         struct inet_listen_hashbucket *ilb;
1997         struct tcp_iter_state *st = seq->private;
1998         struct net *net = seq_file_net(seq);
1999
2000         if (!sk) {
2001                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2002                 spin_lock_bh(&ilb->lock);
2003                 sk = sk_nulls_head(&ilb->head);
2004                 st->offset = 0;
2005                 goto get_sk;
2006         }
2007         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2008         ++st->num;
2009         ++st->offset;
2010
2011         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2012                 struct request_sock *req = cur;
2013
2014                 icsk = inet_csk(st->syn_wait_sk);
2015                 req = req->dl_next;
2016                 while (1) {
2017                         while (req) {
2018                                 if (req->rsk_ops->family == st->family) {
2019                                         cur = req;
2020                                         goto out;
2021                                 }
2022                                 req = req->dl_next;
2023                         }
2024                         st->offset = 0;
2025                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2026                                 break;
2027 get_req:
2028                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2029                 }
2030                 sk        = sk_next(st->syn_wait_sk);
2031                 st->state = TCP_SEQ_STATE_LISTENING;
2032                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2033         } else {
2034                 icsk = inet_csk(sk);
2035                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2036                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2037                         goto start_req;
2038                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2039                 sk = sk_next(sk);
2040         }
2041 get_sk:
2042         sk_nulls_for_each_from(sk, node) {
2043                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2044                         cur = sk;
2045                         goto out;
2046                 }
2047                 icsk = inet_csk(sk);
2048                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2049                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2050 start_req:
2051                         st->uid         = sock_i_uid(sk);
2052                         st->syn_wait_sk = sk;
2053                         st->state       = TCP_SEQ_STATE_OPENREQ;
2054                         st->sbucket     = 0;
2055                         goto get_req;
2056                 }
2057                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2058         }
2059         spin_unlock_bh(&ilb->lock);
2060         st->offset = 0;
2061         if (++st->bucket < INET_LHTABLE_SIZE) {
2062                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2063                 spin_lock_bh(&ilb->lock);
2064                 sk = sk_nulls_head(&ilb->head);
2065                 goto get_sk;
2066         }
2067         cur = NULL;
2068 out:
2069         return cur;
2070 }
2071
2072 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2073 {
2074         struct tcp_iter_state *st = seq->private;
2075         void *rc;
2076
2077         st->bucket = 0;
2078         st->offset = 0;
2079         rc = listening_get_next(seq, NULL);
2080
2081         while (rc && *pos) {
2082                 rc = listening_get_next(seq, rc);
2083                 --*pos;
2084         }
2085         return rc;
2086 }
2087
2088 static inline int empty_bucket(struct tcp_iter_state *st)
2089 {
2090         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2091                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2092 }
2093
2094 /*
2095  * Get first established socket starting from bucket given in st->bucket.
2096  * If st->bucket is zero, the very first socket in the hash is returned.
2097  */
2098 static void *established_get_first(struct seq_file *seq)
2099 {
2100         struct tcp_iter_state *st = seq->private;
2101         struct net *net = seq_file_net(seq);
2102         void *rc = NULL;
2103
2104         st->offset = 0;
2105         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2106                 struct sock *sk;
2107                 struct hlist_nulls_node *node;
2108                 struct inet_timewait_sock *tw;
2109                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2110
2111                 /* Lockless fast path for the common case of empty buckets */
2112                 if (empty_bucket(st))
2113                         continue;
2114
2115                 spin_lock_bh(lock);
2116                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2117                         if (sk->sk_family != st->family ||
2118                             !net_eq(sock_net(sk), net)) {
2119                                 continue;
2120                         }
2121                         rc = sk;
2122                         goto out;
2123                 }
2124                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2125                 inet_twsk_for_each(tw, node,
2126                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2127                         if (tw->tw_family != st->family ||
2128                             !net_eq(twsk_net(tw), net)) {
2129                                 continue;
2130                         }
2131                         rc = tw;
2132                         goto out;
2133                 }
2134                 spin_unlock_bh(lock);
2135                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2136         }
2137 out:
2138         return rc;
2139 }
2140
2141 static void *established_get_next(struct seq_file *seq, void *cur)
2142 {
2143         struct sock *sk = cur;
2144         struct inet_timewait_sock *tw;
2145         struct hlist_nulls_node *node;
2146         struct tcp_iter_state *st = seq->private;
2147         struct net *net = seq_file_net(seq);
2148
2149         ++st->num;
2150         ++st->offset;
2151
2152         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2153                 tw = cur;
2154                 tw = tw_next(tw);
2155 get_tw:
2156                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2157                         tw = tw_next(tw);
2158                 }
2159                 if (tw) {
2160                         cur = tw;
2161                         goto out;
2162                 }
2163                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2164                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2165
2166                 /* Look for next non empty bucket */
2167                 st->offset = 0;
2168                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2169                                 empty_bucket(st))
2170                         ;
2171                 if (st->bucket > tcp_hashinfo.ehash_mask)
2172                         return NULL;
2173
2174                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2175                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2176         } else
2177                 sk = sk_nulls_next(sk);
2178
2179         sk_nulls_for_each_from(sk, node) {
2180                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2181                         goto found;
2182         }
2183
2184         st->state = TCP_SEQ_STATE_TIME_WAIT;
2185         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2186         goto get_tw;
2187 found:
2188         cur = sk;
2189 out:
2190         return cur;
2191 }
2192
2193 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2194 {
2195         struct tcp_iter_state *st = seq->private;
2196         void *rc;
2197
2198         st->bucket = 0;
2199         rc = established_get_first(seq);
2200
2201         while (rc && pos) {
2202                 rc = established_get_next(seq, rc);
2203                 --pos;
2204         }
2205         return rc;
2206 }
2207
2208 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2209 {
2210         void *rc;
2211         struct tcp_iter_state *st = seq->private;
2212
2213         st->state = TCP_SEQ_STATE_LISTENING;
2214         rc        = listening_get_idx(seq, &pos);
2215
2216         if (!rc) {
2217                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2218                 rc        = established_get_idx(seq, pos);
2219         }
2220
2221         return rc;
2222 }
2223
2224 static void *tcp_seek_last_pos(struct seq_file *seq)
2225 {
2226         struct tcp_iter_state *st = seq->private;
2227         int offset = st->offset;
2228         int orig_num = st->num;
2229         void *rc = NULL;
2230
2231         switch (st->state) {
2232         case TCP_SEQ_STATE_OPENREQ:
2233         case TCP_SEQ_STATE_LISTENING:
2234                 if (st->bucket >= INET_LHTABLE_SIZE)
2235                         break;
2236                 st->state = TCP_SEQ_STATE_LISTENING;
2237                 rc = listening_get_next(seq, NULL);
2238                 while (offset-- && rc)
2239                         rc = listening_get_next(seq, rc);
2240                 if (rc)
2241                         break;
2242                 st->bucket = 0;
2243                 /* Fallthrough */
2244         case TCP_SEQ_STATE_ESTABLISHED:
2245         case TCP_SEQ_STATE_TIME_WAIT:
2246                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2247                 if (st->bucket > tcp_hashinfo.ehash_mask)
2248                         break;
2249                 rc = established_get_first(seq);
2250                 while (offset-- && rc)
2251                         rc = established_get_next(seq, rc);
2252         }
2253
2254         st->num = orig_num;
2255
2256         return rc;
2257 }
2258
2259 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2260 {
2261         struct tcp_iter_state *st = seq->private;
2262         void *rc;
2263
2264         if (*pos && *pos == st->last_pos) {
2265                 rc = tcp_seek_last_pos(seq);
2266                 if (rc)
2267                         goto out;
2268         }
2269
2270         st->state = TCP_SEQ_STATE_LISTENING;
2271         st->num = 0;
2272         st->bucket = 0;
2273         st->offset = 0;
2274         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2275
2276 out:
2277         st->last_pos = *pos;
2278         return rc;
2279 }
2280
2281 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2282 {
2283         struct tcp_iter_state *st = seq->private;
2284         void *rc = NULL;
2285
2286         if (v == SEQ_START_TOKEN) {
2287                 rc = tcp_get_idx(seq, 0);
2288                 goto out;
2289         }
2290
2291         switch (st->state) {
2292         case TCP_SEQ_STATE_OPENREQ:
2293         case TCP_SEQ_STATE_LISTENING:
2294                 rc = listening_get_next(seq, v);
2295                 if (!rc) {
2296                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2297                         st->bucket = 0;
2298                         st->offset = 0;
2299                         rc        = established_get_first(seq);
2300                 }
2301                 break;
2302         case TCP_SEQ_STATE_ESTABLISHED:
2303         case TCP_SEQ_STATE_TIME_WAIT:
2304                 rc = established_get_next(seq, v);
2305                 break;
2306         }
2307 out:
2308         ++*pos;
2309         st->last_pos = *pos;
2310         return rc;
2311 }
2312
2313 static void tcp_seq_stop(struct seq_file *seq, void *v)
2314 {
2315         struct tcp_iter_state *st = seq->private;
2316
2317         switch (st->state) {
2318         case TCP_SEQ_STATE_OPENREQ:
2319                 if (v) {
2320                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2321                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2322                 }
2323         case TCP_SEQ_STATE_LISTENING:
2324                 if (v != SEQ_START_TOKEN)
2325                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2326                 break;
2327         case TCP_SEQ_STATE_TIME_WAIT:
2328         case TCP_SEQ_STATE_ESTABLISHED:
2329                 if (v)
2330                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2331                 break;
2332         }
2333 }
2334
2335 static int tcp_seq_open(struct inode *inode, struct file *file)
2336 {
2337         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2338         struct tcp_iter_state *s;
2339         int err;
2340
2341         err = seq_open_net(inode, file, &afinfo->seq_ops,
2342                           sizeof(struct tcp_iter_state));
2343         if (err < 0)
2344                 return err;
2345
2346         s = ((struct seq_file *)file->private_data)->private;
2347         s->family               = afinfo->family;
2348         s->last_pos             = 0;
2349         return 0;
2350 }
2351
2352 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2353 {
2354         int rc = 0;
2355         struct proc_dir_entry *p;
2356
2357         afinfo->seq_fops.open           = tcp_seq_open;
2358         afinfo->seq_fops.read           = seq_read;
2359         afinfo->seq_fops.llseek         = seq_lseek;
2360         afinfo->seq_fops.release        = seq_release_net;
2361
2362         afinfo->seq_ops.start           = tcp_seq_start;
2363         afinfo->seq_ops.next            = tcp_seq_next;
2364         afinfo->seq_ops.stop            = tcp_seq_stop;
2365
2366         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2367                              &afinfo->seq_fops, afinfo);
2368         if (!p)
2369                 rc = -ENOMEM;
2370         return rc;
2371 }
2372
2373 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2374 {
2375         proc_net_remove(net, afinfo->name);
2376 }
2377
2378 static void get_openreq4(struct sock *sk, struct request_sock *req,
2379                          struct seq_file *f, int i, int uid, int *len)
2380 {
2381         const struct inet_request_sock *ireq = inet_rsk(req);
2382         int ttd = req->expires - jiffies;
2383
2384         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2385                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2386                 i,
2387                 ireq->loc_addr,
2388                 ntohs(inet_sk(sk)->inet_sport),
2389                 ireq->rmt_addr,
2390                 ntohs(ireq->rmt_port),
2391                 TCP_SYN_RECV,
2392                 0, 0, /* could print option size, but that is af dependent. */
2393                 1,    /* timers active (only the expire timer) */
2394                 jiffies_to_clock_t(ttd),
2395                 req->retrans,
2396                 uid,
2397                 0,  /* non standard timer */
2398                 0, /* open_requests have no inode */
2399                 atomic_read(&sk->sk_refcnt),
2400                 req,
2401                 len);
2402 }
2403
2404 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2405 {
2406         int timer_active;
2407         unsigned long timer_expires;
2408         struct tcp_sock *tp = tcp_sk(sk);
2409         const struct inet_connection_sock *icsk = inet_csk(sk);
2410         struct inet_sock *inet = inet_sk(sk);
2411         __be32 dest = inet->inet_daddr;
2412         __be32 src = inet->inet_rcv_saddr;
2413         __u16 destp = ntohs(inet->inet_dport);
2414         __u16 srcp = ntohs(inet->inet_sport);
2415         int rx_queue;
2416
2417         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2418                 timer_active    = 1;
2419                 timer_expires   = icsk->icsk_timeout;
2420         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2421                 timer_active    = 4;
2422                 timer_expires   = icsk->icsk_timeout;
2423         } else if (timer_pending(&sk->sk_timer)) {
2424                 timer_active    = 2;
2425                 timer_expires   = sk->sk_timer.expires;
2426         } else {
2427                 timer_active    = 0;
2428                 timer_expires = jiffies;
2429         }
2430
2431         if (sk->sk_state == TCP_LISTEN)
2432                 rx_queue = sk->sk_ack_backlog;
2433         else
2434                 /*
2435                  * because we dont lock socket, we might find a transient negative value
2436                  */
2437                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2438
2439         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2440                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2441                 i, src, srcp, dest, destp, sk->sk_state,
2442                 tp->write_seq - tp->snd_una,
2443                 rx_queue,
2444                 timer_active,
2445                 jiffies_to_clock_t(timer_expires - jiffies),
2446                 icsk->icsk_retransmits,
2447                 sock_i_uid(sk),
2448                 icsk->icsk_probes_out,
2449                 sock_i_ino(sk),
2450                 atomic_read(&sk->sk_refcnt), sk,
2451                 jiffies_to_clock_t(icsk->icsk_rto),
2452                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2453                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2454                 tp->snd_cwnd,
2455                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2456                 len);
2457 }
2458
2459 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2460                                struct seq_file *f, int i, int *len)
2461 {
2462         __be32 dest, src;
2463         __u16 destp, srcp;
2464         int ttd = tw->tw_ttd - jiffies;
2465
2466         if (ttd < 0)
2467                 ttd = 0;
2468
2469         dest  = tw->tw_daddr;
2470         src   = tw->tw_rcv_saddr;
2471         destp = ntohs(tw->tw_dport);
2472         srcp  = ntohs(tw->tw_sport);
2473
2474         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2475                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2476                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2477                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2478                 atomic_read(&tw->tw_refcnt), tw, len);
2479 }
2480
2481 #define TMPSZ 150
2482
2483 static int tcp4_seq_show(struct seq_file *seq, void *v)
2484 {
2485         struct tcp_iter_state *st;
2486         int len;
2487
2488         if (v == SEQ_START_TOKEN) {
2489                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2490                            "  sl  local_address rem_address   st tx_queue "
2491                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2492                            "inode");
2493                 goto out;
2494         }
2495         st = seq->private;
2496
2497         switch (st->state) {
2498         case TCP_SEQ_STATE_LISTENING:
2499         case TCP_SEQ_STATE_ESTABLISHED:
2500                 get_tcp4_sock(v, seq, st->num, &len);
2501                 break;
2502         case TCP_SEQ_STATE_OPENREQ:
2503                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2504                 break;
2505         case TCP_SEQ_STATE_TIME_WAIT:
2506                 get_timewait4_sock(v, seq, st->num, &len);
2507                 break;
2508         }
2509         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2510 out:
2511         return 0;
2512 }
2513
2514 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2515         .name           = "tcp",
2516         .family         = AF_INET,
2517         .seq_fops       = {
2518                 .owner          = THIS_MODULE,
2519         },
2520         .seq_ops        = {
2521                 .show           = tcp4_seq_show,
2522         },
2523 };
2524
2525 static int __net_init tcp4_proc_init_net(struct net *net)
2526 {
2527         return tcp_proc_register(net, &tcp4_seq_afinfo);
2528 }
2529
2530 static void __net_exit tcp4_proc_exit_net(struct net *net)
2531 {
2532         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2533 }
2534
2535 static struct pernet_operations tcp4_net_ops = {
2536         .init = tcp4_proc_init_net,
2537         .exit = tcp4_proc_exit_net,
2538 };
2539
2540 int __init tcp4_proc_init(void)
2541 {
2542         return register_pernet_subsys(&tcp4_net_ops);
2543 }
2544
2545 void tcp4_proc_exit(void)
2546 {
2547         unregister_pernet_subsys(&tcp4_net_ops);
2548 }
2549 #endif /* CONFIG_PROC_FS */
2550
2551 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2552 {
2553         struct iphdr *iph = skb_gro_network_header(skb);
2554
2555         switch (skb->ip_summed) {
2556         case CHECKSUM_COMPLETE:
2557                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2558                                   skb->csum)) {
2559                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2560                         break;
2561                 }
2562
2563                 /* fall through */
2564         case CHECKSUM_NONE:
2565                 NAPI_GRO_CB(skb)->flush = 1;
2566                 return NULL;
2567         }
2568
2569         return tcp_gro_receive(head, skb);
2570 }
2571 EXPORT_SYMBOL(tcp4_gro_receive);
2572
2573 int tcp4_gro_complete(struct sk_buff *skb)
2574 {
2575         struct iphdr *iph = ip_hdr(skb);
2576         struct tcphdr *th = tcp_hdr(skb);
2577
2578         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2579                                   iph->saddr, iph->daddr, 0);
2580         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2581
2582         return tcp_gro_complete(skb);
2583 }
2584 EXPORT_SYMBOL(tcp4_gro_complete);
2585
2586 struct proto tcp_prot = {
2587         .name                   = "TCP",
2588         .owner                  = THIS_MODULE,
2589         .close                  = tcp_close,
2590         .connect                = tcp_v4_connect,
2591         .disconnect             = tcp_disconnect,
2592         .accept                 = inet_csk_accept,
2593         .ioctl                  = tcp_ioctl,
2594         .init                   = tcp_v4_init_sock,
2595         .destroy                = tcp_v4_destroy_sock,
2596         .shutdown               = tcp_shutdown,
2597         .setsockopt             = tcp_setsockopt,
2598         .getsockopt             = tcp_getsockopt,
2599         .recvmsg                = tcp_recvmsg,
2600         .backlog_rcv            = tcp_v4_do_rcv,
2601         .hash                   = inet_hash,
2602         .unhash                 = inet_unhash,
2603         .get_port               = inet_csk_get_port,
2604         .enter_memory_pressure  = tcp_enter_memory_pressure,
2605         .sockets_allocated      = &tcp_sockets_allocated,
2606         .orphan_count           = &tcp_orphan_count,
2607         .memory_allocated       = &tcp_memory_allocated,
2608         .memory_pressure        = &tcp_memory_pressure,
2609         .sysctl_mem             = sysctl_tcp_mem,
2610         .sysctl_wmem            = sysctl_tcp_wmem,
2611         .sysctl_rmem            = sysctl_tcp_rmem,
2612         .max_header             = MAX_TCP_HEADER,
2613         .obj_size               = sizeof(struct tcp_sock),
2614         .slab_flags             = SLAB_DESTROY_BY_RCU,
2615         .twsk_prot              = &tcp_timewait_sock_ops,
2616         .rsk_prot               = &tcp_request_sock_ops,
2617         .h.hashinfo             = &tcp_hashinfo,
2618 #ifdef CONFIG_COMPAT
2619         .compat_setsockopt      = compat_tcp_setsockopt,
2620         .compat_getsockopt      = compat_tcp_getsockopt,
2621 #endif
2622 };
2623
2624
2625 static int __net_init tcp_sk_init(struct net *net)
2626 {
2627         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2628                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2629 }
2630
2631 static void __net_exit tcp_sk_exit(struct net *net)
2632 {
2633         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2634 }
2635
2636 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2637 {
2638         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2639 }
2640
2641 static struct pernet_operations __net_initdata tcp_sk_ops = {
2642        .init       = tcp_sk_init,
2643        .exit       = tcp_sk_exit,
2644        .exit_batch = tcp_sk_exit_batch,
2645 };
2646
2647 void __init tcp_v4_init(void)
2648 {
2649         inet_hashinfo_init(&tcp_hashinfo);
2650         if (register_pernet_subsys(&tcp_sk_ops))
2651                 panic("Failed to create the TCP control socket.\n");
2652 }
2653
2654 EXPORT_SYMBOL(ipv4_specific);
2655 EXPORT_SYMBOL(tcp_hashinfo);
2656 EXPORT_SYMBOL(tcp_prot);
2657 EXPORT_SYMBOL(tcp_v4_conn_request);
2658 EXPORT_SYMBOL(tcp_v4_connect);
2659 EXPORT_SYMBOL(tcp_v4_do_rcv);
2660 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2661 EXPORT_SYMBOL(tcp_v4_send_check);
2662 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2663
2664 #ifdef CONFIG_PROC_FS
2665 EXPORT_SYMBOL(tcp_proc_register);
2666 EXPORT_SYMBOL(tcp_proc_unregister);
2667 #endif
2668 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2669