net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/bottom_half.h>
  55 #include <linux/types.h>
  56 #include <linux/fcntl.h>
  57 #include <linux/module.h>
  58 #include <linux/random.h>
  59 #include <linux/cache.h>
  60 #include <linux/jhash.h>
  61 #include <linux/init.h>
  62 #include <linux/times.h>
  63
  64 #include <net/net_namespace.h>
  65 #include <net/icmp.h>
  66 #include <net/inet_hashtables.h>
  67 #include <net/tcp.h>
  68 #include <net/transp_v6.h>
  69 #include <net/ipv6.h>
  70 #include <net/inet_common.h>
  71 #include <net/timewait_sock.h>
  72 #include <net/xfrm.h>
  73 #include <net/netdma.h>
  74
  75 #include <linux/inet.h>
  76 #include <linux/ipv6.h>
  77 #include <linux/stddef.h>
  78 #include <linux/proc_fs.h>
  79 #include <linux/seq_file.h>
  80
  81 #include <linux/crypto.h>
  82 #include <linux/scatterlist.h>
  83
  84 int sysctl_tcp_tw_reuse __read_mostly;
  85 int sysctl_tcp_low_latency __read_mostly;
  86
  87
  88 #ifdef CONFIG_TCP_MD5SIG
  89 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  90                                                    __be32 addr);
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, struct tcphdr *th);
  93 #else
  94 static inline
  95 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  96 {
  97         return NULL;
  98 }
  99 #endif
 100
 101 struct inet_hashinfo tcp_hashinfo;
 102
 103 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 104 {
 105         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 106                                           ip_hdr(skb)->saddr,
 107                                           tcp_hdr(skb)->dest,
 108                                           tcp_hdr(skb)->source);
 109 }
 110
 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112 {
 113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114         struct tcp_sock *tp = tcp_sk(sk);
 115
 116         /* With PAWS, it is safe from the viewpoint
 117            of data integrity. Even without PAWS it is safe provided sequence
 118            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 119
 120            Actually, the idea is close to VJ's one, only timestamp cache is
 121            held not per host, but per port pair and TW bucket is used as state
 122            holder.
 123
 124            If TW bucket has been already destroyed we fall back to VJ's scheme
 125            and use initial timestamp retrieved from peer table.
 126          */
 127         if (tcptw->tw_ts_recent_stamp &&
 128             (twp == NULL || (sysctl_tcp_tw_reuse &&
 129                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 130                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 131                 if (tp->write_seq == 0)
 132                         tp->write_seq = 1;
 133                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 134                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 135                 sock_hold(sktw);
 136                 return 1;
 137         }
 138
 139         return 0;
 140 }
 141
 142 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 143
 144 /* This will initiate an outgoing connection. */
 145 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 146 {
 147         struct inet_sock *inet = inet_sk(sk);
 148         struct tcp_sock *tp = tcp_sk(sk);
 149         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 150         struct rtable *rt;
 151         __be32 daddr, nexthop;
 152         int tmp;
 153         int err;
 154
 155         if (addr_len < sizeof(struct sockaddr_in))
 156                 return -EINVAL;
 157
 158         if (usin->sin_family != AF_INET)
 159                 return -EAFNOSUPPORT;
 160
 161         nexthop = daddr = usin->sin_addr.s_addr;
 162         if (inet->opt && inet->opt->srr) {
 163                 if (!daddr)
 164                         return -EINVAL;
 165                 nexthop = inet->opt->faddr;
 166         }
 167
 168         tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
 169                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 170                                IPPROTO_TCP,
 171                                inet->inet_sport, usin->sin_port, sk, 1);
 172         if (tmp < 0) {
 173                 if (tmp == -ENETUNREACH)
 174                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 175                 return tmp;
 176         }
 177
 178         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 179                 ip_rt_put(rt);
 180                 return -ENETUNREACH;
 181         }
 182
 183         if (!inet->opt || !inet->opt->srr)
 184                 daddr = rt->rt_dst;
 185
 186         if (!inet->inet_saddr)
 187                 inet->inet_saddr = rt->rt_src;
 188         inet->inet_rcv_saddr = inet->inet_saddr;
 189
 190         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 191                 /* Reset inherited state */
 192                 tp->rx_opt.ts_recent       = 0;
 193                 tp->rx_opt.ts_recent_stamp = 0;
 194                 tp->write_seq              = 0;
 195         }
 196
 197         if (tcp_death_row.sysctl_tw_recycle &&
 198             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 199                 struct inet_peer *peer = rt_get_peer(rt);
 200                 /*
 201                  * VJ's idea. We save last timestamp seen from
 202                  * the destination in peer table, when entering state
 203                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 204                  * when trying new connection.
 205                  */
 206                 if (peer != NULL &&
 207                     (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 208                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 209                         tp->rx_opt.ts_recent = peer->tcp_ts;
 210                 }
 211         }
 212
 213         inet->inet_dport = usin->sin_port;
 214         inet->inet_daddr = daddr;
 215
 216         inet_csk(sk)->icsk_ext_hdr_len = 0;
 217         if (inet->opt)
 218                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 219
 220         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 221
 222         /* Socket identity is still unknown (sport may be zero).
 223          * However we set state to SYN-SENT and not releasing socket
 224          * lock select source port, enter ourselves into the hash tables and
 225          * complete initialization after this.
 226          */
 227         tcp_set_state(sk, TCP_SYN_SENT);
 228         err = inet_hash_connect(&tcp_death_row, sk);
 229         if (err)
 230                 goto failure;
 231
 232         err = ip_route_newports(&rt, IPPROTO_TCP,
 233                                 inet->inet_sport, inet->inet_dport, sk);
 234         if (err)
 235                 goto failure;
 236
 237         /* OK, now commit destination to socket.  */
 238         sk->sk_gso_type = SKB_GSO_TCPV4;
 239         sk_setup_caps(sk, &rt->u.dst);
 240
 241         if (!tp->write_seq)
 242                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 243                                                            inet->inet_daddr,
 244                                                            inet->inet_sport,
 245                                                            usin->sin_port);
 246
 247         inet->inet_id = tp->write_seq ^ jiffies;
 248
 249         err = tcp_connect(sk);
 250         rt = NULL;
 251         if (err)
 252                 goto failure;
 253
 254         return 0;
 255
 256 failure:
 257         /*
 258          * This unhashes the socket and releases the local port,
 259          * if necessary.
 260          */
 261         tcp_set_state(sk, TCP_CLOSE);
 262         ip_rt_put(rt);
 263         sk->sk_route_caps = 0;
 264         inet->inet_dport = 0;
 265         return err;
 266 }
 267
 268 /*
 269  * This routine does path mtu discovery as defined in RFC1191.
 270  */
 271 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 272 {
 273         struct dst_entry *dst;
 274         struct inet_sock *inet = inet_sk(sk);
 275
 276         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 277          * send out by Linux are always <576bytes so they should go through
 278          * unfragmented).
 279          */
 280         if (sk->sk_state == TCP_LISTEN)
 281                 return;
 282
 283         /* We don't check in the destentry if pmtu discovery is forbidden
 284          * on this route. We just assume that no packet_to_big packets
 285          * are send back when pmtu discovery is not active.
 286          * There is a small race when the user changes this flag in the
 287          * route, but I think that's acceptable.
 288          */
 289         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 290                 return;
 291
 292         dst->ops->update_pmtu(dst, mtu);
 293
 294         /* Something is about to be wrong... Remember soft error
 295          * for the case, if this connection will not able to recover.
 296          */
 297         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 298                 sk->sk_err_soft = EMSGSIZE;
 299
 300         mtu = dst_mtu(dst);
 301
 302         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 303             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 304                 tcp_sync_mss(sk, mtu);
 305
 306                 /* Resend the TCP packet because it's
 307                  * clear that the old packet has been
 308                  * dropped. This is the new "fast" path mtu
 309                  * discovery.
 310                  */
 311                 tcp_simple_retransmit(sk);
 312         } /* else let the usual retransmit timer handle it */
 313 }
 314
 315 /*
 316  * This routine is called by the ICMP module when it gets some
 317  * sort of error condition.  If err < 0 then the socket should
 318  * be closed and the error returned to the user.  If err > 0
 319  * it's just the icmp type << 8 | icmp code.  After adjustment
 320  * header points to the first 8 bytes of the tcp header.  We need
 321  * to find the appropriate port.
 322  *
 323  * The locking strategy used here is very "optimistic". When
 324  * someone else accesses the socket the ICMP is just dropped
 325  * and for some paths there is no check at all.
 326  * A more general error queue to queue errors for later handling
 327  * is probably better.
 328  *
 329  */
 330
 331 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 332 {
 333         struct iphdr *iph = (struct iphdr *)icmp_skb->data;
 334         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 335         struct inet_connection_sock *icsk;
 336         struct tcp_sock *tp;
 337         struct inet_sock *inet;
 338         const int type = icmp_hdr(icmp_skb)->type;
 339         const int code = icmp_hdr(icmp_skb)->code;
 340         struct sock *sk;
 341         struct sk_buff *skb;
 342         __u32 seq;
 343         __u32 remaining;
 344         int err;
 345         struct net *net = dev_net(icmp_skb->dev);
 346
 347         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 348                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 349                 return;
 350         }
 351
 352         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 353                         iph->saddr, th->source, inet_iif(icmp_skb));
 354         if (!sk) {
 355                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 356                 return;
 357         }
 358         if (sk->sk_state == TCP_TIME_WAIT) {
 359                 inet_twsk_put(inet_twsk(sk));
 360                 return;
 361         }
 362
 363         bh_lock_sock(sk);
 364         /* If too many ICMPs get dropped on busy
 365          * servers this needs to be solved differently.
 366          */
 367         if (sock_owned_by_user(sk))
 368                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 369
 370         if (sk->sk_state == TCP_CLOSE)
 371                 goto out;
 372
 373         icsk = inet_csk(sk);
 374         tp = tcp_sk(sk);
 375         seq = ntohl(th->seq);
 376         if (sk->sk_state != TCP_LISTEN &&
 377             !between(seq, tp->snd_una, tp->snd_nxt)) {
 378                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 379                 goto out;
 380         }
 381
 382         switch (type) {
 383         case ICMP_SOURCE_QUENCH:
 384                 /* Just silently ignore these. */
 385                 goto out;
 386         case ICMP_PARAMETERPROB:
 387                 err = EPROTO;
 388                 break;
 389         case ICMP_DEST_UNREACH:
 390                 if (code > NR_ICMP_UNREACH)
 391                         goto out;
 392
 393                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 394                         if (!sock_owned_by_user(sk))
 395                                 do_pmtu_discovery(sk, iph, info);
 396                         goto out;
 397                 }
 398
 399                 err = icmp_err_convert[code].errno;
 400                 /* check if icmp_skb allows revert of backoff
 401                  * (see draft-zimmermann-tcp-lcd) */
 402                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 403                         break;
 404                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 405                     !icsk->icsk_backoff)
 406                         break;
 407
 408                 icsk->icsk_backoff--;
 409                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
 410                                          icsk->icsk_backoff;
 411                 tcp_bound_rto(sk);
 412
 413                 skb = tcp_write_queue_head(sk);
 414                 BUG_ON(!skb);
 415
 416                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 417                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 418
 419                 if (remaining) {
 420                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 421                                                   remaining, TCP_RTO_MAX);
 422                 } else if (sock_owned_by_user(sk)) {
 423                         /* RTO revert clocked out retransmission,
 424                          * but socket is locked. Will defer. */
 425                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 426                                                   HZ/20, TCP_RTO_MAX);
 427                 } else {
 428                         /* RTO revert clocked out retransmission.
 429                          * Will retransmit now */
 430                         tcp_retransmit_timer(sk);
 431                 }
 432
 433                 break;
 434         case ICMP_TIME_EXCEEDED:
 435                 err = EHOSTUNREACH;
 436                 break;
 437         default:
 438                 goto out;
 439         }
 440
 441         switch (sk->sk_state) {
 442                 struct request_sock *req, **prev;
 443         case TCP_LISTEN:
 444                 if (sock_owned_by_user(sk))
 445                         goto out;
 446
 447                 req = inet_csk_search_req(sk, &prev, th->dest,
 448                                           iph->daddr, iph->saddr);
 449                 if (!req)
 450                         goto out;
 451
 452                 /* ICMPs are not backlogged, hence we cannot get
 453                    an established socket here.
 454                  */
 455                 WARN_ON(req->sk);
 456
 457                 if (seq != tcp_rsk(req)->snt_isn) {
 458                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 459                         goto out;
 460                 }
 461
 462                 /*
 463                  * Still in SYN_RECV, just remove it silently.
 464                  * There is no good way to pass the error to the newly
 465                  * created socket, and POSIX does not want network
 466                  * errors returned from accept().
 467                  */
 468                 inet_csk_reqsk_queue_drop(sk, req, prev);
 469                 goto out;
 470
 471         case TCP_SYN_SENT:
 472         case TCP_SYN_RECV:  /* Cannot happen.
 473                                It can f.e. if SYNs crossed.
 474                              */
 475                 if (!sock_owned_by_user(sk)) {
 476                         sk->sk_err = err;
 477
 478                         sk->sk_error_report(sk);
 479
 480                         tcp_done(sk);
 481                 } else {
 482                         sk->sk_err_soft = err;
 483                 }
 484                 goto out;
 485         }
 486
 487         /* If we've already connected we will keep trying
 488          * until we time out, or the user gives up.
 489          *
 490          * rfc1122 4.2.3.9 allows to consider as hard errors
 491          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 492          * but it is obsoleted by pmtu discovery).
 493          *
 494          * Note, that in modern internet, where routing is unreliable
 495          * and in each dark corner broken firewalls sit, sending random
 496          * errors ordered by their masters even this two messages finally lose
 497          * their original sense (even Linux sends invalid PORT_UNREACHs)
 498          *
 499          * Now we are in compliance with RFCs.
 500          *                                                      --ANK (980905)
 501          */
 502
 503         inet = inet_sk(sk);
 504         if (!sock_owned_by_user(sk) && inet->recverr) {
 505                 sk->sk_err = err;
 506                 sk->sk_error_report(sk);
 507         } else  { /* Only an error on timeout */
 508                 sk->sk_err_soft = err;
 509         }
 510
 511 out:
 512         bh_unlock_sock(sk);
 513         sock_put(sk);
 514 }
 515
 516 /* This routine computes an IPv4 TCP checksum. */
 517 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 518 {
 519         struct inet_sock *inet = inet_sk(sk);
 520         struct tcphdr *th = tcp_hdr(skb);
 521
 522         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 523                 th->check = ~tcp_v4_check(len, inet->inet_saddr,
 524                                           inet->inet_daddr, 0);
 525                 skb->csum_start = skb_transport_header(skb) - skb->head;
 526                 skb->csum_offset = offsetof(struct tcphdr, check);
 527         } else {
 528                 th->check = tcp_v4_check(len, inet->inet_saddr,
 529                                          inet->inet_daddr,
 530                                          csum_partial(th,
 531                                                       th->doff << 2,
 532                                                       skb->csum));
 533         }
 534 }
 535
 536 int tcp_v4_gso_send_check(struct sk_buff *skb)
 537 {
 538         const struct iphdr *iph;
 539         struct tcphdr *th;
 540
 541         if (!pskb_may_pull(skb, sizeof(*th)))
 542                 return -EINVAL;
 543
 544         iph = ip_hdr(skb);
 545         th = tcp_hdr(skb);
 546
 547         th->check = 0;
 548         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
 549         skb->csum_start = skb_transport_header(skb) - skb->head;
 550         skb->csum_offset = offsetof(struct tcphdr, check);
 551         skb->ip_summed = CHECKSUM_PARTIAL;
 552         return 0;
 553 }
 554
 555 /*
 556  *      This routine will send an RST to the other tcp.
 557  *
 558  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 559  *                    for reset.
 560  *      Answer: if a packet caused RST, it is not for a socket
 561  *              existing in our system, if it is matched to a socket,
 562  *              it is just duplicate segment or bug in other side's TCP.
 563  *              So that we build reply only basing on parameters
 564  *              arrived with segment.
 565  *      Exception: precedence violation. We do not implement it in any case.
 566  */
 567
 568 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 569 {
 570         struct tcphdr *th = tcp_hdr(skb);
 571         struct {
 572                 struct tcphdr th;
 573 #ifdef CONFIG_TCP_MD5SIG
 574                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 575 #endif
 576         } rep;
 577         struct ip_reply_arg arg;
 578 #ifdef CONFIG_TCP_MD5SIG
 579         struct tcp_md5sig_key *key;
 580 #endif
 581         struct net *net;
 582
 583         /* Never send a reset in response to a reset. */
 584         if (th->rst)
 585                 return;
 586
 587         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 588                 return;
 589
 590         /* Swap the send and the receive. */
 591         memset(&rep, 0, sizeof(rep));
 592         rep.th.dest   = th->source;
 593         rep.th.source = th->dest;
 594         rep.th.doff   = sizeof(struct tcphdr) / 4;
 595         rep.th.rst    = 1;
 596
 597         if (th->ack) {
 598                 rep.th.seq = th->ack_seq;
 599         } else {
 600                 rep.th.ack = 1;
 601                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 602                                        skb->len - (th->doff << 2));
 603         }
 604
 605         memset(&arg, 0, sizeof(arg));
 606         arg.iov[0].iov_base = (unsigned char *)&rep;
 607         arg.iov[0].iov_len  = sizeof(rep.th);
 608
 609 #ifdef CONFIG_TCP_MD5SIG
 610         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 611         if (key) {
 612                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 613                                    (TCPOPT_NOP << 16) |
 614                                    (TCPOPT_MD5SIG << 8) |
 615                                    TCPOLEN_MD5SIG);
 616                 /* Update length and the length the header thinks exists */
 617                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 618                 rep.th.doff = arg.iov[0].iov_len / 4;
 619
 620                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 621                                      key, ip_hdr(skb)->saddr,
 622                                      ip_hdr(skb)->daddr, &rep.th);
 623         }
 624 #endif
 625         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 626                                       ip_hdr(skb)->saddr, /* XXX */
 627                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 628         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 629         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 630
 631         net = dev_net(skb_dst(skb)->dev);
 632         ip_send_reply(net->ipv4.tcp_sock, skb,
 633                       &arg, arg.iov[0].iov_len);
 634
 635         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 636         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 637 }
 638
 639 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 640    outside socket context is ugly, certainly. What can I do?
 641  */
 642
 643 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 644                             u32 win, u32 ts, int oif,
 645                             struct tcp_md5sig_key *key,
 646                             int reply_flags)
 647 {
 648         struct tcphdr *th = tcp_hdr(skb);
 649         struct {
 650                 struct tcphdr th;
 651                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 652 #ifdef CONFIG_TCP_MD5SIG
 653                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 654 #endif
 655                         ];
 656         } rep;
 657         struct ip_reply_arg arg;
 658         struct net *net = dev_net(skb_dst(skb)->dev);
 659
 660         memset(&rep.th, 0, sizeof(struct tcphdr));
 661         memset(&arg, 0, sizeof(arg));
 662
 663         arg.iov[0].iov_base = (unsigned char *)&rep;
 664         arg.iov[0].iov_len  = sizeof(rep.th);
 665         if (ts) {
 666                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 667                                    (TCPOPT_TIMESTAMP << 8) |
 668                                    TCPOLEN_TIMESTAMP);
 669                 rep.opt[1] = htonl(tcp_time_stamp);
 670                 rep.opt[2] = htonl(ts);
 671                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 672         }
 673
 674         /* Swap the send and the receive. */
 675         rep.th.dest    = th->source;
 676         rep.th.source  = th->dest;
 677         rep.th.doff    = arg.iov[0].iov_len / 4;
 678         rep.th.seq     = htonl(seq);
 679         rep.th.ack_seq = htonl(ack);
 680         rep.th.ack     = 1;
 681         rep.th.window  = htons(win);
 682
 683 #ifdef CONFIG_TCP_MD5SIG
 684         if (key) {
 685                 int offset = (ts) ? 3 : 0;
 686
 687                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 688                                           (TCPOPT_NOP << 16) |
 689                                           (TCPOPT_MD5SIG << 8) |
 690                                           TCPOLEN_MD5SIG);
 691                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 692                 rep.th.doff = arg.iov[0].iov_len/4;
 693
 694                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 695                                     key, ip_hdr(skb)->saddr,
 696                                     ip_hdr(skb)->daddr, &rep.th);
 697         }
 698 #endif
 699         arg.flags = reply_flags;
 700         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 701                                       ip_hdr(skb)->saddr, /* XXX */
 702                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 703         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 704         if (oif)
 705                 arg.bound_dev_if = oif;
 706
 707         ip_send_reply(net->ipv4.tcp_sock, skb,
 708                       &arg, arg.iov[0].iov_len);
 709
 710         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 711 }
 712
 713 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 714 {
 715         struct inet_timewait_sock *tw = inet_twsk(sk);
 716         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 717
 718         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 719                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 720                         tcptw->tw_ts_recent,
 721                         tw->tw_bound_dev_if,
 722                         tcp_twsk_md5_key(tcptw),
 723                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 724                         );
 725
 726         inet_twsk_put(tw);
 727 }
 728
 729 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 730                                   struct request_sock *req)
 731 {
 732         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 733                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 734                         req->ts_recent,
 735                         0,
 736                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 737                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 738 }
 739
 740 /*
 741  *      Send a SYN-ACK after having received a SYN.
 742  *      This still operates on a request_sock only, not on a big
 743  *      socket.
 744  */
 745 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 746                                 struct dst_entry *dst)
 747 {
 748         const struct inet_request_sock *ireq = inet_rsk(req);
 749         int err = -1;
 750         struct sk_buff * skb;
 751
 752         /* First, grab a route. */
 753         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 754                 return -1;
 755
 756         skb = tcp_make_synack(sk, dst, req);
 757
 758         if (skb) {
 759                 struct tcphdr *th = tcp_hdr(skb);
 760
 761                 th->check = tcp_v4_check(skb->len,
 762                                          ireq->loc_addr,
 763                                          ireq->rmt_addr,
 764                                          csum_partial(th, skb->len,
 765                                                       skb->csum));
 766
 767                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 768                                             ireq->rmt_addr,
 769                                             ireq->opt);
 770                 err = net_xmit_eval(err);
 771         }
 772
 773         dst_release(dst);
 774         return err;
 775 }
 776
 777 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
 778 {
 779         return __tcp_v4_send_synack(sk, req, NULL);
 780 }
 781
 782 /*
 783  *      IPv4 request_sock destructor.
 784  */
 785 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 786 {
 787         kfree(inet_rsk(req)->opt);
 788 }
 789
 790 #ifdef CONFIG_SYN_COOKIES
 791 static void syn_flood_warning(struct sk_buff *skb)
 792 {
 793         static unsigned long warntime;
 794
 795         if (time_after(jiffies, (warntime + HZ * 60))) {
 796                 warntime = jiffies;
 797                 printk(KERN_INFO
 798                        "possible SYN flooding on port %d. Sending cookies.\n",
 799                        ntohs(tcp_hdr(skb)->dest));
 800         }
 801 }
 802 #endif
 803
 804 /*
 805  * Save and compile IPv4 options into the request_sock if needed.
 806  */
 807 static struct ip_options *tcp_v4_save_options(struct sock *sk,
 808                                               struct sk_buff *skb)
 809 {
 810         struct ip_options *opt = &(IPCB(skb)->opt);
 811         struct ip_options *dopt = NULL;
 812
 813         if (opt && opt->optlen) {
 814                 int opt_size = optlength(opt);
 815                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 816                 if (dopt) {
 817                         if (ip_options_echo(dopt, skb)) {
 818                                 kfree(dopt);
 819                                 dopt = NULL;
 820                         }
 821                 }
 822         }
 823         return dopt;
 824 }
 825
 826 #ifdef CONFIG_TCP_MD5SIG
 827 /*
 828  * RFC2385 MD5 checksumming requires a mapping of
 829  * IP address->MD5 Key.
 830  * We need to maintain these in the sk structure.
 831  */
 832
 833 /* Find the Key structure for an address.  */
 834 static struct tcp_md5sig_key *
 835                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 836 {
 837         struct tcp_sock *tp = tcp_sk(sk);
 838         int i;
 839
 840         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 841                 return NULL;
 842         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 843                 if (tp->md5sig_info->keys4[i].addr == addr)
 844                         return &tp->md5sig_info->keys4[i].base;
 845         }
 846         return NULL;
 847 }
 848
 849 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 850                                          struct sock *addr_sk)
 851 {
 852         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 853 }
 854
 855 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 856
 857 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 858                                                       struct request_sock *req)
 859 {
 860         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 861 }
 862
 863 /* This can be called on a newly created socket, from other files */
 864 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 865                       u8 *newkey, u8 newkeylen)
 866 {
 867         /* Add Key to the list */
 868         struct tcp_md5sig_key *key;
 869         struct tcp_sock *tp = tcp_sk(sk);
 870         struct tcp4_md5sig_key *keys;
 871
 872         key = tcp_v4_md5_do_lookup(sk, addr);
 873         if (key) {
 874                 /* Pre-existing entry - just update that one. */
 875                 kfree(key->key);
 876                 key->key = newkey;
 877                 key->keylen = newkeylen;
 878         } else {
 879                 struct tcp_md5sig_info *md5sig;
 880
 881                 if (!tp->md5sig_info) {
 882                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 883                                                   GFP_ATOMIC);
 884                         if (!tp->md5sig_info) {
 885                                 kfree(newkey);
 886                                 return -ENOMEM;
 887                         }
 888                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 889                 }
 890                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
 891                         kfree(newkey);
 892                         return -ENOMEM;
 893                 }
 894                 md5sig = tp->md5sig_info;
 895
 896                 if (md5sig->alloced4 == md5sig->entries4) {
 897                         keys = kmalloc((sizeof(*keys) *
 898                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 899                         if (!keys) {
 900                                 kfree(newkey);
 901                                 tcp_free_md5sig_pool();
 902                                 return -ENOMEM;
 903                         }
 904
 905                         if (md5sig->entries4)
 906                                 memcpy(keys, md5sig->keys4,
 907                                        sizeof(*keys) * md5sig->entries4);
 908
 909                         /* Free old key list, and reference new one */
 910                         kfree(md5sig->keys4);
 911                         md5sig->keys4 = keys;
 912                         md5sig->alloced4++;
 913                 }
 914                 md5sig->entries4++;
 915                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 916                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 917                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 918         }
 919         return 0;
 920 }
 921
 922 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 923
 924 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 925                                u8 *newkey, u8 newkeylen)
 926 {
 927         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 928                                  newkey, newkeylen);
 929 }
 930
 931 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 932 {
 933         struct tcp_sock *tp = tcp_sk(sk);
 934         int i;
 935
 936         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 937                 if (tp->md5sig_info->keys4[i].addr == addr) {
 938                         /* Free the key */
 939                         kfree(tp->md5sig_info->keys4[i].base.key);
 940                         tp->md5sig_info->entries4--;
 941
 942                         if (tp->md5sig_info->entries4 == 0) {
 943                                 kfree(tp->md5sig_info->keys4);
 944                                 tp->md5sig_info->keys4 = NULL;
 945                                 tp->md5sig_info->alloced4 = 0;
 946                         } else if (tp->md5sig_info->entries4 != i) {
 947                                 /* Need to do some manipulation */
 948                                 memmove(&tp->md5sig_info->keys4[i],
 949                                         &tp->md5sig_info->keys4[i+1],
 950                                         (tp->md5sig_info->entries4 - i) *
 951                                          sizeof(struct tcp4_md5sig_key));
 952                         }
 953                         tcp_free_md5sig_pool();
 954                         return 0;
 955                 }
 956         }
 957         return -ENOENT;
 958 }
 959
 960 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 961
 962 static void tcp_v4_clear_md5_list(struct sock *sk)
 963 {
 964         struct tcp_sock *tp = tcp_sk(sk);
 965
 966         /* Free each key, then the set of key keys,
 967          * the crypto element, and then decrement our
 968          * hold on the last resort crypto.
 969          */
 970         if (tp->md5sig_info->entries4) {
 971                 int i;
 972                 for (i = 0; i < tp->md5sig_info->entries4; i++)
 973                         kfree(tp->md5sig_info->keys4[i].base.key);
 974                 tp->md5sig_info->entries4 = 0;
 975                 tcp_free_md5sig_pool();
 976         }
 977         if (tp->md5sig_info->keys4) {
 978                 kfree(tp->md5sig_info->keys4);
 979                 tp->md5sig_info->keys4 = NULL;
 980                 tp->md5sig_info->alloced4  = 0;
 981         }
 982 }
 983
 984 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 985                                  int optlen)
 986 {
 987         struct tcp_md5sig cmd;
 988         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 989         u8 *newkey;
 990
 991         if (optlen < sizeof(cmd))
 992                 return -EINVAL;
 993
 994         if (copy_from_user(&cmd, optval, sizeof(cmd)))
 995                 return -EFAULT;
 996
 997         if (sin->sin_family != AF_INET)
 998                 return -EINVAL;
 999
1000         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1001                 if (!tcp_sk(sk)->md5sig_info)
1002                         return -ENOENT;
1003                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1004         }
1005
1006         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1007                 return -EINVAL;
1008
1009         if (!tcp_sk(sk)->md5sig_info) {
1010                 struct tcp_sock *tp = tcp_sk(sk);
1011                 struct tcp_md5sig_info *p;
1012
1013                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1014                 if (!p)
1015                         return -EINVAL;
1016
1017                 tp->md5sig_info = p;
1018                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1019         }
1020
1021         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1022         if (!newkey)
1023                 return -ENOMEM;
1024         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1025                                  newkey, cmd.tcpm_keylen);
1026 }
1027
1028 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1029                                         __be32 daddr, __be32 saddr, int nbytes)
1030 {
1031         struct tcp4_pseudohdr *bp;
1032         struct scatterlist sg;
1033
1034         bp = &hp->md5_blk.ip4;
1035
1036         /*
1037          * 1. the TCP pseudo-header (in the order: source IP address,
1038          * destination IP address, zero-padded protocol number, and
1039          * segment length)
1040          */
1041         bp->saddr = saddr;
1042         bp->daddr = daddr;
1043         bp->pad = 0;
1044         bp->protocol = IPPROTO_TCP;
1045         bp->len = cpu_to_be16(nbytes);
1046
1047         sg_init_one(&sg, bp, sizeof(*bp));
1048         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1049 }
1050
1051 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1052                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1053 {
1054         struct tcp_md5sig_pool *hp;
1055         struct hash_desc *desc;
1056
1057         hp = tcp_get_md5sig_pool();
1058         if (!hp)
1059                 goto clear_hash_noput;
1060         desc = &hp->md5_desc;
1061
1062         if (crypto_hash_init(desc))
1063                 goto clear_hash;
1064         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1065                 goto clear_hash;
1066         if (tcp_md5_hash_header(hp, th))
1067                 goto clear_hash;
1068         if (tcp_md5_hash_key(hp, key))
1069                 goto clear_hash;
1070         if (crypto_hash_final(desc, md5_hash))
1071                 goto clear_hash;
1072
1073         tcp_put_md5sig_pool();
1074         return 0;
1075
1076 clear_hash:
1077         tcp_put_md5sig_pool();
1078 clear_hash_noput:
1079         memset(md5_hash, 0, 16);
1080         return 1;
1081 }
1082
1083 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1084                         struct sock *sk, struct request_sock *req,
1085                         struct sk_buff *skb)
1086 {
1087         struct tcp_md5sig_pool *hp;
1088         struct hash_desc *desc;
1089         struct tcphdr *th = tcp_hdr(skb);
1090         __be32 saddr, daddr;
1091
1092         if (sk) {
1093                 saddr = inet_sk(sk)->inet_saddr;
1094                 daddr = inet_sk(sk)->inet_daddr;
1095         } else if (req) {
1096                 saddr = inet_rsk(req)->loc_addr;
1097                 daddr = inet_rsk(req)->rmt_addr;
1098         } else {
1099                 const struct iphdr *iph = ip_hdr(skb);
1100                 saddr = iph->saddr;
1101                 daddr = iph->daddr;
1102         }
1103
1104         hp = tcp_get_md5sig_pool();
1105         if (!hp)
1106                 goto clear_hash_noput;
1107         desc = &hp->md5_desc;
1108
1109         if (crypto_hash_init(desc))
1110                 goto clear_hash;
1111
1112         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1113                 goto clear_hash;
1114         if (tcp_md5_hash_header(hp, th))
1115                 goto clear_hash;
1116         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1117                 goto clear_hash;
1118         if (tcp_md5_hash_key(hp, key))
1119                 goto clear_hash;
1120         if (crypto_hash_final(desc, md5_hash))
1121                 goto clear_hash;
1122
1123         tcp_put_md5sig_pool();
1124         return 0;
1125
1126 clear_hash:
1127         tcp_put_md5sig_pool();
1128 clear_hash_noput:
1129         memset(md5_hash, 0, 16);
1130         return 1;
1131 }
1132
1133 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1134
1135 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1136 {
1137         /*
1138          * This gets called for each TCP segment that arrives
1139          * so we want to be efficient.
1140          * We have 3 drop cases:
1141          * o No MD5 hash and one expected.
1142          * o MD5 hash and we're not expecting one.
1143          * o MD5 hash and its wrong.
1144          */
1145         __u8 *hash_location = NULL;
1146         struct tcp_md5sig_key *hash_expected;
1147         const struct iphdr *iph = ip_hdr(skb);
1148         struct tcphdr *th = tcp_hdr(skb);
1149         int genhash;
1150         unsigned char newhash[16];
1151
1152         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1153         hash_location = tcp_parse_md5sig_option(th);
1154
1155         /* We've parsed the options - do we have a hash? */
1156         if (!hash_expected && !hash_location)
1157                 return 0;
1158
1159         if (hash_expected && !hash_location) {
1160                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1161                 return 1;
1162         }
1163
1164         if (!hash_expected && hash_location) {
1165                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1166                 return 1;
1167         }
1168
1169         /* Okay, so this is hash_expected and hash_location -
1170          * so we need to calculate the checksum.
1171          */
1172         genhash = tcp_v4_md5_hash_skb(newhash,
1173                                       hash_expected,
1174                                       NULL, NULL, skb);
1175
1176         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1177                 if (net_ratelimit()) {
1178                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1179                                &iph->saddr, ntohs(th->source),
1180                                &iph->daddr, ntohs(th->dest),
1181                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1182                 }
1183                 return 1;
1184         }
1185         return 0;
1186 }
1187
1188 #endif
1189
1190 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1191         .family         =       PF_INET,
1192         .obj_size       =       sizeof(struct tcp_request_sock),
1193         .rtx_syn_ack    =       tcp_v4_send_synack,
1194         .send_ack       =       tcp_v4_reqsk_send_ack,
1195         .destructor     =       tcp_v4_reqsk_destructor,
1196         .send_reset     =       tcp_v4_send_reset,
1197 };
1198
1199 #ifdef CONFIG_TCP_MD5SIG
1200 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1201         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1202         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1203 };
1204 #endif
1205
1206 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1207         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1208         .twsk_unique    = tcp_twsk_unique,
1209         .twsk_destructor= tcp_twsk_destructor,
1210 };
1211
1212 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1213 {
1214         struct inet_request_sock *ireq;
1215         struct tcp_options_received tmp_opt;
1216         struct request_sock *req;
1217         __be32 saddr = ip_hdr(skb)->saddr;
1218         __be32 daddr = ip_hdr(skb)->daddr;
1219         __u32 isn = TCP_SKB_CB(skb)->when;
1220         struct dst_entry *dst = NULL;
1221 #ifdef CONFIG_SYN_COOKIES
1222         int want_cookie = 0;
1223 #else
1224 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1225 #endif
1226
1227         /* Never answer to SYNs send to broadcast or multicast */
1228         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1229                 goto drop;
1230
1231         /* TW buckets are converted to open requests without
1232          * limitations, they conserve resources and peer is
1233          * evidently real one.
1234          */
1235         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1236 #ifdef CONFIG_SYN_COOKIES
1237                 if (sysctl_tcp_syncookies) {
1238                         want_cookie = 1;
1239                 } else
1240 #endif
1241                 goto drop;
1242         }
1243
1244         /* Accept backlog is full. If we have already queued enough
1245          * of warm entries in syn queue, drop request. It is better than
1246          * clogging syn queue with openreqs with exponentially increasing
1247          * timeout.
1248          */
1249         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1250                 goto drop;
1251
1252         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1253         if (!req)
1254                 goto drop;
1255
1256 #ifdef CONFIG_TCP_MD5SIG
1257         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1258 #endif
1259
1260         ireq = inet_rsk(req);
1261         ireq->loc_addr = daddr;
1262         ireq->rmt_addr = saddr;
1263         ireq->no_srccheck = inet_sk(sk)->transparent;
1264         ireq->opt = tcp_v4_save_options(sk, skb);
1265
1266         dst = inet_csk_route_req(sk, req);
1267         if(!dst)
1268                 goto drop_and_free;
1269
1270         tcp_clear_options(&tmp_opt);
1271         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1272         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1273
1274         tcp_parse_options(skb, &tmp_opt, 0, dst);
1275
1276         if (want_cookie && !tmp_opt.saw_tstamp)
1277                 tcp_clear_options(&tmp_opt);
1278
1279         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1280
1281         tcp_openreq_init(req, &tmp_opt, skb);
1282
1283         if (security_inet_conn_request(sk, skb, req))
1284                 goto drop_and_release;
1285
1286         if (!want_cookie)
1287                 TCP_ECN_create_request(req, tcp_hdr(skb));
1288
1289         if (want_cookie) {
1290 #ifdef CONFIG_SYN_COOKIES
1291                 syn_flood_warning(skb);
1292                 req->cookie_ts = tmp_opt.tstamp_ok;
1293 #endif
1294                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1295         } else if (!isn) {
1296                 struct inet_peer *peer = NULL;
1297
1298                 /* VJ's idea. We save last timestamp seen
1299                  * from the destination in peer table, when entering
1300                  * state TIME-WAIT, and check against it before
1301                  * accepting new connection request.
1302                  *
1303                  * If "isn" is not zero, this request hit alive
1304                  * timewait bucket, so that all the necessary checks
1305                  * are made in the function processing timewait state.
1306                  */
1307                 if (tmp_opt.saw_tstamp &&
1308                     tcp_death_row.sysctl_tw_recycle &&
1309                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1310                     peer->v4daddr == saddr) {
1311                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1312                             (s32)(peer->tcp_ts - req->ts_recent) >
1313                                                         TCP_PAWS_WINDOW) {
1314                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1315                                 goto drop_and_release;
1316                         }
1317                 }
1318                 /* Kill the following clause, if you dislike this way. */
1319                 else if (!sysctl_tcp_syncookies &&
1320                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1321                           (sysctl_max_syn_backlog >> 2)) &&
1322                          (!peer || !peer->tcp_ts_stamp) &&
1323                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1324                         /* Without syncookies last quarter of
1325                          * backlog is filled with destinations,
1326                          * proven to be alive.
1327                          * It means that we continue to communicate
1328                          * to destinations, already remembered
1329                          * to the moment of synflood.
1330                          */
1331                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1332                                        &saddr, ntohs(tcp_hdr(skb)->source));
1333                         goto drop_and_release;
1334                 }
1335
1336                 isn = tcp_v4_init_sequence(skb);
1337         }
1338         tcp_rsk(req)->snt_isn = isn;
1339
1340         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1341                 goto drop_and_free;
1342
1343         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1344         return 0;
1345
1346 drop_and_release:
1347         dst_release(dst);
1348 drop_and_free:
1349         reqsk_free(req);
1350 drop:
1351         return 0;
1352 }
1353
1354
1355 /*
1356  * The three way handshake has completed - we got a valid synack -
1357  * now create the new socket.
1358  */
1359 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1360                                   struct request_sock *req,
1361                                   struct dst_entry *dst)
1362 {
1363         struct inet_request_sock *ireq;
1364         struct inet_sock *newinet;
1365         struct tcp_sock *newtp;
1366         struct sock *newsk;
1367 #ifdef CONFIG_TCP_MD5SIG
1368         struct tcp_md5sig_key *key;
1369 #endif
1370
1371         if (sk_acceptq_is_full(sk))
1372                 goto exit_overflow;
1373
1374         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1375                 goto exit;
1376
1377         newsk = tcp_create_openreq_child(sk, req, skb);
1378         if (!newsk)
1379                 goto exit;
1380
1381         newsk->sk_gso_type = SKB_GSO_TCPV4;
1382         sk_setup_caps(newsk, dst);
1383
1384         newtp                 = tcp_sk(newsk);
1385         newinet               = inet_sk(newsk);
1386         ireq                  = inet_rsk(req);
1387         newinet->inet_daddr   = ireq->rmt_addr;
1388         newinet->inet_rcv_saddr = ireq->loc_addr;
1389         newinet->inet_saddr           = ireq->loc_addr;
1390         newinet->opt          = ireq->opt;
1391         ireq->opt             = NULL;
1392         newinet->mc_index     = inet_iif(skb);
1393         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1394         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1395         if (newinet->opt)
1396                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1397         newinet->inet_id = newtp->write_seq ^ jiffies;
1398
1399         tcp_mtup_init(newsk);
1400         tcp_sync_mss(newsk, dst_mtu(dst));
1401         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1402         if (tcp_sk(sk)->rx_opt.user_mss &&
1403             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1404                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1405
1406         tcp_initialize_rcv_mss(newsk);
1407
1408 #ifdef CONFIG_TCP_MD5SIG
1409         /* Copy over the MD5 key from the original socket */
1410         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1411         if (key != NULL) {
1412                 /*
1413                  * We're using one, so create a matching key
1414                  * on the newsk structure. If we fail to get
1415                  * memory, then we end up not copying the key
1416                  * across. Shucks.
1417                  */
1418                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1419                 if (newkey != NULL)
1420                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1421                                           newkey, key->keylen);
1422                 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1423         }
1424 #endif
1425
1426         __inet_hash_nolisten(newsk);
1427         __inet_inherit_port(sk, newsk);
1428
1429         return newsk;
1430
1431 exit_overflow:
1432         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1433 exit:
1434         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1435         dst_release(dst);
1436         return NULL;
1437 }
1438
1439 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1440 {
1441         struct tcphdr *th = tcp_hdr(skb);
1442         const struct iphdr *iph = ip_hdr(skb);
1443         struct sock *nsk;
1444         struct request_sock **prev;
1445         /* Find possible connection requests. */
1446         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1447                                                        iph->saddr, iph->daddr);
1448         if (req)
1449                 return tcp_check_req(sk, skb, req, prev);
1450
1451         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1452                         th->source, iph->daddr, th->dest, inet_iif(skb));
1453
1454         if (nsk) {
1455                 if (nsk->sk_state != TCP_TIME_WAIT) {
1456                         bh_lock_sock(nsk);
1457                         return nsk;
1458                 }
1459                 inet_twsk_put(inet_twsk(nsk));
1460                 return NULL;
1461         }
1462
1463 #ifdef CONFIG_SYN_COOKIES
1464         if (!th->rst && !th->syn && th->ack)
1465                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1466 #endif
1467         return sk;
1468 }
1469
1470 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1471 {
1472         const struct iphdr *iph = ip_hdr(skb);
1473
1474         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1475                 if (!tcp_v4_check(skb->len, iph->saddr,
1476                                   iph->daddr, skb->csum)) {
1477                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1478                         return 0;
1479                 }
1480         }
1481
1482         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1483                                        skb->len, IPPROTO_TCP, 0);
1484
1485         if (skb->len <= 76) {
1486                 return __skb_checksum_complete(skb);
1487         }
1488         return 0;
1489 }
1490
1491
1492 /* The socket must have it's spinlock held when we get
1493  * here.
1494  *
1495  * We have a potential double-lock case here, so even when
1496  * doing backlog processing we use the BH locking scheme.
1497  * This is because we cannot sleep with the original spinlock
1498  * held.
1499  */
1500 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1501 {
1502         struct sock *rsk;
1503 #ifdef CONFIG_TCP_MD5SIG
1504         /*
1505          * We really want to reject the packet as early as possible
1506          * if:
1507          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1508          *  o There is an MD5 option and we're not expecting one
1509          */
1510         if (tcp_v4_inbound_md5_hash(sk, skb))
1511                 goto discard;
1512 #endif
1513
1514         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1515                 TCP_CHECK_TIMER(sk);
1516                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1517                         rsk = sk;
1518                         goto reset;
1519                 }
1520                 TCP_CHECK_TIMER(sk);
1521                 return 0;
1522         }
1523
1524         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1525                 goto csum_err;
1526
1527         if (sk->sk_state == TCP_LISTEN) {
1528                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1529                 if (!nsk)
1530                         goto discard;
1531
1532                 if (nsk != sk) {
1533                         if (tcp_child_process(sk, nsk, skb)) {
1534                                 rsk = nsk;
1535                                 goto reset;
1536                         }
1537                         return 0;
1538                 }
1539         }
1540
1541         TCP_CHECK_TIMER(sk);
1542         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1543                 rsk = sk;
1544                 goto reset;
1545         }
1546         TCP_CHECK_TIMER(sk);
1547         return 0;
1548
1549 reset:
1550         tcp_v4_send_reset(rsk, skb);
1551 discard:
1552         kfree_skb(skb);
1553         /* Be careful here. If this function gets more complicated and
1554          * gcc suffers from register pressure on the x86, sk (in %ebx)
1555          * might be destroyed here. This current version compiles correctly,
1556          * but you have been warned.
1557          */
1558         return 0;
1559
1560 csum_err:
1561         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1562         goto discard;
1563 }
1564
1565 /*
1566  *      From tcp_input.c
1567  */
1568
1569 int tcp_v4_rcv(struct sk_buff *skb)
1570 {
1571         const struct iphdr *iph;
1572         struct tcphdr *th;
1573         struct sock *sk;
1574         int ret;
1575         struct net *net = dev_net(skb->dev);
1576
1577         if (skb->pkt_type != PACKET_HOST)
1578                 goto discard_it;
1579
1580         /* Count it even if it's bad */
1581         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1582
1583         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1584                 goto discard_it;
1585
1586         th = tcp_hdr(skb);
1587
1588         if (th->doff < sizeof(struct tcphdr) / 4)
1589                 goto bad_packet;
1590         if (!pskb_may_pull(skb, th->doff * 4))
1591                 goto discard_it;
1592
1593         /* An explanation is required here, I think.
1594          * Packet length and doff are validated by header prediction,
1595          * provided case of th->doff==0 is eliminated.
1596          * So, we defer the checks. */
1597         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1598                 goto bad_packet;
1599
1600         th = tcp_hdr(skb);
1601         iph = ip_hdr(skb);
1602         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1603         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1604                                     skb->len - th->doff * 4);
1605         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1606         TCP_SKB_CB(skb)->when    = 0;
1607         TCP_SKB_CB(skb)->flags   = iph->tos;
1608         TCP_SKB_CB(skb)->sacked  = 0;
1609
1610         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1611         if (!sk)
1612                 goto no_tcp_socket;
1613
1614 process:
1615         if (sk->sk_state == TCP_TIME_WAIT)
1616                 goto do_time_wait;
1617
1618         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1619                 goto discard_and_relse;
1620         nf_reset(skb);
1621
1622         if (sk_filter(sk, skb))
1623                 goto discard_and_relse;
1624
1625         skb->dev = NULL;
1626
1627         bh_lock_sock_nested(sk);
1628         ret = 0;
1629         if (!sock_owned_by_user(sk)) {
1630 #ifdef CONFIG_NET_DMA
1631                 struct tcp_sock *tp = tcp_sk(sk);
1632                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1633                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1634                 if (tp->ucopy.dma_chan)
1635                         ret = tcp_v4_do_rcv(sk, skb);
1636                 else
1637 #endif
1638                 {
1639                         if (!tcp_prequeue(sk, skb))
1640                                 ret = tcp_v4_do_rcv(sk, skb);
1641                 }
1642         } else
1643                 sk_add_backlog(sk, skb);
1644         bh_unlock_sock(sk);
1645
1646         sock_put(sk);
1647
1648         return ret;
1649
1650 no_tcp_socket:
1651         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1652                 goto discard_it;
1653
1654         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1655 bad_packet:
1656                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1657         } else {
1658                 tcp_v4_send_reset(NULL, skb);
1659         }
1660
1661 discard_it:
1662         /* Discard frame. */
1663         kfree_skb(skb);
1664         return 0;
1665
1666 discard_and_relse:
1667         sock_put(sk);
1668         goto discard_it;
1669
1670 do_time_wait:
1671         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1672                 inet_twsk_put(inet_twsk(sk));
1673                 goto discard_it;
1674         }
1675
1676         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1677                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1678                 inet_twsk_put(inet_twsk(sk));
1679                 goto discard_it;
1680         }
1681         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1682         case TCP_TW_SYN: {
1683                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1684                                                         &tcp_hashinfo,
1685                                                         iph->daddr, th->dest,
1686                                                         inet_iif(skb));
1687                 if (sk2) {
1688                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1689                         inet_twsk_put(inet_twsk(sk));
1690                         sk = sk2;
1691                         goto process;
1692                 }
1693                 /* Fall through to ACK */
1694         }
1695         case TCP_TW_ACK:
1696                 tcp_v4_timewait_ack(sk, skb);
1697                 break;
1698         case TCP_TW_RST:
1699                 goto no_tcp_socket;
1700         case TCP_TW_SUCCESS:;
1701         }
1702         goto discard_it;
1703 }
1704
1705 /* VJ's idea. Save last timestamp seen from this destination
1706  * and hold it at least for normal timewait interval to use for duplicate
1707  * segment detection in subsequent connections, before they enter synchronized
1708  * state.
1709  */
1710
1711 int tcp_v4_remember_stamp(struct sock *sk)
1712 {
1713         struct inet_sock *inet = inet_sk(sk);
1714         struct tcp_sock *tp = tcp_sk(sk);
1715         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1716         struct inet_peer *peer = NULL;
1717         int release_it = 0;
1718
1719         if (!rt || rt->rt_dst != inet->inet_daddr) {
1720                 peer = inet_getpeer(inet->inet_daddr, 1);
1721                 release_it = 1;
1722         } else {
1723                 if (!rt->peer)
1724                         rt_bind_peer(rt, 1);
1725                 peer = rt->peer;
1726         }
1727
1728         if (peer) {
1729                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1730                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1731                      peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1732                         peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1733                         peer->tcp_ts = tp->rx_opt.ts_recent;
1734                 }
1735                 if (release_it)
1736                         inet_putpeer(peer);
1737                 return 1;
1738         }
1739
1740         return 0;
1741 }
1742
1743 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1744 {
1745         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1746
1747         if (peer) {
1748                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1749
1750                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1751                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1752                      peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1753                         peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1754                         peer->tcp_ts       = tcptw->tw_ts_recent;
1755                 }
1756                 inet_putpeer(peer);
1757                 return 1;
1758         }
1759
1760         return 0;
1761 }
1762
1763 const struct inet_connection_sock_af_ops ipv4_specific = {
1764         .queue_xmit        = ip_queue_xmit,
1765         .send_check        = tcp_v4_send_check,
1766         .rebuild_header    = inet_sk_rebuild_header,
1767         .conn_request      = tcp_v4_conn_request,
1768         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1769         .remember_stamp    = tcp_v4_remember_stamp,
1770         .net_header_len    = sizeof(struct iphdr),
1771         .setsockopt        = ip_setsockopt,
1772         .getsockopt        = ip_getsockopt,
1773         .addr2sockaddr     = inet_csk_addr2sockaddr,
1774         .sockaddr_len      = sizeof(struct sockaddr_in),
1775         .bind_conflict     = inet_csk_bind_conflict,
1776 #ifdef CONFIG_COMPAT
1777         .compat_setsockopt = compat_ip_setsockopt,
1778         .compat_getsockopt = compat_ip_getsockopt,
1779 #endif
1780 };
1781
1782 #ifdef CONFIG_TCP_MD5SIG
1783 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1784         .md5_lookup             = tcp_v4_md5_lookup,
1785         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1786         .md5_add                = tcp_v4_md5_add_func,
1787         .md5_parse              = tcp_v4_parse_md5_keys,
1788 };
1789 #endif
1790
1791 /* NOTE: A lot of things set to zero explicitly by call to
1792  *       sk_alloc() so need not be done here.
1793  */
1794 static int tcp_v4_init_sock(struct sock *sk)
1795 {
1796         struct inet_connection_sock *icsk = inet_csk(sk);
1797         struct tcp_sock *tp = tcp_sk(sk);
1798
1799         skb_queue_head_init(&tp->out_of_order_queue);
1800         tcp_init_xmit_timers(sk);
1801         tcp_prequeue_init(tp);
1802
1803         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1804         tp->mdev = TCP_TIMEOUT_INIT;
1805
1806         /* So many TCP implementations out there (incorrectly) count the
1807          * initial SYN frame in their delayed-ACK and congestion control
1808          * algorithms that we must have the following bandaid to talk
1809          * efficiently to them.  -DaveM
1810          */
1811         tp->snd_cwnd = 2;
1812
1813         /* See draft-stevens-tcpca-spec-01 for discussion of the
1814          * initialization of these values.
1815          */
1816         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1817         tp->snd_cwnd_clamp = ~0;
1818         tp->mss_cache = TCP_MSS_DEFAULT;
1819
1820         tp->reordering = sysctl_tcp_reordering;
1821         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1822
1823         sk->sk_state = TCP_CLOSE;
1824
1825         sk->sk_write_space = sk_stream_write_space;
1826         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1827
1828         icsk->icsk_af_ops = &ipv4_specific;
1829         icsk->icsk_sync_mss = tcp_sync_mss;
1830 #ifdef CONFIG_TCP_MD5SIG
1831         tp->af_specific = &tcp_sock_ipv4_specific;
1832 #endif
1833
1834         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1835         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1836
1837         local_bh_disable();
1838         percpu_counter_inc(&tcp_sockets_allocated);
1839         local_bh_enable();
1840
1841         return 0;
1842 }
1843
1844 void tcp_v4_destroy_sock(struct sock *sk)
1845 {
1846         struct tcp_sock *tp = tcp_sk(sk);
1847
1848         tcp_clear_xmit_timers(sk);
1849
1850         tcp_cleanup_congestion_control(sk);
1851
1852         /* Cleanup up the write buffer. */
1853         tcp_write_queue_purge(sk);
1854
1855         /* Cleans up our, hopefully empty, out_of_order_queue. */
1856         __skb_queue_purge(&tp->out_of_order_queue);
1857
1858 #ifdef CONFIG_TCP_MD5SIG
1859         /* Clean up the MD5 key list, if any */
1860         if (tp->md5sig_info) {
1861                 tcp_v4_clear_md5_list(sk);
1862                 kfree(tp->md5sig_info);
1863                 tp->md5sig_info = NULL;
1864         }
1865 #endif
1866
1867 #ifdef CONFIG_NET_DMA
1868         /* Cleans up our sk_async_wait_queue */
1869         __skb_queue_purge(&sk->sk_async_wait_queue);
1870 #endif
1871
1872         /* Clean prequeue, it must be empty really */
1873         __skb_queue_purge(&tp->ucopy.prequeue);
1874
1875         /* Clean up a referenced TCP bind bucket. */
1876         if (inet_csk(sk)->icsk_bind_hash)
1877                 inet_put_port(sk);
1878
1879         /*
1880          * If sendmsg cached page exists, toss it.
1881          */
1882         if (sk->sk_sndmsg_page) {
1883                 __free_page(sk->sk_sndmsg_page);
1884                 sk->sk_sndmsg_page = NULL;
1885         }
1886
1887         percpu_counter_dec(&tcp_sockets_allocated);
1888 }
1889
1890 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1891
1892 #ifdef CONFIG_PROC_FS
1893 /* Proc filesystem TCP sock list dumping. */
1894
1895 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1896 {
1897         return hlist_nulls_empty(head) ? NULL :
1898                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1899 }
1900
1901 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1902 {
1903         return !is_a_nulls(tw->tw_node.next) ?
1904                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1905 }
1906
1907 static void *listening_get_next(struct seq_file *seq, void *cur)
1908 {
1909         struct inet_connection_sock *icsk;
1910         struct hlist_nulls_node *node;
1911         struct sock *sk = cur;
1912         struct inet_listen_hashbucket *ilb;
1913         struct tcp_iter_state *st = seq->private;
1914         struct net *net = seq_file_net(seq);
1915
1916         if (!sk) {
1917                 st->bucket = 0;
1918                 ilb = &tcp_hashinfo.listening_hash[0];
1919                 spin_lock_bh(&ilb->lock);
1920                 sk = sk_nulls_head(&ilb->head);
1921                 goto get_sk;
1922         }
1923         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1924         ++st->num;
1925
1926         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1927                 struct request_sock *req = cur;
1928
1929                 icsk = inet_csk(st->syn_wait_sk);
1930                 req = req->dl_next;
1931                 while (1) {
1932                         while (req) {
1933                                 if (req->rsk_ops->family == st->family) {
1934                                         cur = req;
1935                                         goto out;
1936                                 }
1937                                 req = req->dl_next;
1938                         }
1939                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1940                                 break;
1941 get_req:
1942                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1943                 }
1944                 sk        = sk_next(st->syn_wait_sk);
1945                 st->state = TCP_SEQ_STATE_LISTENING;
1946                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1947         } else {
1948                 icsk = inet_csk(sk);
1949                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1950                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1951                         goto start_req;
1952                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1953                 sk = sk_next(sk);
1954         }
1955 get_sk:
1956         sk_nulls_for_each_from(sk, node) {
1957                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1958                         cur = sk;
1959                         goto out;
1960                 }
1961                 icsk = inet_csk(sk);
1962                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1963                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1964 start_req:
1965                         st->uid         = sock_i_uid(sk);
1966                         st->syn_wait_sk = sk;
1967                         st->state       = TCP_SEQ_STATE_OPENREQ;
1968                         st->sbucket     = 0;
1969                         goto get_req;
1970                 }
1971                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1972         }
1973         spin_unlock_bh(&ilb->lock);
1974         if (++st->bucket < INET_LHTABLE_SIZE) {
1975                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1976                 spin_lock_bh(&ilb->lock);
1977                 sk = sk_nulls_head(&ilb->head);
1978                 goto get_sk;
1979         }
1980         cur = NULL;
1981 out:
1982         return cur;
1983 }
1984
1985 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1986 {
1987         void *rc = listening_get_next(seq, NULL);
1988
1989         while (rc && *pos) {
1990                 rc = listening_get_next(seq, rc);
1991                 --*pos;
1992         }
1993         return rc;
1994 }
1995
1996 static inline int empty_bucket(struct tcp_iter_state *st)
1997 {
1998         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1999                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2000 }
2001
2002 static void *established_get_first(struct seq_file *seq)
2003 {
2004         struct tcp_iter_state *st = seq->private;
2005         struct net *net = seq_file_net(seq);
2006         void *rc = NULL;
2007
2008         for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2009                 struct sock *sk;
2010                 struct hlist_nulls_node *node;
2011                 struct inet_timewait_sock *tw;
2012                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2013
2014                 /* Lockless fast path for the common case of empty buckets */
2015                 if (empty_bucket(st))
2016                         continue;
2017
2018                 spin_lock_bh(lock);
2019                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2020                         if (sk->sk_family != st->family ||
2021                             !net_eq(sock_net(sk), net)) {
2022                                 continue;
2023                         }
2024                         rc = sk;
2025                         goto out;
2026                 }
2027                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2028                 inet_twsk_for_each(tw, node,
2029                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2030                         if (tw->tw_family != st->family ||
2031                             !net_eq(twsk_net(tw), net)) {
2032                                 continue;
2033                         }
2034                         rc = tw;
2035                         goto out;
2036                 }
2037                 spin_unlock_bh(lock);
2038                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2039         }
2040 out:
2041         return rc;
2042 }
2043
2044 static void *established_get_next(struct seq_file *seq, void *cur)
2045 {
2046         struct sock *sk = cur;
2047         struct inet_timewait_sock *tw;
2048         struct hlist_nulls_node *node;
2049         struct tcp_iter_state *st = seq->private;
2050         struct net *net = seq_file_net(seq);
2051
2052         ++st->num;
2053
2054         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2055                 tw = cur;
2056                 tw = tw_next(tw);
2057 get_tw:
2058                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2059                         tw = tw_next(tw);
2060                 }
2061                 if (tw) {
2062                         cur = tw;
2063                         goto out;
2064                 }
2065                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2066                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2067
2068                 /* Look for next non empty bucket */
2069                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2070                                 empty_bucket(st))
2071                         ;
2072                 if (st->bucket > tcp_hashinfo.ehash_mask)
2073                         return NULL;
2074
2075                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2076                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2077         } else
2078                 sk = sk_nulls_next(sk);
2079
2080         sk_nulls_for_each_from(sk, node) {
2081                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2082                         goto found;
2083         }
2084
2085         st->state = TCP_SEQ_STATE_TIME_WAIT;
2086         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2087         goto get_tw;
2088 found:
2089         cur = sk;
2090 out:
2091         return cur;
2092 }
2093
2094 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2095 {
2096         void *rc = established_get_first(seq);
2097
2098         while (rc && pos) {
2099                 rc = established_get_next(seq, rc);
2100                 --pos;
2101         }
2102         return rc;
2103 }
2104
2105 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2106 {
2107         void *rc;
2108         struct tcp_iter_state *st = seq->private;
2109
2110         st->state = TCP_SEQ_STATE_LISTENING;
2111         rc        = listening_get_idx(seq, &pos);
2112
2113         if (!rc) {
2114                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2115                 rc        = established_get_idx(seq, pos);
2116         }
2117
2118         return rc;
2119 }
2120
2121 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2122 {
2123         struct tcp_iter_state *st = seq->private;
2124         st->state = TCP_SEQ_STATE_LISTENING;
2125         st->num = 0;
2126         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2127 }
2128
2129 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2130 {
2131         void *rc = NULL;
2132         struct tcp_iter_state *st;
2133
2134         if (v == SEQ_START_TOKEN) {
2135                 rc = tcp_get_idx(seq, 0);
2136                 goto out;
2137         }
2138         st = seq->private;
2139
2140         switch (st->state) {
2141         case TCP_SEQ_STATE_OPENREQ:
2142         case TCP_SEQ_STATE_LISTENING:
2143                 rc = listening_get_next(seq, v);
2144                 if (!rc) {
2145                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2146                         rc        = established_get_first(seq);
2147                 }
2148                 break;
2149         case TCP_SEQ_STATE_ESTABLISHED:
2150         case TCP_SEQ_STATE_TIME_WAIT:
2151                 rc = established_get_next(seq, v);
2152                 break;
2153         }
2154 out:
2155         ++*pos;
2156         return rc;
2157 }
2158
2159 static void tcp_seq_stop(struct seq_file *seq, void *v)
2160 {
2161         struct tcp_iter_state *st = seq->private;
2162
2163         switch (st->state) {
2164         case TCP_SEQ_STATE_OPENREQ:
2165                 if (v) {
2166                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2167                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2168                 }
2169         case TCP_SEQ_STATE_LISTENING:
2170                 if (v != SEQ_START_TOKEN)
2171                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2172                 break;
2173         case TCP_SEQ_STATE_TIME_WAIT:
2174         case TCP_SEQ_STATE_ESTABLISHED:
2175                 if (v)
2176                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2177                 break;
2178         }
2179 }
2180
2181 static int tcp_seq_open(struct inode *inode, struct file *file)
2182 {
2183         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2184         struct tcp_iter_state *s;
2185         int err;
2186
2187         err = seq_open_net(inode, file, &afinfo->seq_ops,
2188                           sizeof(struct tcp_iter_state));
2189         if (err < 0)
2190                 return err;
2191
2192         s = ((struct seq_file *)file->private_data)->private;
2193         s->family               = afinfo->family;
2194         return 0;
2195 }
2196
2197 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2198 {
2199         int rc = 0;
2200         struct proc_dir_entry *p;
2201
2202         afinfo->seq_fops.open           = tcp_seq_open;
2203         afinfo->seq_fops.read           = seq_read;
2204         afinfo->seq_fops.llseek         = seq_lseek;
2205         afinfo->seq_fops.release        = seq_release_net;
2206
2207         afinfo->seq_ops.start           = tcp_seq_start;
2208         afinfo->seq_ops.next            = tcp_seq_next;
2209         afinfo->seq_ops.stop            = tcp_seq_stop;
2210
2211         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2212                              &afinfo->seq_fops, afinfo);
2213         if (!p)
2214                 rc = -ENOMEM;
2215         return rc;
2216 }
2217
2218 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2219 {
2220         proc_net_remove(net, afinfo->name);
2221 }
2222
2223 static void get_openreq4(struct sock *sk, struct request_sock *req,
2224                          struct seq_file *f, int i, int uid, int *len)
2225 {
2226         const struct inet_request_sock *ireq = inet_rsk(req);
2227         int ttd = req->expires - jiffies;
2228
2229         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2230                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2231                 i,
2232                 ireq->loc_addr,
2233                 ntohs(inet_sk(sk)->inet_sport),
2234                 ireq->rmt_addr,
2235                 ntohs(ireq->rmt_port),
2236                 TCP_SYN_RECV,
2237                 0, 0, /* could print option size, but that is af dependent. */
2238                 1,    /* timers active (only the expire timer) */
2239                 jiffies_to_clock_t(ttd),
2240                 req->retrans,
2241                 uid,
2242                 0,  /* non standard timer */
2243                 0, /* open_requests have no inode */
2244                 atomic_read(&sk->sk_refcnt),
2245                 req,
2246                 len);
2247 }
2248
2249 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2250 {
2251         int timer_active;
2252         unsigned long timer_expires;
2253         struct tcp_sock *tp = tcp_sk(sk);
2254         const struct inet_connection_sock *icsk = inet_csk(sk);
2255         struct inet_sock *inet = inet_sk(sk);
2256         __be32 dest = inet->inet_daddr;
2257         __be32 src = inet->inet_rcv_saddr;
2258         __u16 destp = ntohs(inet->inet_dport);
2259         __u16 srcp = ntohs(inet->inet_sport);
2260
2261         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2262                 timer_active    = 1;
2263                 timer_expires   = icsk->icsk_timeout;
2264         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2265                 timer_active    = 4;
2266                 timer_expires   = icsk->icsk_timeout;
2267         } else if (timer_pending(&sk->sk_timer)) {
2268                 timer_active    = 2;
2269                 timer_expires   = sk->sk_timer.expires;
2270         } else {
2271                 timer_active    = 0;
2272                 timer_expires = jiffies;
2273         }
2274
2275         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2276                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2277                 i, src, srcp, dest, destp, sk->sk_state,
2278                 tp->write_seq - tp->snd_una,
2279                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2280                                              (tp->rcv_nxt - tp->copied_seq),
2281                 timer_active,
2282                 jiffies_to_clock_t(timer_expires - jiffies),
2283                 icsk->icsk_retransmits,
2284                 sock_i_uid(sk),
2285                 icsk->icsk_probes_out,
2286                 sock_i_ino(sk),
2287                 atomic_read(&sk->sk_refcnt), sk,
2288                 jiffies_to_clock_t(icsk->icsk_rto),
2289                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2290                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2291                 tp->snd_cwnd,
2292                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2293                 len);
2294 }
2295
2296 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2297                                struct seq_file *f, int i, int *len)
2298 {
2299         __be32 dest, src;
2300         __u16 destp, srcp;
2301         int ttd = tw->tw_ttd - jiffies;
2302
2303         if (ttd < 0)
2304                 ttd = 0;
2305
2306         dest  = tw->tw_daddr;
2307         src   = tw->tw_rcv_saddr;
2308         destp = ntohs(tw->tw_dport);
2309         srcp  = ntohs(tw->tw_sport);
2310
2311         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2312                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2313                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2314                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2315                 atomic_read(&tw->tw_refcnt), tw, len);
2316 }
2317
2318 #define TMPSZ 150
2319
2320 static int tcp4_seq_show(struct seq_file *seq, void *v)
2321 {
2322         struct tcp_iter_state *st;
2323         int len;
2324
2325         if (v == SEQ_START_TOKEN) {
2326                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2327                            "  sl  local_address rem_address   st tx_queue "
2328                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2329                            "inode");
2330                 goto out;
2331         }
2332         st = seq->private;
2333
2334         switch (st->state) {
2335         case TCP_SEQ_STATE_LISTENING:
2336         case TCP_SEQ_STATE_ESTABLISHED:
2337                 get_tcp4_sock(v, seq, st->num, &len);
2338                 break;
2339         case TCP_SEQ_STATE_OPENREQ:
2340                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2341                 break;
2342         case TCP_SEQ_STATE_TIME_WAIT:
2343                 get_timewait4_sock(v, seq, st->num, &len);
2344                 break;
2345         }
2346         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2347 out:
2348         return 0;
2349 }
2350
2351 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2352         .name           = "tcp",
2353         .family         = AF_INET,
2354         .seq_fops       = {
2355                 .owner          = THIS_MODULE,
2356         },
2357         .seq_ops        = {
2358                 .show           = tcp4_seq_show,
2359         },
2360 };
2361
2362 static int tcp4_proc_init_net(struct net *net)
2363 {
2364         return tcp_proc_register(net, &tcp4_seq_afinfo);
2365 }
2366
2367 static void tcp4_proc_exit_net(struct net *net)
2368 {
2369         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2370 }
2371
2372 static struct pernet_operations tcp4_net_ops = {
2373         .init = tcp4_proc_init_net,
2374         .exit = tcp4_proc_exit_net,
2375 };
2376
2377 int __init tcp4_proc_init(void)
2378 {
2379         return register_pernet_subsys(&tcp4_net_ops);
2380 }
2381
2382 void tcp4_proc_exit(void)
2383 {
2384         unregister_pernet_subsys(&tcp4_net_ops);
2385 }
2386 #endif /* CONFIG_PROC_FS */
2387
2388 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2389 {
2390         struct iphdr *iph = skb_gro_network_header(skb);
2391
2392         switch (skb->ip_summed) {
2393         case CHECKSUM_COMPLETE:
2394                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2395                                   skb->csum)) {
2396                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2397                         break;
2398                 }
2399
2400                 /* fall through */
2401         case CHECKSUM_NONE:
2402                 NAPI_GRO_CB(skb)->flush = 1;
2403                 return NULL;
2404         }
2405
2406         return tcp_gro_receive(head, skb);
2407 }
2408 EXPORT_SYMBOL(tcp4_gro_receive);
2409
2410 int tcp4_gro_complete(struct sk_buff *skb)
2411 {
2412         struct iphdr *iph = ip_hdr(skb);
2413         struct tcphdr *th = tcp_hdr(skb);
2414
2415         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2416                                   iph->saddr, iph->daddr, 0);
2417         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2418
2419         return tcp_gro_complete(skb);
2420 }
2421 EXPORT_SYMBOL(tcp4_gro_complete);
2422
2423 struct proto tcp_prot = {
2424         .name                   = "TCP",
2425         .owner                  = THIS_MODULE,
2426         .close                  = tcp_close,
2427         .connect                = tcp_v4_connect,
2428         .disconnect             = tcp_disconnect,
2429         .accept                 = inet_csk_accept,
2430         .ioctl                  = tcp_ioctl,
2431         .init                   = tcp_v4_init_sock,
2432         .destroy                = tcp_v4_destroy_sock,
2433         .shutdown               = tcp_shutdown,
2434         .setsockopt             = tcp_setsockopt,
2435         .getsockopt             = tcp_getsockopt,
2436         .recvmsg                = tcp_recvmsg,
2437         .backlog_rcv            = tcp_v4_do_rcv,
2438         .hash                   = inet_hash,
2439         .unhash                 = inet_unhash,
2440         .get_port               = inet_csk_get_port,
2441         .enter_memory_pressure  = tcp_enter_memory_pressure,
2442         .sockets_allocated      = &tcp_sockets_allocated,
2443         .orphan_count           = &tcp_orphan_count,
2444         .memory_allocated       = &tcp_memory_allocated,
2445         .memory_pressure        = &tcp_memory_pressure,
2446         .sysctl_mem             = sysctl_tcp_mem,
2447         .sysctl_wmem            = sysctl_tcp_wmem,
2448         .sysctl_rmem            = sysctl_tcp_rmem,
2449         .max_header             = MAX_TCP_HEADER,
2450         .obj_size               = sizeof(struct tcp_sock),
2451         .slab_flags             = SLAB_DESTROY_BY_RCU,
2452         .twsk_prot              = &tcp_timewait_sock_ops,
2453         .rsk_prot               = &tcp_request_sock_ops,
2454         .h.hashinfo             = &tcp_hashinfo,
2455 #ifdef CONFIG_COMPAT
2456         .compat_setsockopt      = compat_tcp_setsockopt,
2457         .compat_getsockopt      = compat_tcp_getsockopt,
2458 #endif
2459 };
2460
2461
2462 static int __net_init tcp_sk_init(struct net *net)
2463 {
2464         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2465                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2466 }
2467
2468 static void __net_exit tcp_sk_exit(struct net *net)
2469 {
2470         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2471         inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
2472 }
2473
2474 static struct pernet_operations __net_initdata tcp_sk_ops = {
2475        .init = tcp_sk_init,
2476        .exit = tcp_sk_exit,
2477 };
2478
2479 void __init tcp_v4_init(void)
2480 {
2481         inet_hashinfo_init(&tcp_hashinfo);
2482         if (register_pernet_subsys(&tcp_sk_ops))
2483                 panic("Failed to create the TCP control socket.\n");
2484 }
2485
2486 EXPORT_SYMBOL(ipv4_specific);
2487 EXPORT_SYMBOL(tcp_hashinfo);
2488 EXPORT_SYMBOL(tcp_prot);
2489 EXPORT_SYMBOL(tcp_v4_conn_request);
2490 EXPORT_SYMBOL(tcp_v4_connect);
2491 EXPORT_SYMBOL(tcp_v4_do_rcv);
2492 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2493 EXPORT_SYMBOL(tcp_v4_send_check);
2494 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2495
2496 #ifdef CONFIG_PROC_FS
2497 EXPORT_SYMBOL(tcp_proc_register);
2498 EXPORT_SYMBOL(tcp_proc_unregister);
2499 #endif
2500 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2501