]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/tcp_ipv4.c
tcp: Respect SO_RCVLOWAT in tcp_poll().
[net-next-2.6.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
1da177e4
LT
53
54#include <linux/types.h>
55#include <linux/fcntl.h>
56#include <linux/module.h>
57#include <linux/random.h>
58#include <linux/cache.h>
59#include <linux/jhash.h>
60#include <linux/init.h>
61#include <linux/times.h>
62
457c4cbc 63#include <net/net_namespace.h>
1da177e4 64#include <net/icmp.h>
304a1618 65#include <net/inet_hashtables.h>
1da177e4 66#include <net/tcp.h>
20380731 67#include <net/transp_v6.h>
1da177e4
LT
68#include <net/ipv6.h>
69#include <net/inet_common.h>
6d6ee43e 70#include <net/timewait_sock.h>
1da177e4 71#include <net/xfrm.h>
1a2449a8 72#include <net/netdma.h>
1da177e4
LT
73
74#include <linux/inet.h>
75#include <linux/ipv6.h>
76#include <linux/stddef.h>
77#include <linux/proc_fs.h>
78#include <linux/seq_file.h>
79
cfb6eeb4
YH
80#include <linux/crypto.h>
81#include <linux/scatterlist.h>
82
ab32ea5d
BH
83int sysctl_tcp_tw_reuse __read_mostly;
84int sysctl_tcp_low_latency __read_mostly;
1da177e4 85
1da177e4 86
cfb6eeb4 87#ifdef CONFIG_TCP_MD5SIG
7174259e
ACM
88static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
89 __be32 addr);
49a72dfb
AL
90static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
91 __be32 daddr, __be32 saddr, struct tcphdr *th);
9501f972
YH
92#else
93static inline
94struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
95{
96 return NULL;
97}
cfb6eeb4
YH
98#endif
99
0f7ff927 100struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
7174259e
ACM
101 .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
102 .lhash_users = ATOMIC_INIT(0),
103 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
1da177e4
LT
104};
105
a94f723d 106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
1da177e4 107{
eddc9ec5
ACM
108 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109 ip_hdr(skb)->saddr,
aa8223c7
ACM
110 tcp_hdr(skb)->dest,
111 tcp_hdr(skb)->source);
1da177e4
LT
112}
113
6d6ee43e
ACM
114int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115{
116 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117 struct tcp_sock *tp = tcp_sk(sk);
118
119 /* With PAWS, it is safe from the viewpoint
120 of data integrity. Even without PAWS it is safe provided sequence
121 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123 Actually, the idea is close to VJ's one, only timestamp cache is
124 held not per host, but per port pair and TW bucket is used as state
125 holder.
126
127 If TW bucket has been already destroyed we fall back to VJ's scheme
128 and use initial timestamp retrieved from peer table.
129 */
130 if (tcptw->tw_ts_recent_stamp &&
131 (twp == NULL || (sysctl_tcp_tw_reuse &&
9d729f72 132 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
133 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134 if (tp->write_seq == 0)
135 tp->write_seq = 1;
136 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
137 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138 sock_hold(sktw);
139 return 1;
140 }
141
142 return 0;
143}
144
145EXPORT_SYMBOL_GPL(tcp_twsk_unique);
146
1da177e4
LT
147/* This will initiate an outgoing connection. */
148int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
149{
150 struct inet_sock *inet = inet_sk(sk);
151 struct tcp_sock *tp = tcp_sk(sk);
152 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
153 struct rtable *rt;
bada8adc 154 __be32 daddr, nexthop;
1da177e4
LT
155 int tmp;
156 int err;
157
158 if (addr_len < sizeof(struct sockaddr_in))
159 return -EINVAL;
160
161 if (usin->sin_family != AF_INET)
162 return -EAFNOSUPPORT;
163
164 nexthop = daddr = usin->sin_addr.s_addr;
165 if (inet->opt && inet->opt->srr) {
166 if (!daddr)
167 return -EINVAL;
168 nexthop = inet->opt->faddr;
169 }
170
171 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
172 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173 IPPROTO_TCP,
8eb9086f 174 inet->sport, usin->sin_port, sk, 1);
584bdf8c
WD
175 if (tmp < 0) {
176 if (tmp == -ENETUNREACH)
7c73a6fa 177 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
1da177e4 178 return tmp;
584bdf8c 179 }
1da177e4
LT
180
181 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182 ip_rt_put(rt);
183 return -ENETUNREACH;
184 }
185
186 if (!inet->opt || !inet->opt->srr)
187 daddr = rt->rt_dst;
188
189 if (!inet->saddr)
190 inet->saddr = rt->rt_src;
191 inet->rcv_saddr = inet->saddr;
192
193 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
194 /* Reset inherited state */
195 tp->rx_opt.ts_recent = 0;
196 tp->rx_opt.ts_recent_stamp = 0;
197 tp->write_seq = 0;
198 }
199
295ff7ed 200 if (tcp_death_row.sysctl_tw_recycle &&
1da177e4
LT
201 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
202 struct inet_peer *peer = rt_get_peer(rt);
7174259e
ACM
203 /*
204 * VJ's idea. We save last timestamp seen from
205 * the destination in peer table, when entering state
206 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
207 * when trying new connection.
1da177e4 208 */
7174259e 209 if (peer != NULL &&
9d729f72 210 peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
1da177e4
LT
211 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
212 tp->rx_opt.ts_recent = peer->tcp_ts;
213 }
214 }
215
216 inet->dport = usin->sin_port;
217 inet->daddr = daddr;
218
d83d8461 219 inet_csk(sk)->icsk_ext_hdr_len = 0;
1da177e4 220 if (inet->opt)
d83d8461 221 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
1da177e4
LT
222
223 tp->rx_opt.mss_clamp = 536;
224
225 /* Socket identity is still unknown (sport may be zero).
226 * However we set state to SYN-SENT and not releasing socket
227 * lock select source port, enter ourselves into the hash tables and
228 * complete initialization after this.
229 */
230 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 231 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
232 if (err)
233 goto failure;
234
7174259e
ACM
235 err = ip_route_newports(&rt, IPPROTO_TCP,
236 inet->sport, inet->dport, sk);
1da177e4
LT
237 if (err)
238 goto failure;
239
240 /* OK, now commit destination to socket. */
bcd76111 241 sk->sk_gso_type = SKB_GSO_TCPV4;
6cbb0df7 242 sk_setup_caps(sk, &rt->u.dst);
1da177e4
LT
243
244 if (!tp->write_seq)
245 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
246 inet->daddr,
247 inet->sport,
248 usin->sin_port);
249
250 inet->id = tp->write_seq ^ jiffies;
251
252 err = tcp_connect(sk);
253 rt = NULL;
254 if (err)
255 goto failure;
256
257 return 0;
258
259failure:
7174259e
ACM
260 /*
261 * This unhashes the socket and releases the local port,
262 * if necessary.
263 */
1da177e4
LT
264 tcp_set_state(sk, TCP_CLOSE);
265 ip_rt_put(rt);
266 sk->sk_route_caps = 0;
267 inet->dport = 0;
268 return err;
269}
270
1da177e4
LT
271/*
272 * This routine does path mtu discovery as defined in RFC1191.
273 */
40efc6fa 274static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
1da177e4
LT
275{
276 struct dst_entry *dst;
277 struct inet_sock *inet = inet_sk(sk);
1da177e4
LT
278
279 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
280 * send out by Linux are always <576bytes so they should go through
281 * unfragmented).
282 */
283 if (sk->sk_state == TCP_LISTEN)
284 return;
285
286 /* We don't check in the destentry if pmtu discovery is forbidden
287 * on this route. We just assume that no packet_to_big packets
288 * are send back when pmtu discovery is not active.
e905a9ed 289 * There is a small race when the user changes this flag in the
1da177e4
LT
290 * route, but I think that's acceptable.
291 */
292 if ((dst = __sk_dst_check(sk, 0)) == NULL)
293 return;
294
295 dst->ops->update_pmtu(dst, mtu);
296
297 /* Something is about to be wrong... Remember soft error
298 * for the case, if this connection will not able to recover.
299 */
300 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
301 sk->sk_err_soft = EMSGSIZE;
302
303 mtu = dst_mtu(dst);
304
305 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
d83d8461 306 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
307 tcp_sync_mss(sk, mtu);
308
309 /* Resend the TCP packet because it's
310 * clear that the old packet has been
311 * dropped. This is the new "fast" path mtu
312 * discovery.
313 */
314 tcp_simple_retransmit(sk);
315 } /* else let the usual retransmit timer handle it */
316}
317
318/*
319 * This routine is called by the ICMP module when it gets some
320 * sort of error condition. If err < 0 then the socket should
321 * be closed and the error returned to the user. If err > 0
322 * it's just the icmp type << 8 | icmp code. After adjustment
323 * header points to the first 8 bytes of the tcp header. We need
324 * to find the appropriate port.
325 *
326 * The locking strategy used here is very "optimistic". When
327 * someone else accesses the socket the ICMP is just dropped
328 * and for some paths there is no check at all.
329 * A more general error queue to queue errors for later handling
330 * is probably better.
331 *
332 */
333
334void tcp_v4_err(struct sk_buff *skb, u32 info)
335{
336 struct iphdr *iph = (struct iphdr *)skb->data;
337 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
338 struct tcp_sock *tp;
339 struct inet_sock *inet;
88c7664f
ACM
340 const int type = icmp_hdr(skb)->type;
341 const int code = icmp_hdr(skb)->code;
1da177e4
LT
342 struct sock *sk;
343 __u32 seq;
344 int err;
fd54d716 345 struct net *net = dev_net(skb->dev);
1da177e4
LT
346
347 if (skb->len < (iph->ihl << 2) + 8) {
dcfc23ca 348 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
349 return;
350 }
351
fd54d716 352 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
c67499c0 353 iph->saddr, th->source, inet_iif(skb));
1da177e4 354 if (!sk) {
dcfc23ca 355 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
356 return;
357 }
358 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 359 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
360 return;
361 }
362
363 bh_lock_sock(sk);
364 /* If too many ICMPs get dropped on busy
365 * servers this needs to be solved differently.
366 */
367 if (sock_owned_by_user(sk))
de0744af 368 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
1da177e4
LT
369
370 if (sk->sk_state == TCP_CLOSE)
371 goto out;
372
373 tp = tcp_sk(sk);
374 seq = ntohl(th->seq);
375 if (sk->sk_state != TCP_LISTEN &&
376 !between(seq, tp->snd_una, tp->snd_nxt)) {
de0744af 377 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
378 goto out;
379 }
380
381 switch (type) {
382 case ICMP_SOURCE_QUENCH:
383 /* Just silently ignore these. */
384 goto out;
385 case ICMP_PARAMETERPROB:
386 err = EPROTO;
387 break;
388 case ICMP_DEST_UNREACH:
389 if (code > NR_ICMP_UNREACH)
390 goto out;
391
392 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
393 if (!sock_owned_by_user(sk))
394 do_pmtu_discovery(sk, iph, info);
395 goto out;
396 }
397
398 err = icmp_err_convert[code].errno;
399 break;
400 case ICMP_TIME_EXCEEDED:
401 err = EHOSTUNREACH;
402 break;
403 default:
404 goto out;
405 }
406
407 switch (sk->sk_state) {
60236fdd 408 struct request_sock *req, **prev;
1da177e4
LT
409 case TCP_LISTEN:
410 if (sock_owned_by_user(sk))
411 goto out;
412
463c84b9
ACM
413 req = inet_csk_search_req(sk, &prev, th->dest,
414 iph->daddr, iph->saddr);
1da177e4
LT
415 if (!req)
416 goto out;
417
418 /* ICMPs are not backlogged, hence we cannot get
419 an established socket here.
420 */
547b792c 421 WARN_ON(req->sk);
1da177e4 422
2e6599cb 423 if (seq != tcp_rsk(req)->snt_isn) {
de0744af 424 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
425 goto out;
426 }
427
428 /*
429 * Still in SYN_RECV, just remove it silently.
430 * There is no good way to pass the error to the newly
431 * created socket, and POSIX does not want network
432 * errors returned from accept().
433 */
463c84b9 434 inet_csk_reqsk_queue_drop(sk, req, prev);
1da177e4
LT
435 goto out;
436
437 case TCP_SYN_SENT:
438 case TCP_SYN_RECV: /* Cannot happen.
439 It can f.e. if SYNs crossed.
440 */
441 if (!sock_owned_by_user(sk)) {
1da177e4
LT
442 sk->sk_err = err;
443
444 sk->sk_error_report(sk);
445
446 tcp_done(sk);
447 } else {
448 sk->sk_err_soft = err;
449 }
450 goto out;
451 }
452
453 /* If we've already connected we will keep trying
454 * until we time out, or the user gives up.
455 *
456 * rfc1122 4.2.3.9 allows to consider as hard errors
457 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
458 * but it is obsoleted by pmtu discovery).
459 *
460 * Note, that in modern internet, where routing is unreliable
461 * and in each dark corner broken firewalls sit, sending random
462 * errors ordered by their masters even this two messages finally lose
463 * their original sense (even Linux sends invalid PORT_UNREACHs)
464 *
465 * Now we are in compliance with RFCs.
466 * --ANK (980905)
467 */
468
469 inet = inet_sk(sk);
470 if (!sock_owned_by_user(sk) && inet->recverr) {
471 sk->sk_err = err;
472 sk->sk_error_report(sk);
473 } else { /* Only an error on timeout */
474 sk->sk_err_soft = err;
475 }
476
477out:
478 bh_unlock_sock(sk);
479 sock_put(sk);
480}
481
482/* This routine computes an IPv4 TCP checksum. */
8292a17a 483void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
1da177e4
LT
484{
485 struct inet_sock *inet = inet_sk(sk);
aa8223c7 486 struct tcphdr *th = tcp_hdr(skb);
1da177e4 487
84fa7933 488 if (skb->ip_summed == CHECKSUM_PARTIAL) {
ba7808ea
FD
489 th->check = ~tcp_v4_check(len, inet->saddr,
490 inet->daddr, 0);
663ead3b 491 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 492 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 493 } else {
ba7808ea 494 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
1da177e4
LT
495 csum_partial((char *)th,
496 th->doff << 2,
497 skb->csum));
498 }
499}
500
a430a43d
HX
501int tcp_v4_gso_send_check(struct sk_buff *skb)
502{
eddc9ec5 503 const struct iphdr *iph;
a430a43d
HX
504 struct tcphdr *th;
505
506 if (!pskb_may_pull(skb, sizeof(*th)))
507 return -EINVAL;
508
eddc9ec5 509 iph = ip_hdr(skb);
aa8223c7 510 th = tcp_hdr(skb);
a430a43d
HX
511
512 th->check = 0;
ba7808ea 513 th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
663ead3b 514 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 515 skb->csum_offset = offsetof(struct tcphdr, check);
84fa7933 516 skb->ip_summed = CHECKSUM_PARTIAL;
a430a43d
HX
517 return 0;
518}
519
1da177e4
LT
520/*
521 * This routine will send an RST to the other tcp.
522 *
523 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
524 * for reset.
525 * Answer: if a packet caused RST, it is not for a socket
526 * existing in our system, if it is matched to a socket,
527 * it is just duplicate segment or bug in other side's TCP.
528 * So that we build reply only basing on parameters
529 * arrived with segment.
530 * Exception: precedence violation. We do not implement it in any case.
531 */
532
cfb6eeb4 533static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
1da177e4 534{
aa8223c7 535 struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
536 struct {
537 struct tcphdr th;
538#ifdef CONFIG_TCP_MD5SIG
714e85be 539 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
540#endif
541 } rep;
1da177e4 542 struct ip_reply_arg arg;
cfb6eeb4
YH
543#ifdef CONFIG_TCP_MD5SIG
544 struct tcp_md5sig_key *key;
545#endif
a86b1e30 546 struct net *net;
1da177e4
LT
547
548 /* Never send a reset in response to a reset. */
549 if (th->rst)
550 return;
551
ee6b9673 552 if (skb->rtable->rt_type != RTN_LOCAL)
1da177e4
LT
553 return;
554
555 /* Swap the send and the receive. */
cfb6eeb4
YH
556 memset(&rep, 0, sizeof(rep));
557 rep.th.dest = th->source;
558 rep.th.source = th->dest;
559 rep.th.doff = sizeof(struct tcphdr) / 4;
560 rep.th.rst = 1;
1da177e4
LT
561
562 if (th->ack) {
cfb6eeb4 563 rep.th.seq = th->ack_seq;
1da177e4 564 } else {
cfb6eeb4
YH
565 rep.th.ack = 1;
566 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
567 skb->len - (th->doff << 2));
1da177e4
LT
568 }
569
7174259e 570 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
571 arg.iov[0].iov_base = (unsigned char *)&rep;
572 arg.iov[0].iov_len = sizeof(rep.th);
573
574#ifdef CONFIG_TCP_MD5SIG
eddc9ec5 575 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
cfb6eeb4
YH
576 if (key) {
577 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
578 (TCPOPT_NOP << 16) |
579 (TCPOPT_MD5SIG << 8) |
580 TCPOLEN_MD5SIG);
581 /* Update length and the length the header thinks exists */
582 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
583 rep.th.doff = arg.iov[0].iov_len / 4;
584
49a72dfb
AL
585 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
586 key, ip_hdr(skb)->daddr,
587 ip_hdr(skb)->saddr, &rep.th);
cfb6eeb4
YH
588 }
589#endif
eddc9ec5
ACM
590 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
591 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
592 sizeof(struct tcphdr), IPPROTO_TCP, 0);
593 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
88ef4a5a 594 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
1da177e4 595
a86b1e30
PE
596 net = dev_net(skb->dst->dev);
597 ip_send_reply(net->ipv4.tcp_sock, skb,
7feb49c8 598 &arg, arg.iov[0].iov_len);
1da177e4 599
63231bdd
PE
600 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
601 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
1da177e4
LT
602}
603
604/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
605 outside socket context is ugly, certainly. What can I do?
606 */
607
9501f972
YH
608static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
609 u32 win, u32 ts, int oif,
88ef4a5a
KK
610 struct tcp_md5sig_key *key,
611 int reply_flags)
1da177e4 612{
aa8223c7 613 struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
614 struct {
615 struct tcphdr th;
714e85be 616 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 617#ifdef CONFIG_TCP_MD5SIG
714e85be 618 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
619#endif
620 ];
1da177e4
LT
621 } rep;
622 struct ip_reply_arg arg;
4dd7972d 623 struct net *net = dev_net(skb->dst->dev);
1da177e4
LT
624
625 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 626 memset(&arg, 0, sizeof(arg));
1da177e4
LT
627
628 arg.iov[0].iov_base = (unsigned char *)&rep;
629 arg.iov[0].iov_len = sizeof(rep.th);
630 if (ts) {
cfb6eeb4
YH
631 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
632 (TCPOPT_TIMESTAMP << 8) |
633 TCPOLEN_TIMESTAMP);
634 rep.opt[1] = htonl(tcp_time_stamp);
635 rep.opt[2] = htonl(ts);
cb48cfe8 636 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
637 }
638
639 /* Swap the send and the receive. */
640 rep.th.dest = th->source;
641 rep.th.source = th->dest;
642 rep.th.doff = arg.iov[0].iov_len / 4;
643 rep.th.seq = htonl(seq);
644 rep.th.ack_seq = htonl(ack);
645 rep.th.ack = 1;
646 rep.th.window = htons(win);
647
cfb6eeb4 648#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
649 if (key) {
650 int offset = (ts) ? 3 : 0;
651
652 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
653 (TCPOPT_NOP << 16) |
654 (TCPOPT_MD5SIG << 8) |
655 TCPOLEN_MD5SIG);
656 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
657 rep.th.doff = arg.iov[0].iov_len/4;
658
49a72dfb 659 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
660 key, ip_hdr(skb)->saddr,
661 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
662 }
663#endif
88ef4a5a 664 arg.flags = reply_flags;
eddc9ec5
ACM
665 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
666 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
667 arg.iov[0].iov_len, IPPROTO_TCP, 0);
668 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
669 if (oif)
670 arg.bound_dev_if = oif;
1da177e4 671
a86b1e30 672 ip_send_reply(net->ipv4.tcp_sock, skb,
7feb49c8 673 &arg, arg.iov[0].iov_len);
1da177e4 674
63231bdd 675 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
1da177e4
LT
676}
677
678static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
679{
8feaf0c0 680 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 681 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 682
9501f972 683 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 684 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9501f972
YH
685 tcptw->tw_ts_recent,
686 tw->tw_bound_dev_if,
88ef4a5a
KK
687 tcp_twsk_md5_key(tcptw),
688 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
9501f972 689 );
1da177e4 690
8feaf0c0 691 inet_twsk_put(tw);
1da177e4
LT
692}
693
6edafaaf 694static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
7174259e 695 struct request_sock *req)
1da177e4 696{
9501f972 697 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
cfb6eeb4 698 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
9501f972
YH
699 req->ts_recent,
700 0,
88ef4a5a
KK
701 tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
702 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
1da177e4
LT
703}
704
1da177e4 705/*
9bf1d83e 706 * Send a SYN-ACK after having received a SYN.
60236fdd 707 * This still operates on a request_sock only, not on a big
1da177e4
LT
708 * socket.
709 */
fd80eb94
DL
710static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
711 struct dst_entry *dst)
1da177e4 712{
2e6599cb 713 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
714 int err = -1;
715 struct sk_buff * skb;
716
717 /* First, grab a route. */
463c84b9 718 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
fd80eb94 719 return -1;
1da177e4
LT
720
721 skb = tcp_make_synack(sk, dst, req);
722
723 if (skb) {
aa8223c7 724 struct tcphdr *th = tcp_hdr(skb);
1da177e4 725
ba7808ea 726 th->check = tcp_v4_check(skb->len,
2e6599cb
ACM
727 ireq->loc_addr,
728 ireq->rmt_addr,
1da177e4
LT
729 csum_partial((char *)th, skb->len,
730 skb->csum));
731
2e6599cb
ACM
732 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
733 ireq->rmt_addr,
734 ireq->opt);
b9df3cb8 735 err = net_xmit_eval(err);
1da177e4
LT
736 }
737
1da177e4
LT
738 dst_release(dst);
739 return err;
740}
741
fd80eb94
DL
742static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
743{
744 return __tcp_v4_send_synack(sk, req, NULL);
745}
746
1da177e4 747/*
60236fdd 748 * IPv4 request_sock destructor.
1da177e4 749 */
60236fdd 750static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 751{
a51482bd 752 kfree(inet_rsk(req)->opt);
1da177e4
LT
753}
754
80e40daa 755#ifdef CONFIG_SYN_COOKIES
40efc6fa 756static void syn_flood_warning(struct sk_buff *skb)
1da177e4
LT
757{
758 static unsigned long warntime;
759
760 if (time_after(jiffies, (warntime + HZ * 60))) {
761 warntime = jiffies;
762 printk(KERN_INFO
763 "possible SYN flooding on port %d. Sending cookies.\n",
aa8223c7 764 ntohs(tcp_hdr(skb)->dest));
1da177e4
LT
765 }
766}
80e40daa 767#endif
1da177e4
LT
768
769/*
60236fdd 770 * Save and compile IPv4 options into the request_sock if needed.
1da177e4 771 */
40efc6fa
SH
772static struct ip_options *tcp_v4_save_options(struct sock *sk,
773 struct sk_buff *skb)
1da177e4
LT
774{
775 struct ip_options *opt = &(IPCB(skb)->opt);
776 struct ip_options *dopt = NULL;
777
778 if (opt && opt->optlen) {
779 int opt_size = optlength(opt);
780 dopt = kmalloc(opt_size, GFP_ATOMIC);
781 if (dopt) {
782 if (ip_options_echo(dopt, skb)) {
783 kfree(dopt);
784 dopt = NULL;
785 }
786 }
787 }
788 return dopt;
789}
790
cfb6eeb4
YH
791#ifdef CONFIG_TCP_MD5SIG
792/*
793 * RFC2385 MD5 checksumming requires a mapping of
794 * IP address->MD5 Key.
795 * We need to maintain these in the sk structure.
796 */
797
798/* Find the Key structure for an address. */
7174259e
ACM
799static struct tcp_md5sig_key *
800 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
cfb6eeb4
YH
801{
802 struct tcp_sock *tp = tcp_sk(sk);
803 int i;
804
805 if (!tp->md5sig_info || !tp->md5sig_info->entries4)
806 return NULL;
807 for (i = 0; i < tp->md5sig_info->entries4; i++) {
808 if (tp->md5sig_info->keys4[i].addr == addr)
f8ab18d2 809 return &tp->md5sig_info->keys4[i].base;
cfb6eeb4
YH
810 }
811 return NULL;
812}
813
814struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
815 struct sock *addr_sk)
816{
817 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
818}
819
820EXPORT_SYMBOL(tcp_v4_md5_lookup);
821
f5b99bcd
AB
822static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
823 struct request_sock *req)
cfb6eeb4
YH
824{
825 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
826}
827
828/* This can be called on a newly created socket, from other files */
829int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
830 u8 *newkey, u8 newkeylen)
831{
832 /* Add Key to the list */
b0a713e9 833 struct tcp_md5sig_key *key;
cfb6eeb4
YH
834 struct tcp_sock *tp = tcp_sk(sk);
835 struct tcp4_md5sig_key *keys;
836
b0a713e9 837 key = tcp_v4_md5_do_lookup(sk, addr);
cfb6eeb4
YH
838 if (key) {
839 /* Pre-existing entry - just update that one. */
b0a713e9
MD
840 kfree(key->key);
841 key->key = newkey;
842 key->keylen = newkeylen;
cfb6eeb4 843 } else {
f6685938
ACM
844 struct tcp_md5sig_info *md5sig;
845
cfb6eeb4 846 if (!tp->md5sig_info) {
f6685938
ACM
847 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
848 GFP_ATOMIC);
cfb6eeb4
YH
849 if (!tp->md5sig_info) {
850 kfree(newkey);
851 return -ENOMEM;
852 }
3d7dbeac 853 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
cfb6eeb4
YH
854 }
855 if (tcp_alloc_md5sig_pool() == NULL) {
856 kfree(newkey);
857 return -ENOMEM;
858 }
f6685938
ACM
859 md5sig = tp->md5sig_info;
860
861 if (md5sig->alloced4 == md5sig->entries4) {
862 keys = kmalloc((sizeof(*keys) *
e905a9ed 863 (md5sig->entries4 + 1)), GFP_ATOMIC);
cfb6eeb4
YH
864 if (!keys) {
865 kfree(newkey);
866 tcp_free_md5sig_pool();
867 return -ENOMEM;
868 }
869
f6685938
ACM
870 if (md5sig->entries4)
871 memcpy(keys, md5sig->keys4,
872 sizeof(*keys) * md5sig->entries4);
cfb6eeb4
YH
873
874 /* Free old key list, and reference new one */
a80cc20d 875 kfree(md5sig->keys4);
f6685938
ACM
876 md5sig->keys4 = keys;
877 md5sig->alloced4++;
cfb6eeb4 878 }
f6685938 879 md5sig->entries4++;
f8ab18d2
DM
880 md5sig->keys4[md5sig->entries4 - 1].addr = addr;
881 md5sig->keys4[md5sig->entries4 - 1].base.key = newkey;
882 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
cfb6eeb4
YH
883 }
884 return 0;
885}
886
887EXPORT_SYMBOL(tcp_v4_md5_do_add);
888
889static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
890 u8 *newkey, u8 newkeylen)
891{
892 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
893 newkey, newkeylen);
894}
895
896int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
897{
898 struct tcp_sock *tp = tcp_sk(sk);
899 int i;
900
901 for (i = 0; i < tp->md5sig_info->entries4; i++) {
902 if (tp->md5sig_info->keys4[i].addr == addr) {
903 /* Free the key */
f8ab18d2 904 kfree(tp->md5sig_info->keys4[i].base.key);
cfb6eeb4
YH
905 tp->md5sig_info->entries4--;
906
907 if (tp->md5sig_info->entries4 == 0) {
908 kfree(tp->md5sig_info->keys4);
909 tp->md5sig_info->keys4 = NULL;
8228a18d 910 tp->md5sig_info->alloced4 = 0;
7174259e 911 } else if (tp->md5sig_info->entries4 != i) {
cfb6eeb4 912 /* Need to do some manipulation */
354faf09
YH
913 memmove(&tp->md5sig_info->keys4[i],
914 &tp->md5sig_info->keys4[i+1],
915 (tp->md5sig_info->entries4 - i) *
916 sizeof(struct tcp4_md5sig_key));
cfb6eeb4
YH
917 }
918 tcp_free_md5sig_pool();
919 return 0;
920 }
921 }
922 return -ENOENT;
923}
924
925EXPORT_SYMBOL(tcp_v4_md5_do_del);
926
7174259e 927static void tcp_v4_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
928{
929 struct tcp_sock *tp = tcp_sk(sk);
930
931 /* Free each key, then the set of key keys,
932 * the crypto element, and then decrement our
933 * hold on the last resort crypto.
934 */
935 if (tp->md5sig_info->entries4) {
936 int i;
937 for (i = 0; i < tp->md5sig_info->entries4; i++)
f8ab18d2 938 kfree(tp->md5sig_info->keys4[i].base.key);
cfb6eeb4
YH
939 tp->md5sig_info->entries4 = 0;
940 tcp_free_md5sig_pool();
941 }
942 if (tp->md5sig_info->keys4) {
943 kfree(tp->md5sig_info->keys4);
944 tp->md5sig_info->keys4 = NULL;
945 tp->md5sig_info->alloced4 = 0;
946 }
947}
948
7174259e
ACM
949static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
950 int optlen)
cfb6eeb4
YH
951{
952 struct tcp_md5sig cmd;
953 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
954 u8 *newkey;
955
956 if (optlen < sizeof(cmd))
957 return -EINVAL;
958
7174259e 959 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
960 return -EFAULT;
961
962 if (sin->sin_family != AF_INET)
963 return -EINVAL;
964
965 if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
966 if (!tcp_sk(sk)->md5sig_info)
967 return -ENOENT;
968 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
969 }
970
971 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
972 return -EINVAL;
973
974 if (!tcp_sk(sk)->md5sig_info) {
975 struct tcp_sock *tp = tcp_sk(sk);
7174259e 976 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
cfb6eeb4 977
cfb6eeb4
YH
978 if (!p)
979 return -EINVAL;
980
981 tp->md5sig_info = p;
3d7dbeac 982 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
cfb6eeb4
YH
983 }
984
f6685938 985 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
cfb6eeb4
YH
986 if (!newkey)
987 return -ENOMEM;
cfb6eeb4
YH
988 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
989 newkey, cmd.tcpm_keylen);
990}
991
49a72dfb
AL
992static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
993 __be32 daddr, __be32 saddr, int nbytes)
cfb6eeb4 994{
cfb6eeb4 995 struct tcp4_pseudohdr *bp;
49a72dfb 996 struct scatterlist sg;
cfb6eeb4
YH
997
998 bp = &hp->md5_blk.ip4;
cfb6eeb4
YH
999
1000 /*
49a72dfb 1001 * 1. the TCP pseudo-header (in the order: source IP address,
cfb6eeb4
YH
1002 * destination IP address, zero-padded protocol number, and
1003 * segment length)
1004 */
1005 bp->saddr = saddr;
1006 bp->daddr = daddr;
1007 bp->pad = 0;
076fb722 1008 bp->protocol = IPPROTO_TCP;
49a72dfb 1009 bp->len = cpu_to_be16(nbytes);
c7da57a1 1010
49a72dfb
AL
1011 sg_init_one(&sg, bp, sizeof(*bp));
1012 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1013}
1014
1015static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1016 __be32 daddr, __be32 saddr, struct tcphdr *th)
1017{
1018 struct tcp_md5sig_pool *hp;
1019 struct hash_desc *desc;
1020
1021 hp = tcp_get_md5sig_pool();
1022 if (!hp)
1023 goto clear_hash_noput;
1024 desc = &hp->md5_desc;
1025
1026 if (crypto_hash_init(desc))
1027 goto clear_hash;
1028 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1029 goto clear_hash;
1030 if (tcp_md5_hash_header(hp, th))
1031 goto clear_hash;
1032 if (tcp_md5_hash_key(hp, key))
1033 goto clear_hash;
1034 if (crypto_hash_final(desc, md5_hash))
cfb6eeb4
YH
1035 goto clear_hash;
1036
cfb6eeb4 1037 tcp_put_md5sig_pool();
cfb6eeb4 1038 return 0;
49a72dfb 1039
cfb6eeb4
YH
1040clear_hash:
1041 tcp_put_md5sig_pool();
1042clear_hash_noput:
1043 memset(md5_hash, 0, 16);
49a72dfb 1044 return 1;
cfb6eeb4
YH
1045}
1046
49a72dfb
AL
1047int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1048 struct sock *sk, struct request_sock *req,
1049 struct sk_buff *skb)
cfb6eeb4 1050{
49a72dfb
AL
1051 struct tcp_md5sig_pool *hp;
1052 struct hash_desc *desc;
1053 struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1054 __be32 saddr, daddr;
1055
1056 if (sk) {
1057 saddr = inet_sk(sk)->saddr;
1058 daddr = inet_sk(sk)->daddr;
49a72dfb
AL
1059 } else if (req) {
1060 saddr = inet_rsk(req)->loc_addr;
1061 daddr = inet_rsk(req)->rmt_addr;
cfb6eeb4 1062 } else {
49a72dfb
AL
1063 const struct iphdr *iph = ip_hdr(skb);
1064 saddr = iph->saddr;
1065 daddr = iph->daddr;
cfb6eeb4 1066 }
49a72dfb
AL
1067
1068 hp = tcp_get_md5sig_pool();
1069 if (!hp)
1070 goto clear_hash_noput;
1071 desc = &hp->md5_desc;
1072
1073 if (crypto_hash_init(desc))
1074 goto clear_hash;
1075
1076 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1077 goto clear_hash;
1078 if (tcp_md5_hash_header(hp, th))
1079 goto clear_hash;
1080 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1081 goto clear_hash;
1082 if (tcp_md5_hash_key(hp, key))
1083 goto clear_hash;
1084 if (crypto_hash_final(desc, md5_hash))
1085 goto clear_hash;
1086
1087 tcp_put_md5sig_pool();
1088 return 0;
1089
1090clear_hash:
1091 tcp_put_md5sig_pool();
1092clear_hash_noput:
1093 memset(md5_hash, 0, 16);
1094 return 1;
cfb6eeb4
YH
1095}
1096
49a72dfb 1097EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1098
7174259e 1099static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
cfb6eeb4
YH
1100{
1101 /*
1102 * This gets called for each TCP segment that arrives
1103 * so we want to be efficient.
1104 * We have 3 drop cases:
1105 * o No MD5 hash and one expected.
1106 * o MD5 hash and we're not expecting one.
1107 * o MD5 hash and its wrong.
1108 */
1109 __u8 *hash_location = NULL;
1110 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1111 const struct iphdr *iph = ip_hdr(skb);
aa8223c7 1112 struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1113 int genhash;
cfb6eeb4
YH
1114 unsigned char newhash[16];
1115
1116 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
7d5d5525 1117 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1118
cfb6eeb4
YH
1119 /* We've parsed the options - do we have a hash? */
1120 if (!hash_expected && !hash_location)
1121 return 0;
1122
1123 if (hash_expected && !hash_location) {
785957d3 1124 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
cfb6eeb4
YH
1125 return 1;
1126 }
1127
1128 if (!hash_expected && hash_location) {
785957d3 1129 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
cfb6eeb4
YH
1130 return 1;
1131 }
1132
1133 /* Okay, so this is hash_expected and hash_location -
1134 * so we need to calculate the checksum.
1135 */
49a72dfb
AL
1136 genhash = tcp_v4_md5_hash_skb(newhash,
1137 hash_expected,
1138 NULL, NULL, skb);
cfb6eeb4
YH
1139
1140 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1141 if (net_ratelimit()) {
1142 printk(KERN_INFO "MD5 Hash failed for "
1143 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
7174259e
ACM
1144 NIPQUAD(iph->saddr), ntohs(th->source),
1145 NIPQUAD(iph->daddr), ntohs(th->dest),
cfb6eeb4 1146 genhash ? " tcp_v4_calc_md5_hash failed" : "");
cfb6eeb4
YH
1147 }
1148 return 1;
1149 }
1150 return 0;
1151}
1152
1153#endif
1154
72a3effa 1155struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1156 .family = PF_INET,
2e6599cb 1157 .obj_size = sizeof(struct tcp_request_sock),
1da177e4 1158 .rtx_syn_ack = tcp_v4_send_synack,
60236fdd
ACM
1159 .send_ack = tcp_v4_reqsk_send_ack,
1160 .destructor = tcp_v4_reqsk_destructor,
1da177e4
LT
1161 .send_reset = tcp_v4_send_reset,
1162};
1163
cfb6eeb4 1164#ifdef CONFIG_TCP_MD5SIG
b6332e6c 1165static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
cfb6eeb4 1166 .md5_lookup = tcp_v4_reqsk_md5_lookup,
cfb6eeb4 1167};
b6332e6c 1168#endif
cfb6eeb4 1169
6d6ee43e
ACM
1170static struct timewait_sock_ops tcp_timewait_sock_ops = {
1171 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1172 .twsk_unique = tcp_twsk_unique,
cfb6eeb4 1173 .twsk_destructor= tcp_twsk_destructor,
6d6ee43e
ACM
1174};
1175
1da177e4
LT
1176int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1177{
2e6599cb 1178 struct inet_request_sock *ireq;
1da177e4 1179 struct tcp_options_received tmp_opt;
60236fdd 1180 struct request_sock *req;
eddc9ec5
ACM
1181 __be32 saddr = ip_hdr(skb)->saddr;
1182 __be32 daddr = ip_hdr(skb)->daddr;
1da177e4
LT
1183 __u32 isn = TCP_SKB_CB(skb)->when;
1184 struct dst_entry *dst = NULL;
1185#ifdef CONFIG_SYN_COOKIES
1186 int want_cookie = 0;
1187#else
1188#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1189#endif
1190
1191 /* Never answer to SYNs send to broadcast or multicast */
ee6b9673 1192 if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1193 goto drop;
1194
1195 /* TW buckets are converted to open requests without
1196 * limitations, they conserve resources and peer is
1197 * evidently real one.
1198 */
463c84b9 1199 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1da177e4
LT
1200#ifdef CONFIG_SYN_COOKIES
1201 if (sysctl_tcp_syncookies) {
1202 want_cookie = 1;
1203 } else
1204#endif
1205 goto drop;
1206 }
1207
1208 /* Accept backlog is full. If we have already queued enough
1209 * of warm entries in syn queue, drop request. It is better than
1210 * clogging syn queue with openreqs with exponentially increasing
1211 * timeout.
1212 */
463c84b9 1213 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1da177e4
LT
1214 goto drop;
1215
ce4a7d0d 1216 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1da177e4
LT
1217 if (!req)
1218 goto drop;
1219
cfb6eeb4
YH
1220#ifdef CONFIG_TCP_MD5SIG
1221 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1222#endif
1223
1da177e4
LT
1224 tcp_clear_options(&tmp_opt);
1225 tmp_opt.mss_clamp = 536;
1226 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1227
1228 tcp_parse_options(skb, &tmp_opt, 0);
1229
4dfc2817 1230 if (want_cookie && !tmp_opt.saw_tstamp)
1da177e4 1231 tcp_clear_options(&tmp_opt);
1da177e4
LT
1232
1233 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1234 /* Some OSes (unknown ones, but I see them on web server, which
1235 * contains information interesting only for windows'
1236 * users) do not send their stamp in SYN. It is easy case.
1237 * We simply do not advertise TS support.
1238 */
1239 tmp_opt.saw_tstamp = 0;
1240 tmp_opt.tstamp_ok = 0;
1241 }
1242 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1243
1244 tcp_openreq_init(req, &tmp_opt, skb);
1245
4237c75c
VY
1246 if (security_inet_conn_request(sk, skb, req))
1247 goto drop_and_free;
1248
2e6599cb
ACM
1249 ireq = inet_rsk(req);
1250 ireq->loc_addr = daddr;
1251 ireq->rmt_addr = saddr;
88ef4a5a 1252 ireq->no_srccheck = inet_sk(sk)->transparent;
2e6599cb 1253 ireq->opt = tcp_v4_save_options(sk, skb);
1da177e4 1254 if (!want_cookie)
aa8223c7 1255 TCP_ECN_create_request(req, tcp_hdr(skb));
1da177e4
LT
1256
1257 if (want_cookie) {
1258#ifdef CONFIG_SYN_COOKIES
1259 syn_flood_warning(skb);
4dfc2817 1260 req->cookie_ts = tmp_opt.tstamp_ok;
1da177e4
LT
1261#endif
1262 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1263 } else if (!isn) {
1264 struct inet_peer *peer = NULL;
1265
1266 /* VJ's idea. We save last timestamp seen
1267 * from the destination in peer table, when entering
1268 * state TIME-WAIT, and check against it before
1269 * accepting new connection request.
1270 *
1271 * If "isn" is not zero, this request hit alive
1272 * timewait bucket, so that all the necessary checks
1273 * are made in the function processing timewait state.
1274 */
1275 if (tmp_opt.saw_tstamp &&
295ff7ed 1276 tcp_death_row.sysctl_tw_recycle &&
463c84b9 1277 (dst = inet_csk_route_req(sk, req)) != NULL &&
1da177e4
LT
1278 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1279 peer->v4daddr == saddr) {
9d729f72 1280 if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1da177e4
LT
1281 (s32)(peer->tcp_ts - req->ts_recent) >
1282 TCP_PAWS_WINDOW) {
de0744af 1283 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
7cd04fa7 1284 goto drop_and_release;
1da177e4
LT
1285 }
1286 }
1287 /* Kill the following clause, if you dislike this way. */
1288 else if (!sysctl_tcp_syncookies &&
463c84b9 1289 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1da177e4
LT
1290 (sysctl_max_syn_backlog >> 2)) &&
1291 (!peer || !peer->tcp_ts_stamp) &&
1292 (!dst || !dst_metric(dst, RTAX_RTT))) {
1293 /* Without syncookies last quarter of
1294 * backlog is filled with destinations,
1295 * proven to be alive.
1296 * It means that we continue to communicate
1297 * to destinations, already remembered
1298 * to the moment of synflood.
1299 */
64ce2073 1300 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
a7d632b6 1301 "request from " NIPQUAD_FMT "/%u\n",
64ce2073 1302 NIPQUAD(saddr),
aa8223c7 1303 ntohs(tcp_hdr(skb)->source));
7cd04fa7 1304 goto drop_and_release;
1da177e4
LT
1305 }
1306
a94f723d 1307 isn = tcp_v4_init_sequence(skb);
1da177e4 1308 }
2e6599cb 1309 tcp_rsk(req)->snt_isn = isn;
1da177e4 1310
7cd04fa7 1311 if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1da177e4
LT
1312 goto drop_and_free;
1313
7cd04fa7 1314 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1da177e4
LT
1315 return 0;
1316
7cd04fa7
DL
1317drop_and_release:
1318 dst_release(dst);
1da177e4 1319drop_and_free:
60236fdd 1320 reqsk_free(req);
1da177e4 1321drop:
1da177e4
LT
1322 return 0;
1323}
1324
1325
1326/*
1327 * The three way handshake has completed - we got a valid synack -
1328 * now create the new socket.
1329 */
1330struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1331 struct request_sock *req,
1da177e4
LT
1332 struct dst_entry *dst)
1333{
2e6599cb 1334 struct inet_request_sock *ireq;
1da177e4
LT
1335 struct inet_sock *newinet;
1336 struct tcp_sock *newtp;
1337 struct sock *newsk;
cfb6eeb4
YH
1338#ifdef CONFIG_TCP_MD5SIG
1339 struct tcp_md5sig_key *key;
1340#endif
1da177e4
LT
1341
1342 if (sk_acceptq_is_full(sk))
1343 goto exit_overflow;
1344
463c84b9 1345 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1da177e4
LT
1346 goto exit;
1347
1348 newsk = tcp_create_openreq_child(sk, req, skb);
1349 if (!newsk)
1350 goto exit;
1351
bcd76111 1352 newsk->sk_gso_type = SKB_GSO_TCPV4;
6cbb0df7 1353 sk_setup_caps(newsk, dst);
1da177e4
LT
1354
1355 newtp = tcp_sk(newsk);
1356 newinet = inet_sk(newsk);
2e6599cb
ACM
1357 ireq = inet_rsk(req);
1358 newinet->daddr = ireq->rmt_addr;
1359 newinet->rcv_saddr = ireq->loc_addr;
1360 newinet->saddr = ireq->loc_addr;
1361 newinet->opt = ireq->opt;
1362 ireq->opt = NULL;
463c84b9 1363 newinet->mc_index = inet_iif(skb);
eddc9ec5 1364 newinet->mc_ttl = ip_hdr(skb)->ttl;
d83d8461 1365 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1da177e4 1366 if (newinet->opt)
d83d8461 1367 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1da177e4
LT
1368 newinet->id = newtp->write_seq ^ jiffies;
1369
5d424d5a 1370 tcp_mtup_init(newsk);
1da177e4
LT
1371 tcp_sync_mss(newsk, dst_mtu(dst));
1372 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
f5fff5dc
TQ
1373 if (tcp_sk(sk)->rx_opt.user_mss &&
1374 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1375 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1376
1da177e4
LT
1377 tcp_initialize_rcv_mss(newsk);
1378
cfb6eeb4
YH
1379#ifdef CONFIG_TCP_MD5SIG
1380 /* Copy over the MD5 key from the original socket */
1381 if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1382 /*
1383 * We're using one, so create a matching key
1384 * on the newsk structure. If we fail to get
1385 * memory, then we end up not copying the key
1386 * across. Shucks.
1387 */
f6685938
ACM
1388 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1389 if (newkey != NULL)
cfb6eeb4
YH
1390 tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1391 newkey, key->keylen);
49a72dfb 1392 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
cfb6eeb4
YH
1393 }
1394#endif
1395
ab1e0a13
ACM
1396 __inet_hash_nolisten(newsk);
1397 __inet_inherit_port(sk, newsk);
1da177e4
LT
1398
1399 return newsk;
1400
1401exit_overflow:
de0744af 1402 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1da177e4 1403exit:
de0744af 1404 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
1405 dst_release(dst);
1406 return NULL;
1407}
1408
1409static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1410{
aa8223c7 1411 struct tcphdr *th = tcp_hdr(skb);
eddc9ec5 1412 const struct iphdr *iph = ip_hdr(skb);
1da177e4 1413 struct sock *nsk;
60236fdd 1414 struct request_sock **prev;
1da177e4 1415 /* Find possible connection requests. */
463c84b9
ACM
1416 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1417 iph->saddr, iph->daddr);
1da177e4
LT
1418 if (req)
1419 return tcp_check_req(sk, skb, req, prev);
1420
3b1e0a65 1421 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
c67499c0 1422 th->source, iph->daddr, th->dest, inet_iif(skb));
1da177e4
LT
1423
1424 if (nsk) {
1425 if (nsk->sk_state != TCP_TIME_WAIT) {
1426 bh_lock_sock(nsk);
1427 return nsk;
1428 }
9469c7b4 1429 inet_twsk_put(inet_twsk(nsk));
1da177e4
LT
1430 return NULL;
1431 }
1432
1433#ifdef CONFIG_SYN_COOKIES
1434 if (!th->rst && !th->syn && th->ack)
1435 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1436#endif
1437 return sk;
1438}
1439
b51655b9 1440static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1da177e4 1441{
eddc9ec5
ACM
1442 const struct iphdr *iph = ip_hdr(skb);
1443
84fa7933 1444 if (skb->ip_summed == CHECKSUM_COMPLETE) {
eddc9ec5
ACM
1445 if (!tcp_v4_check(skb->len, iph->saddr,
1446 iph->daddr, skb->csum)) {
fb286bb2 1447 skb->ip_summed = CHECKSUM_UNNECESSARY;
1da177e4 1448 return 0;
fb286bb2 1449 }
1da177e4 1450 }
fb286bb2 1451
eddc9ec5 1452 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
fb286bb2
HX
1453 skb->len, IPPROTO_TCP, 0);
1454
1da177e4 1455 if (skb->len <= 76) {
fb286bb2 1456 return __skb_checksum_complete(skb);
1da177e4
LT
1457 }
1458 return 0;
1459}
1460
1461
1462/* The socket must have it's spinlock held when we get
1463 * here.
1464 *
1465 * We have a potential double-lock case here, so even when
1466 * doing backlog processing we use the BH locking scheme.
1467 * This is because we cannot sleep with the original spinlock
1468 * held.
1469 */
1470int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1471{
cfb6eeb4
YH
1472 struct sock *rsk;
1473#ifdef CONFIG_TCP_MD5SIG
1474 /*
1475 * We really want to reject the packet as early as possible
1476 * if:
1477 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1478 * o There is an MD5 option and we're not expecting one
1479 */
7174259e 1480 if (tcp_v4_inbound_md5_hash(sk, skb))
cfb6eeb4
YH
1481 goto discard;
1482#endif
1483
1da177e4
LT
1484 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1485 TCP_CHECK_TIMER(sk);
aa8223c7 1486 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1487 rsk = sk;
1da177e4 1488 goto reset;
cfb6eeb4 1489 }
1da177e4
LT
1490 TCP_CHECK_TIMER(sk);
1491 return 0;
1492 }
1493
ab6a5bb6 1494 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1da177e4
LT
1495 goto csum_err;
1496
1497 if (sk->sk_state == TCP_LISTEN) {
1498 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1499 if (!nsk)
1500 goto discard;
1501
1502 if (nsk != sk) {
cfb6eeb4
YH
1503 if (tcp_child_process(sk, nsk, skb)) {
1504 rsk = nsk;
1da177e4 1505 goto reset;
cfb6eeb4 1506 }
1da177e4
LT
1507 return 0;
1508 }
1509 }
1510
1511 TCP_CHECK_TIMER(sk);
aa8223c7 1512 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1513 rsk = sk;
1da177e4 1514 goto reset;
cfb6eeb4 1515 }
1da177e4
LT
1516 TCP_CHECK_TIMER(sk);
1517 return 0;
1518
1519reset:
cfb6eeb4 1520 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1521discard:
1522 kfree_skb(skb);
1523 /* Be careful here. If this function gets more complicated and
1524 * gcc suffers from register pressure on the x86, sk (in %ebx)
1525 * might be destroyed here. This current version compiles correctly,
1526 * but you have been warned.
1527 */
1528 return 0;
1529
1530csum_err:
63231bdd 1531 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1532 goto discard;
1533}
1534
1535/*
1536 * From tcp_input.c
1537 */
1538
1539int tcp_v4_rcv(struct sk_buff *skb)
1540{
eddc9ec5 1541 const struct iphdr *iph;
1da177e4
LT
1542 struct tcphdr *th;
1543 struct sock *sk;
1544 int ret;
a86b1e30 1545 struct net *net = dev_net(skb->dev);
1da177e4
LT
1546
1547 if (skb->pkt_type != PACKET_HOST)
1548 goto discard_it;
1549
1550 /* Count it even if it's bad */
63231bdd 1551 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1da177e4
LT
1552
1553 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1554 goto discard_it;
1555
aa8223c7 1556 th = tcp_hdr(skb);
1da177e4
LT
1557
1558 if (th->doff < sizeof(struct tcphdr) / 4)
1559 goto bad_packet;
1560 if (!pskb_may_pull(skb, th->doff * 4))
1561 goto discard_it;
1562
1563 /* An explanation is required here, I think.
1564 * Packet length and doff are validated by header prediction,
caa20d9a 1565 * provided case of th->doff==0 is eliminated.
1da177e4 1566 * So, we defer the checks. */
60476372 1567 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1da177e4
LT
1568 goto bad_packet;
1569
aa8223c7 1570 th = tcp_hdr(skb);
eddc9ec5 1571 iph = ip_hdr(skb);
1da177e4
LT
1572 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1573 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1574 skb->len - th->doff * 4);
1575 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1576 TCP_SKB_CB(skb)->when = 0;
eddc9ec5 1577 TCP_SKB_CB(skb)->flags = iph->tos;
1da177e4
LT
1578 TCP_SKB_CB(skb)->sacked = 0;
1579
a86b1e30 1580 sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr,
c67499c0 1581 th->source, iph->daddr, th->dest, inet_iif(skb));
1da177e4
LT
1582 if (!sk)
1583 goto no_tcp_socket;
1584
1585process:
1586 if (sk->sk_state == TCP_TIME_WAIT)
1587 goto do_time_wait;
1588
1589 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1590 goto discard_and_relse;
b59c2701 1591 nf_reset(skb);
1da177e4 1592
fda9ef5d 1593 if (sk_filter(sk, skb))
1da177e4
LT
1594 goto discard_and_relse;
1595
1596 skb->dev = NULL;
1597
c6366184 1598 bh_lock_sock_nested(sk);
1da177e4
LT
1599 ret = 0;
1600 if (!sock_owned_by_user(sk)) {
1a2449a8
CL
1601#ifdef CONFIG_NET_DMA
1602 struct tcp_sock *tp = tcp_sk(sk);
1603 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1604 tp->ucopy.dma_chan = get_softnet_dma();
1605 if (tp->ucopy.dma_chan)
1da177e4 1606 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8
CL
1607 else
1608#endif
1609 {
1610 if (!tcp_prequeue(sk, skb))
1611 ret = tcp_v4_do_rcv(sk, skb);
1612 }
1da177e4
LT
1613 } else
1614 sk_add_backlog(sk, skb);
1615 bh_unlock_sock(sk);
1616
1617 sock_put(sk);
1618
1619 return ret;
1620
1621no_tcp_socket:
1622 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1623 goto discard_it;
1624
1625 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1626bad_packet:
63231bdd 1627 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1da177e4 1628 } else {
cfb6eeb4 1629 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1630 }
1631
1632discard_it:
1633 /* Discard frame. */
1634 kfree_skb(skb);
e905a9ed 1635 return 0;
1da177e4
LT
1636
1637discard_and_relse:
1638 sock_put(sk);
1639 goto discard_it;
1640
1641do_time_wait:
1642 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1643 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1644 goto discard_it;
1645 }
1646
1647 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
63231bdd 1648 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
9469c7b4 1649 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1650 goto discard_it;
1651 }
9469c7b4 1652 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1653 case TCP_TW_SYN: {
c346dca1 1654 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
c67499c0 1655 &tcp_hashinfo,
eddc9ec5 1656 iph->daddr, th->dest,
463c84b9 1657 inet_iif(skb));
1da177e4 1658 if (sk2) {
9469c7b4
YH
1659 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1660 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1661 sk = sk2;
1662 goto process;
1663 }
1664 /* Fall through to ACK */
1665 }
1666 case TCP_TW_ACK:
1667 tcp_v4_timewait_ack(sk, skb);
1668 break;
1669 case TCP_TW_RST:
1670 goto no_tcp_socket;
1671 case TCP_TW_SUCCESS:;
1672 }
1673 goto discard_it;
1674}
1675
1da177e4
LT
1676/* VJ's idea. Save last timestamp seen from this destination
1677 * and hold it at least for normal timewait interval to use for duplicate
1678 * segment detection in subsequent connections, before they enter synchronized
1679 * state.
1680 */
1681
1682int tcp_v4_remember_stamp(struct sock *sk)
1683{
1684 struct inet_sock *inet = inet_sk(sk);
1685 struct tcp_sock *tp = tcp_sk(sk);
1686 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1687 struct inet_peer *peer = NULL;
1688 int release_it = 0;
1689
1690 if (!rt || rt->rt_dst != inet->daddr) {
1691 peer = inet_getpeer(inet->daddr, 1);
1692 release_it = 1;
1693 } else {
1694 if (!rt->peer)
1695 rt_bind_peer(rt, 1);
1696 peer = rt->peer;
1697 }
1698
1699 if (peer) {
1700 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
9d729f72 1701 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1da177e4
LT
1702 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1703 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1704 peer->tcp_ts = tp->rx_opt.ts_recent;
1705 }
1706 if (release_it)
1707 inet_putpeer(peer);
1708 return 1;
1709 }
1710
1711 return 0;
1712}
1713
8feaf0c0 1714int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1da177e4 1715{
8feaf0c0 1716 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1da177e4
LT
1717
1718 if (peer) {
8feaf0c0
ACM
1719 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1720
1721 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
9d729f72 1722 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
8feaf0c0
ACM
1723 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1724 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1725 peer->tcp_ts = tcptw->tw_ts_recent;
1da177e4
LT
1726 }
1727 inet_putpeer(peer);
1728 return 1;
1729 }
1730
1731 return 0;
1732}
1733
8292a17a 1734struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1735 .queue_xmit = ip_queue_xmit,
1736 .send_check = tcp_v4_send_check,
1737 .rebuild_header = inet_sk_rebuild_header,
1738 .conn_request = tcp_v4_conn_request,
1739 .syn_recv_sock = tcp_v4_syn_recv_sock,
1740 .remember_stamp = tcp_v4_remember_stamp,
1741 .net_header_len = sizeof(struct iphdr),
1742 .setsockopt = ip_setsockopt,
1743 .getsockopt = ip_getsockopt,
1744 .addr2sockaddr = inet_csk_addr2sockaddr,
1745 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 1746 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 1747#ifdef CONFIG_COMPAT
543d9cfe
ACM
1748 .compat_setsockopt = compat_ip_setsockopt,
1749 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1750#endif
1da177e4
LT
1751};
1752
cfb6eeb4 1753#ifdef CONFIG_TCP_MD5SIG
b6332e6c 1754static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1755 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1756 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4
YH
1757 .md5_add = tcp_v4_md5_add_func,
1758 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1759};
b6332e6c 1760#endif
cfb6eeb4 1761
1da177e4
LT
1762/* NOTE: A lot of things set to zero explicitly by call to
1763 * sk_alloc() so need not be done here.
1764 */
1765static int tcp_v4_init_sock(struct sock *sk)
1766{
6687e988 1767 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4
LT
1768 struct tcp_sock *tp = tcp_sk(sk);
1769
1770 skb_queue_head_init(&tp->out_of_order_queue);
1771 tcp_init_xmit_timers(sk);
1772 tcp_prequeue_init(tp);
1773
6687e988 1774 icsk->icsk_rto = TCP_TIMEOUT_INIT;
1da177e4
LT
1775 tp->mdev = TCP_TIMEOUT_INIT;
1776
1777 /* So many TCP implementations out there (incorrectly) count the
1778 * initial SYN frame in their delayed-ACK and congestion control
1779 * algorithms that we must have the following bandaid to talk
1780 * efficiently to them. -DaveM
1781 */
1782 tp->snd_cwnd = 2;
1783
1784 /* See draft-stevens-tcpca-spec-01 for discussion of the
1785 * initialization of these values.
1786 */
1787 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1788 tp->snd_cwnd_clamp = ~0;
c1b4a7e6 1789 tp->mss_cache = 536;
1da177e4
LT
1790
1791 tp->reordering = sysctl_tcp_reordering;
6687e988 1792 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1da177e4
LT
1793
1794 sk->sk_state = TCP_CLOSE;
1795
1796 sk->sk_write_space = sk_stream_write_space;
1797 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1798
8292a17a 1799 icsk->icsk_af_ops = &ipv4_specific;
d83d8461 1800 icsk->icsk_sync_mss = tcp_sync_mss;
cfb6eeb4
YH
1801#ifdef CONFIG_TCP_MD5SIG
1802 tp->af_specific = &tcp_sock_ipv4_specific;
1803#endif
1da177e4
LT
1804
1805 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1806 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1807
1808 atomic_inc(&tcp_sockets_allocated);
1809
1810 return 0;
1811}
1812
7d06b2e0 1813void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1814{
1815 struct tcp_sock *tp = tcp_sk(sk);
1816
1817 tcp_clear_xmit_timers(sk);
1818
6687e988 1819 tcp_cleanup_congestion_control(sk);
317a76f9 1820
1da177e4 1821 /* Cleanup up the write buffer. */
fe067e8a 1822 tcp_write_queue_purge(sk);
1da177e4
LT
1823
1824 /* Cleans up our, hopefully empty, out_of_order_queue. */
e905a9ed 1825 __skb_queue_purge(&tp->out_of_order_queue);
1da177e4 1826
cfb6eeb4
YH
1827#ifdef CONFIG_TCP_MD5SIG
1828 /* Clean up the MD5 key list, if any */
1829 if (tp->md5sig_info) {
1830 tcp_v4_clear_md5_list(sk);
1831 kfree(tp->md5sig_info);
1832 tp->md5sig_info = NULL;
1833 }
1834#endif
1835
1a2449a8
CL
1836#ifdef CONFIG_NET_DMA
1837 /* Cleans up our sk_async_wait_queue */
e905a9ed 1838 __skb_queue_purge(&sk->sk_async_wait_queue);
1a2449a8
CL
1839#endif
1840
1da177e4
LT
1841 /* Clean prequeue, it must be empty really */
1842 __skb_queue_purge(&tp->ucopy.prequeue);
1843
1844 /* Clean up a referenced TCP bind bucket. */
463c84b9 1845 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 1846 inet_put_port(sk);
1da177e4
LT
1847
1848 /*
1849 * If sendmsg cached page exists, toss it.
1850 */
1851 if (sk->sk_sndmsg_page) {
1852 __free_page(sk->sk_sndmsg_page);
1853 sk->sk_sndmsg_page = NULL;
1854 }
1855
1856 atomic_dec(&tcp_sockets_allocated);
1da177e4
LT
1857}
1858
1859EXPORT_SYMBOL(tcp_v4_destroy_sock);
1860
1861#ifdef CONFIG_PROC_FS
1862/* Proc filesystem TCP sock list dumping. */
1863
8feaf0c0 1864static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1da177e4
LT
1865{
1866 return hlist_empty(head) ? NULL :
8feaf0c0 1867 list_entry(head->first, struct inet_timewait_sock, tw_node);
1da177e4
LT
1868}
1869
8feaf0c0 1870static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1da177e4
LT
1871{
1872 return tw->tw_node.next ?
1873 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1874}
1875
1876static void *listening_get_next(struct seq_file *seq, void *cur)
1877{
463c84b9 1878 struct inet_connection_sock *icsk;
1da177e4
LT
1879 struct hlist_node *node;
1880 struct sock *sk = cur;
1881 struct tcp_iter_state* st = seq->private;
a4146b1b 1882 struct net *net = seq_file_net(seq);
1da177e4
LT
1883
1884 if (!sk) {
1885 st->bucket = 0;
6e04e021 1886 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1da177e4
LT
1887 goto get_sk;
1888 }
1889
1890 ++st->num;
1891
1892 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 1893 struct request_sock *req = cur;
1da177e4 1894
72a3effa 1895 icsk = inet_csk(st->syn_wait_sk);
1da177e4
LT
1896 req = req->dl_next;
1897 while (1) {
1898 while (req) {
bdccc4ca 1899 if (req->rsk_ops->family == st->family) {
1da177e4
LT
1900 cur = req;
1901 goto out;
1902 }
1903 req = req->dl_next;
1904 }
72a3effa 1905 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1da177e4
LT
1906 break;
1907get_req:
463c84b9 1908 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4
LT
1909 }
1910 sk = sk_next(st->syn_wait_sk);
1911 st->state = TCP_SEQ_STATE_LISTENING;
463c84b9 1912 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 1913 } else {
e905a9ed 1914 icsk = inet_csk(sk);
463c84b9
ACM
1915 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1916 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1da177e4 1917 goto start_req;
463c84b9 1918 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4
LT
1919 sk = sk_next(sk);
1920 }
1921get_sk:
1922 sk_for_each_from(sk, node) {
878628fb 1923 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1da177e4
LT
1924 cur = sk;
1925 goto out;
1926 }
e905a9ed 1927 icsk = inet_csk(sk);
463c84b9
ACM
1928 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1929 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1da177e4
LT
1930start_req:
1931 st->uid = sock_i_uid(sk);
1932 st->syn_wait_sk = sk;
1933 st->state = TCP_SEQ_STATE_OPENREQ;
1934 st->sbucket = 0;
1935 goto get_req;
1936 }
463c84b9 1937 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 1938 }
0f7ff927 1939 if (++st->bucket < INET_LHTABLE_SIZE) {
6e04e021 1940 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1da177e4
LT
1941 goto get_sk;
1942 }
1943 cur = NULL;
1944out:
1945 return cur;
1946}
1947
1948static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1949{
1950 void *rc = listening_get_next(seq, NULL);
1951
1952 while (rc && *pos) {
1953 rc = listening_get_next(seq, rc);
1954 --*pos;
1955 }
1956 return rc;
1957}
1958
6eac5604
AK
1959static inline int empty_bucket(struct tcp_iter_state *st)
1960{
1961 return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1962 hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1963}
1964
1da177e4
LT
1965static void *established_get_first(struct seq_file *seq)
1966{
1967 struct tcp_iter_state* st = seq->private;
a4146b1b 1968 struct net *net = seq_file_net(seq);
1da177e4
LT
1969 void *rc = NULL;
1970
6e04e021 1971 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1da177e4
LT
1972 struct sock *sk;
1973 struct hlist_node *node;
8feaf0c0 1974 struct inet_timewait_sock *tw;
230140cf 1975 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 1976
6eac5604
AK
1977 /* Lockless fast path for the common case of empty buckets */
1978 if (empty_bucket(st))
1979 continue;
1980
230140cf 1981 read_lock_bh(lock);
6e04e021 1982 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 1983 if (sk->sk_family != st->family ||
878628fb 1984 !net_eq(sock_net(sk), net)) {
1da177e4
LT
1985 continue;
1986 }
1987 rc = sk;
1988 goto out;
1989 }
1990 st->state = TCP_SEQ_STATE_TIME_WAIT;
8feaf0c0 1991 inet_twsk_for_each(tw, node,
dbca9b27 1992 &tcp_hashinfo.ehash[st->bucket].twchain) {
28518fc1 1993 if (tw->tw_family != st->family ||
878628fb 1994 !net_eq(twsk_net(tw), net)) {
1da177e4
LT
1995 continue;
1996 }
1997 rc = tw;
1998 goto out;
1999 }
230140cf 2000 read_unlock_bh(lock);
1da177e4
LT
2001 st->state = TCP_SEQ_STATE_ESTABLISHED;
2002 }
2003out:
2004 return rc;
2005}
2006
2007static void *established_get_next(struct seq_file *seq, void *cur)
2008{
2009 struct sock *sk = cur;
8feaf0c0 2010 struct inet_timewait_sock *tw;
1da177e4
LT
2011 struct hlist_node *node;
2012 struct tcp_iter_state* st = seq->private;
a4146b1b 2013 struct net *net = seq_file_net(seq);
1da177e4
LT
2014
2015 ++st->num;
2016
2017 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2018 tw = cur;
2019 tw = tw_next(tw);
2020get_tw:
878628fb 2021 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1da177e4
LT
2022 tw = tw_next(tw);
2023 }
2024 if (tw) {
2025 cur = tw;
2026 goto out;
2027 }
230140cf 2028 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2029 st->state = TCP_SEQ_STATE_ESTABLISHED;
2030
6eac5604
AK
2031 /* Look for next non empty bucket */
2032 while (++st->bucket < tcp_hashinfo.ehash_size &&
2033 empty_bucket(st))
2034 ;
2035 if (st->bucket >= tcp_hashinfo.ehash_size)
2036 return NULL;
2037
2038 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2039 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1da177e4
LT
2040 } else
2041 sk = sk_next(sk);
2042
2043 sk_for_each_from(sk, node) {
878628fb 2044 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1da177e4
LT
2045 goto found;
2046 }
2047
2048 st->state = TCP_SEQ_STATE_TIME_WAIT;
dbca9b27 2049 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
1da177e4
LT
2050 goto get_tw;
2051found:
2052 cur = sk;
2053out:
2054 return cur;
2055}
2056
2057static void *established_get_idx(struct seq_file *seq, loff_t pos)
2058{
2059 void *rc = established_get_first(seq);
2060
2061 while (rc && pos) {
2062 rc = established_get_next(seq, rc);
2063 --pos;
7174259e 2064 }
1da177e4
LT
2065 return rc;
2066}
2067
2068static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2069{
2070 void *rc;
2071 struct tcp_iter_state* st = seq->private;
2072
f3f05f70 2073 inet_listen_lock(&tcp_hashinfo);
1da177e4
LT
2074 st->state = TCP_SEQ_STATE_LISTENING;
2075 rc = listening_get_idx(seq, &pos);
2076
2077 if (!rc) {
f3f05f70 2078 inet_listen_unlock(&tcp_hashinfo);
1da177e4
LT
2079 st->state = TCP_SEQ_STATE_ESTABLISHED;
2080 rc = established_get_idx(seq, pos);
2081 }
2082
2083 return rc;
2084}
2085
2086static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2087{
2088 struct tcp_iter_state* st = seq->private;
2089 st->state = TCP_SEQ_STATE_LISTENING;
2090 st->num = 0;
2091 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2092}
2093
2094static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2095{
2096 void *rc = NULL;
2097 struct tcp_iter_state* st;
2098
2099 if (v == SEQ_START_TOKEN) {
2100 rc = tcp_get_idx(seq, 0);
2101 goto out;
2102 }
2103 st = seq->private;
2104
2105 switch (st->state) {
2106 case TCP_SEQ_STATE_OPENREQ:
2107 case TCP_SEQ_STATE_LISTENING:
2108 rc = listening_get_next(seq, v);
2109 if (!rc) {
f3f05f70 2110 inet_listen_unlock(&tcp_hashinfo);
1da177e4
LT
2111 st->state = TCP_SEQ_STATE_ESTABLISHED;
2112 rc = established_get_first(seq);
2113 }
2114 break;
2115 case TCP_SEQ_STATE_ESTABLISHED:
2116 case TCP_SEQ_STATE_TIME_WAIT:
2117 rc = established_get_next(seq, v);
2118 break;
2119 }
2120out:
2121 ++*pos;
2122 return rc;
2123}
2124
2125static void tcp_seq_stop(struct seq_file *seq, void *v)
2126{
2127 struct tcp_iter_state* st = seq->private;
2128
2129 switch (st->state) {
2130 case TCP_SEQ_STATE_OPENREQ:
2131 if (v) {
463c84b9
ACM
2132 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2133 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4
LT
2134 }
2135 case TCP_SEQ_STATE_LISTENING:
2136 if (v != SEQ_START_TOKEN)
f3f05f70 2137 inet_listen_unlock(&tcp_hashinfo);
1da177e4
LT
2138 break;
2139 case TCP_SEQ_STATE_TIME_WAIT:
2140 case TCP_SEQ_STATE_ESTABLISHED:
2141 if (v)
230140cf 2142 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2143 break;
2144 }
2145}
2146
2147static int tcp_seq_open(struct inode *inode, struct file *file)
2148{
2149 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1da177e4 2150 struct tcp_iter_state *s;
52d6f3f1 2151 int err;
1da177e4 2152
52d6f3f1
DL
2153 err = seq_open_net(inode, file, &afinfo->seq_ops,
2154 sizeof(struct tcp_iter_state));
2155 if (err < 0)
2156 return err;
f40c8174 2157
52d6f3f1 2158 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2159 s->family = afinfo->family;
f40c8174
DL
2160 return 0;
2161}
2162
6f8b13bc 2163int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2164{
2165 int rc = 0;
2166 struct proc_dir_entry *p;
2167
68fcadd1
DL
2168 afinfo->seq_fops.open = tcp_seq_open;
2169 afinfo->seq_fops.read = seq_read;
2170 afinfo->seq_fops.llseek = seq_lseek;
2171 afinfo->seq_fops.release = seq_release_net;
7174259e 2172
9427c4b3
DL
2173 afinfo->seq_ops.start = tcp_seq_start;
2174 afinfo->seq_ops.next = tcp_seq_next;
2175 afinfo->seq_ops.stop = tcp_seq_stop;
2176
84841c3c
DL
2177 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2178 &afinfo->seq_fops, afinfo);
2179 if (!p)
1da177e4
LT
2180 rc = -ENOMEM;
2181 return rc;
2182}
2183
6f8b13bc 2184void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2185{
6f8b13bc 2186 proc_net_remove(net, afinfo->name);
1da177e4
LT
2187}
2188
60236fdd 2189static void get_openreq4(struct sock *sk, struct request_sock *req,
5e659e4c 2190 struct seq_file *f, int i, int uid, int *len)
1da177e4 2191{
2e6599cb 2192 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
2193 int ttd = req->expires - jiffies;
2194
5e659e4c
PE
2195 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2196 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
1da177e4 2197 i,
2e6599cb 2198 ireq->loc_addr,
1da177e4 2199 ntohs(inet_sk(sk)->sport),
2e6599cb
ACM
2200 ireq->rmt_addr,
2201 ntohs(ireq->rmt_port),
1da177e4
LT
2202 TCP_SYN_RECV,
2203 0, 0, /* could print option size, but that is af dependent. */
2204 1, /* timers active (only the expire timer) */
2205 jiffies_to_clock_t(ttd),
2206 req->retrans,
2207 uid,
2208 0, /* non standard timer */
2209 0, /* open_requests have no inode */
2210 atomic_read(&sk->sk_refcnt),
5e659e4c
PE
2211 req,
2212 len);
1da177e4
LT
2213}
2214
5e659e4c 2215static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
1da177e4
LT
2216{
2217 int timer_active;
2218 unsigned long timer_expires;
cf4c6bf8
IJ
2219 struct tcp_sock *tp = tcp_sk(sk);
2220 const struct inet_connection_sock *icsk = inet_csk(sk);
2221 struct inet_sock *inet = inet_sk(sk);
714e85be
AV
2222 __be32 dest = inet->daddr;
2223 __be32 src = inet->rcv_saddr;
1da177e4
LT
2224 __u16 destp = ntohs(inet->dport);
2225 __u16 srcp = ntohs(inet->sport);
2226
463c84b9 2227 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1da177e4 2228 timer_active = 1;
463c84b9
ACM
2229 timer_expires = icsk->icsk_timeout;
2230 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2231 timer_active = 4;
463c84b9 2232 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2233 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2234 timer_active = 2;
cf4c6bf8 2235 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2236 } else {
2237 timer_active = 0;
2238 timer_expires = jiffies;
2239 }
2240
5e659e4c 2241 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
7be87351 2242 "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
cf4c6bf8 2243 i, src, srcp, dest, destp, sk->sk_state,
47da8ee6 2244 tp->write_seq - tp->snd_una,
cf4c6bf8 2245 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
7174259e 2246 (tp->rcv_nxt - tp->copied_seq),
1da177e4
LT
2247 timer_active,
2248 jiffies_to_clock_t(timer_expires - jiffies),
463c84b9 2249 icsk->icsk_retransmits,
cf4c6bf8 2250 sock_i_uid(sk),
6687e988 2251 icsk->icsk_probes_out,
cf4c6bf8
IJ
2252 sock_i_ino(sk),
2253 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2254 jiffies_to_clock_t(icsk->icsk_rto),
2255 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2256 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2257 tp->snd_cwnd,
5e659e4c
PE
2258 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2259 len);
1da177e4
LT
2260}
2261
7174259e 2262static void get_timewait4_sock(struct inet_timewait_sock *tw,
5e659e4c 2263 struct seq_file *f, int i, int *len)
1da177e4 2264{
23f33c2d 2265 __be32 dest, src;
1da177e4
LT
2266 __u16 destp, srcp;
2267 int ttd = tw->tw_ttd - jiffies;
2268
2269 if (ttd < 0)
2270 ttd = 0;
2271
2272 dest = tw->tw_daddr;
2273 src = tw->tw_rcv_saddr;
2274 destp = ntohs(tw->tw_dport);
2275 srcp = ntohs(tw->tw_sport);
2276
5e659e4c
PE
2277 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2278 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
1da177e4
LT
2279 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2280 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
5e659e4c 2281 atomic_read(&tw->tw_refcnt), tw, len);
1da177e4
LT
2282}
2283
2284#define TMPSZ 150
2285
2286static int tcp4_seq_show(struct seq_file *seq, void *v)
2287{
2288 struct tcp_iter_state* st;
5e659e4c 2289 int len;
1da177e4
LT
2290
2291 if (v == SEQ_START_TOKEN) {
2292 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2293 " sl local_address rem_address st tx_queue "
2294 "rx_queue tr tm->when retrnsmt uid timeout "
2295 "inode");
2296 goto out;
2297 }
2298 st = seq->private;
2299
2300 switch (st->state) {
2301 case TCP_SEQ_STATE_LISTENING:
2302 case TCP_SEQ_STATE_ESTABLISHED:
5e659e4c 2303 get_tcp4_sock(v, seq, st->num, &len);
1da177e4
LT
2304 break;
2305 case TCP_SEQ_STATE_OPENREQ:
5e659e4c 2306 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
1da177e4
LT
2307 break;
2308 case TCP_SEQ_STATE_TIME_WAIT:
5e659e4c 2309 get_timewait4_sock(v, seq, st->num, &len);
1da177e4
LT
2310 break;
2311 }
5e659e4c 2312 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
1da177e4
LT
2313out:
2314 return 0;
2315}
2316
1da177e4 2317static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2318 .name = "tcp",
2319 .family = AF_INET,
5f4472c5
DL
2320 .seq_fops = {
2321 .owner = THIS_MODULE,
2322 },
9427c4b3
DL
2323 .seq_ops = {
2324 .show = tcp4_seq_show,
2325 },
1da177e4
LT
2326};
2327
757764f6
PE
2328static int tcp4_proc_init_net(struct net *net)
2329{
2330 return tcp_proc_register(net, &tcp4_seq_afinfo);
2331}
2332
2333static void tcp4_proc_exit_net(struct net *net)
2334{
2335 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2336}
2337
2338static struct pernet_operations tcp4_net_ops = {
2339 .init = tcp4_proc_init_net,
2340 .exit = tcp4_proc_exit_net,
2341};
2342
1da177e4
LT
2343int __init tcp4_proc_init(void)
2344{
757764f6 2345 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2346}
2347
2348void tcp4_proc_exit(void)
2349{
757764f6 2350 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2351}
2352#endif /* CONFIG_PROC_FS */
2353
2354struct proto tcp_prot = {
2355 .name = "TCP",
2356 .owner = THIS_MODULE,
2357 .close = tcp_close,
2358 .connect = tcp_v4_connect,
2359 .disconnect = tcp_disconnect,
463c84b9 2360 .accept = inet_csk_accept,
1da177e4
LT
2361 .ioctl = tcp_ioctl,
2362 .init = tcp_v4_init_sock,
2363 .destroy = tcp_v4_destroy_sock,
2364 .shutdown = tcp_shutdown,
2365 .setsockopt = tcp_setsockopt,
2366 .getsockopt = tcp_getsockopt,
1da177e4
LT
2367 .recvmsg = tcp_recvmsg,
2368 .backlog_rcv = tcp_v4_do_rcv,
ab1e0a13
ACM
2369 .hash = inet_hash,
2370 .unhash = inet_unhash,
2371 .get_port = inet_csk_get_port,
1da177e4
LT
2372 .enter_memory_pressure = tcp_enter_memory_pressure,
2373 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2374 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2375 .memory_allocated = &tcp_memory_allocated,
2376 .memory_pressure = &tcp_memory_pressure,
2377 .sysctl_mem = sysctl_tcp_mem,
2378 .sysctl_wmem = sysctl_tcp_wmem,
2379 .sysctl_rmem = sysctl_tcp_rmem,
2380 .max_header = MAX_TCP_HEADER,
2381 .obj_size = sizeof(struct tcp_sock),
6d6ee43e 2382 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2383 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2384 .h.hashinfo = &tcp_hashinfo,
543d9cfe
ACM
2385#ifdef CONFIG_COMPAT
2386 .compat_setsockopt = compat_tcp_setsockopt,
2387 .compat_getsockopt = compat_tcp_getsockopt,
2388#endif
1da177e4
LT
2389};
2390
046ee902
DL
2391
2392static int __net_init tcp_sk_init(struct net *net)
2393{
2394 return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2395 PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2396}
2397
2398static void __net_exit tcp_sk_exit(struct net *net)
2399{
2400 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
d315492b 2401 inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
2402}
2403
2404static struct pernet_operations __net_initdata tcp_sk_ops = {
2405 .init = tcp_sk_init,
2406 .exit = tcp_sk_exit,
2407};
2408
9b0f976f 2409void __init tcp_v4_init(void)
1da177e4 2410{
046ee902 2411 if (register_pernet_device(&tcp_sk_ops))
1da177e4 2412 panic("Failed to create the TCP control socket.\n");
1da177e4
LT
2413}
2414
2415EXPORT_SYMBOL(ipv4_specific);
1da177e4 2416EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 2417EXPORT_SYMBOL(tcp_prot);
1da177e4
LT
2418EXPORT_SYMBOL(tcp_v4_conn_request);
2419EXPORT_SYMBOL(tcp_v4_connect);
2420EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4
LT
2421EXPORT_SYMBOL(tcp_v4_remember_stamp);
2422EXPORT_SYMBOL(tcp_v4_send_check);
2423EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2424
2425#ifdef CONFIG_PROC_FS
2426EXPORT_SYMBOL(tcp_proc_register);
2427EXPORT_SYMBOL(tcp_proc_unregister);
2428#endif
1da177e4 2429EXPORT_SYMBOL(sysctl_tcp_low_latency);
1da177e4 2430