]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/tcp_minisocks.c
tcp: Don't make syn cookies initial setting depend on CONFIG_SYSCTL
[net-next-2.6.git] / net / ipv4 / tcp_minisocks.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Mark Evans, <evansmp@uhura.aston.ac.uk>
11 * Corey Minyard <wf-rch!minyard@relay.EU.net>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
14 * Linus Torvalds, <torvalds@cs.helsinki.fi>
15 * Alan Cox, <gw4pts@gw4pts.ampr.org>
16 * Matthew Dillon, <dillon@apollo.west.oic.com>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 * Jorge Cwik, <jorge@laser.satlink.net>
19 */
20
1da177e4
LT
21#include <linux/mm.h>
22#include <linux/module.h>
23#include <linux/sysctl.h>
24#include <linux/workqueue.h>
25#include <net/tcp.h>
26#include <net/inet_common.h>
27#include <net/xfrm.h>
28
e994b7c9 29int sysctl_tcp_syncookies __read_mostly = 1;
c6aefafb
GG
30EXPORT_SYMBOL(sysctl_tcp_syncookies);
31
ab32ea5d 32int sysctl_tcp_abort_on_overflow __read_mostly;
1da177e4 33
295ff7ed
ACM
34struct inet_timewait_death_row tcp_death_row = {
35 .sysctl_max_tw_buckets = NR_FILE * 2,
36 .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
e4d91918 37 .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
295ff7ed
ACM
38 .hashinfo = &tcp_hashinfo,
39 .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
40 (unsigned long)&tcp_death_row),
41 .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
65f27f38 42 inet_twdr_twkill_work),
295ff7ed
ACM
43/* Short-time timewait calendar */
44
45 .twcal_hand = -1,
46 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
47 (unsigned long)&tcp_death_row),
48};
49
50EXPORT_SYMBOL_GPL(tcp_death_row);
51
1da177e4
LT
52static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
53{
54 if (seq == s_win)
55 return 1;
56 if (after(end_seq, s_win) && before(seq, e_win))
57 return 1;
58 return (seq == e_win && seq == end_seq);
59}
60
e905a9ed 61/*
1da177e4
LT
62 * * Main purpose of TIME-WAIT state is to close connection gracefully,
63 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
64 * (and, probably, tail of data) and one or more our ACKs are lost.
65 * * What is TIME-WAIT timeout? It is associated with maximal packet
66 * lifetime in the internet, which results in wrong conclusion, that
67 * it is set to catch "old duplicate segments" wandering out of their path.
68 * It is not quite correct. This timeout is calculated so that it exceeds
69 * maximal retransmission timeout enough to allow to lose one (or more)
70 * segments sent by peer and our ACKs. This time may be calculated from RTO.
71 * * When TIME-WAIT socket receives RST, it means that another end
72 * finally closed and we are allowed to kill TIME-WAIT too.
73 * * Second purpose of TIME-WAIT is catching old duplicate segments.
74 * Well, certainly it is pure paranoia, but if we load TIME-WAIT
75 * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
76 * * If we invented some more clever way to catch duplicates
77 * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
78 *
79 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
80 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
81 * from the very beginning.
82 *
83 * NOTE. With recycling (and later with fin-wait-2) TW bucket
84 * is _not_ stateless. It means, that strictly speaking we must
85 * spinlock it. I do not want! Well, probability of misbehaviour
86 * is ridiculously low and, seems, we could use some mb() tricks
87 * to avoid misread sequence numbers, states etc. --ANK
88 */
89enum tcp_tw_status
8feaf0c0
ACM
90tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
91 const struct tcphdr *th)
1da177e4 92{
8feaf0c0 93 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1da177e4
LT
94 struct tcp_options_received tmp_opt;
95 int paws_reject = 0;
96
8feaf0c0 97 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
f55017a9 98 tmp_opt.tstamp_ok = 1;
022c3f7d 99 tcp_parse_options(skb, &tmp_opt, 1, NULL);
1da177e4
LT
100
101 if (tmp_opt.saw_tstamp) {
8feaf0c0
ACM
102 tmp_opt.ts_recent = tcptw->tw_ts_recent;
103 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
c887e6d2 104 paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
1da177e4
LT
105 }
106 }
107
108 if (tw->tw_substate == TCP_FIN_WAIT2) {
109 /* Just repeat all the checks of tcp_rcv_state_process() */
110
111 /* Out of window, send ACK */
112 if (paws_reject ||
113 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
8feaf0c0
ACM
114 tcptw->tw_rcv_nxt,
115 tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
1da177e4
LT
116 return TCP_TW_ACK;
117
118 if (th->rst)
119 goto kill;
120
8feaf0c0 121 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
1da177e4
LT
122 goto kill_with_rst;
123
124 /* Dup ACK? */
1ac530b3
WY
125 if (!th->ack ||
126 !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
1da177e4 127 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
8feaf0c0 128 inet_twsk_put(tw);
1da177e4
LT
129 return TCP_TW_SUCCESS;
130 }
131
132 /* New data or FIN. If new data arrive after half-duplex close,
133 * reset.
134 */
135 if (!th->fin ||
8feaf0c0 136 TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
1da177e4 137kill_with_rst:
295ff7ed 138 inet_twsk_deschedule(tw, &tcp_death_row);
8feaf0c0 139 inet_twsk_put(tw);
1da177e4
LT
140 return TCP_TW_RST;
141 }
142
143 /* FIN arrived, enter true time-wait state. */
8feaf0c0
ACM
144 tw->tw_substate = TCP_TIME_WAIT;
145 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1da177e4 146 if (tmp_opt.saw_tstamp) {
9d729f72 147 tcptw->tw_ts_recent_stamp = get_seconds();
8feaf0c0 148 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
1da177e4
LT
149 }
150
151 /* I am shamed, but failed to make it more elegant.
152 * Yes, it is direct reference to IP, which is impossible
153 * to generalize to IPv6. Taking into account that IPv6
caa20d9a 154 * do not understand recycling in any case, it not
1da177e4
LT
155 * a big problem in practice. --ANK */
156 if (tw->tw_family == AF_INET &&
295ff7ed 157 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
1da177e4 158 tcp_v4_tw_remember_stamp(tw))
696ab2d3
ACM
159 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
160 TCP_TIMEWAIT_LEN);
1da177e4 161 else
696ab2d3
ACM
162 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
163 TCP_TIMEWAIT_LEN);
1da177e4
LT
164 return TCP_TW_ACK;
165 }
166
167 /*
168 * Now real TIME-WAIT state.
169 *
170 * RFC 1122:
171 * "When a connection is [...] on TIME-WAIT state [...]
172 * [a TCP] MAY accept a new SYN from the remote TCP to
173 * reopen the connection directly, if it:
e905a9ed 174 *
1da177e4
LT
175 * (1) assigns its initial sequence number for the new
176 * connection to be larger than the largest sequence
177 * number it used on the previous connection incarnation,
178 * and
179 *
e905a9ed 180 * (2) returns to TIME-WAIT state if the SYN turns out
1da177e4
LT
181 * to be an old duplicate".
182 */
183
184 if (!paws_reject &&
8feaf0c0 185 (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
1da177e4
LT
186 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
187 /* In window segment, it may be only reset or bare ack. */
188
189 if (th->rst) {
caa20d9a 190 /* This is TIME_WAIT assassination, in two flavors.
1da177e4
LT
191 * Oh well... nobody has a sufficient solution to this
192 * protocol bug yet.
193 */
194 if (sysctl_tcp_rfc1337 == 0) {
195kill:
295ff7ed 196 inet_twsk_deschedule(tw, &tcp_death_row);
8feaf0c0 197 inet_twsk_put(tw);
1da177e4
LT
198 return TCP_TW_SUCCESS;
199 }
200 }
696ab2d3
ACM
201 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
202 TCP_TIMEWAIT_LEN);
1da177e4
LT
203
204 if (tmp_opt.saw_tstamp) {
8feaf0c0 205 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
9d729f72 206 tcptw->tw_ts_recent_stamp = get_seconds();
1da177e4
LT
207 }
208
8feaf0c0 209 inet_twsk_put(tw);
1da177e4
LT
210 return TCP_TW_SUCCESS;
211 }
212
213 /* Out of window segment.
214
215 All the segments are ACKed immediately.
216
217 The only exception is new SYN. We accept it, if it is
218 not old duplicate and we are not in danger to be killed
219 by delayed old duplicates. RFC check is that it has
220 newer sequence number works at rates <40Mbit/sec.
221 However, if paws works, it is reliable AND even more,
222 we even may relax silly seq space cutoff.
223
224 RED-PEN: we violate main RFC requirement, if this SYN will appear
225 old duplicate (i.e. we receive RST in reply to SYN-ACK),
226 we must return socket to time-wait state. It is not good,
227 but not fatal yet.
228 */
229
230 if (th->syn && !th->rst && !th->ack && !paws_reject &&
8feaf0c0
ACM
231 (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
232 (tmp_opt.saw_tstamp &&
233 (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
234 u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
1da177e4
LT
235 if (isn == 0)
236 isn++;
237 TCP_SKB_CB(skb)->when = isn;
238 return TCP_TW_SYN;
239 }
240
241 if (paws_reject)
de0744af 242 NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
1da177e4 243
2de979bd 244 if (!th->rst) {
1da177e4
LT
245 /* In this case we must reset the TIMEWAIT timer.
246 *
247 * If it is ACKless SYN it may be both old duplicate
248 * and new good SYN with random sequence number <rcv_nxt.
249 * Do not reschedule in the last case.
250 */
251 if (paws_reject || th->ack)
696ab2d3
ACM
252 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
253 TCP_TIMEWAIT_LEN);
1da177e4
LT
254
255 /* Send ACK. Note, we do not put the bucket,
256 * it will be released by caller.
257 */
258 return TCP_TW_ACK;
259 }
8feaf0c0 260 inet_twsk_put(tw);
1da177e4
LT
261 return TCP_TW_SUCCESS;
262}
263
e905a9ed 264/*
1da177e4 265 * Move a socket to time-wait or dead fin-wait-2 state.
e905a9ed 266 */
1da177e4
LT
267void tcp_time_wait(struct sock *sk, int state, int timeo)
268{
8feaf0c0 269 struct inet_timewait_sock *tw = NULL;
8292a17a 270 const struct inet_connection_sock *icsk = inet_csk(sk);
8feaf0c0 271 const struct tcp_sock *tp = tcp_sk(sk);
1da177e4
LT
272 int recycle_ok = 0;
273
295ff7ed 274 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
8292a17a 275 recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
1da177e4 276
295ff7ed 277 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
c676270b 278 tw = inet_twsk_alloc(sk, state);
1da177e4 279
8feaf0c0
ACM
280 if (tw != NULL) {
281 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
463c84b9 282 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
8feaf0c0 283
1da177e4 284 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
8feaf0c0
ACM
285 tcptw->tw_rcv_nxt = tp->rcv_nxt;
286 tcptw->tw_snd_nxt = tp->snd_nxt;
287 tcptw->tw_rcv_wnd = tcp_receive_window(tp);
288 tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
289 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
1da177e4
LT
290
291#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
292 if (tw->tw_family == PF_INET6) {
293 struct ipv6_pinfo *np = inet6_sk(sk);
0fa1a53e 294 struct inet6_timewait_sock *tw6;
1da177e4 295
0fa1a53e
ACM
296 tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
297 tw6 = inet6_twsk((struct sock *)tw);
298 ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
299 ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
8feaf0c0 300 tw->tw_ipv6only = np->ipv6only;
c676270b 301 }
1da177e4 302#endif
cfb6eeb4
YH
303
304#ifdef CONFIG_TCP_MD5SIG
305 /*
306 * The timewait bucket does not have the key DB from the
307 * sock structure. We just make a quick copy of the
308 * md5 key being used (if indeed we are using one)
309 * so the timewait ack generating code has the key.
310 */
311 do {
312 struct tcp_md5sig_key *key;
313 memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key));
314 tcptw->tw_md5_keylen = 0;
315 key = tp->af_specific->md5_lookup(sk, sk);
316 if (key != NULL) {
317 memcpy(&tcptw->tw_md5_key, key->key, key->keylen);
318 tcptw->tw_md5_keylen = key->keylen;
aa133076 319 if (tcp_alloc_md5sig_pool(sk) == NULL)
cfb6eeb4
YH
320 BUG();
321 }
2de979bd 322 } while (0);
cfb6eeb4
YH
323#endif
324
1da177e4 325 /* Linkage updates. */
e48c414e 326 __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
1da177e4
LT
327
328 /* Get the TIME_WAIT timeout firing. */
329 if (timeo < rto)
330 timeo = rto;
331
332 if (recycle_ok) {
333 tw->tw_timeout = rto;
334 } else {
335 tw->tw_timeout = TCP_TIMEWAIT_LEN;
336 if (state == TCP_TIME_WAIT)
337 timeo = TCP_TIMEWAIT_LEN;
338 }
339
696ab2d3
ACM
340 inet_twsk_schedule(tw, &tcp_death_row, timeo,
341 TCP_TIMEWAIT_LEN);
8feaf0c0 342 inet_twsk_put(tw);
1da177e4
LT
343 } else {
344 /* Sorry, if we're out of memory, just CLOSE this
345 * socket up. We've got bigger problems than
346 * non-graceful socket closings.
347 */
c6786240 348 LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n");
1da177e4
LT
349 }
350
351 tcp_update_metrics(sk);
352 tcp_done(sk);
353}
354
cfb6eeb4
YH
355void tcp_twsk_destructor(struct sock *sk)
356{
cfb6eeb4 357#ifdef CONFIG_TCP_MD5SIG
a928630a 358 struct tcp_timewait_sock *twsk = tcp_twsk(sk);
cfb6eeb4 359 if (twsk->tw_md5_keylen)
657e9649 360 tcp_free_md5sig_pool();
cfb6eeb4
YH
361#endif
362}
363
364EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
365
bdf1ee5d
IJ
366static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
367 struct request_sock *req)
368{
369 tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
370}
371
1da177e4
LT
372/* This is not only more efficient than what we used to do, it eliminates
373 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
374 *
375 * Actually, we could lots of memory writes here. tp of listening
376 * socket contains all necessary default parameters.
377 */
60236fdd 378struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
1da177e4 379{
9f1d2604 380 struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
1da177e4 381
87d11ceb 382 if (newsk != NULL) {
9f1d2604 383 const struct inet_request_sock *ireq = inet_rsk(req);
2e6599cb 384 struct tcp_request_sock *treq = tcp_rsk(req);
a9948a7e 385 struct inet_connection_sock *newicsk = inet_csk(newsk);
1da177e4 386 struct tcp_sock *newtp;
1da177e4 387
1da177e4
LT
388 /* Now setup tcp_sock */
389 newtp = tcp_sk(newsk);
390 newtp->pred_flags = 0;
54287cc1
ED
391 newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
392 newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;
33f5f57e 393 newtp->snd_up = treq->snt_isn + 1;
1da177e4
LT
394
395 tcp_prequeue_init(newtp);
396
ee7537b6 397 tcp_init_wl(newtp, treq->rcv_isn);
1da177e4 398
1da177e4
LT
399 newtp->srtt = 0;
400 newtp->mdev = TCP_TIMEOUT_INIT;
463c84b9 401 newicsk->icsk_rto = TCP_TIMEOUT_INIT;
1da177e4
LT
402
403 newtp->packets_out = 0;
1da177e4
LT
404 newtp->retrans_out = 0;
405 newtp->sacked_out = 0;
406 newtp->fackets_out = 0;
0b6a05c1 407 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1da177e4
LT
408
409 /* So many TCP implementations out there (incorrectly) count the
410 * initial SYN frame in their delayed-ACK and congestion control
411 * algorithms that we must have the following bandaid to talk
412 * efficiently to them. -DaveM
413 */
414 newtp->snd_cwnd = 2;
415 newtp->snd_cwnd_cnt = 0;
9772efb9 416 newtp->bytes_acked = 0;
1da177e4
LT
417
418 newtp->frto_counter = 0;
419 newtp->frto_highmark = 0;
420
7957aed7 421 newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
317a76f9 422
6687e988 423 tcp_set_ca_state(newsk, TCP_CA_Open);
1da177e4
LT
424 tcp_init_xmit_timers(newsk);
425 skb_queue_head_init(&newtp->out_of_order_queue);
2e6599cb 426 newtp->write_seq = treq->snt_isn + 1;
1da177e4 427 newtp->pushed_seq = newtp->write_seq;
1da177e4
LT
428
429 newtp->rx_opt.saw_tstamp = 0;
430
431 newtp->rx_opt.dsack = 0;
1da177e4 432 newtp->rx_opt.num_sacks = 0;
cabeccbd 433
1da177e4 434 newtp->urg_data = 0;
1da177e4 435
1da177e4 436 if (sock_flag(newsk, SOCK_KEEPOPEN))
463c84b9
ACM
437 inet_csk_reset_keepalive_timer(newsk,
438 keepalive_time_when(newtp));
1da177e4 439
2e6599cb 440 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
2de979bd 441 if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
1da177e4 442 if (sysctl_tcp_fack)
e60402d0 443 tcp_enable_fack(newtp);
1da177e4
LT
444 }
445 newtp->window_clamp = req->window_clamp;
446 newtp->rcv_ssthresh = req->rcv_wnd;
447 newtp->rcv_wnd = req->rcv_wnd;
2e6599cb 448 newtp->rx_opt.wscale_ok = ireq->wscale_ok;
1da177e4 449 if (newtp->rx_opt.wscale_ok) {
2e6599cb
ACM
450 newtp->rx_opt.snd_wscale = ireq->snd_wscale;
451 newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
1da177e4
LT
452 } else {
453 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
454 newtp->window_clamp = min(newtp->window_clamp, 65535U);
455 }
aa8223c7
ACM
456 newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<
457 newtp->rx_opt.snd_wscale);
1da177e4
LT
458 newtp->max_window = newtp->snd_wnd;
459
460 if (newtp->rx_opt.tstamp_ok) {
461 newtp->rx_opt.ts_recent = req->ts_recent;
9d729f72 462 newtp->rx_opt.ts_recent_stamp = get_seconds();
1da177e4
LT
463 newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
464 } else {
465 newtp->rx_opt.ts_recent_stamp = 0;
466 newtp->tcp_header_len = sizeof(struct tcphdr);
467 }
cfb6eeb4
YH
468#ifdef CONFIG_TCP_MD5SIG
469 newtp->md5sig_info = NULL; /*XXX*/
470 if (newtp->af_specific->md5_lookup(sk, newsk))
471 newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
472#endif
bee7ca9e 473 if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
463c84b9 474 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
1da177e4
LT
475 newtp->rx_opt.mss_clamp = req->mss;
476 TCP_ECN_openreq_child(newtp, req);
1da177e4 477
63231bdd 478 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
1da177e4
LT
479 }
480 return newsk;
481}
482
e905a9ed 483/*
1da177e4 484 * Process an incoming packet for SYN_RECV sockets represented
60236fdd 485 * as a request_sock.
1da177e4
LT
486 */
487
5a5f3a8d 488struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
60236fdd
ACM
489 struct request_sock *req,
490 struct request_sock **prev)
1da177e4 491{
aa8223c7 492 const struct tcphdr *th = tcp_hdr(skb);
714e85be 493 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
1da177e4
LT
494 int paws_reject = 0;
495 struct tcp_options_received tmp_opt;
496 struct sock *child;
497
05eaade2
GBY
498 if ((th->doff > (sizeof(struct tcphdr)>>2)) && (req->ts_recent)) {
499 tmp_opt.tstamp_ok = 1;
500 tcp_parse_options(skb, &tmp_opt, 1, NULL);
1da177e4
LT
501
502 if (tmp_opt.saw_tstamp) {
503 tmp_opt.ts_recent = req->ts_recent;
504 /* We do not store true stamp, but it is not required,
505 * it can be estimated (approximately)
506 * from another data.
507 */
9d729f72 508 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
c887e6d2 509 paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
1da177e4
LT
510 }
511 }
512
513 /* Check for pure retransmitted SYN. */
2e6599cb 514 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
1da177e4
LT
515 flg == TCP_FLAG_SYN &&
516 !paws_reject) {
517 /*
518 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
519 * this case on figure 6 and figure 8, but formal
520 * protocol description says NOTHING.
521 * To be more exact, it says that we should send ACK,
522 * because this segment (at least, if it has no data)
523 * is out of window.
524 *
525 * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
526 * describe SYN-RECV state. All the description
527 * is wrong, we cannot believe to it and should
528 * rely only on common sense and implementation
529 * experience.
530 *
531 * Enforce "SYN-ACK" according to figure 8, figure 6
532 * of RFC793, fixed by RFC1122.
533 */
fd80eb94 534 req->rsk_ops->rtx_syn_ack(sk, req);
1da177e4
LT
535 return NULL;
536 }
537
538 /* Further reproduces section "SEGMENT ARRIVES"
539 for state SYN-RECEIVED of RFC793.
540 It is broken, however, it does not work only
541 when SYNs are crossed.
542
543 You would think that SYN crossing is impossible here, since
544 we should have a SYN_SENT socket (from connect()) on our end,
545 but this is not true if the crossed SYNs were sent to both
546 ends by a malicious third party. We must defend against this,
547 and to do that we first verify the ACK (as per RFC793, page
548 36) and reset if it is invalid. Is this a true full defense?
549 To convince ourselves, let us consider a way in which the ACK
550 test can still pass in this 'malicious crossed SYNs' case.
551 Malicious sender sends identical SYNs (and thus identical sequence
552 numbers) to both A and B:
553
554 A: gets SYN, seq=7
555 B: gets SYN, seq=7
556
557 By our good fortune, both A and B select the same initial
558 send sequence number of seven :-)
559
560 A: sends SYN|ACK, seq=7, ack_seq=8
561 B: sends SYN|ACK, seq=7, ack_seq=8
562
563 So we are now A eating this SYN|ACK, ACK test passes. So
564 does sequence test, SYN is truncated, and thus we consider
565 it a bare ACK.
566
ec0a1966
DM
567 If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
568 bare ACK. Otherwise, we create an established connection. Both
569 ends (listening sockets) accept the new incoming connection and try
570 to talk to each other. 8-)
1da177e4
LT
571
572 Note: This case is both harmless, and rare. Possibility is about the
573 same as us discovering intelligent life on another plant tomorrow.
574
575 But generally, we should (RFC lies!) to accept ACK
576 from SYNACK both here and in tcp_rcv_state_process().
577 tcp_rcv_state_process() does not, hence, we do not too.
578
579 Note that the case is absolutely generic:
580 we cannot optimize anything here without
581 violating protocol. All the checks must be made
582 before attempt to create socket.
583 */
584
585 /* RFC793 page 36: "If the connection is in any non-synchronized state ...
586 * and the incoming segment acknowledges something not yet
caa20d9a 587 * sent (the segment carries an unacceptable ACK) ...
1da177e4
LT
588 * a reset is sent."
589 *
590 * Invalid ACK: reset will be sent by listening socket
591 */
592 if ((flg & TCP_FLAG_ACK) &&
2e6599cb 593 (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
1da177e4
LT
594 return sk;
595
596 /* Also, it would be not so bad idea to check rcv_tsecr, which
597 * is essentially ACK extension and too early or too late values
598 * should cause reset in unsynchronized states.
599 */
600
601 /* RFC793: "first check sequence number". */
602
603 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
2e6599cb 604 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
1da177e4
LT
605 /* Out of window: send ACK and drop. */
606 if (!(flg & TCP_FLAG_RST))
6edafaaf 607 req->rsk_ops->send_ack(sk, skb, req);
1da177e4 608 if (paws_reject)
de0744af 609 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
1da177e4
LT
610 return NULL;
611 }
612
613 /* In sequence, PAWS is OK. */
614
2e6599cb 615 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
2aaab9a0 616 req->ts_recent = tmp_opt.rcv_tsval;
1da177e4 617
2aaab9a0
AL
618 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
619 /* Truncate SYN, it is out of window starting
620 at tcp_rsk(req)->rcv_isn + 1. */
621 flg &= ~TCP_FLAG_SYN;
622 }
1da177e4 623
2aaab9a0
AL
624 /* RFC793: "second check the RST bit" and
625 * "fourth, check the SYN bit"
626 */
627 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
628 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
629 goto embryonic_reset;
630 }
1da177e4 631
2aaab9a0
AL
632 /* ACK sequence verified above, just make sure ACK is
633 * set. If ACK not set, just silently drop the packet.
634 */
635 if (!(flg & TCP_FLAG_ACK))
636 return NULL;
ec0a1966 637
d1b99ba4
JA
638 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
639 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
2aaab9a0
AL
640 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
641 inet_rsk(req)->acked = 1;
642 return NULL;
643 }
644
645 /* OK, ACK is valid, create big socket and
646 * feed this segment to it. It will repeat all
647 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
648 * ESTABLISHED STATE. If it will be dropped after
649 * socket is created, wait for troubles.
650 */
651 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
652 if (child == NULL)
653 goto listen_overflow;
1da177e4 654
2aaab9a0
AL
655 inet_csk_reqsk_queue_unlink(sk, req, prev);
656 inet_csk_reqsk_queue_removed(sk, req);
1da177e4 657
2aaab9a0
AL
658 inet_csk_reqsk_queue_add(sk, req, child);
659 return child;
1da177e4 660
2aaab9a0
AL
661listen_overflow:
662 if (!sysctl_tcp_abort_on_overflow) {
663 inet_rsk(req)->acked = 1;
664 return NULL;
665 }
1da177e4 666
2aaab9a0
AL
667embryonic_reset:
668 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
669 if (!(flg & TCP_FLAG_RST))
670 req->rsk_ops->send_reset(sk, skb);
1da177e4 671
2aaab9a0
AL
672 inet_csk_reqsk_queue_drop(sk, req, prev);
673 return NULL;
1da177e4
LT
674}
675
676/*
677 * Queue segment on the new socket if the new socket is active,
678 * otherwise we just shortcircuit this and continue with
679 * the new socket.
680 */
681
682int tcp_child_process(struct sock *parent, struct sock *child,
683 struct sk_buff *skb)
684{
685 int ret = 0;
686 int state = child->sk_state;
687
688 if (!sock_owned_by_user(child)) {
aa8223c7
ACM
689 ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
690 skb->len);
1da177e4
LT
691 /* Wakeup parent, send SIGIO */
692 if (state == TCP_SYN_RECV && child->sk_state != state)
693 parent->sk_data_ready(parent, 0);
694 } else {
695 /* Alas, it is possible again, because we do lookup
696 * in main socket hash table and lock on listening
697 * socket does not protect us more.
698 */
699 sk_add_backlog(child, skb);
700 }
701
702 bh_unlock_sock(child);
703 sock_put(child);
704 return ret;
705}
706
707EXPORT_SYMBOL(tcp_check_req);
708EXPORT_SYMBOL(tcp_child_process);
709EXPORT_SYMBOL(tcp_create_openreq_child);
710EXPORT_SYMBOL(tcp_timewait_state_process);