]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/netfilter/ipvs/ip_vs_proto_tcp.c
net: use the macros defined for the members of flowi
[net-next-2.6.git] / net / netfilter / ipvs / ip_vs_proto_tcp.c
CommitLineData
1da177e4
LT
1/*
2 * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
3 *
1da177e4
LT
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
9aada7ac
HE
16#define KMSG_COMPONENT "IPVS"
17#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
18
1da177e4
LT
19#include <linux/kernel.h>
20#include <linux/ip.h>
21#include <linux/tcp.h> /* for tcphdr */
22#include <net/ip.h>
23#include <net/tcp.h> /* for csum_tcpudp_magic */
63f2c046 24#include <net/ip6_checksum.h>
af1e1cf0 25#include <linux/netfilter.h>
1da177e4
LT
26#include <linux/netfilter_ipv4.h>
27
28#include <net/ip_vs.h>
29
1da177e4 30static int
51ef348b 31tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
1da177e4
LT
32 int *verdict, struct ip_vs_conn **cpp)
33{
34 struct ip_vs_service *svc;
35 struct tcphdr _tcph, *th;
3c2e0505 36 struct ip_vs_iphdr iph;
1da177e4 37
51ef348b 38 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
3c2e0505
JV
39
40 th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
1da177e4
LT
41 if (th == NULL) {
42 *verdict = NF_DROP;
43 return 0;
44 }
45
190ecd27 46 /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
1da177e4 47 if (th->syn &&
51ef348b
JV
48 (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
49 th->dest))) {
190ecd27
JA
50 int ignored;
51
1da177e4
LT
52 if (ip_vs_todrop()) {
53 /*
54 * It seems that we are very loaded.
55 * We have to drop this packet :(
56 */
57 ip_vs_service_put(svc);
58 *verdict = NF_DROP;
59 return 0;
60 }
61
62 /*
63 * Let the virtual server select a real server for the
64 * incoming connection, and create a connection entry.
65 */
190ecd27
JA
66 *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
67 if (!*cpp && !ignored) {
1da177e4
LT
68 *verdict = ip_vs_leave(svc, skb, pp);
69 return 0;
70 }
71 ip_vs_service_put(svc);
72 }
73 return 1;
74}
75
76
77static inline void
0bbdd42b
JV
78tcp_fast_csum_update(int af, struct tcphdr *tcph,
79 const union nf_inet_addr *oldip,
80 const union nf_inet_addr *newip,
014d730d 81 __be16 oldport, __be16 newport)
1da177e4 82{
0bbdd42b
JV
83#ifdef CONFIG_IP_VS_IPV6
84 if (af == AF_INET6)
85 tcph->check =
86 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
87 ip_vs_check_diff2(oldport, newport,
88 ~csum_unfold(tcph->check))));
89 else
90#endif
1da177e4 91 tcph->check =
0bbdd42b 92 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
f9214b26
AV
93 ip_vs_check_diff2(oldport, newport,
94 ~csum_unfold(tcph->check))));
1da177e4
LT
95}
96
97
503e81f6
SH
98static inline void
99tcp_partial_csum_update(int af, struct tcphdr *tcph,
100 const union nf_inet_addr *oldip,
101 const union nf_inet_addr *newip,
102 __be16 oldlen, __be16 newlen)
103{
104#ifdef CONFIG_IP_VS_IPV6
105 if (af == AF_INET6)
106 tcph->check =
5bc9068e 107 ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
503e81f6 108 ip_vs_check_diff2(oldlen, newlen,
5bc9068e 109 csum_unfold(tcph->check))));
503e81f6
SH
110 else
111#endif
112 tcph->check =
5bc9068e 113 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
503e81f6 114 ip_vs_check_diff2(oldlen, newlen,
5bc9068e 115 csum_unfold(tcph->check))));
503e81f6
SH
116}
117
118
1da177e4 119static int
3db05fea 120tcp_snat_handler(struct sk_buff *skb,
1da177e4
LT
121 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
122{
123 struct tcphdr *tcph;
0bbdd42b 124 unsigned int tcphoff;
503e81f6 125 int oldlen;
8b27b10f 126 int payload_csum = 0;
0bbdd42b
JV
127
128#ifdef CONFIG_IP_VS_IPV6
129 if (cp->af == AF_INET6)
130 tcphoff = sizeof(struct ipv6hdr);
131 else
132#endif
133 tcphoff = ip_hdrlen(skb);
503e81f6 134 oldlen = skb->len - tcphoff;
1da177e4
LT
135
136 /* csum_check requires unshared skb */
3db05fea 137 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
1da177e4
LT
138 return 0;
139
140 if (unlikely(cp->app != NULL)) {
8b27b10f
JA
141 int ret;
142
1da177e4 143 /* Some checks before mangling */
0bbdd42b 144 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
1da177e4
LT
145 return 0;
146
147 /* Call application helper if needed */
8b27b10f 148 if (!(ret = ip_vs_app_pkt_out(cp, skb)))
1da177e4 149 return 0;
8b27b10f
JA
150 /* ret=2: csum update is needed after payload mangling */
151 if (ret == 1)
152 oldlen = skb->len - tcphoff;
153 else
154 payload_csum = 1;
1da177e4
LT
155 }
156
0bbdd42b 157 tcph = (void *)skb_network_header(skb) + tcphoff;
1da177e4
LT
158 tcph->source = cp->vport;
159
160 /* Adjust TCP checksums */
503e81f6
SH
161 if (skb->ip_summed == CHECKSUM_PARTIAL) {
162 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
ca62059b
HH
163 htons(oldlen),
164 htons(skb->len - tcphoff));
8b27b10f 165 } else if (!payload_csum) {
1da177e4 166 /* Only port and addr are changed, do fast csum update */
0bbdd42b 167 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
1da177e4 168 cp->dport, cp->vport);
3db05fea 169 if (skb->ip_summed == CHECKSUM_COMPLETE)
8b27b10f
JA
170 skb->ip_summed = (cp->app && pp->csum_check) ?
171 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
1da177e4
LT
172 } else {
173 /* full checksum calculation */
174 tcph->check = 0;
3db05fea 175 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
0bbdd42b
JV
176#ifdef CONFIG_IP_VS_IPV6
177 if (cp->af == AF_INET6)
178 tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
179 &cp->caddr.in6,
180 skb->len - tcphoff,
181 cp->protocol, skb->csum);
182 else
183#endif
184 tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
185 cp->caddr.ip,
186 skb->len - tcphoff,
187 cp->protocol,
188 skb->csum);
8b27b10f 189 skb->ip_summed = CHECKSUM_UNNECESSARY;
0bbdd42b 190
1da177e4
LT
191 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
192 pp->name, tcph->check,
193 (char*)&(tcph->check) - (char*)tcph);
194 }
195 return 1;
196}
197
198
199static int
3db05fea 200tcp_dnat_handler(struct sk_buff *skb,
1da177e4
LT
201 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
202{
203 struct tcphdr *tcph;
0bbdd42b 204 unsigned int tcphoff;
503e81f6 205 int oldlen;
8b27b10f 206 int payload_csum = 0;
0bbdd42b
JV
207
208#ifdef CONFIG_IP_VS_IPV6
209 if (cp->af == AF_INET6)
210 tcphoff = sizeof(struct ipv6hdr);
211 else
212#endif
213 tcphoff = ip_hdrlen(skb);
503e81f6 214 oldlen = skb->len - tcphoff;
1da177e4
LT
215
216 /* csum_check requires unshared skb */
3db05fea 217 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
1da177e4
LT
218 return 0;
219
220 if (unlikely(cp->app != NULL)) {
8b27b10f
JA
221 int ret;
222
1da177e4 223 /* Some checks before mangling */
0bbdd42b 224 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
1da177e4
LT
225 return 0;
226
227 /*
228 * Attempt ip_vs_app call.
229 * It will fix ip_vs_conn and iph ack_seq stuff
230 */
8b27b10f 231 if (!(ret = ip_vs_app_pkt_in(cp, skb)))
1da177e4 232 return 0;
8b27b10f
JA
233 /* ret=2: csum update is needed after payload mangling */
234 if (ret == 1)
235 oldlen = skb->len - tcphoff;
236 else
237 payload_csum = 1;
1da177e4
LT
238 }
239
0bbdd42b 240 tcph = (void *)skb_network_header(skb) + tcphoff;
1da177e4
LT
241 tcph->dest = cp->dport;
242
243 /*
244 * Adjust TCP checksums
245 */
503e81f6 246 if (skb->ip_summed == CHECKSUM_PARTIAL) {
5bc9068e 247 tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
ca62059b
HH
248 htons(oldlen),
249 htons(skb->len - tcphoff));
8b27b10f 250 } else if (!payload_csum) {
1da177e4 251 /* Only port and addr are changed, do fast csum update */
0bbdd42b 252 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
1da177e4 253 cp->vport, cp->dport);
3db05fea 254 if (skb->ip_summed == CHECKSUM_COMPLETE)
8b27b10f
JA
255 skb->ip_summed = (cp->app && pp->csum_check) ?
256 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
1da177e4
LT
257 } else {
258 /* full checksum calculation */
259 tcph->check = 0;
3db05fea 260 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
0bbdd42b
JV
261#ifdef CONFIG_IP_VS_IPV6
262 if (cp->af == AF_INET6)
263 tcph->check = csum_ipv6_magic(&cp->caddr.in6,
264 &cp->daddr.in6,
265 skb->len - tcphoff,
266 cp->protocol, skb->csum);
267 else
268#endif
269 tcph->check = csum_tcpudp_magic(cp->caddr.ip,
270 cp->daddr.ip,
271 skb->len - tcphoff,
272 cp->protocol,
273 skb->csum);
3db05fea 274 skb->ip_summed = CHECKSUM_UNNECESSARY;
1da177e4
LT
275 }
276 return 1;
277}
278
279
280static int
51ef348b 281tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
1da177e4 282{
51ef348b
JV
283 unsigned int tcphoff;
284
285#ifdef CONFIG_IP_VS_IPV6
286 if (af == AF_INET6)
287 tcphoff = sizeof(struct ipv6hdr);
288 else
289#endif
290 tcphoff = ip_hdrlen(skb);
1da177e4
LT
291
292 switch (skb->ip_summed) {
293 case CHECKSUM_NONE:
294 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
84fa7933 295 case CHECKSUM_COMPLETE:
51ef348b
JV
296#ifdef CONFIG_IP_VS_IPV6
297 if (af == AF_INET6) {
298 if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
299 &ipv6_hdr(skb)->daddr,
300 skb->len - tcphoff,
301 ipv6_hdr(skb)->nexthdr,
302 skb->csum)) {
0d79641a 303 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
51ef348b
JV
304 "Failed checksum for");
305 return 0;
306 }
307 } else
308#endif
309 if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
310 ip_hdr(skb)->daddr,
311 skb->len - tcphoff,
312 ip_hdr(skb)->protocol,
313 skb->csum)) {
0d79641a 314 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
51ef348b
JV
315 "Failed checksum for");
316 return 0;
317 }
1da177e4
LT
318 break;
319 default:
84fa7933 320 /* No need to checksum. */
1da177e4
LT
321 break;
322 }
323
324 return 1;
325}
326
327
328#define TCP_DIR_INPUT 0
329#define TCP_DIR_OUTPUT 4
330#define TCP_DIR_INPUT_ONLY 8
331
9b5b5cff 332static const int tcp_state_off[IP_VS_DIR_LAST] = {
1da177e4
LT
333 [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
334 [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
335 [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
336};
337
338/*
339 * Timeout table[state]
340 */
341static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
342 [IP_VS_TCP_S_NONE] = 2*HZ,
343 [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
344 [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
345 [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
346 [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
347 [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
348 [IP_VS_TCP_S_CLOSE] = 10*HZ,
349 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
350 [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
351 [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
352 [IP_VS_TCP_S_SYNACK] = 120*HZ,
353 [IP_VS_TCP_S_LAST] = 2*HZ,
354};
355
36cbd3dc 356static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
1da177e4
LT
357 [IP_VS_TCP_S_NONE] = "NONE",
358 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
359 [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
360 [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
361 [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
362 [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
363 [IP_VS_TCP_S_CLOSE] = "CLOSE",
364 [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
365 [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
366 [IP_VS_TCP_S_LISTEN] = "LISTEN",
367 [IP_VS_TCP_S_SYNACK] = "SYNACK",
368 [IP_VS_TCP_S_LAST] = "BUG!",
369};
370
371#define sNO IP_VS_TCP_S_NONE
372#define sES IP_VS_TCP_S_ESTABLISHED
373#define sSS IP_VS_TCP_S_SYN_SENT
374#define sSR IP_VS_TCP_S_SYN_RECV
375#define sFW IP_VS_TCP_S_FIN_WAIT
376#define sTW IP_VS_TCP_S_TIME_WAIT
377#define sCL IP_VS_TCP_S_CLOSE
378#define sCW IP_VS_TCP_S_CLOSE_WAIT
379#define sLA IP_VS_TCP_S_LAST_ACK
380#define sLI IP_VS_TCP_S_LISTEN
381#define sSA IP_VS_TCP_S_SYNACK
382
383struct tcp_states_t {
384 int next_state[IP_VS_TCP_S_LAST];
385};
386
387static const char * tcp_state_name(int state)
388{
389 if (state >= IP_VS_TCP_S_LAST)
390 return "ERR!";
391 return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
392}
393
394static struct tcp_states_t tcp_states [] = {
395/* INPUT */
396/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
397/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
398/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
399/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
400/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
401
402/* OUTPUT */
403/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
404/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
405/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
406/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
407/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
408
409/* INPUT-ONLY */
410/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
411/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
412/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
413/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
414/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
415};
416
417static struct tcp_states_t tcp_states_dos [] = {
418/* INPUT */
419/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
420/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
421/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
422/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
423/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
424
425/* OUTPUT */
426/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
427/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
428/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
429/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
430/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
431
432/* INPUT-ONLY */
433/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
434/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
435/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
436/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
437/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
438};
439
440static struct tcp_states_t *tcp_state_table = tcp_states;
441
442
443static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
444{
445 int on = (flags & 1); /* secure_tcp */
446
447 /*
448 ** FIXME: change secure_tcp to independent sysctl var
449 ** or make it per-service or per-app because it is valid
450 ** for most if not for all of the applications. Something
451 ** like "capabilities" (flags) for each object.
452 */
453 tcp_state_table = (on? tcp_states_dos : tcp_states);
454}
455
456static int
457tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
458{
459 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
460 tcp_state_name_table, sname, to);
461}
462
463static inline int tcp_state_idx(struct tcphdr *th)
464{
465 if (th->rst)
466 return 3;
467 if (th->syn)
468 return 0;
469 if (th->fin)
470 return 1;
471 if (th->ack)
472 return 2;
473 return -1;
474}
475
476static inline void
477set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
478 int direction, struct tcphdr *th)
479{
480 int state_idx;
481 int new_state = IP_VS_TCP_S_CLOSE;
482 int state_off = tcp_state_off[direction];
483
484 /*
485 * Update state offset to INPUT_ONLY if necessary
486 * or delete NO_OUTPUT flag if output packet detected
487 */
488 if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
489 if (state_off == TCP_DIR_OUTPUT)
490 cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
491 else
492 state_off = TCP_DIR_INPUT_ONLY;
493 }
494
495 if ((state_idx = tcp_state_idx(th)) < 0) {
496 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
497 goto tcp_state_out;
498 }
499
500 new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
501
502 tcp_state_out:
503 if (new_state != cp->state) {
504 struct ip_vs_dest *dest = cp->dest;
505
cfc78c5a
JV
506 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
507 "%s:%d state: %s->%s conn->refcnt:%d\n",
508 pp->name,
509 ((state_off == TCP_DIR_OUTPUT) ?
510 "output " : "input "),
511 th->syn ? 'S' : '.',
512 th->fin ? 'F' : '.',
513 th->ack ? 'A' : '.',
514 th->rst ? 'R' : '.',
515 IP_VS_DBG_ADDR(cp->af, &cp->daddr),
516 ntohs(cp->dport),
517 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
518 ntohs(cp->cport),
519 tcp_state_name(cp->state),
520 tcp_state_name(new_state),
521 atomic_read(&cp->refcnt));
522
1da177e4
LT
523 if (dest) {
524 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
525 (new_state != IP_VS_TCP_S_ESTABLISHED)) {
526 atomic_dec(&dest->activeconns);
527 atomic_inc(&dest->inactconns);
528 cp->flags |= IP_VS_CONN_F_INACTIVE;
529 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
530 (new_state == IP_VS_TCP_S_ESTABLISHED)) {
531 atomic_inc(&dest->activeconns);
532 atomic_dec(&dest->inactconns);
533 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
534 }
535 }
536 }
537
538 cp->timeout = pp->timeout_table[cp->state = new_state];
539}
540
541
542/*
543 * Handle state transitions
544 */
545static int
546tcp_state_transition(struct ip_vs_conn *cp, int direction,
547 const struct sk_buff *skb,
548 struct ip_vs_protocol *pp)
549{
550 struct tcphdr _tcph, *th;
551
0bbdd42b
JV
552#ifdef CONFIG_IP_VS_IPV6
553 int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
554#else
555 int ihl = ip_hdrlen(skb);
556#endif
557
558 th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
1da177e4
LT
559 if (th == NULL)
560 return 0;
561
562 spin_lock(&cp->lock);
563 set_tcp_state(pp, cp, direction, th);
564 spin_unlock(&cp->lock);
565
566 return 1;
567}
568
569
570/*
571 * Hash table for TCP application incarnations
572 */
573#define TCP_APP_TAB_BITS 4
574#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
575#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
576
577static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
578static DEFINE_SPINLOCK(tcp_app_lock);
579
75e7ce66 580static inline __u16 tcp_app_hashkey(__be16 port)
1da177e4 581{
75e7ce66
AV
582 return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
583 & TCP_APP_TAB_MASK;
1da177e4
LT
584}
585
586
587static int tcp_register_app(struct ip_vs_app *inc)
588{
589 struct ip_vs_app *i;
75e7ce66
AV
590 __u16 hash;
591 __be16 port = inc->port;
1da177e4
LT
592 int ret = 0;
593
594 hash = tcp_app_hashkey(port);
595
596 spin_lock_bh(&tcp_app_lock);
597 list_for_each_entry(i, &tcp_apps[hash], p_list) {
598 if (i->port == port) {
599 ret = -EEXIST;
600 goto out;
601 }
602 }
603 list_add(&inc->p_list, &tcp_apps[hash]);
604 atomic_inc(&ip_vs_protocol_tcp.appcnt);
605
606 out:
607 spin_unlock_bh(&tcp_app_lock);
608 return ret;
609}
610
611
612static void
613tcp_unregister_app(struct ip_vs_app *inc)
614{
615 spin_lock_bh(&tcp_app_lock);
616 atomic_dec(&ip_vs_protocol_tcp.appcnt);
617 list_del(&inc->p_list);
618 spin_unlock_bh(&tcp_app_lock);
619}
620
621
622static int
623tcp_app_conn_bind(struct ip_vs_conn *cp)
624{
625 int hash;
626 struct ip_vs_app *inc;
627 int result = 0;
628
629 /* Default binding: bind app only for NAT */
630 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
631 return 0;
632
633 /* Lookup application incarnations and bind the right one */
634 hash = tcp_app_hashkey(cp->vport);
635
636 spin_lock(&tcp_app_lock);
637 list_for_each_entry(inc, &tcp_apps[hash], p_list) {
638 if (inc->port == cp->vport) {
639 if (unlikely(!ip_vs_app_inc_get(inc)))
640 break;
641 spin_unlock(&tcp_app_lock);
642
1e3e238e 643 IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
cfc78c5a
JV
644 "%s:%u to app %s on port %u\n",
645 __func__,
646 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
647 ntohs(cp->cport),
648 IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
649 ntohs(cp->vport),
650 inc->name, ntohs(inc->port));
651
1da177e4
LT
652 cp->app = inc;
653 if (inc->init_conn)
654 result = inc->init_conn(inc, cp);
655 goto out;
656 }
657 }
658 spin_unlock(&tcp_app_lock);
659
660 out:
661 return result;
662}
663
664
665/*
666 * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
667 */
668void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
669{
670 spin_lock(&cp->lock);
671 cp->state = IP_VS_TCP_S_LISTEN;
672 cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
673 spin_unlock(&cp->lock);
674}
675
676
ba602a81 677static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
1da177e4
LT
678{
679 IP_VS_INIT_HASH_TABLE(tcp_apps);
680 pp->timeout_table = tcp_timeouts;
681}
682
683
ba602a81 684static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
1da177e4
LT
685{
686}
687
688
689struct ip_vs_protocol ip_vs_protocol_tcp = {
690 .name = "TCP",
691 .protocol = IPPROTO_TCP,
2ad17def 692 .num_states = IP_VS_TCP_S_LAST,
1da177e4
LT
693 .dont_defrag = 0,
694 .appcnt = ATOMIC_INIT(0),
ba602a81
DM
695 .init = ip_vs_tcp_init,
696 .exit = ip_vs_tcp_exit,
1da177e4
LT
697 .register_app = tcp_register_app,
698 .unregister_app = tcp_unregister_app,
699 .conn_schedule = tcp_conn_schedule,
5c0d2374
SH
700 .conn_in_get = ip_vs_conn_in_get_proto,
701 .conn_out_get = ip_vs_conn_out_get_proto,
1da177e4
LT
702 .snat_handler = tcp_snat_handler,
703 .dnat_handler = tcp_dnat_handler,
704 .csum_check = tcp_csum_check,
705 .state_name = tcp_state_name,
706 .state_transition = tcp_state_transition,
707 .app_conn_bind = tcp_app_conn_bind,
708 .debug_packet = ip_vs_tcpudp_debug_packet,
709 .timeout_change = tcp_timeout_change,
710 .set_state_timeout = tcp_set_state_timeout,
711};