]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/netfilter/ipvs/ip_vs_proto_udp.c
xps: Transmit Packet Steering
[net-next-2.6.git] / net / netfilter / ipvs / ip_vs_proto_udp.c
CommitLineData
1da177e4
LT
1/*
2 * ip_vs_proto_udp.c: UDP load balancing support for IPVS
3 *
1da177e4
LT
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
9aada7ac
HE
16#define KMSG_COMPONENT "IPVS"
17#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
18
14c85021
ACM
19#include <linux/in.h>
20#include <linux/ip.h>
1da177e4 21#include <linux/kernel.h>
af1e1cf0 22#include <linux/netfilter.h>
1da177e4 23#include <linux/netfilter_ipv4.h>
14c85021 24#include <linux/udp.h>
1da177e4
LT
25
26#include <net/ip_vs.h>
c9bdd4b5 27#include <net/ip.h>
63f2c046 28#include <net/ip6_checksum.h>
1da177e4 29
1da177e4 30static int
51ef348b 31udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
1da177e4
LT
32 int *verdict, struct ip_vs_conn **cpp)
33{
34 struct ip_vs_service *svc;
35 struct udphdr _udph, *uh;
3c2e0505 36 struct ip_vs_iphdr iph;
1da177e4 37
51ef348b 38 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
3c2e0505
JV
39
40 uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
1da177e4
LT
41 if (uh == NULL) {
42 *verdict = NF_DROP;
43 return 0;
44 }
45
51ef348b 46 svc = ip_vs_service_get(af, skb->mark, iph.protocol,
3c2e0505
JV
47 &iph.daddr, uh->dest);
48 if (svc) {
190ecd27
JA
49 int ignored;
50
1da177e4
LT
51 if (ip_vs_todrop()) {
52 /*
53 * It seems that we are very loaded.
54 * We have to drop this packet :(
55 */
56 ip_vs_service_put(svc);
57 *verdict = NF_DROP;
58 return 0;
59 }
60
61 /*
62 * Let the virtual server select a real server for the
63 * incoming connection, and create a connection entry.
64 */
190ecd27
JA
65 *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
66 if (!*cpp && !ignored) {
1da177e4
LT
67 *verdict = ip_vs_leave(svc, skb, pp);
68 return 0;
69 }
70 ip_vs_service_put(svc);
71 }
72 return 1;
73}
74
75
76static inline void
0bbdd42b
JV
77udp_fast_csum_update(int af, struct udphdr *uhdr,
78 const union nf_inet_addr *oldip,
79 const union nf_inet_addr *newip,
014d730d 80 __be16 oldport, __be16 newport)
1da177e4 81{
0bbdd42b
JV
82#ifdef CONFIG_IP_VS_IPV6
83 if (af == AF_INET6)
84 uhdr->check =
85 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
86 ip_vs_check_diff2(oldport, newport,
87 ~csum_unfold(uhdr->check))));
88 else
89#endif
90 uhdr->check =
91 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
92 ip_vs_check_diff2(oldport, newport,
93 ~csum_unfold(uhdr->check))));
1da177e4 94 if (!uhdr->check)
f6ab0288 95 uhdr->check = CSUM_MANGLED_0;
1da177e4
LT
96}
97
503e81f6
SH
98static inline void
99udp_partial_csum_update(int af, struct udphdr *uhdr,
100 const union nf_inet_addr *oldip,
101 const union nf_inet_addr *newip,
102 __be16 oldlen, __be16 newlen)
103{
104#ifdef CONFIG_IP_VS_IPV6
105 if (af == AF_INET6)
106 uhdr->check =
5bc9068e 107 ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
503e81f6 108 ip_vs_check_diff2(oldlen, newlen,
5bc9068e 109 csum_unfold(uhdr->check))));
503e81f6
SH
110 else
111#endif
112 uhdr->check =
5bc9068e 113 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
503e81f6 114 ip_vs_check_diff2(oldlen, newlen,
5bc9068e 115 csum_unfold(uhdr->check))));
503e81f6
SH
116}
117
118
1da177e4 119static int
3db05fea 120udp_snat_handler(struct sk_buff *skb,
1da177e4
LT
121 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
122{
123 struct udphdr *udph;
0bbdd42b 124 unsigned int udphoff;
503e81f6 125 int oldlen;
8b27b10f 126 int payload_csum = 0;
0bbdd42b
JV
127
128#ifdef CONFIG_IP_VS_IPV6
129 if (cp->af == AF_INET6)
130 udphoff = sizeof(struct ipv6hdr);
131 else
132#endif
133 udphoff = ip_hdrlen(skb);
503e81f6 134 oldlen = skb->len - udphoff;
1da177e4
LT
135
136 /* csum_check requires unshared skb */
3db05fea 137 if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
1da177e4
LT
138 return 0;
139
140 if (unlikely(cp->app != NULL)) {
8b27b10f
JA
141 int ret;
142
1da177e4 143 /* Some checks before mangling */
0bbdd42b 144 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
1da177e4
LT
145 return 0;
146
147 /*
148 * Call application helper if needed
149 */
8b27b10f 150 if (!(ret = ip_vs_app_pkt_out(cp, skb)))
1da177e4 151 return 0;
8b27b10f
JA
152 /* ret=2: csum update is needed after payload mangling */
153 if (ret == 1)
154 oldlen = skb->len - udphoff;
155 else
156 payload_csum = 1;
1da177e4
LT
157 }
158
0bbdd42b 159 udph = (void *)skb_network_header(skb) + udphoff;
1da177e4
LT
160 udph->source = cp->vport;
161
162 /*
163 * Adjust UDP checksums
164 */
503e81f6
SH
165 if (skb->ip_summed == CHECKSUM_PARTIAL) {
166 udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
ca62059b
HH
167 htons(oldlen),
168 htons(skb->len - udphoff));
8b27b10f 169 } else if (!payload_csum && (udph->check != 0)) {
1da177e4 170 /* Only port and addr are changed, do fast csum update */
0bbdd42b 171 udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
1da177e4 172 cp->dport, cp->vport);
3db05fea 173 if (skb->ip_summed == CHECKSUM_COMPLETE)
8b27b10f
JA
174 skb->ip_summed = (cp->app && pp->csum_check) ?
175 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
1da177e4
LT
176 } else {
177 /* full checksum calculation */
178 udph->check = 0;
3db05fea 179 skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
0bbdd42b
JV
180#ifdef CONFIG_IP_VS_IPV6
181 if (cp->af == AF_INET6)
182 udph->check = csum_ipv6_magic(&cp->vaddr.in6,
183 &cp->caddr.in6,
184 skb->len - udphoff,
185 cp->protocol, skb->csum);
186 else
187#endif
188 udph->check = csum_tcpudp_magic(cp->vaddr.ip,
189 cp->caddr.ip,
190 skb->len - udphoff,
191 cp->protocol,
192 skb->csum);
1da177e4 193 if (udph->check == 0)
f6ab0288 194 udph->check = CSUM_MANGLED_0;
8b27b10f 195 skb->ip_summed = CHECKSUM_UNNECESSARY;
1da177e4
LT
196 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
197 pp->name, udph->check,
198 (char*)&(udph->check) - (char*)udph);
199 }
200 return 1;
201}
202
203
204static int
3db05fea 205udp_dnat_handler(struct sk_buff *skb,
1da177e4
LT
206 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
207{
208 struct udphdr *udph;
0bbdd42b 209 unsigned int udphoff;
503e81f6 210 int oldlen;
8b27b10f 211 int payload_csum = 0;
0bbdd42b
JV
212
213#ifdef CONFIG_IP_VS_IPV6
214 if (cp->af == AF_INET6)
215 udphoff = sizeof(struct ipv6hdr);
216 else
217#endif
218 udphoff = ip_hdrlen(skb);
503e81f6 219 oldlen = skb->len - udphoff;
1da177e4
LT
220
221 /* csum_check requires unshared skb */
3db05fea 222 if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
1da177e4
LT
223 return 0;
224
225 if (unlikely(cp->app != NULL)) {
8b27b10f
JA
226 int ret;
227
1da177e4 228 /* Some checks before mangling */
0bbdd42b 229 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
1da177e4
LT
230 return 0;
231
232 /*
233 * Attempt ip_vs_app call.
234 * It will fix ip_vs_conn
235 */
8b27b10f 236 if (!(ret = ip_vs_app_pkt_in(cp, skb)))
1da177e4 237 return 0;
8b27b10f
JA
238 /* ret=2: csum update is needed after payload mangling */
239 if (ret == 1)
240 oldlen = skb->len - udphoff;
241 else
242 payload_csum = 1;
1da177e4
LT
243 }
244
0bbdd42b 245 udph = (void *)skb_network_header(skb) + udphoff;
1da177e4
LT
246 udph->dest = cp->dport;
247
248 /*
249 * Adjust UDP checksums
250 */
503e81f6 251 if (skb->ip_summed == CHECKSUM_PARTIAL) {
5bc9068e 252 udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
ca62059b
HH
253 htons(oldlen),
254 htons(skb->len - udphoff));
8b27b10f 255 } else if (!payload_csum && (udph->check != 0)) {
1da177e4 256 /* Only port and addr are changed, do fast csum update */
0bbdd42b 257 udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
1da177e4 258 cp->vport, cp->dport);
3db05fea 259 if (skb->ip_summed == CHECKSUM_COMPLETE)
8b27b10f
JA
260 skb->ip_summed = (cp->app && pp->csum_check) ?
261 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
1da177e4
LT
262 } else {
263 /* full checksum calculation */
264 udph->check = 0;
3db05fea 265 skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
0bbdd42b
JV
266#ifdef CONFIG_IP_VS_IPV6
267 if (cp->af == AF_INET6)
268 udph->check = csum_ipv6_magic(&cp->caddr.in6,
269 &cp->daddr.in6,
270 skb->len - udphoff,
271 cp->protocol, skb->csum);
272 else
273#endif
274 udph->check = csum_tcpudp_magic(cp->caddr.ip,
275 cp->daddr.ip,
276 skb->len - udphoff,
277 cp->protocol,
278 skb->csum);
1da177e4 279 if (udph->check == 0)
f6ab0288 280 udph->check = CSUM_MANGLED_0;
3db05fea 281 skb->ip_summed = CHECKSUM_UNNECESSARY;
1da177e4
LT
282 }
283 return 1;
284}
285
286
287static int
51ef348b 288udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
1da177e4
LT
289{
290 struct udphdr _udph, *uh;
51ef348b
JV
291 unsigned int udphoff;
292
293#ifdef CONFIG_IP_VS_IPV6
294 if (af == AF_INET6)
295 udphoff = sizeof(struct ipv6hdr);
296 else
297#endif
298 udphoff = ip_hdrlen(skb);
1da177e4
LT
299
300 uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
301 if (uh == NULL)
302 return 0;
303
304 if (uh->check != 0) {
305 switch (skb->ip_summed) {
306 case CHECKSUM_NONE:
307 skb->csum = skb_checksum(skb, udphoff,
308 skb->len - udphoff, 0);
84fa7933 309 case CHECKSUM_COMPLETE:
51ef348b
JV
310#ifdef CONFIG_IP_VS_IPV6
311 if (af == AF_INET6) {
312 if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
313 &ipv6_hdr(skb)->daddr,
314 skb->len - udphoff,
315 ipv6_hdr(skb)->nexthdr,
316 skb->csum)) {
0d79641a 317 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
51ef348b
JV
318 "Failed checksum for");
319 return 0;
320 }
321 } else
322#endif
323 if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
324 ip_hdr(skb)->daddr,
325 skb->len - udphoff,
326 ip_hdr(skb)->protocol,
327 skb->csum)) {
0d79641a 328 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
51ef348b
JV
329 "Failed checksum for");
330 return 0;
331 }
1da177e4
LT
332 break;
333 default:
84fa7933 334 /* No need to checksum. */
1da177e4
LT
335 break;
336 }
337 }
338 return 1;
339}
340
341
342/*
343 * Note: the caller guarantees that only one of register_app,
344 * unregister_app or app_conn_bind is called each time.
345 */
346
347#define UDP_APP_TAB_BITS 4
348#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
349#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
350
351static struct list_head udp_apps[UDP_APP_TAB_SIZE];
352static DEFINE_SPINLOCK(udp_app_lock);
353
75e7ce66 354static inline __u16 udp_app_hashkey(__be16 port)
1da177e4 355{
75e7ce66
AV
356 return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
357 & UDP_APP_TAB_MASK;
1da177e4
LT
358}
359
360
361static int udp_register_app(struct ip_vs_app *inc)
362{
363 struct ip_vs_app *i;
75e7ce66
AV
364 __u16 hash;
365 __be16 port = inc->port;
1da177e4
LT
366 int ret = 0;
367
368 hash = udp_app_hashkey(port);
369
370
371 spin_lock_bh(&udp_app_lock);
372 list_for_each_entry(i, &udp_apps[hash], p_list) {
373 if (i->port == port) {
374 ret = -EEXIST;
375 goto out;
376 }
377 }
378 list_add(&inc->p_list, &udp_apps[hash]);
379 atomic_inc(&ip_vs_protocol_udp.appcnt);
380
381 out:
382 spin_unlock_bh(&udp_app_lock);
383 return ret;
384}
385
386
387static void
388udp_unregister_app(struct ip_vs_app *inc)
389{
390 spin_lock_bh(&udp_app_lock);
391 atomic_dec(&ip_vs_protocol_udp.appcnt);
392 list_del(&inc->p_list);
393 spin_unlock_bh(&udp_app_lock);
394}
395
396
397static int udp_app_conn_bind(struct ip_vs_conn *cp)
398{
399 int hash;
400 struct ip_vs_app *inc;
401 int result = 0;
402
403 /* Default binding: bind app only for NAT */
404 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
405 return 0;
406
407 /* Lookup application incarnations and bind the right one */
408 hash = udp_app_hashkey(cp->vport);
409
410 spin_lock(&udp_app_lock);
411 list_for_each_entry(inc, &udp_apps[hash], p_list) {
412 if (inc->port == cp->vport) {
413 if (unlikely(!ip_vs_app_inc_get(inc)))
414 break;
415 spin_unlock(&udp_app_lock);
416
1e3e238e 417 IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
cfc78c5a
JV
418 "%s:%u to app %s on port %u\n",
419 __func__,
420 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
421 ntohs(cp->cport),
422 IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
423 ntohs(cp->vport),
424 inc->name, ntohs(inc->port));
425
1da177e4
LT
426 cp->app = inc;
427 if (inc->init_conn)
428 result = inc->init_conn(inc, cp);
429 goto out;
430 }
431 }
432 spin_unlock(&udp_app_lock);
433
434 out:
435 return result;
436}
437
438
439static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
440 [IP_VS_UDP_S_NORMAL] = 5*60*HZ,
441 [IP_VS_UDP_S_LAST] = 2*HZ,
442};
443
36cbd3dc 444static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
1da177e4
LT
445 [IP_VS_UDP_S_NORMAL] = "UDP",
446 [IP_VS_UDP_S_LAST] = "BUG!",
447};
448
449
450static int
451udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
452{
453 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
454 udp_state_name_table, sname, to);
455}
456
457static const char * udp_state_name(int state)
458{
459 if (state >= IP_VS_UDP_S_LAST)
460 return "ERR!";
461 return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
462}
463
464static int
465udp_state_transition(struct ip_vs_conn *cp, int direction,
466 const struct sk_buff *skb,
467 struct ip_vs_protocol *pp)
468{
469 cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
470 return 1;
471}
472
473static void udp_init(struct ip_vs_protocol *pp)
474{
475 IP_VS_INIT_HASH_TABLE(udp_apps);
476 pp->timeout_table = udp_timeouts;
477}
478
479static void udp_exit(struct ip_vs_protocol *pp)
480{
481}
482
483
484struct ip_vs_protocol ip_vs_protocol_udp = {
485 .name = "UDP",
486 .protocol = IPPROTO_UDP,
2ad17def 487 .num_states = IP_VS_UDP_S_LAST,
1da177e4
LT
488 .dont_defrag = 0,
489 .init = udp_init,
490 .exit = udp_exit,
491 .conn_schedule = udp_conn_schedule,
5c0d2374
SH
492 .conn_in_get = ip_vs_conn_in_get_proto,
493 .conn_out_get = ip_vs_conn_out_get_proto,
1da177e4
LT
494 .snat_handler = udp_snat_handler,
495 .dnat_handler = udp_dnat_handler,
496 .csum_check = udp_csum_check,
497 .state_transition = udp_state_transition,
498 .state_name = udp_state_name,
499 .register_app = udp_register_app,
500 .unregister_app = udp_unregister_app,
501 .app_conn_bind = udp_app_conn_bind,
502 .debug_packet = ip_vs_tcpudp_debug_packet,
503 .timeout_change = NULL,
504 .set_state_timeout = udp_set_state_timeout,
505};