]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/netfilter/ipvs/ip_vs_xmit.c
ipvs: optimize checksums for apps
[net-next-2.6.git] / net / netfilter / ipvs / ip_vs_xmit.c
CommitLineData
1da177e4
LT
1/*
2 * ip_vs_xmit.c: various packet transmitters for IPVS
3 *
1da177e4
LT
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
9aada7ac
HE
16#define KMSG_COMPONENT "IPVS"
17#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
18
1da177e4 19#include <linux/kernel.h>
5a0e3ad6 20#include <linux/slab.h>
1da177e4 21#include <linux/tcp.h> /* for tcphdr */
c439cb2e 22#include <net/ip.h>
1da177e4
LT
23#include <net/tcp.h> /* for csum_tcpudp_magic */
24#include <net/udp.h>
25#include <net/icmp.h> /* for icmp_send */
26#include <net/route.h> /* for ip_route_output */
38cdcc9a
JV
27#include <net/ipv6.h>
28#include <net/ip6_route.h>
714f095f 29#include <net/addrconf.h>
38cdcc9a 30#include <linux/icmpv6.h>
1da177e4
LT
31#include <linux/netfilter.h>
32#include <linux/netfilter_ipv4.h>
33
34#include <net/ip_vs.h>
35
36
37/*
38 * Destination cache to speed up outgoing route lookup
39 */
40static inline void
714f095f
HS
41__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
42 u32 dst_cookie)
1da177e4
LT
43{
44 struct dst_entry *old_dst;
45
46 old_dst = dest->dst_cache;
47 dest->dst_cache = dst;
48 dest->dst_rtos = rtos;
714f095f 49 dest->dst_cookie = dst_cookie;
1da177e4
LT
50 dst_release(old_dst);
51}
52
53static inline struct dst_entry *
714f095f 54__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
1da177e4
LT
55{
56 struct dst_entry *dst = dest->dst_cache;
57
58 if (!dst)
59 return NULL;
714f095f
HS
60 if ((dst->obsolete || rtos != dest->dst_rtos) &&
61 dst->ops->check(dst, dest->dst_cookie) == NULL) {
1da177e4
LT
62 dest->dst_cache = NULL;
63 dst_release(dst);
64 return NULL;
65 }
66 dst_hold(dst);
67 return dst;
68}
69
ad1b30b1 70static struct rtable *
714f095f 71__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
1da177e4 72{
714f095f 73 struct net *net = dev_net(skb->dev);
1da177e4
LT
74 struct rtable *rt; /* Route to the other host */
75 struct ip_vs_dest *dest = cp->dest;
76
77 if (dest) {
78 spin_lock(&dest->dst_lock);
79 if (!(rt = (struct rtable *)
714f095f 80 __ip_vs_dst_check(dest, rtos))) {
1da177e4
LT
81 struct flowi fl = {
82 .oif = 0,
83 .nl_u = {
84 .ip4_u = {
e7ade46a 85 .daddr = dest->addr.ip,
1da177e4
LT
86 .saddr = 0,
87 .tos = rtos, } },
88 };
89
714f095f 90 if (ip_route_output_key(net, &rt, &fl)) {
1da177e4 91 spin_unlock(&dest->dst_lock);
14d5e834
HH
92 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
93 &dest->addr.ip);
1da177e4
LT
94 return NULL;
95 }
714f095f 96 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
14d5e834
HH
97 IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
98 &dest->addr.ip,
d8d1f30b 99 atomic_read(&rt->dst.__refcnt), rtos);
1da177e4
LT
100 }
101 spin_unlock(&dest->dst_lock);
102 } else {
103 struct flowi fl = {
104 .oif = 0,
105 .nl_u = {
106 .ip4_u = {
e7ade46a 107 .daddr = cp->daddr.ip,
1da177e4
LT
108 .saddr = 0,
109 .tos = rtos, } },
110 };
111
714f095f 112 if (ip_route_output_key(net, &rt, &fl)) {
14d5e834
HH
113 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
114 &cp->daddr.ip);
1da177e4
LT
115 return NULL;
116 }
117 }
118
119 return rt;
120}
121
38cdcc9a 122#ifdef CONFIG_IP_VS_IPV6
714f095f
HS
123
124static struct dst_entry *
125__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
126 struct in6_addr *ret_saddr, int do_xfrm)
127{
128 struct dst_entry *dst;
129 struct flowi fl = {
130 .oif = 0,
131 .nl_u = {
132 .ip6_u = {
133 .daddr = *daddr,
134 },
135 },
136 };
137
138 dst = ip6_route_output(net, NULL, &fl);
139 if (dst->error)
140 goto out_err;
141 if (!ret_saddr)
142 return dst;
143 if (ipv6_addr_any(&fl.fl6_src) &&
144 ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
145 &fl.fl6_dst, 0, &fl.fl6_src) < 0)
146 goto out_err;
147 if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
148 goto out_err;
149 ipv6_addr_copy(ret_saddr, &fl.fl6_src);
150 return dst;
151
152out_err:
153 dst_release(dst);
154 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
155 return NULL;
156}
157
38cdcc9a 158static struct rt6_info *
714f095f
HS
159__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
160 struct in6_addr *ret_saddr, int do_xfrm)
38cdcc9a 161{
714f095f 162 struct net *net = dev_net(skb->dev);
38cdcc9a
JV
163 struct rt6_info *rt; /* Route to the other host */
164 struct ip_vs_dest *dest = cp->dest;
714f095f 165 struct dst_entry *dst;
38cdcc9a
JV
166
167 if (dest) {
168 spin_lock(&dest->dst_lock);
714f095f 169 rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
38cdcc9a 170 if (!rt) {
714f095f 171 u32 cookie;
38cdcc9a 172
714f095f
HS
173 dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
174 &dest->dst_saddr,
175 do_xfrm);
176 if (!dst) {
38cdcc9a 177 spin_unlock(&dest->dst_lock);
38cdcc9a
JV
178 return NULL;
179 }
714f095f
HS
180 rt = (struct rt6_info *) dst;
181 cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
182 __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
183 IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
184 &dest->addr.in6, &dest->dst_saddr,
d8d1f30b 185 atomic_read(&rt->dst.__refcnt));
38cdcc9a 186 }
714f095f
HS
187 if (ret_saddr)
188 ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
38cdcc9a
JV
189 spin_unlock(&dest->dst_lock);
190 } else {
714f095f
HS
191 dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr,
192 do_xfrm);
193 if (!dst)
38cdcc9a 194 return NULL;
714f095f 195 rt = (struct rt6_info *) dst;
38cdcc9a
JV
196 }
197
198 return rt;
199}
200#endif
201
1da177e4
LT
202
203/*
204 * Release dest->dst_cache before a dest is removed
205 */
206void
207ip_vs_dst_reset(struct ip_vs_dest *dest)
208{
209 struct dst_entry *old_dst;
210
211 old_dst = dest->dst_cache;
212 dest->dst_cache = NULL;
213 dst_release(old_dst);
214}
215
f4bc17cd
JA
216#define IP_VS_XMIT_TUNNEL(skb, cp) \
217({ \
218 int __ret = NF_ACCEPT; \
219 \
220 if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \
221 __ret = ip_vs_confirm_conntrack(skb, cp); \
222 if (__ret == NF_ACCEPT) { \
223 nf_reset(skb); \
224 (skb)->ip_summed = CHECKSUM_NONE; \
225 } \
226 __ret; \
227})
228
229#define IP_VS_XMIT_NAT(pf, skb, cp) \
1da177e4 230do { \
f4bc17cd
JA
231 if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
232 (skb)->ipvs_property = 1; \
233 else \
234 ip_vs_update_conntrack(skb, cp, 1); \
ccc7911f 235 skb_forward_csum(skb); \
38cdcc9a 236 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
f4bc17cd
JA
237 skb_dst(skb)->dev, dst_output); \
238} while (0)
239
240#define IP_VS_XMIT(pf, skb, cp) \
241do { \
242 if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
243 (skb)->ipvs_property = 1; \
244 skb_forward_csum(skb); \
245 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
246 skb_dst(skb)->dev, dst_output); \
1da177e4
LT
247} while (0)
248
249
250/*
251 * NULL transmitter (do nothing except return NF_ACCEPT)
252 */
253int
254ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
255 struct ip_vs_protocol *pp)
256{
257 /* we do not touch skb and do not need pskb ptr */
258 return NF_ACCEPT;
259}
260
261
262/*
263 * Bypass transmitter
264 * Let packets bypass the destination when the destination is not
265 * available, it may be only used in transparent cache cluster.
266 */
267int
268ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
269 struct ip_vs_protocol *pp)
270{
714f095f 271 struct net *net = dev_net(skb->dev);
1da177e4 272 struct rtable *rt; /* Route to the other host */
eddc9ec5 273 struct iphdr *iph = ip_hdr(skb);
1da177e4
LT
274 u8 tos = iph->tos;
275 int mtu;
276 struct flowi fl = {
277 .oif = 0,
278 .nl_u = {
279 .ip4_u = {
280 .daddr = iph->daddr,
281 .saddr = 0,
282 .tos = RT_TOS(tos), } },
283 };
284
285 EnterFunction(10);
286
714f095f 287 if (ip_route_output_key(net, &rt, &fl)) {
1e3e238e
HE
288 IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
289 __func__, &iph->daddr);
1da177e4
LT
290 goto tx_error_icmp;
291 }
292
293 /* MTU checking */
d8d1f30b 294 mtu = dst_mtu(&rt->dst);
4412ec49 295 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
1da177e4
LT
296 ip_rt_put(rt);
297 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
1e3e238e 298 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
1da177e4
LT
299 goto tx_error;
300 }
301
302 /*
303 * Call ip_send_check because we are not sure it is called
304 * after ip_defrag. Is copy-on-write needed?
305 */
306 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
307 ip_rt_put(rt);
308 return NF_STOLEN;
309 }
eddc9ec5 310 ip_send_check(ip_hdr(skb));
1da177e4
LT
311
312 /* drop old route */
adf30907 313 skb_dst_drop(skb);
d8d1f30b 314 skb_dst_set(skb, &rt->dst);
1da177e4
LT
315
316 /* Another hack: avoid icmp_send in ip_fragment */
317 skb->local_df = 1;
318
f4bc17cd 319 IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
1da177e4
LT
320
321 LeaveFunction(10);
322 return NF_STOLEN;
323
324 tx_error_icmp:
325 dst_link_failure(skb);
326 tx_error:
327 kfree_skb(skb);
328 LeaveFunction(10);
329 return NF_STOLEN;
330}
331
b3cdd2a7
JV
332#ifdef CONFIG_IP_VS_IPV6
333int
334ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
335 struct ip_vs_protocol *pp)
336{
714f095f
HS
337 struct net *net = dev_net(skb->dev);
338 struct dst_entry *dst;
b3cdd2a7
JV
339 struct rt6_info *rt; /* Route to the other host */
340 struct ipv6hdr *iph = ipv6_hdr(skb);
341 int mtu;
b3cdd2a7
JV
342
343 EnterFunction(10);
344
714f095f
HS
345 dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0);
346 if (!dst)
b3cdd2a7 347 goto tx_error_icmp;
714f095f 348 rt = (struct rt6_info *) dst;
b3cdd2a7
JV
349
350 /* MTU checking */
d8d1f30b 351 mtu = dst_mtu(&rt->dst);
b3cdd2a7 352 if (skb->len > mtu) {
d8d1f30b 353 dst_release(&rt->dst);
3ffe533c 354 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1e3e238e 355 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
b3cdd2a7
JV
356 goto tx_error;
357 }
358
359 /*
360 * Call ip_send_check because we are not sure it is called
361 * after ip_defrag. Is copy-on-write needed?
362 */
363 skb = skb_share_check(skb, GFP_ATOMIC);
364 if (unlikely(skb == NULL)) {
d8d1f30b 365 dst_release(&rt->dst);
b3cdd2a7
JV
366 return NF_STOLEN;
367 }
368
369 /* drop old route */
adf30907 370 skb_dst_drop(skb);
d8d1f30b 371 skb_dst_set(skb, &rt->dst);
b3cdd2a7
JV
372
373 /* Another hack: avoid icmp_send in ip_fragment */
374 skb->local_df = 1;
375
f4bc17cd 376 IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
b3cdd2a7
JV
377
378 LeaveFunction(10);
379 return NF_STOLEN;
380
381 tx_error_icmp:
382 dst_link_failure(skb);
383 tx_error:
384 kfree_skb(skb);
385 LeaveFunction(10);
386 return NF_STOLEN;
387}
388#endif
1da177e4
LT
389
390/*
391 * NAT transmitter (only for outside-to-inside nat forwarding)
392 * Not used for related ICMP
393 */
394int
395ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
396 struct ip_vs_protocol *pp)
397{
398 struct rtable *rt; /* Route to the other host */
399 int mtu;
eddc9ec5 400 struct iphdr *iph = ip_hdr(skb);
1da177e4
LT
401
402 EnterFunction(10);
403
404 /* check if it is a connection of no-client-port */
405 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
014d730d 406 __be16 _pt, *p;
1da177e4
LT
407 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
408 if (p == NULL)
409 goto tx_error;
410 ip_vs_conn_fill_cport(cp, *p);
411 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
412 }
413
714f095f 414 if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
1da177e4
LT
415 goto tx_error_icmp;
416
417 /* MTU checking */
d8d1f30b 418 mtu = dst_mtu(&rt->dst);
4412ec49 419 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
1da177e4
LT
420 ip_rt_put(rt);
421 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
422 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
423 goto tx_error;
424 }
425
426 /* copy-on-write the packet before mangling it */
af1e1cf0 427 if (!skb_make_writable(skb, sizeof(struct iphdr)))
1da177e4
LT
428 goto tx_error_put;
429
d8d1f30b 430 if (skb_cow(skb, rt->dst.dev->hard_header_len))
1da177e4
LT
431 goto tx_error_put;
432
433 /* drop old route */
adf30907 434 skb_dst_drop(skb);
d8d1f30b 435 skb_dst_set(skb, &rt->dst);
1da177e4
LT
436
437 /* mangle the packet */
3db05fea 438 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
1da177e4 439 goto tx_error;
e7ade46a 440 ip_hdr(skb)->daddr = cp->daddr.ip;
eddc9ec5 441 ip_send_check(ip_hdr(skb));
1da177e4
LT
442
443 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
444
445 /* FIXME: when application helper enlarges the packet and the length
446 is larger than the MTU of outgoing device, there will be still
447 MTU problem. */
448
449 /* Another hack: avoid icmp_send in ip_fragment */
450 skb->local_df = 1;
451
f4bc17cd 452 IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp);
1da177e4
LT
453
454 LeaveFunction(10);
455 return NF_STOLEN;
456
457 tx_error_icmp:
458 dst_link_failure(skb);
459 tx_error:
1da177e4 460 kfree_skb(skb);
f4bc17cd 461 LeaveFunction(10);
1da177e4
LT
462 return NF_STOLEN;
463 tx_error_put:
464 ip_rt_put(rt);
465 goto tx_error;
466}
467
b3cdd2a7
JV
468#ifdef CONFIG_IP_VS_IPV6
469int
470ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
471 struct ip_vs_protocol *pp)
472{
473 struct rt6_info *rt; /* Route to the other host */
474 int mtu;
475
476 EnterFunction(10);
477
478 /* check if it is a connection of no-client-port */
479 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
480 __be16 _pt, *p;
481 p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
482 sizeof(_pt), &_pt);
483 if (p == NULL)
484 goto tx_error;
485 ip_vs_conn_fill_cport(cp, *p);
486 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
487 }
488
714f095f 489 rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
b3cdd2a7
JV
490 if (!rt)
491 goto tx_error_icmp;
492
493 /* MTU checking */
d8d1f30b 494 mtu = dst_mtu(&rt->dst);
b3cdd2a7 495 if (skb->len > mtu) {
d8d1f30b 496 dst_release(&rt->dst);
3ffe533c 497 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
b3cdd2a7
JV
498 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
499 "ip_vs_nat_xmit_v6(): frag needed for");
500 goto tx_error;
501 }
502
503 /* copy-on-write the packet before mangling it */
504 if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
505 goto tx_error_put;
506
d8d1f30b 507 if (skb_cow(skb, rt->dst.dev->hard_header_len))
b3cdd2a7
JV
508 goto tx_error_put;
509
510 /* drop old route */
adf30907 511 skb_dst_drop(skb);
d8d1f30b 512 skb_dst_set(skb, &rt->dst);
b3cdd2a7
JV
513
514 /* mangle the packet */
515 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
516 goto tx_error;
517 ipv6_hdr(skb)->daddr = cp->daddr.in6;
518
519 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
520
521 /* FIXME: when application helper enlarges the packet and the length
522 is larger than the MTU of outgoing device, there will be still
523 MTU problem. */
524
525 /* Another hack: avoid icmp_send in ip_fragment */
526 skb->local_df = 1;
527
f4bc17cd 528 IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp);
b3cdd2a7
JV
529
530 LeaveFunction(10);
531 return NF_STOLEN;
532
533tx_error_icmp:
534 dst_link_failure(skb);
535tx_error:
536 LeaveFunction(10);
537 kfree_skb(skb);
538 return NF_STOLEN;
539tx_error_put:
d8d1f30b 540 dst_release(&rt->dst);
b3cdd2a7
JV
541 goto tx_error;
542}
543#endif
544
1da177e4
LT
545
546/*
547 * IP Tunneling transmitter
548 *
549 * This function encapsulates the packet in a new IP packet, its
550 * destination will be set to cp->daddr. Most code of this function
551 * is taken from ipip.c.
552 *
553 * It is used in VS/TUN cluster. The load balancer selects a real
554 * server from a cluster based on a scheduling algorithm,
555 * encapsulates the request packet and forwards it to the selected
556 * server. For example, all real servers are configured with
557 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
558 * the encapsulated packet, it will decapsulate the packet, processe
559 * the request and return the response packets directly to the client
560 * without passing the load balancer. This can greatly increase the
561 * scalability of virtual server.
562 *
563 * Used for ANY protocol
564 */
565int
566ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
567 struct ip_vs_protocol *pp)
568{
569 struct rtable *rt; /* Route to the other host */
570 struct net_device *tdev; /* Device to other host */
eddc9ec5 571 struct iphdr *old_iph = ip_hdr(skb);
1da177e4 572 u8 tos = old_iph->tos;
76ab608d 573 __be16 df = old_iph->frag_off;
1da177e4 574 struct iphdr *iph; /* Our new IP header */
c2636b4d 575 unsigned int max_headroom; /* The extra header space needed */
1da177e4 576 int mtu;
f4bc17cd 577 int ret;
1da177e4
LT
578
579 EnterFunction(10);
580
4412ec49 581 if (skb->protocol != htons(ETH_P_IP)) {
1e3e238e 582 IP_VS_DBG_RL("%s(): protocol error, "
1da177e4 583 "ETH_P_IP: %d, skb protocol: %d\n",
1e3e238e 584 __func__, htons(ETH_P_IP), skb->protocol);
1da177e4
LT
585 goto tx_error;
586 }
587
714f095f 588 if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos))))
1da177e4
LT
589 goto tx_error_icmp;
590
d8d1f30b 591 tdev = rt->dst.dev;
1da177e4 592
d8d1f30b 593 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
1da177e4
LT
594 if (mtu < 68) {
595 ip_rt_put(rt);
1e3e238e 596 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
1da177e4
LT
597 goto tx_error;
598 }
adf30907
ED
599 if (skb_dst(skb))
600 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
1da177e4 601
4412ec49 602 df |= (old_iph->frag_off & htons(IP_DF));
1da177e4 603
4412ec49 604 if ((old_iph->frag_off & htons(IP_DF))
1da177e4
LT
605 && mtu < ntohs(old_iph->tot_len)) {
606 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
607 ip_rt_put(rt);
1e3e238e 608 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
1da177e4
LT
609 goto tx_error;
610 }
611
612 /*
613 * Okay, now see if we can stuff it in the buffer as-is.
614 */
615 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
616
617 if (skb_headroom(skb) < max_headroom
618 || skb_cloned(skb) || skb_shared(skb)) {
619 struct sk_buff *new_skb =
620 skb_realloc_headroom(skb, max_headroom);
621 if (!new_skb) {
622 ip_rt_put(rt);
623 kfree_skb(skb);
1e3e238e 624 IP_VS_ERR_RL("%s(): no memory\n", __func__);
1da177e4
LT
625 return NF_STOLEN;
626 }
627 kfree_skb(skb);
628 skb = new_skb;
eddc9ec5 629 old_iph = ip_hdr(skb);
1da177e4
LT
630 }
631
714f095f 632 skb->transport_header = skb->network_header;
1da177e4
LT
633
634 /* fix old IP header checksum */
635 ip_send_check(old_iph);
636
e2d1bca7
ACM
637 skb_push(skb, sizeof(struct iphdr));
638 skb_reset_network_header(skb);
1da177e4
LT
639 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
640
641 /* drop old route */
adf30907 642 skb_dst_drop(skb);
d8d1f30b 643 skb_dst_set(skb, &rt->dst);
1da177e4
LT
644
645 /*
646 * Push down and install the IPIP header.
647 */
eddc9ec5 648 iph = ip_hdr(skb);
1da177e4
LT
649 iph->version = 4;
650 iph->ihl = sizeof(struct iphdr)>>2;
651 iph->frag_off = df;
652 iph->protocol = IPPROTO_IPIP;
653 iph->tos = tos;
654 iph->daddr = rt->rt_dst;
655 iph->saddr = rt->rt_src;
656 iph->ttl = old_iph->ttl;
d8d1f30b 657 ip_select_ident(iph, &rt->dst, NULL);
1da177e4
LT
658
659 /* Another hack: avoid icmp_send in ip_fragment */
660 skb->local_df = 1;
661
f4bc17cd
JA
662 ret = IP_VS_XMIT_TUNNEL(skb, cp);
663 if (ret == NF_ACCEPT)
664 ip_local_out(skb);
665 else if (ret == NF_DROP)
666 kfree_skb(skb);
1da177e4
LT
667
668 LeaveFunction(10);
669
670 return NF_STOLEN;
671
672 tx_error_icmp:
673 dst_link_failure(skb);
674 tx_error:
675 kfree_skb(skb);
676 LeaveFunction(10);
677 return NF_STOLEN;
678}
679
b3cdd2a7
JV
680#ifdef CONFIG_IP_VS_IPV6
681int
682ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
683 struct ip_vs_protocol *pp)
684{
685 struct rt6_info *rt; /* Route to the other host */
714f095f 686 struct in6_addr saddr; /* Source for tunnel */
b3cdd2a7
JV
687 struct net_device *tdev; /* Device to other host */
688 struct ipv6hdr *old_iph = ipv6_hdr(skb);
b3cdd2a7
JV
689 struct ipv6hdr *iph; /* Our new IP header */
690 unsigned int max_headroom; /* The extra header space needed */
691 int mtu;
f4bc17cd 692 int ret;
b3cdd2a7
JV
693
694 EnterFunction(10);
695
696 if (skb->protocol != htons(ETH_P_IPV6)) {
1e3e238e 697 IP_VS_DBG_RL("%s(): protocol error, "
b3cdd2a7 698 "ETH_P_IPV6: %d, skb protocol: %d\n",
1e3e238e 699 __func__, htons(ETH_P_IPV6), skb->protocol);
b3cdd2a7
JV
700 goto tx_error;
701 }
702
714f095f 703 rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1);
b3cdd2a7
JV
704 if (!rt)
705 goto tx_error_icmp;
706
d8d1f30b 707 tdev = rt->dst.dev;
b3cdd2a7 708
d8d1f30b 709 mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
714f095f 710 if (mtu < IPV6_MIN_MTU) {
d8d1f30b 711 dst_release(&rt->dst);
714f095f
HS
712 IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
713 IPV6_MIN_MTU);
b3cdd2a7
JV
714 goto tx_error;
715 }
adf30907
ED
716 if (skb_dst(skb))
717 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
b3cdd2a7
JV
718
719 if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
3ffe533c 720 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
d8d1f30b 721 dst_release(&rt->dst);
1e3e238e 722 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
b3cdd2a7
JV
723 goto tx_error;
724 }
725
726 /*
727 * Okay, now see if we can stuff it in the buffer as-is.
728 */
729 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
730
731 if (skb_headroom(skb) < max_headroom
732 || skb_cloned(skb) || skb_shared(skb)) {
733 struct sk_buff *new_skb =
734 skb_realloc_headroom(skb, max_headroom);
735 if (!new_skb) {
d8d1f30b 736 dst_release(&rt->dst);
b3cdd2a7 737 kfree_skb(skb);
1e3e238e 738 IP_VS_ERR_RL("%s(): no memory\n", __func__);
b3cdd2a7
JV
739 return NF_STOLEN;
740 }
741 kfree_skb(skb);
742 skb = new_skb;
743 old_iph = ipv6_hdr(skb);
744 }
745
714f095f 746 skb->transport_header = skb->network_header;
b3cdd2a7
JV
747
748 skb_push(skb, sizeof(struct ipv6hdr));
749 skb_reset_network_header(skb);
750 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
751
752 /* drop old route */
adf30907 753 skb_dst_drop(skb);
d8d1f30b 754 skb_dst_set(skb, &rt->dst);
b3cdd2a7
JV
755
756 /*
757 * Push down and install the IPIP header.
758 */
759 iph = ipv6_hdr(skb);
760 iph->version = 6;
761 iph->nexthdr = IPPROTO_IPV6;
b7b45f47
HH
762 iph->payload_len = old_iph->payload_len;
763 be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
b3cdd2a7
JV
764 iph->priority = old_iph->priority;
765 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
714f095f
HS
766 ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
767 ipv6_addr_copy(&iph->saddr, &saddr);
b3cdd2a7
JV
768 iph->hop_limit = old_iph->hop_limit;
769
770 /* Another hack: avoid icmp_send in ip_fragment */
771 skb->local_df = 1;
772
f4bc17cd
JA
773 ret = IP_VS_XMIT_TUNNEL(skb, cp);
774 if (ret == NF_ACCEPT)
775 ip6_local_out(skb);
776 else if (ret == NF_DROP)
777 kfree_skb(skb);
b3cdd2a7
JV
778
779 LeaveFunction(10);
780
781 return NF_STOLEN;
782
783tx_error_icmp:
784 dst_link_failure(skb);
785tx_error:
786 kfree_skb(skb);
787 LeaveFunction(10);
788 return NF_STOLEN;
789}
790#endif
791
1da177e4
LT
792
793/*
794 * Direct Routing transmitter
795 * Used for ANY protocol
796 */
797int
798ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
799 struct ip_vs_protocol *pp)
800{
801 struct rtable *rt; /* Route to the other host */
eddc9ec5 802 struct iphdr *iph = ip_hdr(skb);
1da177e4
LT
803 int mtu;
804
805 EnterFunction(10);
806
714f095f 807 if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
1da177e4
LT
808 goto tx_error_icmp;
809
810 /* MTU checking */
d8d1f30b 811 mtu = dst_mtu(&rt->dst);
4412ec49 812 if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
1da177e4
LT
813 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
814 ip_rt_put(rt);
1e3e238e 815 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
1da177e4
LT
816 goto tx_error;
817 }
818
819 /*
820 * Call ip_send_check because we are not sure it is called
821 * after ip_defrag. Is copy-on-write needed?
822 */
823 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
824 ip_rt_put(rt);
825 return NF_STOLEN;
826 }
eddc9ec5 827 ip_send_check(ip_hdr(skb));
1da177e4
LT
828
829 /* drop old route */
adf30907 830 skb_dst_drop(skb);
d8d1f30b 831 skb_dst_set(skb, &rt->dst);
1da177e4
LT
832
833 /* Another hack: avoid icmp_send in ip_fragment */
834 skb->local_df = 1;
835
f4bc17cd 836 IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
1da177e4
LT
837
838 LeaveFunction(10);
839 return NF_STOLEN;
840
841 tx_error_icmp:
842 dst_link_failure(skb);
843 tx_error:
844 kfree_skb(skb);
845 LeaveFunction(10);
846 return NF_STOLEN;
847}
848
b3cdd2a7
JV
849#ifdef CONFIG_IP_VS_IPV6
850int
851ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
852 struct ip_vs_protocol *pp)
853{
854 struct rt6_info *rt; /* Route to the other host */
855 int mtu;
856
857 EnterFunction(10);
858
714f095f 859 rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
b3cdd2a7
JV
860 if (!rt)
861 goto tx_error_icmp;
862
863 /* MTU checking */
d8d1f30b 864 mtu = dst_mtu(&rt->dst);
b3cdd2a7 865 if (skb->len > mtu) {
3ffe533c 866 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
d8d1f30b 867 dst_release(&rt->dst);
1e3e238e 868 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
b3cdd2a7
JV
869 goto tx_error;
870 }
871
872 /*
873 * Call ip_send_check because we are not sure it is called
874 * after ip_defrag. Is copy-on-write needed?
875 */
876 skb = skb_share_check(skb, GFP_ATOMIC);
877 if (unlikely(skb == NULL)) {
d8d1f30b 878 dst_release(&rt->dst);
b3cdd2a7
JV
879 return NF_STOLEN;
880 }
881
882 /* drop old route */
adf30907 883 skb_dst_drop(skb);
d8d1f30b 884 skb_dst_set(skb, &rt->dst);
b3cdd2a7
JV
885
886 /* Another hack: avoid icmp_send in ip_fragment */
887 skb->local_df = 1;
888
f4bc17cd 889 IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
b3cdd2a7
JV
890
891 LeaveFunction(10);
892 return NF_STOLEN;
893
894tx_error_icmp:
895 dst_link_failure(skb);
896tx_error:
897 kfree_skb(skb);
898 LeaveFunction(10);
899 return NF_STOLEN;
900}
901#endif
902
1da177e4
LT
903
904/*
905 * ICMP packet transmitter
906 * called by the ip_vs_in_icmp
907 */
908int
909ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
910 struct ip_vs_protocol *pp, int offset)
911{
912 struct rtable *rt; /* Route to the other host */
913 int mtu;
914 int rc;
915
916 EnterFunction(10);
917
918 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
919 forwarded directly here, because there is no need to
920 translate address/port back */
921 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
922 if (cp->packet_xmit)
923 rc = cp->packet_xmit(skb, cp, pp);
924 else
925 rc = NF_ACCEPT;
926 /* do not touch skb anymore */
927 atomic_inc(&cp->in_pkts);
1da177e4
LT
928 goto out;
929 }
930
931 /*
932 * mangle and send the packet here (only for VS/NAT)
933 */
934
714f095f 935 if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos))))
1da177e4
LT
936 goto tx_error_icmp;
937
938 /* MTU checking */
d8d1f30b 939 mtu = dst_mtu(&rt->dst);
eddc9ec5 940 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
1da177e4
LT
941 ip_rt_put(rt);
942 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
1e3e238e 943 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
1da177e4
LT
944 goto tx_error;
945 }
946
947 /* copy-on-write the packet before mangling it */
af1e1cf0 948 if (!skb_make_writable(skb, offset))
1da177e4
LT
949 goto tx_error_put;
950
d8d1f30b 951 if (skb_cow(skb, rt->dst.dev->hard_header_len))
1da177e4
LT
952 goto tx_error_put;
953
954 /* drop the old route when skb is not shared */
adf30907 955 skb_dst_drop(skb);
d8d1f30b 956 skb_dst_set(skb, &rt->dst);
1da177e4
LT
957
958 ip_vs_nat_icmp(skb, pp, cp, 0);
959
960 /* Another hack: avoid icmp_send in ip_fragment */
961 skb->local_df = 1;
962
f4bc17cd 963 IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
1da177e4
LT
964
965 rc = NF_STOLEN;
966 goto out;
967
968 tx_error_icmp:
969 dst_link_failure(skb);
970 tx_error:
971 dev_kfree_skb(skb);
972 rc = NF_STOLEN;
973 out:
974 LeaveFunction(10);
975 return rc;
976 tx_error_put:
977 ip_rt_put(rt);
978 goto tx_error;
979}
b3cdd2a7
JV
980
981#ifdef CONFIG_IP_VS_IPV6
982int
983ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
984 struct ip_vs_protocol *pp, int offset)
985{
986 struct rt6_info *rt; /* Route to the other host */
987 int mtu;
988 int rc;
989
990 EnterFunction(10);
991
992 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
993 forwarded directly here, because there is no need to
994 translate address/port back */
995 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
996 if (cp->packet_xmit)
997 rc = cp->packet_xmit(skb, cp, pp);
998 else
999 rc = NF_ACCEPT;
1000 /* do not touch skb anymore */
1001 atomic_inc(&cp->in_pkts);
1002 goto out;
1003 }
1004
1005 /*
1006 * mangle and send the packet here (only for VS/NAT)
1007 */
1008
714f095f 1009 rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
b3cdd2a7
JV
1010 if (!rt)
1011 goto tx_error_icmp;
1012
1013 /* MTU checking */
d8d1f30b 1014 mtu = dst_mtu(&rt->dst);
b3cdd2a7 1015 if (skb->len > mtu) {
d8d1f30b 1016 dst_release(&rt->dst);
3ffe533c 1017 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1e3e238e 1018 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
b3cdd2a7
JV
1019 goto tx_error;
1020 }
1021
1022 /* copy-on-write the packet before mangling it */
1023 if (!skb_make_writable(skb, offset))
1024 goto tx_error_put;
1025
d8d1f30b 1026 if (skb_cow(skb, rt->dst.dev->hard_header_len))
b3cdd2a7
JV
1027 goto tx_error_put;
1028
1029 /* drop the old route when skb is not shared */
adf30907 1030 skb_dst_drop(skb);
d8d1f30b 1031 skb_dst_set(skb, &rt->dst);
b3cdd2a7
JV
1032
1033 ip_vs_nat_icmp_v6(skb, pp, cp, 0);
1034
1035 /* Another hack: avoid icmp_send in ip_fragment */
1036 skb->local_df = 1;
1037
f4bc17cd 1038 IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
b3cdd2a7
JV
1039
1040 rc = NF_STOLEN;
1041 goto out;
1042
1043tx_error_icmp:
1044 dst_link_failure(skb);
1045tx_error:
1046 dev_kfree_skb(skb);
1047 rc = NF_STOLEN;
1048out:
1049 LeaveFunction(10);
1050 return rc;
1051tx_error_put:
d8d1f30b 1052 dst_release(&rt->dst);
b3cdd2a7
JV
1053 goto tx_error;
1054}
1055#endif