]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/netfilter/ipvs/ip_vs_xmit.c
ipvs: extend connection flags to 32 bits
[net-next-2.6.git] / net / netfilter / ipvs / ip_vs_xmit.c
CommitLineData
1da177e4
LT
1/*
2 * ip_vs_xmit.c: various packet transmitters for IPVS
3 *
1da177e4
LT
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
9aada7ac
HE
16#define KMSG_COMPONENT "IPVS"
17#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
18
1da177e4 19#include <linux/kernel.h>
5a0e3ad6 20#include <linux/slab.h>
1da177e4 21#include <linux/tcp.h> /* for tcphdr */
c439cb2e 22#include <net/ip.h>
1da177e4
LT
23#include <net/tcp.h> /* for csum_tcpudp_magic */
24#include <net/udp.h>
25#include <net/icmp.h> /* for icmp_send */
26#include <net/route.h> /* for ip_route_output */
38cdcc9a
JV
27#include <net/ipv6.h>
28#include <net/ip6_route.h>
29#include <linux/icmpv6.h>
1da177e4 30#include <linux/netfilter.h>
7b215ffc 31#include <net/netfilter/nf_conntrack.h>
1da177e4
LT
32#include <linux/netfilter_ipv4.h>
33
34#include <net/ip_vs.h>
35
36
37/*
38 * Destination cache to speed up outgoing route lookup
39 */
40static inline void
41__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
42{
43 struct dst_entry *old_dst;
44
45 old_dst = dest->dst_cache;
46 dest->dst_cache = dst;
47 dest->dst_rtos = rtos;
48 dst_release(old_dst);
49}
50
51static inline struct dst_entry *
52__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
53{
54 struct dst_entry *dst = dest->dst_cache;
55
56 if (!dst)
57 return NULL;
38cdcc9a
JV
58 if ((dst->obsolete
59 || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
1da177e4
LT
60 dst->ops->check(dst, cookie) == NULL) {
61 dest->dst_cache = NULL;
62 dst_release(dst);
63 return NULL;
64 }
65 dst_hold(dst);
66 return dst;
67}
68
ad1b30b1 69static struct rtable *
1da177e4
LT
70__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
71{
72 struct rtable *rt; /* Route to the other host */
73 struct ip_vs_dest *dest = cp->dest;
74
75 if (dest) {
76 spin_lock(&dest->dst_lock);
77 if (!(rt = (struct rtable *)
78 __ip_vs_dst_check(dest, rtos, 0))) {
79 struct flowi fl = {
80 .oif = 0,
81 .nl_u = {
82 .ip4_u = {
e7ade46a 83 .daddr = dest->addr.ip,
1da177e4
LT
84 .saddr = 0,
85 .tos = rtos, } },
86 };
87
f206351a 88 if (ip_route_output_key(&init_net, &rt, &fl)) {
1da177e4 89 spin_unlock(&dest->dst_lock);
14d5e834
HH
90 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
91 &dest->addr.ip);
1da177e4
LT
92 return NULL;
93 }
d8d1f30b 94 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
14d5e834
HH
95 IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
96 &dest->addr.ip,
d8d1f30b 97 atomic_read(&rt->dst.__refcnt), rtos);
1da177e4
LT
98 }
99 spin_unlock(&dest->dst_lock);
100 } else {
101 struct flowi fl = {
102 .oif = 0,
103 .nl_u = {
104 .ip4_u = {
e7ade46a 105 .daddr = cp->daddr.ip,
1da177e4
LT
106 .saddr = 0,
107 .tos = rtos, } },
108 };
109
f206351a 110 if (ip_route_output_key(&init_net, &rt, &fl)) {
14d5e834
HH
111 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
112 &cp->daddr.ip);
1da177e4
LT
113 return NULL;
114 }
115 }
116
117 return rt;
118}
119
38cdcc9a
JV
120#ifdef CONFIG_IP_VS_IPV6
121static struct rt6_info *
122__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
123{
124 struct rt6_info *rt; /* Route to the other host */
125 struct ip_vs_dest *dest = cp->dest;
126
127 if (dest) {
128 spin_lock(&dest->dst_lock);
129 rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
130 if (!rt) {
131 struct flowi fl = {
132 .oif = 0,
133 .nl_u = {
134 .ip6_u = {
135 .daddr = dest->addr.in6,
136 .saddr = {
137 .s6_addr32 =
138 { 0, 0, 0, 0 },
139 },
140 },
141 },
142 };
143
144 rt = (struct rt6_info *)ip6_route_output(&init_net,
145 NULL, &fl);
146 if (!rt) {
147 spin_unlock(&dest->dst_lock);
5b095d98 148 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
38ff4fa4 149 &dest->addr.in6);
38cdcc9a
JV
150 return NULL;
151 }
d8d1f30b 152 __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
5b095d98 153 IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
38ff4fa4 154 &dest->addr.in6,
d8d1f30b 155 atomic_read(&rt->dst.__refcnt));
38cdcc9a
JV
156 }
157 spin_unlock(&dest->dst_lock);
158 } else {
159 struct flowi fl = {
160 .oif = 0,
161 .nl_u = {
162 .ip6_u = {
163 .daddr = cp->daddr.in6,
164 .saddr = {
165 .s6_addr32 = { 0, 0, 0, 0 },
166 },
167 },
168 },
169 };
170
171 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
172 if (!rt) {
5b095d98 173 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
38ff4fa4 174 &cp->daddr.in6);
38cdcc9a
JV
175 return NULL;
176 }
177 }
178
179 return rt;
180}
181#endif
182
1da177e4
LT
183
184/*
185 * Release dest->dst_cache before a dest is removed
186 */
187void
188ip_vs_dst_reset(struct ip_vs_dest *dest)
189{
190 struct dst_entry *old_dst;
191
192 old_dst = dest->dst_cache;
193 dest->dst_cache = NULL;
194 dst_release(old_dst);
195}
196
38cdcc9a 197#define IP_VS_XMIT(pf, skb, rt) \
1da177e4 198do { \
6869c4d8 199 (skb)->ipvs_property = 1; \
ccc7911f 200 skb_forward_csum(skb); \
38cdcc9a 201 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
d8d1f30b 202 (rt)->dst.dev, dst_output); \
1da177e4
LT
203} while (0)
204
205
206/*
207 * NULL transmitter (do nothing except return NF_ACCEPT)
208 */
209int
210ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
211 struct ip_vs_protocol *pp)
212{
213 /* we do not touch skb and do not need pskb ptr */
214 return NF_ACCEPT;
215}
216
217
218/*
219 * Bypass transmitter
220 * Let packets bypass the destination when the destination is not
221 * available, it may be only used in transparent cache cluster.
222 */
223int
224ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
225 struct ip_vs_protocol *pp)
226{
227 struct rtable *rt; /* Route to the other host */
eddc9ec5 228 struct iphdr *iph = ip_hdr(skb);
1da177e4
LT
229 u8 tos = iph->tos;
230 int mtu;
231 struct flowi fl = {
232 .oif = 0,
233 .nl_u = {
234 .ip4_u = {
235 .daddr = iph->daddr,
236 .saddr = 0,
237 .tos = RT_TOS(tos), } },
238 };
239
240 EnterFunction(10);
241
f206351a 242 if (ip_route_output_key(&init_net, &rt, &fl)) {
1e3e238e
HE
243 IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
244 __func__, &iph->daddr);
1da177e4
LT
245 goto tx_error_icmp;
246 }
247
248 /* MTU checking */
d8d1f30b 249 mtu = dst_mtu(&rt->dst);
4412ec49 250 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
1da177e4
LT
251 ip_rt_put(rt);
252 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
1e3e238e 253 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
1da177e4
LT
254 goto tx_error;
255 }
256
257 /*
258 * Call ip_send_check because we are not sure it is called
259 * after ip_defrag. Is copy-on-write needed?
260 */
261 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
262 ip_rt_put(rt);
263 return NF_STOLEN;
264 }
eddc9ec5 265 ip_send_check(ip_hdr(skb));
1da177e4
LT
266
267 /* drop old route */
adf30907 268 skb_dst_drop(skb);
d8d1f30b 269 skb_dst_set(skb, &rt->dst);
1da177e4
LT
270
271 /* Another hack: avoid icmp_send in ip_fragment */
272 skb->local_df = 1;
273
7911b5c7 274 IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
1da177e4
LT
275
276 LeaveFunction(10);
277 return NF_STOLEN;
278
279 tx_error_icmp:
280 dst_link_failure(skb);
281 tx_error:
282 kfree_skb(skb);
283 LeaveFunction(10);
284 return NF_STOLEN;
285}
286
b3cdd2a7
JV
287#ifdef CONFIG_IP_VS_IPV6
288int
289ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
290 struct ip_vs_protocol *pp)
291{
292 struct rt6_info *rt; /* Route to the other host */
293 struct ipv6hdr *iph = ipv6_hdr(skb);
294 int mtu;
295 struct flowi fl = {
296 .oif = 0,
297 .nl_u = {
298 .ip6_u = {
299 .daddr = iph->daddr,
300 .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
301 };
302
303 EnterFunction(10);
304
305 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
306 if (!rt) {
1e3e238e
HE
307 IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
308 __func__, &iph->daddr);
b3cdd2a7
JV
309 goto tx_error_icmp;
310 }
311
312 /* MTU checking */
d8d1f30b 313 mtu = dst_mtu(&rt->dst);
b3cdd2a7 314 if (skb->len > mtu) {
d8d1f30b 315 dst_release(&rt->dst);
3ffe533c 316 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1e3e238e 317 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
b3cdd2a7
JV
318 goto tx_error;
319 }
320
321 /*
322 * Call ip_send_check because we are not sure it is called
323 * after ip_defrag. Is copy-on-write needed?
324 */
325 skb = skb_share_check(skb, GFP_ATOMIC);
326 if (unlikely(skb == NULL)) {
d8d1f30b 327 dst_release(&rt->dst);
b3cdd2a7
JV
328 return NF_STOLEN;
329 }
330
331 /* drop old route */
adf30907 332 skb_dst_drop(skb);
d8d1f30b 333 skb_dst_set(skb, &rt->dst);
b3cdd2a7
JV
334
335 /* Another hack: avoid icmp_send in ip_fragment */
336 skb->local_df = 1;
337
7911b5c7 338 IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
b3cdd2a7
JV
339
340 LeaveFunction(10);
341 return NF_STOLEN;
342
343 tx_error_icmp:
344 dst_link_failure(skb);
345 tx_error:
346 kfree_skb(skb);
347 LeaveFunction(10);
348 return NF_STOLEN;
349}
350#endif
1da177e4 351
6523ce15
JA
352void
353ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
7b215ffc
HE
354{
355 struct nf_conn *ct = (struct nf_conn *)skb->nfct;
356 struct nf_conntrack_tuple new_tuple;
357
358 if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct))
359 return;
360
361 /*
362 * The connection is not yet in the hashtable, so we update it.
363 * CIP->VIP will remain the same, so leave the tuple in
364 * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the
365 * real-server we will see RIP->DIP.
366 */
367 new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
6523ce15
JA
368 if (outin)
369 new_tuple.src.u3 = cp->daddr;
370 else
371 new_tuple.dst.u3 = cp->vaddr;
7b215ffc
HE
372 /*
373 * This will also take care of UDP and other protocols.
374 */
6523ce15
JA
375 if (outin)
376 new_tuple.src.u.tcp.port = cp->dport;
377 else
378 new_tuple.dst.u.tcp.port = cp->vport;
7b215ffc
HE
379 nf_conntrack_alter_reply(ct, &new_tuple);
380}
381
1da177e4
LT
382/*
383 * NAT transmitter (only for outside-to-inside nat forwarding)
384 * Not used for related ICMP
385 */
386int
387ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
388 struct ip_vs_protocol *pp)
389{
390 struct rtable *rt; /* Route to the other host */
391 int mtu;
eddc9ec5 392 struct iphdr *iph = ip_hdr(skb);
1da177e4
LT
393
394 EnterFunction(10);
395
396 /* check if it is a connection of no-client-port */
397 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
014d730d 398 __be16 _pt, *p;
1da177e4
LT
399 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
400 if (p == NULL)
401 goto tx_error;
402 ip_vs_conn_fill_cport(cp, *p);
403 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
404 }
405
406 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
407 goto tx_error_icmp;
408
409 /* MTU checking */
d8d1f30b 410 mtu = dst_mtu(&rt->dst);
4412ec49 411 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
1da177e4
LT
412 ip_rt_put(rt);
413 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
414 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
415 goto tx_error;
416 }
417
418 /* copy-on-write the packet before mangling it */
af1e1cf0 419 if (!skb_make_writable(skb, sizeof(struct iphdr)))
1da177e4
LT
420 goto tx_error_put;
421
d8d1f30b 422 if (skb_cow(skb, rt->dst.dev->hard_header_len))
1da177e4
LT
423 goto tx_error_put;
424
425 /* drop old route */
adf30907 426 skb_dst_drop(skb);
d8d1f30b 427 skb_dst_set(skb, &rt->dst);
1da177e4
LT
428
429 /* mangle the packet */
3db05fea 430 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
1da177e4 431 goto tx_error;
e7ade46a 432 ip_hdr(skb)->daddr = cp->daddr.ip;
eddc9ec5 433 ip_send_check(ip_hdr(skb));
1da177e4
LT
434
435 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
436
6523ce15 437 ip_vs_update_conntrack(skb, cp, 1);
7b215ffc 438
1da177e4
LT
439 /* FIXME: when application helper enlarges the packet and the length
440 is larger than the MTU of outgoing device, there will be still
441 MTU problem. */
442
443 /* Another hack: avoid icmp_send in ip_fragment */
444 skb->local_df = 1;
445
7911b5c7 446 IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
1da177e4
LT
447
448 LeaveFunction(10);
449 return NF_STOLEN;
450
451 tx_error_icmp:
452 dst_link_failure(skb);
453 tx_error:
454 LeaveFunction(10);
455 kfree_skb(skb);
456 return NF_STOLEN;
457 tx_error_put:
458 ip_rt_put(rt);
459 goto tx_error;
460}
461
b3cdd2a7
JV
462#ifdef CONFIG_IP_VS_IPV6
463int
464ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
465 struct ip_vs_protocol *pp)
466{
467 struct rt6_info *rt; /* Route to the other host */
468 int mtu;
469
470 EnterFunction(10);
471
472 /* check if it is a connection of no-client-port */
473 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
474 __be16 _pt, *p;
475 p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
476 sizeof(_pt), &_pt);
477 if (p == NULL)
478 goto tx_error;
479 ip_vs_conn_fill_cport(cp, *p);
480 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
481 }
482
483 rt = __ip_vs_get_out_rt_v6(cp);
484 if (!rt)
485 goto tx_error_icmp;
486
487 /* MTU checking */
d8d1f30b 488 mtu = dst_mtu(&rt->dst);
b3cdd2a7 489 if (skb->len > mtu) {
d8d1f30b 490 dst_release(&rt->dst);
3ffe533c 491 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
b3cdd2a7
JV
492 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
493 "ip_vs_nat_xmit_v6(): frag needed for");
494 goto tx_error;
495 }
496
497 /* copy-on-write the packet before mangling it */
498 if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
499 goto tx_error_put;
500
d8d1f30b 501 if (skb_cow(skb, rt->dst.dev->hard_header_len))
b3cdd2a7
JV
502 goto tx_error_put;
503
504 /* drop old route */
adf30907 505 skb_dst_drop(skb);
d8d1f30b 506 skb_dst_set(skb, &rt->dst);
b3cdd2a7
JV
507
508 /* mangle the packet */
509 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
510 goto tx_error;
511 ipv6_hdr(skb)->daddr = cp->daddr.in6;
512
513 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
514
6523ce15 515 ip_vs_update_conntrack(skb, cp, 1);
7b215ffc 516
b3cdd2a7
JV
517 /* FIXME: when application helper enlarges the packet and the length
518 is larger than the MTU of outgoing device, there will be still
519 MTU problem. */
520
521 /* Another hack: avoid icmp_send in ip_fragment */
522 skb->local_df = 1;
523
7911b5c7 524 IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
b3cdd2a7
JV
525
526 LeaveFunction(10);
527 return NF_STOLEN;
528
529tx_error_icmp:
530 dst_link_failure(skb);
531tx_error:
532 LeaveFunction(10);
533 kfree_skb(skb);
534 return NF_STOLEN;
535tx_error_put:
d8d1f30b 536 dst_release(&rt->dst);
b3cdd2a7
JV
537 goto tx_error;
538}
539#endif
540
1da177e4
LT
541
542/*
543 * IP Tunneling transmitter
544 *
545 * This function encapsulates the packet in a new IP packet, its
546 * destination will be set to cp->daddr. Most code of this function
547 * is taken from ipip.c.
548 *
549 * It is used in VS/TUN cluster. The load balancer selects a real
550 * server from a cluster based on a scheduling algorithm,
551 * encapsulates the request packet and forwards it to the selected
552 * server. For example, all real servers are configured with
553 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
554 * the encapsulated packet, it will decapsulate the packet, processe
555 * the request and return the response packets directly to the client
556 * without passing the load balancer. This can greatly increase the
557 * scalability of virtual server.
558 *
559 * Used for ANY protocol
560 */
561int
562ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
563 struct ip_vs_protocol *pp)
564{
565 struct rtable *rt; /* Route to the other host */
566 struct net_device *tdev; /* Device to other host */
eddc9ec5 567 struct iphdr *old_iph = ip_hdr(skb);
1da177e4 568 u8 tos = old_iph->tos;
76ab608d 569 __be16 df = old_iph->frag_off;
2e07fa9c 570 sk_buff_data_t old_transport_header = skb->transport_header;
1da177e4 571 struct iphdr *iph; /* Our new IP header */
c2636b4d 572 unsigned int max_headroom; /* The extra header space needed */
1da177e4
LT
573 int mtu;
574
575 EnterFunction(10);
576
4412ec49 577 if (skb->protocol != htons(ETH_P_IP)) {
1e3e238e 578 IP_VS_DBG_RL("%s(): protocol error, "
1da177e4 579 "ETH_P_IP: %d, skb protocol: %d\n",
1e3e238e 580 __func__, htons(ETH_P_IP), skb->protocol);
1da177e4
LT
581 goto tx_error;
582 }
583
584 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
585 goto tx_error_icmp;
586
d8d1f30b 587 tdev = rt->dst.dev;
1da177e4 588
d8d1f30b 589 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
1da177e4
LT
590 if (mtu < 68) {
591 ip_rt_put(rt);
1e3e238e 592 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
1da177e4
LT
593 goto tx_error;
594 }
adf30907
ED
595 if (skb_dst(skb))
596 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
1da177e4 597
4412ec49 598 df |= (old_iph->frag_off & htons(IP_DF));
1da177e4 599
4412ec49 600 if ((old_iph->frag_off & htons(IP_DF))
1da177e4
LT
601 && mtu < ntohs(old_iph->tot_len)) {
602 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
603 ip_rt_put(rt);
1e3e238e 604 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
1da177e4
LT
605 goto tx_error;
606 }
607
608 /*
609 * Okay, now see if we can stuff it in the buffer as-is.
610 */
611 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
612
613 if (skb_headroom(skb) < max_headroom
614 || skb_cloned(skb) || skb_shared(skb)) {
615 struct sk_buff *new_skb =
616 skb_realloc_headroom(skb, max_headroom);
617 if (!new_skb) {
618 ip_rt_put(rt);
619 kfree_skb(skb);
1e3e238e 620 IP_VS_ERR_RL("%s(): no memory\n", __func__);
1da177e4
LT
621 return NF_STOLEN;
622 }
623 kfree_skb(skb);
624 skb = new_skb;
eddc9ec5 625 old_iph = ip_hdr(skb);
1da177e4
LT
626 }
627
b0e380b1 628 skb->transport_header = old_transport_header;
1da177e4
LT
629
630 /* fix old IP header checksum */
631 ip_send_check(old_iph);
632
e2d1bca7
ACM
633 skb_push(skb, sizeof(struct iphdr));
634 skb_reset_network_header(skb);
1da177e4
LT
635 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
636
637 /* drop old route */
adf30907 638 skb_dst_drop(skb);
d8d1f30b 639 skb_dst_set(skb, &rt->dst);
1da177e4
LT
640
641 /*
642 * Push down and install the IPIP header.
643 */
eddc9ec5 644 iph = ip_hdr(skb);
1da177e4
LT
645 iph->version = 4;
646 iph->ihl = sizeof(struct iphdr)>>2;
647 iph->frag_off = df;
648 iph->protocol = IPPROTO_IPIP;
649 iph->tos = tos;
650 iph->daddr = rt->rt_dst;
651 iph->saddr = rt->rt_src;
652 iph->ttl = old_iph->ttl;
d8d1f30b 653 ip_select_ident(iph, &rt->dst, NULL);
1da177e4
LT
654
655 /* Another hack: avoid icmp_send in ip_fragment */
656 skb->local_df = 1;
657
c439cb2e 658 ip_local_out(skb);
1da177e4
LT
659
660 LeaveFunction(10);
661
662 return NF_STOLEN;
663
664 tx_error_icmp:
665 dst_link_failure(skb);
666 tx_error:
667 kfree_skb(skb);
668 LeaveFunction(10);
669 return NF_STOLEN;
670}
671
b3cdd2a7
JV
672#ifdef CONFIG_IP_VS_IPV6
673int
674ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
675 struct ip_vs_protocol *pp)
676{
677 struct rt6_info *rt; /* Route to the other host */
678 struct net_device *tdev; /* Device to other host */
679 struct ipv6hdr *old_iph = ipv6_hdr(skb);
680 sk_buff_data_t old_transport_header = skb->transport_header;
681 struct ipv6hdr *iph; /* Our new IP header */
682 unsigned int max_headroom; /* The extra header space needed */
683 int mtu;
684
685 EnterFunction(10);
686
687 if (skb->protocol != htons(ETH_P_IPV6)) {
1e3e238e 688 IP_VS_DBG_RL("%s(): protocol error, "
b3cdd2a7 689 "ETH_P_IPV6: %d, skb protocol: %d\n",
1e3e238e 690 __func__, htons(ETH_P_IPV6), skb->protocol);
b3cdd2a7
JV
691 goto tx_error;
692 }
693
694 rt = __ip_vs_get_out_rt_v6(cp);
695 if (!rt)
696 goto tx_error_icmp;
697
d8d1f30b 698 tdev = rt->dst.dev;
b3cdd2a7 699
d8d1f30b 700 mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
b3cdd2a7
JV
701 /* TODO IPv6: do we need this check in IPv6? */
702 if (mtu < 1280) {
d8d1f30b 703 dst_release(&rt->dst);
1e3e238e 704 IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
b3cdd2a7
JV
705 goto tx_error;
706 }
adf30907
ED
707 if (skb_dst(skb))
708 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
b3cdd2a7
JV
709
710 if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
3ffe533c 711 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
d8d1f30b 712 dst_release(&rt->dst);
1e3e238e 713 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
b3cdd2a7
JV
714 goto tx_error;
715 }
716
717 /*
718 * Okay, now see if we can stuff it in the buffer as-is.
719 */
720 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
721
722 if (skb_headroom(skb) < max_headroom
723 || skb_cloned(skb) || skb_shared(skb)) {
724 struct sk_buff *new_skb =
725 skb_realloc_headroom(skb, max_headroom);
726 if (!new_skb) {
d8d1f30b 727 dst_release(&rt->dst);
b3cdd2a7 728 kfree_skb(skb);
1e3e238e 729 IP_VS_ERR_RL("%s(): no memory\n", __func__);
b3cdd2a7
JV
730 return NF_STOLEN;
731 }
732 kfree_skb(skb);
733 skb = new_skb;
734 old_iph = ipv6_hdr(skb);
735 }
736
737 skb->transport_header = old_transport_header;
738
739 skb_push(skb, sizeof(struct ipv6hdr));
740 skb_reset_network_header(skb);
741 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
742
743 /* drop old route */
adf30907 744 skb_dst_drop(skb);
d8d1f30b 745 skb_dst_set(skb, &rt->dst);
b3cdd2a7
JV
746
747 /*
748 * Push down and install the IPIP header.
749 */
750 iph = ipv6_hdr(skb);
751 iph->version = 6;
752 iph->nexthdr = IPPROTO_IPV6;
b7b45f47
HH
753 iph->payload_len = old_iph->payload_len;
754 be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
b3cdd2a7
JV
755 iph->priority = old_iph->priority;
756 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
757 iph->daddr = rt->rt6i_dst.addr;
758 iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */
759 iph->hop_limit = old_iph->hop_limit;
760
761 /* Another hack: avoid icmp_send in ip_fragment */
762 skb->local_df = 1;
763
764 ip6_local_out(skb);
765
766 LeaveFunction(10);
767
768 return NF_STOLEN;
769
770tx_error_icmp:
771 dst_link_failure(skb);
772tx_error:
773 kfree_skb(skb);
774 LeaveFunction(10);
775 return NF_STOLEN;
776}
777#endif
778
1da177e4
LT
779
780/*
781 * Direct Routing transmitter
782 * Used for ANY protocol
783 */
784int
785ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
786 struct ip_vs_protocol *pp)
787{
788 struct rtable *rt; /* Route to the other host */
eddc9ec5 789 struct iphdr *iph = ip_hdr(skb);
1da177e4
LT
790 int mtu;
791
792 EnterFunction(10);
793
794 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
795 goto tx_error_icmp;
796
797 /* MTU checking */
d8d1f30b 798 mtu = dst_mtu(&rt->dst);
4412ec49 799 if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
1da177e4
LT
800 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
801 ip_rt_put(rt);
1e3e238e 802 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
1da177e4
LT
803 goto tx_error;
804 }
805
806 /*
807 * Call ip_send_check because we are not sure it is called
808 * after ip_defrag. Is copy-on-write needed?
809 */
810 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
811 ip_rt_put(rt);
812 return NF_STOLEN;
813 }
eddc9ec5 814 ip_send_check(ip_hdr(skb));
1da177e4
LT
815
816 /* drop old route */
adf30907 817 skb_dst_drop(skb);
d8d1f30b 818 skb_dst_set(skb, &rt->dst);
1da177e4
LT
819
820 /* Another hack: avoid icmp_send in ip_fragment */
821 skb->local_df = 1;
822
7911b5c7 823 IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
1da177e4
LT
824
825 LeaveFunction(10);
826 return NF_STOLEN;
827
828 tx_error_icmp:
829 dst_link_failure(skb);
830 tx_error:
831 kfree_skb(skb);
832 LeaveFunction(10);
833 return NF_STOLEN;
834}
835
b3cdd2a7
JV
836#ifdef CONFIG_IP_VS_IPV6
837int
838ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
839 struct ip_vs_protocol *pp)
840{
841 struct rt6_info *rt; /* Route to the other host */
842 int mtu;
843
844 EnterFunction(10);
845
846 rt = __ip_vs_get_out_rt_v6(cp);
847 if (!rt)
848 goto tx_error_icmp;
849
850 /* MTU checking */
d8d1f30b 851 mtu = dst_mtu(&rt->dst);
b3cdd2a7 852 if (skb->len > mtu) {
3ffe533c 853 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
d8d1f30b 854 dst_release(&rt->dst);
1e3e238e 855 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
b3cdd2a7
JV
856 goto tx_error;
857 }
858
859 /*
860 * Call ip_send_check because we are not sure it is called
861 * after ip_defrag. Is copy-on-write needed?
862 */
863 skb = skb_share_check(skb, GFP_ATOMIC);
864 if (unlikely(skb == NULL)) {
d8d1f30b 865 dst_release(&rt->dst);
b3cdd2a7
JV
866 return NF_STOLEN;
867 }
868
869 /* drop old route */
adf30907 870 skb_dst_drop(skb);
d8d1f30b 871 skb_dst_set(skb, &rt->dst);
b3cdd2a7
JV
872
873 /* Another hack: avoid icmp_send in ip_fragment */
874 skb->local_df = 1;
875
7911b5c7 876 IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
b3cdd2a7
JV
877
878 LeaveFunction(10);
879 return NF_STOLEN;
880
881tx_error_icmp:
882 dst_link_failure(skb);
883tx_error:
884 kfree_skb(skb);
885 LeaveFunction(10);
886 return NF_STOLEN;
887}
888#endif
889
1da177e4
LT
890
891/*
892 * ICMP packet transmitter
893 * called by the ip_vs_in_icmp
894 */
895int
896ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
897 struct ip_vs_protocol *pp, int offset)
898{
899 struct rtable *rt; /* Route to the other host */
900 int mtu;
901 int rc;
902
903 EnterFunction(10);
904
905 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
906 forwarded directly here, because there is no need to
907 translate address/port back */
908 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
909 if (cp->packet_xmit)
910 rc = cp->packet_xmit(skb, cp, pp);
911 else
912 rc = NF_ACCEPT;
913 /* do not touch skb anymore */
914 atomic_inc(&cp->in_pkts);
1da177e4
LT
915 goto out;
916 }
917
918 /*
919 * mangle and send the packet here (only for VS/NAT)
920 */
921
eddc9ec5 922 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
1da177e4
LT
923 goto tx_error_icmp;
924
925 /* MTU checking */
d8d1f30b 926 mtu = dst_mtu(&rt->dst);
eddc9ec5 927 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
1da177e4
LT
928 ip_rt_put(rt);
929 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
1e3e238e 930 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
1da177e4
LT
931 goto tx_error;
932 }
933
934 /* copy-on-write the packet before mangling it */
af1e1cf0 935 if (!skb_make_writable(skb, offset))
1da177e4
LT
936 goto tx_error_put;
937
d8d1f30b 938 if (skb_cow(skb, rt->dst.dev->hard_header_len))
1da177e4
LT
939 goto tx_error_put;
940
941 /* drop the old route when skb is not shared */
adf30907 942 skb_dst_drop(skb);
d8d1f30b 943 skb_dst_set(skb, &rt->dst);
1da177e4
LT
944
945 ip_vs_nat_icmp(skb, pp, cp, 0);
946
947 /* Another hack: avoid icmp_send in ip_fragment */
948 skb->local_df = 1;
949
7911b5c7 950 IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
1da177e4
LT
951
952 rc = NF_STOLEN;
953 goto out;
954
955 tx_error_icmp:
956 dst_link_failure(skb);
957 tx_error:
958 dev_kfree_skb(skb);
959 rc = NF_STOLEN;
960 out:
961 LeaveFunction(10);
962 return rc;
963 tx_error_put:
964 ip_rt_put(rt);
965 goto tx_error;
966}
b3cdd2a7
JV
967
968#ifdef CONFIG_IP_VS_IPV6
969int
970ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
971 struct ip_vs_protocol *pp, int offset)
972{
973 struct rt6_info *rt; /* Route to the other host */
974 int mtu;
975 int rc;
976
977 EnterFunction(10);
978
979 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
980 forwarded directly here, because there is no need to
981 translate address/port back */
982 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
983 if (cp->packet_xmit)
984 rc = cp->packet_xmit(skb, cp, pp);
985 else
986 rc = NF_ACCEPT;
987 /* do not touch skb anymore */
988 atomic_inc(&cp->in_pkts);
989 goto out;
990 }
991
992 /*
993 * mangle and send the packet here (only for VS/NAT)
994 */
995
996 rt = __ip_vs_get_out_rt_v6(cp);
997 if (!rt)
998 goto tx_error_icmp;
999
1000 /* MTU checking */
d8d1f30b 1001 mtu = dst_mtu(&rt->dst);
b3cdd2a7 1002 if (skb->len > mtu) {
d8d1f30b 1003 dst_release(&rt->dst);
3ffe533c 1004 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1e3e238e 1005 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
b3cdd2a7
JV
1006 goto tx_error;
1007 }
1008
1009 /* copy-on-write the packet before mangling it */
1010 if (!skb_make_writable(skb, offset))
1011 goto tx_error_put;
1012
d8d1f30b 1013 if (skb_cow(skb, rt->dst.dev->hard_header_len))
b3cdd2a7
JV
1014 goto tx_error_put;
1015
1016 /* drop the old route when skb is not shared */
adf30907 1017 skb_dst_drop(skb);
d8d1f30b 1018 skb_dst_set(skb, &rt->dst);
b3cdd2a7
JV
1019
1020 ip_vs_nat_icmp_v6(skb, pp, cp, 0);
1021
1022 /* Another hack: avoid icmp_send in ip_fragment */
1023 skb->local_df = 1;
1024
7911b5c7 1025 IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
b3cdd2a7
JV
1026
1027 rc = NF_STOLEN;
1028 goto out;
1029
1030tx_error_icmp:
1031 dst_link_failure(skb);
1032tx_error:
1033 dev_kfree_skb(skb);
1034 rc = NF_STOLEN;
1035out:
1036 LeaveFunction(10);
1037 return rc;
1038tx_error_put:
d8d1f30b 1039 dst_release(&rt->dst);
b3cdd2a7
JV
1040 goto tx_error;
1041}
1042#endif