]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/netfilter/ipvs/ip_vs_xmit.c
Merge branch 'bug-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/josef/btrfs...
[net-next-2.6.git] / net / netfilter / ipvs / ip_vs_xmit.c
1 /*
2  * ip_vs_xmit.c: various packet transmitters for IPVS
3  *
4  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
5  *              Julian Anastasov <ja@ssi.bg>
6  *
7  *              This program is free software; you can redistribute it and/or
8  *              modify it under the terms of the GNU General Public License
9  *              as published by the Free Software Foundation; either version
10  *              2 of the License, or (at your option) any later version.
11  *
12  * Changes:
13  *
14  */
15
16 #define KMSG_COMPONENT "IPVS"
17 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
18
19 #include <linux/kernel.h>
20 #include <linux/slab.h>
21 #include <linux/tcp.h>                  /* for tcphdr */
22 #include <net/ip.h>
23 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
24 #include <net/udp.h>
25 #include <net/icmp.h>                   /* for icmp_send */
26 #include <net/route.h>                  /* for ip_route_output */
27 #include <net/ipv6.h>
28 #include <net/ip6_route.h>
29 #include <linux/icmpv6.h>
30 #include <linux/netfilter.h>
31 #include <net/netfilter/nf_conntrack.h>
32 #include <linux/netfilter_ipv4.h>
33
34 #include <net/ip_vs.h>
35
36
37 /*
38  *      Destination cache to speed up outgoing route lookup
39  */
40 static inline void
41 __ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
42 {
43         struct dst_entry *old_dst;
44
45         old_dst = dest->dst_cache;
46         dest->dst_cache = dst;
47         dest->dst_rtos = rtos;
48         dst_release(old_dst);
49 }
50
51 static inline struct dst_entry *
52 __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
53 {
54         struct dst_entry *dst = dest->dst_cache;
55
56         if (!dst)
57                 return NULL;
58         if ((dst->obsolete
59              || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
60             dst->ops->check(dst, cookie) == NULL) {
61                 dest->dst_cache = NULL;
62                 dst_release(dst);
63                 return NULL;
64         }
65         dst_hold(dst);
66         return dst;
67 }
68
69 static struct rtable *
70 __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
71 {
72         struct rtable *rt;                      /* Route to the other host */
73         struct ip_vs_dest *dest = cp->dest;
74
75         if (dest) {
76                 spin_lock(&dest->dst_lock);
77                 if (!(rt = (struct rtable *)
78                       __ip_vs_dst_check(dest, rtos, 0))) {
79                         struct flowi fl = {
80                                 .oif = 0,
81                                 .nl_u = {
82                                         .ip4_u = {
83                                                 .daddr = dest->addr.ip,
84                                                 .saddr = 0,
85                                                 .tos = rtos, } },
86                         };
87
88                         if (ip_route_output_key(&init_net, &rt, &fl)) {
89                                 spin_unlock(&dest->dst_lock);
90                                 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
91                                              &dest->addr.ip);
92                                 return NULL;
93                         }
94                         __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
95                         IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
96                                   &dest->addr.ip,
97                                   atomic_read(&rt->dst.__refcnt), rtos);
98                 }
99                 spin_unlock(&dest->dst_lock);
100         } else {
101                 struct flowi fl = {
102                         .oif = 0,
103                         .nl_u = {
104                                 .ip4_u = {
105                                         .daddr = cp->daddr.ip,
106                                         .saddr = 0,
107                                         .tos = rtos, } },
108                 };
109
110                 if (ip_route_output_key(&init_net, &rt, &fl)) {
111                         IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
112                                      &cp->daddr.ip);
113                         return NULL;
114                 }
115         }
116
117         return rt;
118 }
119
120 #ifdef CONFIG_IP_VS_IPV6
121 static struct rt6_info *
122 __ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
123 {
124         struct rt6_info *rt;                    /* Route to the other host */
125         struct ip_vs_dest *dest = cp->dest;
126
127         if (dest) {
128                 spin_lock(&dest->dst_lock);
129                 rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
130                 if (!rt) {
131                         struct flowi fl = {
132                                 .oif = 0,
133                                 .nl_u = {
134                                         .ip6_u = {
135                                                 .daddr = dest->addr.in6,
136                                                 .saddr = {
137                                                         .s6_addr32 =
138                                                                 { 0, 0, 0, 0 },
139                                                 },
140                                         },
141                                 },
142                         };
143
144                         rt = (struct rt6_info *)ip6_route_output(&init_net,
145                                                                  NULL, &fl);
146                         if (!rt) {
147                                 spin_unlock(&dest->dst_lock);
148                                 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
149                                              &dest->addr.in6);
150                                 return NULL;
151                         }
152                         __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
153                         IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
154                                   &dest->addr.in6,
155                                   atomic_read(&rt->dst.__refcnt));
156                 }
157                 spin_unlock(&dest->dst_lock);
158         } else {
159                 struct flowi fl = {
160                         .oif = 0,
161                         .nl_u = {
162                                 .ip6_u = {
163                                         .daddr = cp->daddr.in6,
164                                         .saddr = {
165                                                 .s6_addr32 = { 0, 0, 0, 0 },
166                                         },
167                                 },
168                         },
169                 };
170
171                 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
172                 if (!rt) {
173                         IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
174                                      &cp->daddr.in6);
175                         return NULL;
176                 }
177         }
178
179         return rt;
180 }
181 #endif
182
183
184 /*
185  *      Release dest->dst_cache before a dest is removed
186  */
187 void
188 ip_vs_dst_reset(struct ip_vs_dest *dest)
189 {
190         struct dst_entry *old_dst;
191
192         old_dst = dest->dst_cache;
193         dest->dst_cache = NULL;
194         dst_release(old_dst);
195 }
196
197 #define IP_VS_XMIT(pf, skb, rt)                         \
198 do {                                                    \
199         (skb)->ipvs_property = 1;                       \
200         skb_forward_csum(skb);                          \
201         NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,     \
202                 (rt)->dst.dev, dst_output);             \
203 } while (0)
204
205
206 /*
207  *      NULL transmitter (do nothing except return NF_ACCEPT)
208  */
209 int
210 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
211                 struct ip_vs_protocol *pp)
212 {
213         /* we do not touch skb and do not need pskb ptr */
214         return NF_ACCEPT;
215 }
216
217
218 /*
219  *      Bypass transmitter
220  *      Let packets bypass the destination when the destination is not
221  *      available, it may be only used in transparent cache cluster.
222  */
223 int
224 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
225                   struct ip_vs_protocol *pp)
226 {
227         struct rtable *rt;                      /* Route to the other host */
228         struct iphdr  *iph = ip_hdr(skb);
229         u8     tos = iph->tos;
230         int    mtu;
231         struct flowi fl = {
232                 .oif = 0,
233                 .nl_u = {
234                         .ip4_u = {
235                                 .daddr = iph->daddr,
236                                 .saddr = 0,
237                                 .tos = RT_TOS(tos), } },
238         };
239
240         EnterFunction(10);
241
242         if (ip_route_output_key(&init_net, &rt, &fl)) {
243                 IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
244                              __func__, &iph->daddr);
245                 goto tx_error_icmp;
246         }
247
248         /* MTU checking */
249         mtu = dst_mtu(&rt->dst);
250         if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
251                 ip_rt_put(rt);
252                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
253                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
254                 goto tx_error;
255         }
256
257         /*
258          * Call ip_send_check because we are not sure it is called
259          * after ip_defrag. Is copy-on-write needed?
260          */
261         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
262                 ip_rt_put(rt);
263                 return NF_STOLEN;
264         }
265         ip_send_check(ip_hdr(skb));
266
267         /* drop old route */
268         skb_dst_drop(skb);
269         skb_dst_set(skb, &rt->dst);
270
271         /* Another hack: avoid icmp_send in ip_fragment */
272         skb->local_df = 1;
273
274         IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
275
276         LeaveFunction(10);
277         return NF_STOLEN;
278
279  tx_error_icmp:
280         dst_link_failure(skb);
281  tx_error:
282         kfree_skb(skb);
283         LeaveFunction(10);
284         return NF_STOLEN;
285 }
286
287 #ifdef CONFIG_IP_VS_IPV6
288 int
289 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
290                      struct ip_vs_protocol *pp)
291 {
292         struct rt6_info *rt;                    /* Route to the other host */
293         struct ipv6hdr  *iph = ipv6_hdr(skb);
294         int    mtu;
295         struct flowi fl = {
296                 .oif = 0,
297                 .nl_u = {
298                         .ip6_u = {
299                                 .daddr = iph->daddr,
300                                 .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
301         };
302
303         EnterFunction(10);
304
305         rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
306         if (!rt) {
307                 IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
308                              __func__, &iph->daddr);
309                 goto tx_error_icmp;
310         }
311
312         /* MTU checking */
313         mtu = dst_mtu(&rt->dst);
314         if (skb->len > mtu) {
315                 dst_release(&rt->dst);
316                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
317                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
318                 goto tx_error;
319         }
320
321         /*
322          * Call ip_send_check because we are not sure it is called
323          * after ip_defrag. Is copy-on-write needed?
324          */
325         skb = skb_share_check(skb, GFP_ATOMIC);
326         if (unlikely(skb == NULL)) {
327                 dst_release(&rt->dst);
328                 return NF_STOLEN;
329         }
330
331         /* drop old route */
332         skb_dst_drop(skb);
333         skb_dst_set(skb, &rt->dst);
334
335         /* Another hack: avoid icmp_send in ip_fragment */
336         skb->local_df = 1;
337
338         IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
339
340         LeaveFunction(10);
341         return NF_STOLEN;
342
343  tx_error_icmp:
344         dst_link_failure(skb);
345  tx_error:
346         kfree_skb(skb);
347         LeaveFunction(10);
348         return NF_STOLEN;
349 }
350 #endif
351
352 void
353 ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
354 {
355         struct nf_conn *ct = (struct nf_conn *)skb->nfct;
356         struct nf_conntrack_tuple new_tuple;
357
358         if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct))
359                 return;
360
361         /*
362          * The connection is not yet in the hashtable, so we update it.
363          * CIP->VIP will remain the same, so leave the tuple in
364          * IP_CT_DIR_ORIGINAL untouched.  When the reply comes back from the
365          * real-server we will see RIP->DIP.
366          */
367         new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
368         if (outin)
369                 new_tuple.src.u3 = cp->daddr;
370         else
371                 new_tuple.dst.u3 = cp->vaddr;
372         /*
373          * This will also take care of UDP and other protocols.
374          */
375         if (outin)
376                 new_tuple.src.u.tcp.port = cp->dport;
377         else
378                 new_tuple.dst.u.tcp.port = cp->vport;
379         nf_conntrack_alter_reply(ct, &new_tuple);
380 }
381
382 /*
383  *      NAT transmitter (only for outside-to-inside nat forwarding)
384  *      Not used for related ICMP
385  */
386 int
387 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
388                struct ip_vs_protocol *pp)
389 {
390         struct rtable *rt;              /* Route to the other host */
391         int mtu;
392         struct iphdr *iph = ip_hdr(skb);
393
394         EnterFunction(10);
395
396         /* check if it is a connection of no-client-port */
397         if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
398                 __be16 _pt, *p;
399                 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
400                 if (p == NULL)
401                         goto tx_error;
402                 ip_vs_conn_fill_cport(cp, *p);
403                 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
404         }
405
406         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
407                 goto tx_error_icmp;
408
409         /* MTU checking */
410         mtu = dst_mtu(&rt->dst);
411         if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
412                 ip_rt_put(rt);
413                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
414                 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
415                 goto tx_error;
416         }
417
418         /* copy-on-write the packet before mangling it */
419         if (!skb_make_writable(skb, sizeof(struct iphdr)))
420                 goto tx_error_put;
421
422         if (skb_cow(skb, rt->dst.dev->hard_header_len))
423                 goto tx_error_put;
424
425         /* drop old route */
426         skb_dst_drop(skb);
427         skb_dst_set(skb, &rt->dst);
428
429         /* mangle the packet */
430         if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
431                 goto tx_error;
432         ip_hdr(skb)->daddr = cp->daddr.ip;
433         ip_send_check(ip_hdr(skb));
434
435         IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
436
437         ip_vs_update_conntrack(skb, cp, 1);
438
439         /* FIXME: when application helper enlarges the packet and the length
440            is larger than the MTU of outgoing device, there will be still
441            MTU problem. */
442
443         /* Another hack: avoid icmp_send in ip_fragment */
444         skb->local_df = 1;
445
446         IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
447
448         LeaveFunction(10);
449         return NF_STOLEN;
450
451   tx_error_icmp:
452         dst_link_failure(skb);
453   tx_error:
454         LeaveFunction(10);
455         kfree_skb(skb);
456         return NF_STOLEN;
457   tx_error_put:
458         ip_rt_put(rt);
459         goto tx_error;
460 }
461
462 #ifdef CONFIG_IP_VS_IPV6
463 int
464 ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
465                   struct ip_vs_protocol *pp)
466 {
467         struct rt6_info *rt;            /* Route to the other host */
468         int mtu;
469
470         EnterFunction(10);
471
472         /* check if it is a connection of no-client-port */
473         if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
474                 __be16 _pt, *p;
475                 p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
476                                        sizeof(_pt), &_pt);
477                 if (p == NULL)
478                         goto tx_error;
479                 ip_vs_conn_fill_cport(cp, *p);
480                 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
481         }
482
483         rt = __ip_vs_get_out_rt_v6(cp);
484         if (!rt)
485                 goto tx_error_icmp;
486
487         /* MTU checking */
488         mtu = dst_mtu(&rt->dst);
489         if (skb->len > mtu) {
490                 dst_release(&rt->dst);
491                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
492                 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
493                                  "ip_vs_nat_xmit_v6(): frag needed for");
494                 goto tx_error;
495         }
496
497         /* copy-on-write the packet before mangling it */
498         if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
499                 goto tx_error_put;
500
501         if (skb_cow(skb, rt->dst.dev->hard_header_len))
502                 goto tx_error_put;
503
504         /* drop old route */
505         skb_dst_drop(skb);
506         skb_dst_set(skb, &rt->dst);
507
508         /* mangle the packet */
509         if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
510                 goto tx_error;
511         ipv6_hdr(skb)->daddr = cp->daddr.in6;
512
513         IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
514
515         ip_vs_update_conntrack(skb, cp, 1);
516
517         /* FIXME: when application helper enlarges the packet and the length
518            is larger than the MTU of outgoing device, there will be still
519            MTU problem. */
520
521         /* Another hack: avoid icmp_send in ip_fragment */
522         skb->local_df = 1;
523
524         IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
525
526         LeaveFunction(10);
527         return NF_STOLEN;
528
529 tx_error_icmp:
530         dst_link_failure(skb);
531 tx_error:
532         LeaveFunction(10);
533         kfree_skb(skb);
534         return NF_STOLEN;
535 tx_error_put:
536         dst_release(&rt->dst);
537         goto tx_error;
538 }
539 #endif
540
541
542 /*
543  *   IP Tunneling transmitter
544  *
545  *   This function encapsulates the packet in a new IP packet, its
546  *   destination will be set to cp->daddr. Most code of this function
547  *   is taken from ipip.c.
548  *
549  *   It is used in VS/TUN cluster. The load balancer selects a real
550  *   server from a cluster based on a scheduling algorithm,
551  *   encapsulates the request packet and forwards it to the selected
552  *   server. For example, all real servers are configured with
553  *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
554  *   the encapsulated packet, it will decapsulate the packet, processe
555  *   the request and return the response packets directly to the client
556  *   without passing the load balancer. This can greatly increase the
557  *   scalability of virtual server.
558  *
559  *   Used for ANY protocol
560  */
561 int
562 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
563                   struct ip_vs_protocol *pp)
564 {
565         struct rtable *rt;                      /* Route to the other host */
566         struct net_device *tdev;                /* Device to other host */
567         struct iphdr  *old_iph = ip_hdr(skb);
568         u8     tos = old_iph->tos;
569         __be16 df = old_iph->frag_off;
570         sk_buff_data_t old_transport_header = skb->transport_header;
571         struct iphdr  *iph;                     /* Our new IP header */
572         unsigned int max_headroom;              /* The extra header space needed */
573         int    mtu;
574
575         EnterFunction(10);
576
577         if (skb->protocol != htons(ETH_P_IP)) {
578                 IP_VS_DBG_RL("%s(): protocol error, "
579                              "ETH_P_IP: %d, skb protocol: %d\n",
580                              __func__, htons(ETH_P_IP), skb->protocol);
581                 goto tx_error;
582         }
583
584         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
585                 goto tx_error_icmp;
586
587         tdev = rt->dst.dev;
588
589         mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
590         if (mtu < 68) {
591                 ip_rt_put(rt);
592                 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
593                 goto tx_error;
594         }
595         if (skb_dst(skb))
596                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
597
598         df |= (old_iph->frag_off & htons(IP_DF));
599
600         if ((old_iph->frag_off & htons(IP_DF))
601             && mtu < ntohs(old_iph->tot_len)) {
602                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
603                 ip_rt_put(rt);
604                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
605                 goto tx_error;
606         }
607
608         /*
609          * Okay, now see if we can stuff it in the buffer as-is.
610          */
611         max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
612
613         if (skb_headroom(skb) < max_headroom
614             || skb_cloned(skb) || skb_shared(skb)) {
615                 struct sk_buff *new_skb =
616                         skb_realloc_headroom(skb, max_headroom);
617                 if (!new_skb) {
618                         ip_rt_put(rt);
619                         kfree_skb(skb);
620                         IP_VS_ERR_RL("%s(): no memory\n", __func__);
621                         return NF_STOLEN;
622                 }
623                 kfree_skb(skb);
624                 skb = new_skb;
625                 old_iph = ip_hdr(skb);
626         }
627
628         skb->transport_header = old_transport_header;
629
630         /* fix old IP header checksum */
631         ip_send_check(old_iph);
632
633         skb_push(skb, sizeof(struct iphdr));
634         skb_reset_network_header(skb);
635         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
636
637         /* drop old route */
638         skb_dst_drop(skb);
639         skb_dst_set(skb, &rt->dst);
640
641         /*
642          *      Push down and install the IPIP header.
643          */
644         iph                     =       ip_hdr(skb);
645         iph->version            =       4;
646         iph->ihl                =       sizeof(struct iphdr)>>2;
647         iph->frag_off           =       df;
648         iph->protocol           =       IPPROTO_IPIP;
649         iph->tos                =       tos;
650         iph->daddr              =       rt->rt_dst;
651         iph->saddr              =       rt->rt_src;
652         iph->ttl                =       old_iph->ttl;
653         ip_select_ident(iph, &rt->dst, NULL);
654
655         /* Another hack: avoid icmp_send in ip_fragment */
656         skb->local_df = 1;
657
658         ip_local_out(skb);
659
660         LeaveFunction(10);
661
662         return NF_STOLEN;
663
664   tx_error_icmp:
665         dst_link_failure(skb);
666   tx_error:
667         kfree_skb(skb);
668         LeaveFunction(10);
669         return NF_STOLEN;
670 }
671
672 #ifdef CONFIG_IP_VS_IPV6
673 int
674 ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
675                      struct ip_vs_protocol *pp)
676 {
677         struct rt6_info *rt;            /* Route to the other host */
678         struct net_device *tdev;        /* Device to other host */
679         struct ipv6hdr  *old_iph = ipv6_hdr(skb);
680         sk_buff_data_t old_transport_header = skb->transport_header;
681         struct ipv6hdr  *iph;           /* Our new IP header */
682         unsigned int max_headroom;      /* The extra header space needed */
683         int    mtu;
684
685         EnterFunction(10);
686
687         if (skb->protocol != htons(ETH_P_IPV6)) {
688                 IP_VS_DBG_RL("%s(): protocol error, "
689                              "ETH_P_IPV6: %d, skb protocol: %d\n",
690                              __func__, htons(ETH_P_IPV6), skb->protocol);
691                 goto tx_error;
692         }
693
694         rt = __ip_vs_get_out_rt_v6(cp);
695         if (!rt)
696                 goto tx_error_icmp;
697
698         tdev = rt->dst.dev;
699
700         mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
701         /* TODO IPv6: do we need this check in IPv6? */
702         if (mtu < 1280) {
703                 dst_release(&rt->dst);
704                 IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
705                 goto tx_error;
706         }
707         if (skb_dst(skb))
708                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
709
710         if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
711                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
712                 dst_release(&rt->dst);
713                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
714                 goto tx_error;
715         }
716
717         /*
718          * Okay, now see if we can stuff it in the buffer as-is.
719          */
720         max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
721
722         if (skb_headroom(skb) < max_headroom
723             || skb_cloned(skb) || skb_shared(skb)) {
724                 struct sk_buff *new_skb =
725                         skb_realloc_headroom(skb, max_headroom);
726                 if (!new_skb) {
727                         dst_release(&rt->dst);
728                         kfree_skb(skb);
729                         IP_VS_ERR_RL("%s(): no memory\n", __func__);
730                         return NF_STOLEN;
731                 }
732                 kfree_skb(skb);
733                 skb = new_skb;
734                 old_iph = ipv6_hdr(skb);
735         }
736
737         skb->transport_header = old_transport_header;
738
739         skb_push(skb, sizeof(struct ipv6hdr));
740         skb_reset_network_header(skb);
741         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
742
743         /* drop old route */
744         skb_dst_drop(skb);
745         skb_dst_set(skb, &rt->dst);
746
747         /*
748          *      Push down and install the IPIP header.
749          */
750         iph                     =       ipv6_hdr(skb);
751         iph->version            =       6;
752         iph->nexthdr            =       IPPROTO_IPV6;
753         iph->payload_len        =       old_iph->payload_len;
754         be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
755         iph->priority           =       old_iph->priority;
756         memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
757         iph->daddr              =       rt->rt6i_dst.addr;
758         iph->saddr              =       cp->vaddr.in6; /* rt->rt6i_src.addr; */
759         iph->hop_limit          =       old_iph->hop_limit;
760
761         /* Another hack: avoid icmp_send in ip_fragment */
762         skb->local_df = 1;
763
764         ip6_local_out(skb);
765
766         LeaveFunction(10);
767
768         return NF_STOLEN;
769
770 tx_error_icmp:
771         dst_link_failure(skb);
772 tx_error:
773         kfree_skb(skb);
774         LeaveFunction(10);
775         return NF_STOLEN;
776 }
777 #endif
778
779
780 /*
781  *      Direct Routing transmitter
782  *      Used for ANY protocol
783  */
784 int
785 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
786               struct ip_vs_protocol *pp)
787 {
788         struct rtable *rt;                      /* Route to the other host */
789         struct iphdr  *iph = ip_hdr(skb);
790         int    mtu;
791
792         EnterFunction(10);
793
794         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
795                 goto tx_error_icmp;
796
797         /* MTU checking */
798         mtu = dst_mtu(&rt->dst);
799         if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
800                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
801                 ip_rt_put(rt);
802                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
803                 goto tx_error;
804         }
805
806         /*
807          * Call ip_send_check because we are not sure it is called
808          * after ip_defrag. Is copy-on-write needed?
809          */
810         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
811                 ip_rt_put(rt);
812                 return NF_STOLEN;
813         }
814         ip_send_check(ip_hdr(skb));
815
816         /* drop old route */
817         skb_dst_drop(skb);
818         skb_dst_set(skb, &rt->dst);
819
820         /* Another hack: avoid icmp_send in ip_fragment */
821         skb->local_df = 1;
822
823         IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
824
825         LeaveFunction(10);
826         return NF_STOLEN;
827
828   tx_error_icmp:
829         dst_link_failure(skb);
830   tx_error:
831         kfree_skb(skb);
832         LeaveFunction(10);
833         return NF_STOLEN;
834 }
835
836 #ifdef CONFIG_IP_VS_IPV6
837 int
838 ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
839                  struct ip_vs_protocol *pp)
840 {
841         struct rt6_info *rt;                    /* Route to the other host */
842         int    mtu;
843
844         EnterFunction(10);
845
846         rt = __ip_vs_get_out_rt_v6(cp);
847         if (!rt)
848                 goto tx_error_icmp;
849
850         /* MTU checking */
851         mtu = dst_mtu(&rt->dst);
852         if (skb->len > mtu) {
853                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
854                 dst_release(&rt->dst);
855                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
856                 goto tx_error;
857         }
858
859         /*
860          * Call ip_send_check because we are not sure it is called
861          * after ip_defrag. Is copy-on-write needed?
862          */
863         skb = skb_share_check(skb, GFP_ATOMIC);
864         if (unlikely(skb == NULL)) {
865                 dst_release(&rt->dst);
866                 return NF_STOLEN;
867         }
868
869         /* drop old route */
870         skb_dst_drop(skb);
871         skb_dst_set(skb, &rt->dst);
872
873         /* Another hack: avoid icmp_send in ip_fragment */
874         skb->local_df = 1;
875
876         IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
877
878         LeaveFunction(10);
879         return NF_STOLEN;
880
881 tx_error_icmp:
882         dst_link_failure(skb);
883 tx_error:
884         kfree_skb(skb);
885         LeaveFunction(10);
886         return NF_STOLEN;
887 }
888 #endif
889
890
891 /*
892  *      ICMP packet transmitter
893  *      called by the ip_vs_in_icmp
894  */
895 int
896 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
897                 struct ip_vs_protocol *pp, int offset)
898 {
899         struct rtable   *rt;    /* Route to the other host */
900         int mtu;
901         int rc;
902
903         EnterFunction(10);
904
905         /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
906            forwarded directly here, because there is no need to
907            translate address/port back */
908         if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
909                 if (cp->packet_xmit)
910                         rc = cp->packet_xmit(skb, cp, pp);
911                 else
912                         rc = NF_ACCEPT;
913                 /* do not touch skb anymore */
914                 atomic_inc(&cp->in_pkts);
915                 goto out;
916         }
917
918         /*
919          * mangle and send the packet here (only for VS/NAT)
920          */
921
922         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
923                 goto tx_error_icmp;
924
925         /* MTU checking */
926         mtu = dst_mtu(&rt->dst);
927         if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
928                 ip_rt_put(rt);
929                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
930                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
931                 goto tx_error;
932         }
933
934         /* copy-on-write the packet before mangling it */
935         if (!skb_make_writable(skb, offset))
936                 goto tx_error_put;
937
938         if (skb_cow(skb, rt->dst.dev->hard_header_len))
939                 goto tx_error_put;
940
941         /* drop the old route when skb is not shared */
942         skb_dst_drop(skb);
943         skb_dst_set(skb, &rt->dst);
944
945         ip_vs_nat_icmp(skb, pp, cp, 0);
946
947         /* Another hack: avoid icmp_send in ip_fragment */
948         skb->local_df = 1;
949
950         IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
951
952         rc = NF_STOLEN;
953         goto out;
954
955   tx_error_icmp:
956         dst_link_failure(skb);
957   tx_error:
958         dev_kfree_skb(skb);
959         rc = NF_STOLEN;
960   out:
961         LeaveFunction(10);
962         return rc;
963   tx_error_put:
964         ip_rt_put(rt);
965         goto tx_error;
966 }
967
968 #ifdef CONFIG_IP_VS_IPV6
969 int
970 ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
971                 struct ip_vs_protocol *pp, int offset)
972 {
973         struct rt6_info *rt;    /* Route to the other host */
974         int mtu;
975         int rc;
976
977         EnterFunction(10);
978
979         /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
980            forwarded directly here, because there is no need to
981            translate address/port back */
982         if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
983                 if (cp->packet_xmit)
984                         rc = cp->packet_xmit(skb, cp, pp);
985                 else
986                         rc = NF_ACCEPT;
987                 /* do not touch skb anymore */
988                 atomic_inc(&cp->in_pkts);
989                 goto out;
990         }
991
992         /*
993          * mangle and send the packet here (only for VS/NAT)
994          */
995
996         rt = __ip_vs_get_out_rt_v6(cp);
997         if (!rt)
998                 goto tx_error_icmp;
999
1000         /* MTU checking */
1001         mtu = dst_mtu(&rt->dst);
1002         if (skb->len > mtu) {
1003                 dst_release(&rt->dst);
1004                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1005                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
1006                 goto tx_error;
1007         }
1008
1009         /* copy-on-write the packet before mangling it */
1010         if (!skb_make_writable(skb, offset))
1011                 goto tx_error_put;
1012
1013         if (skb_cow(skb, rt->dst.dev->hard_header_len))
1014                 goto tx_error_put;
1015
1016         /* drop the old route when skb is not shared */
1017         skb_dst_drop(skb);
1018         skb_dst_set(skb, &rt->dst);
1019
1020         ip_vs_nat_icmp_v6(skb, pp, cp, 0);
1021
1022         /* Another hack: avoid icmp_send in ip_fragment */
1023         skb->local_df = 1;
1024
1025         IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
1026
1027         rc = NF_STOLEN;
1028         goto out;
1029
1030 tx_error_icmp:
1031         dst_link_failure(skb);
1032 tx_error:
1033         dev_kfree_skb(skb);
1034         rc = NF_STOLEN;
1035 out:
1036         LeaveFunction(10);
1037         return rc;
1038 tx_error_put:
1039         dst_release(&rt->dst);
1040         goto tx_error;
1041 }
1042 #endif