]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/netfilter/ipvs/ip_vs_xmit.c
Merge branch 'message-callback' into kbuild/kconfig
[net-next-2.6.git] / net / netfilter / ipvs / ip_vs_xmit.c
1 /*
2  * ip_vs_xmit.c: various packet transmitters for IPVS
3  *
4  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
5  *              Julian Anastasov <ja@ssi.bg>
6  *
7  *              This program is free software; you can redistribute it and/or
8  *              modify it under the terms of the GNU General Public License
9  *              as published by the Free Software Foundation; either version
10  *              2 of the License, or (at your option) any later version.
11  *
12  * Changes:
13  *
14  */
15
16 #define KMSG_COMPONENT "IPVS"
17 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
18
19 #include <linux/kernel.h>
20 #include <linux/slab.h>
21 #include <linux/tcp.h>                  /* for tcphdr */
22 #include <net/ip.h>
23 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
24 #include <net/udp.h>
25 #include <net/icmp.h>                   /* for icmp_send */
26 #include <net/route.h>                  /* for ip_route_output */
27 #include <net/ipv6.h>
28 #include <net/ip6_route.h>
29 #include <linux/icmpv6.h>
30 #include <linux/netfilter.h>
31 #include <net/netfilter/nf_conntrack.h>
32 #include <linux/netfilter_ipv4.h>
33
34 #include <net/ip_vs.h>
35
36
37 /*
38  *      Destination cache to speed up outgoing route lookup
39  */
40 static inline void
41 __ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
42 {
43         struct dst_entry *old_dst;
44
45         old_dst = dest->dst_cache;
46         dest->dst_cache = dst;
47         dest->dst_rtos = rtos;
48         dst_release(old_dst);
49 }
50
51 static inline struct dst_entry *
52 __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
53 {
54         struct dst_entry *dst = dest->dst_cache;
55
56         if (!dst)
57                 return NULL;
58         if ((dst->obsolete
59              || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
60             dst->ops->check(dst, cookie) == NULL) {
61                 dest->dst_cache = NULL;
62                 dst_release(dst);
63                 return NULL;
64         }
65         dst_hold(dst);
66         return dst;
67 }
68
69 static struct rtable *
70 __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
71 {
72         struct rtable *rt;                      /* Route to the other host */
73         struct ip_vs_dest *dest = cp->dest;
74
75         if (dest) {
76                 spin_lock(&dest->dst_lock);
77                 if (!(rt = (struct rtable *)
78                       __ip_vs_dst_check(dest, rtos, 0))) {
79                         struct flowi fl = {
80                                 .oif = 0,
81                                 .nl_u = {
82                                         .ip4_u = {
83                                                 .daddr = dest->addr.ip,
84                                                 .saddr = 0,
85                                                 .tos = rtos, } },
86                         };
87
88                         if (ip_route_output_key(&init_net, &rt, &fl)) {
89                                 spin_unlock(&dest->dst_lock);
90                                 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
91                                              &dest->addr.ip);
92                                 return NULL;
93                         }
94                         __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
95                         IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
96                                   &dest->addr.ip,
97                                   atomic_read(&rt->dst.__refcnt), rtos);
98                 }
99                 spin_unlock(&dest->dst_lock);
100         } else {
101                 struct flowi fl = {
102                         .oif = 0,
103                         .nl_u = {
104                                 .ip4_u = {
105                                         .daddr = cp->daddr.ip,
106                                         .saddr = 0,
107                                         .tos = rtos, } },
108                 };
109
110                 if (ip_route_output_key(&init_net, &rt, &fl)) {
111                         IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
112                                      &cp->daddr.ip);
113                         return NULL;
114                 }
115         }
116
117         return rt;
118 }
119
120 #ifdef CONFIG_IP_VS_IPV6
121 static struct rt6_info *
122 __ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
123 {
124         struct rt6_info *rt;                    /* Route to the other host */
125         struct ip_vs_dest *dest = cp->dest;
126
127         if (dest) {
128                 spin_lock(&dest->dst_lock);
129                 rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
130                 if (!rt) {
131                         struct flowi fl = {
132                                 .oif = 0,
133                                 .nl_u = {
134                                         .ip6_u = {
135                                                 .daddr = dest->addr.in6,
136                                                 .saddr = {
137                                                         .s6_addr32 =
138                                                                 { 0, 0, 0, 0 },
139                                                 },
140                                         },
141                                 },
142                         };
143
144                         rt = (struct rt6_info *)ip6_route_output(&init_net,
145                                                                  NULL, &fl);
146                         if (!rt) {
147                                 spin_unlock(&dest->dst_lock);
148                                 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
149                                              &dest->addr.in6);
150                                 return NULL;
151                         }
152                         __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
153                         IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
154                                   &dest->addr.in6,
155                                   atomic_read(&rt->dst.__refcnt));
156                 }
157                 spin_unlock(&dest->dst_lock);
158         } else {
159                 struct flowi fl = {
160                         .oif = 0,
161                         .nl_u = {
162                                 .ip6_u = {
163                                         .daddr = cp->daddr.in6,
164                                         .saddr = {
165                                                 .s6_addr32 = { 0, 0, 0, 0 },
166                                         },
167                                 },
168                         },
169                 };
170
171                 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
172                 if (!rt) {
173                         IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
174                                      &cp->daddr.in6);
175                         return NULL;
176                 }
177         }
178
179         return rt;
180 }
181 #endif
182
183
184 /*
185  *      Release dest->dst_cache before a dest is removed
186  */
187 void
188 ip_vs_dst_reset(struct ip_vs_dest *dest)
189 {
190         struct dst_entry *old_dst;
191
192         old_dst = dest->dst_cache;
193         dest->dst_cache = NULL;
194         dst_release(old_dst);
195 }
196
197 #define IP_VS_XMIT(pf, skb, rt)                         \
198 do {                                                    \
199         (skb)->ipvs_property = 1;                       \
200         skb_forward_csum(skb);                          \
201         NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,     \
202                 (rt)->dst.dev, dst_output);             \
203 } while (0)
204
205
206 /*
207  *      NULL transmitter (do nothing except return NF_ACCEPT)
208  */
209 int
210 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
211                 struct ip_vs_protocol *pp)
212 {
213         /* we do not touch skb and do not need pskb ptr */
214         return NF_ACCEPT;
215 }
216
217
218 /*
219  *      Bypass transmitter
220  *      Let packets bypass the destination when the destination is not
221  *      available, it may be only used in transparent cache cluster.
222  */
223 int
224 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
225                   struct ip_vs_protocol *pp)
226 {
227         struct rtable *rt;                      /* Route to the other host */
228         struct iphdr  *iph = ip_hdr(skb);
229         u8     tos = iph->tos;
230         int    mtu;
231         struct flowi fl = {
232                 .oif = 0,
233                 .nl_u = {
234                         .ip4_u = {
235                                 .daddr = iph->daddr,
236                                 .saddr = 0,
237                                 .tos = RT_TOS(tos), } },
238         };
239
240         EnterFunction(10);
241
242         if (ip_route_output_key(&init_net, &rt, &fl)) {
243                 IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
244                              __func__, &iph->daddr);
245                 goto tx_error_icmp;
246         }
247
248         /* MTU checking */
249         mtu = dst_mtu(&rt->dst);
250         if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
251                 ip_rt_put(rt);
252                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
253                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
254                 goto tx_error;
255         }
256
257         /*
258          * Call ip_send_check because we are not sure it is called
259          * after ip_defrag. Is copy-on-write needed?
260          */
261         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
262                 ip_rt_put(rt);
263                 return NF_STOLEN;
264         }
265         ip_send_check(ip_hdr(skb));
266
267         /* drop old route */
268         skb_dst_drop(skb);
269         skb_dst_set(skb, &rt->dst);
270
271         /* Another hack: avoid icmp_send in ip_fragment */
272         skb->local_df = 1;
273
274         IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
275
276         LeaveFunction(10);
277         return NF_STOLEN;
278
279  tx_error_icmp:
280         dst_link_failure(skb);
281  tx_error:
282         kfree_skb(skb);
283         LeaveFunction(10);
284         return NF_STOLEN;
285 }
286
287 #ifdef CONFIG_IP_VS_IPV6
288 int
289 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
290                      struct ip_vs_protocol *pp)
291 {
292         struct rt6_info *rt;                    /* Route to the other host */
293         struct ipv6hdr  *iph = ipv6_hdr(skb);
294         int    mtu;
295         struct flowi fl = {
296                 .oif = 0,
297                 .nl_u = {
298                         .ip6_u = {
299                                 .daddr = iph->daddr,
300                                 .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
301         };
302
303         EnterFunction(10);
304
305         rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
306         if (!rt) {
307                 IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
308                              __func__, &iph->daddr);
309                 goto tx_error_icmp;
310         }
311
312         /* MTU checking */
313         mtu = dst_mtu(&rt->dst);
314         if (skb->len > mtu) {
315                 dst_release(&rt->dst);
316                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
317                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
318                 goto tx_error;
319         }
320
321         /*
322          * Call ip_send_check because we are not sure it is called
323          * after ip_defrag. Is copy-on-write needed?
324          */
325         skb = skb_share_check(skb, GFP_ATOMIC);
326         if (unlikely(skb == NULL)) {
327                 dst_release(&rt->dst);
328                 return NF_STOLEN;
329         }
330
331         /* drop old route */
332         skb_dst_drop(skb);
333         skb_dst_set(skb, &rt->dst);
334
335         /* Another hack: avoid icmp_send in ip_fragment */
336         skb->local_df = 1;
337
338         IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
339
340         LeaveFunction(10);
341         return NF_STOLEN;
342
343  tx_error_icmp:
344         dst_link_failure(skb);
345  tx_error:
346         kfree_skb(skb);
347         LeaveFunction(10);
348         return NF_STOLEN;
349 }
350 #endif
351
352 static void
353 ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
354 {
355         struct nf_conn *ct = (struct nf_conn *)skb->nfct;
356         struct nf_conntrack_tuple new_tuple;
357
358         if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct))
359                 return;
360
361         /*
362          * The connection is not yet in the hashtable, so we update it.
363          * CIP->VIP will remain the same, so leave the tuple in
364          * IP_CT_DIR_ORIGINAL untouched.  When the reply comes back from the
365          * real-server we will see RIP->DIP.
366          */
367         new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
368         new_tuple.src.u3 = cp->daddr;
369         /*
370          * This will also take care of UDP and other protocols.
371          */
372         new_tuple.src.u.tcp.port = cp->dport;
373         nf_conntrack_alter_reply(ct, &new_tuple);
374 }
375
376 /*
377  *      NAT transmitter (only for outside-to-inside nat forwarding)
378  *      Not used for related ICMP
379  */
380 int
381 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
382                struct ip_vs_protocol *pp)
383 {
384         struct rtable *rt;              /* Route to the other host */
385         int mtu;
386         struct iphdr *iph = ip_hdr(skb);
387
388         EnterFunction(10);
389
390         /* check if it is a connection of no-client-port */
391         if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
392                 __be16 _pt, *p;
393                 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
394                 if (p == NULL)
395                         goto tx_error;
396                 ip_vs_conn_fill_cport(cp, *p);
397                 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
398         }
399
400         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
401                 goto tx_error_icmp;
402
403         /* MTU checking */
404         mtu = dst_mtu(&rt->dst);
405         if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
406                 ip_rt_put(rt);
407                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
408                 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
409                 goto tx_error;
410         }
411
412         /* copy-on-write the packet before mangling it */
413         if (!skb_make_writable(skb, sizeof(struct iphdr)))
414                 goto tx_error_put;
415
416         if (skb_cow(skb, rt->dst.dev->hard_header_len))
417                 goto tx_error_put;
418
419         /* drop old route */
420         skb_dst_drop(skb);
421         skb_dst_set(skb, &rt->dst);
422
423         /* mangle the packet */
424         if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
425                 goto tx_error;
426         ip_hdr(skb)->daddr = cp->daddr.ip;
427         ip_send_check(ip_hdr(skb));
428
429         IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
430
431         ip_vs_update_conntrack(skb, cp);
432
433         /* FIXME: when application helper enlarges the packet and the length
434            is larger than the MTU of outgoing device, there will be still
435            MTU problem. */
436
437         /* Another hack: avoid icmp_send in ip_fragment */
438         skb->local_df = 1;
439
440         IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
441
442         LeaveFunction(10);
443         return NF_STOLEN;
444
445   tx_error_icmp:
446         dst_link_failure(skb);
447   tx_error:
448         LeaveFunction(10);
449         kfree_skb(skb);
450         return NF_STOLEN;
451   tx_error_put:
452         ip_rt_put(rt);
453         goto tx_error;
454 }
455
456 #ifdef CONFIG_IP_VS_IPV6
457 int
458 ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
459                   struct ip_vs_protocol *pp)
460 {
461         struct rt6_info *rt;            /* Route to the other host */
462         int mtu;
463
464         EnterFunction(10);
465
466         /* check if it is a connection of no-client-port */
467         if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
468                 __be16 _pt, *p;
469                 p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
470                                        sizeof(_pt), &_pt);
471                 if (p == NULL)
472                         goto tx_error;
473                 ip_vs_conn_fill_cport(cp, *p);
474                 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
475         }
476
477         rt = __ip_vs_get_out_rt_v6(cp);
478         if (!rt)
479                 goto tx_error_icmp;
480
481         /* MTU checking */
482         mtu = dst_mtu(&rt->dst);
483         if (skb->len > mtu) {
484                 dst_release(&rt->dst);
485                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
486                 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
487                                  "ip_vs_nat_xmit_v6(): frag needed for");
488                 goto tx_error;
489         }
490
491         /* copy-on-write the packet before mangling it */
492         if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
493                 goto tx_error_put;
494
495         if (skb_cow(skb, rt->dst.dev->hard_header_len))
496                 goto tx_error_put;
497
498         /* drop old route */
499         skb_dst_drop(skb);
500         skb_dst_set(skb, &rt->dst);
501
502         /* mangle the packet */
503         if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
504                 goto tx_error;
505         ipv6_hdr(skb)->daddr = cp->daddr.in6;
506
507         IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
508
509         ip_vs_update_conntrack(skb, cp);
510
511         /* FIXME: when application helper enlarges the packet and the length
512            is larger than the MTU of outgoing device, there will be still
513            MTU problem. */
514
515         /* Another hack: avoid icmp_send in ip_fragment */
516         skb->local_df = 1;
517
518         IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
519
520         LeaveFunction(10);
521         return NF_STOLEN;
522
523 tx_error_icmp:
524         dst_link_failure(skb);
525 tx_error:
526         LeaveFunction(10);
527         kfree_skb(skb);
528         return NF_STOLEN;
529 tx_error_put:
530         dst_release(&rt->dst);
531         goto tx_error;
532 }
533 #endif
534
535
536 /*
537  *   IP Tunneling transmitter
538  *
539  *   This function encapsulates the packet in a new IP packet, its
540  *   destination will be set to cp->daddr. Most code of this function
541  *   is taken from ipip.c.
542  *
543  *   It is used in VS/TUN cluster. The load balancer selects a real
544  *   server from a cluster based on a scheduling algorithm,
545  *   encapsulates the request packet and forwards it to the selected
546  *   server. For example, all real servers are configured with
547  *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
548  *   the encapsulated packet, it will decapsulate the packet, processe
549  *   the request and return the response packets directly to the client
550  *   without passing the load balancer. This can greatly increase the
551  *   scalability of virtual server.
552  *
553  *   Used for ANY protocol
554  */
555 int
556 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
557                   struct ip_vs_protocol *pp)
558 {
559         struct rtable *rt;                      /* Route to the other host */
560         struct net_device *tdev;                /* Device to other host */
561         struct iphdr  *old_iph = ip_hdr(skb);
562         u8     tos = old_iph->tos;
563         __be16 df = old_iph->frag_off;
564         sk_buff_data_t old_transport_header = skb->transport_header;
565         struct iphdr  *iph;                     /* Our new IP header */
566         unsigned int max_headroom;              /* The extra header space needed */
567         int    mtu;
568
569         EnterFunction(10);
570
571         if (skb->protocol != htons(ETH_P_IP)) {
572                 IP_VS_DBG_RL("%s(): protocol error, "
573                              "ETH_P_IP: %d, skb protocol: %d\n",
574                              __func__, htons(ETH_P_IP), skb->protocol);
575                 goto tx_error;
576         }
577
578         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
579                 goto tx_error_icmp;
580
581         tdev = rt->dst.dev;
582
583         mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
584         if (mtu < 68) {
585                 ip_rt_put(rt);
586                 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
587                 goto tx_error;
588         }
589         if (skb_dst(skb))
590                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
591
592         df |= (old_iph->frag_off & htons(IP_DF));
593
594         if ((old_iph->frag_off & htons(IP_DF))
595             && mtu < ntohs(old_iph->tot_len)) {
596                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
597                 ip_rt_put(rt);
598                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
599                 goto tx_error;
600         }
601
602         /*
603          * Okay, now see if we can stuff it in the buffer as-is.
604          */
605         max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
606
607         if (skb_headroom(skb) < max_headroom
608             || skb_cloned(skb) || skb_shared(skb)) {
609                 struct sk_buff *new_skb =
610                         skb_realloc_headroom(skb, max_headroom);
611                 if (!new_skb) {
612                         ip_rt_put(rt);
613                         kfree_skb(skb);
614                         IP_VS_ERR_RL("%s(): no memory\n", __func__);
615                         return NF_STOLEN;
616                 }
617                 kfree_skb(skb);
618                 skb = new_skb;
619                 old_iph = ip_hdr(skb);
620         }
621
622         skb->transport_header = old_transport_header;
623
624         /* fix old IP header checksum */
625         ip_send_check(old_iph);
626
627         skb_push(skb, sizeof(struct iphdr));
628         skb_reset_network_header(skb);
629         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
630
631         /* drop old route */
632         skb_dst_drop(skb);
633         skb_dst_set(skb, &rt->dst);
634
635         /*
636          *      Push down and install the IPIP header.
637          */
638         iph                     =       ip_hdr(skb);
639         iph->version            =       4;
640         iph->ihl                =       sizeof(struct iphdr)>>2;
641         iph->frag_off           =       df;
642         iph->protocol           =       IPPROTO_IPIP;
643         iph->tos                =       tos;
644         iph->daddr              =       rt->rt_dst;
645         iph->saddr              =       rt->rt_src;
646         iph->ttl                =       old_iph->ttl;
647         ip_select_ident(iph, &rt->dst, NULL);
648
649         /* Another hack: avoid icmp_send in ip_fragment */
650         skb->local_df = 1;
651
652         ip_local_out(skb);
653
654         LeaveFunction(10);
655
656         return NF_STOLEN;
657
658   tx_error_icmp:
659         dst_link_failure(skb);
660   tx_error:
661         kfree_skb(skb);
662         LeaveFunction(10);
663         return NF_STOLEN;
664 }
665
666 #ifdef CONFIG_IP_VS_IPV6
667 int
668 ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
669                      struct ip_vs_protocol *pp)
670 {
671         struct rt6_info *rt;            /* Route to the other host */
672         struct net_device *tdev;        /* Device to other host */
673         struct ipv6hdr  *old_iph = ipv6_hdr(skb);
674         sk_buff_data_t old_transport_header = skb->transport_header;
675         struct ipv6hdr  *iph;           /* Our new IP header */
676         unsigned int max_headroom;      /* The extra header space needed */
677         int    mtu;
678
679         EnterFunction(10);
680
681         if (skb->protocol != htons(ETH_P_IPV6)) {
682                 IP_VS_DBG_RL("%s(): protocol error, "
683                              "ETH_P_IPV6: %d, skb protocol: %d\n",
684                              __func__, htons(ETH_P_IPV6), skb->protocol);
685                 goto tx_error;
686         }
687
688         rt = __ip_vs_get_out_rt_v6(cp);
689         if (!rt)
690                 goto tx_error_icmp;
691
692         tdev = rt->dst.dev;
693
694         mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
695         /* TODO IPv6: do we need this check in IPv6? */
696         if (mtu < 1280) {
697                 dst_release(&rt->dst);
698                 IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
699                 goto tx_error;
700         }
701         if (skb_dst(skb))
702                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
703
704         if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
705                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
706                 dst_release(&rt->dst);
707                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
708                 goto tx_error;
709         }
710
711         /*
712          * Okay, now see if we can stuff it in the buffer as-is.
713          */
714         max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
715
716         if (skb_headroom(skb) < max_headroom
717             || skb_cloned(skb) || skb_shared(skb)) {
718                 struct sk_buff *new_skb =
719                         skb_realloc_headroom(skb, max_headroom);
720                 if (!new_skb) {
721                         dst_release(&rt->dst);
722                         kfree_skb(skb);
723                         IP_VS_ERR_RL("%s(): no memory\n", __func__);
724                         return NF_STOLEN;
725                 }
726                 kfree_skb(skb);
727                 skb = new_skb;
728                 old_iph = ipv6_hdr(skb);
729         }
730
731         skb->transport_header = old_transport_header;
732
733         skb_push(skb, sizeof(struct ipv6hdr));
734         skb_reset_network_header(skb);
735         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
736
737         /* drop old route */
738         skb_dst_drop(skb);
739         skb_dst_set(skb, &rt->dst);
740
741         /*
742          *      Push down and install the IPIP header.
743          */
744         iph                     =       ipv6_hdr(skb);
745         iph->version            =       6;
746         iph->nexthdr            =       IPPROTO_IPV6;
747         iph->payload_len        =       old_iph->payload_len;
748         be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
749         iph->priority           =       old_iph->priority;
750         memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
751         iph->daddr              =       rt->rt6i_dst.addr;
752         iph->saddr              =       cp->vaddr.in6; /* rt->rt6i_src.addr; */
753         iph->hop_limit          =       old_iph->hop_limit;
754
755         /* Another hack: avoid icmp_send in ip_fragment */
756         skb->local_df = 1;
757
758         ip6_local_out(skb);
759
760         LeaveFunction(10);
761
762         return NF_STOLEN;
763
764 tx_error_icmp:
765         dst_link_failure(skb);
766 tx_error:
767         kfree_skb(skb);
768         LeaveFunction(10);
769         return NF_STOLEN;
770 }
771 #endif
772
773
774 /*
775  *      Direct Routing transmitter
776  *      Used for ANY protocol
777  */
778 int
779 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
780               struct ip_vs_protocol *pp)
781 {
782         struct rtable *rt;                      /* Route to the other host */
783         struct iphdr  *iph = ip_hdr(skb);
784         int    mtu;
785
786         EnterFunction(10);
787
788         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
789                 goto tx_error_icmp;
790
791         /* MTU checking */
792         mtu = dst_mtu(&rt->dst);
793         if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
794                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
795                 ip_rt_put(rt);
796                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
797                 goto tx_error;
798         }
799
800         /*
801          * Call ip_send_check because we are not sure it is called
802          * after ip_defrag. Is copy-on-write needed?
803          */
804         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
805                 ip_rt_put(rt);
806                 return NF_STOLEN;
807         }
808         ip_send_check(ip_hdr(skb));
809
810         /* drop old route */
811         skb_dst_drop(skb);
812         skb_dst_set(skb, &rt->dst);
813
814         /* Another hack: avoid icmp_send in ip_fragment */
815         skb->local_df = 1;
816
817         IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
818
819         LeaveFunction(10);
820         return NF_STOLEN;
821
822   tx_error_icmp:
823         dst_link_failure(skb);
824   tx_error:
825         kfree_skb(skb);
826         LeaveFunction(10);
827         return NF_STOLEN;
828 }
829
830 #ifdef CONFIG_IP_VS_IPV6
831 int
832 ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
833                  struct ip_vs_protocol *pp)
834 {
835         struct rt6_info *rt;                    /* Route to the other host */
836         int    mtu;
837
838         EnterFunction(10);
839
840         rt = __ip_vs_get_out_rt_v6(cp);
841         if (!rt)
842                 goto tx_error_icmp;
843
844         /* MTU checking */
845         mtu = dst_mtu(&rt->dst);
846         if (skb->len > mtu) {
847                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
848                 dst_release(&rt->dst);
849                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
850                 goto tx_error;
851         }
852
853         /*
854          * Call ip_send_check because we are not sure it is called
855          * after ip_defrag. Is copy-on-write needed?
856          */
857         skb = skb_share_check(skb, GFP_ATOMIC);
858         if (unlikely(skb == NULL)) {
859                 dst_release(&rt->dst);
860                 return NF_STOLEN;
861         }
862
863         /* drop old route */
864         skb_dst_drop(skb);
865         skb_dst_set(skb, &rt->dst);
866
867         /* Another hack: avoid icmp_send in ip_fragment */
868         skb->local_df = 1;
869
870         IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
871
872         LeaveFunction(10);
873         return NF_STOLEN;
874
875 tx_error_icmp:
876         dst_link_failure(skb);
877 tx_error:
878         kfree_skb(skb);
879         LeaveFunction(10);
880         return NF_STOLEN;
881 }
882 #endif
883
884
885 /*
886  *      ICMP packet transmitter
887  *      called by the ip_vs_in_icmp
888  */
889 int
890 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
891                 struct ip_vs_protocol *pp, int offset)
892 {
893         struct rtable   *rt;    /* Route to the other host */
894         int mtu;
895         int rc;
896
897         EnterFunction(10);
898
899         /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
900            forwarded directly here, because there is no need to
901            translate address/port back */
902         if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
903                 if (cp->packet_xmit)
904                         rc = cp->packet_xmit(skb, cp, pp);
905                 else
906                         rc = NF_ACCEPT;
907                 /* do not touch skb anymore */
908                 atomic_inc(&cp->in_pkts);
909                 goto out;
910         }
911
912         /*
913          * mangle and send the packet here (only for VS/NAT)
914          */
915
916         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
917                 goto tx_error_icmp;
918
919         /* MTU checking */
920         mtu = dst_mtu(&rt->dst);
921         if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
922                 ip_rt_put(rt);
923                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
924                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
925                 goto tx_error;
926         }
927
928         /* copy-on-write the packet before mangling it */
929         if (!skb_make_writable(skb, offset))
930                 goto tx_error_put;
931
932         if (skb_cow(skb, rt->dst.dev->hard_header_len))
933                 goto tx_error_put;
934
935         /* drop the old route when skb is not shared */
936         skb_dst_drop(skb);
937         skb_dst_set(skb, &rt->dst);
938
939         ip_vs_nat_icmp(skb, pp, cp, 0);
940
941         /* Another hack: avoid icmp_send in ip_fragment */
942         skb->local_df = 1;
943
944         IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
945
946         rc = NF_STOLEN;
947         goto out;
948
949   tx_error_icmp:
950         dst_link_failure(skb);
951   tx_error:
952         dev_kfree_skb(skb);
953         rc = NF_STOLEN;
954   out:
955         LeaveFunction(10);
956         return rc;
957   tx_error_put:
958         ip_rt_put(rt);
959         goto tx_error;
960 }
961
962 #ifdef CONFIG_IP_VS_IPV6
963 int
964 ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
965                 struct ip_vs_protocol *pp, int offset)
966 {
967         struct rt6_info *rt;    /* Route to the other host */
968         int mtu;
969         int rc;
970
971         EnterFunction(10);
972
973         /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
974            forwarded directly here, because there is no need to
975            translate address/port back */
976         if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
977                 if (cp->packet_xmit)
978                         rc = cp->packet_xmit(skb, cp, pp);
979                 else
980                         rc = NF_ACCEPT;
981                 /* do not touch skb anymore */
982                 atomic_inc(&cp->in_pkts);
983                 goto out;
984         }
985
986         /*
987          * mangle and send the packet here (only for VS/NAT)
988          */
989
990         rt = __ip_vs_get_out_rt_v6(cp);
991         if (!rt)
992                 goto tx_error_icmp;
993
994         /* MTU checking */
995         mtu = dst_mtu(&rt->dst);
996         if (skb->len > mtu) {
997                 dst_release(&rt->dst);
998                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
999                 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
1000                 goto tx_error;
1001         }
1002
1003         /* copy-on-write the packet before mangling it */
1004         if (!skb_make_writable(skb, offset))
1005                 goto tx_error_put;
1006
1007         if (skb_cow(skb, rt->dst.dev->hard_header_len))
1008                 goto tx_error_put;
1009
1010         /* drop the old route when skb is not shared */
1011         skb_dst_drop(skb);
1012         skb_dst_set(skb, &rt->dst);
1013
1014         ip_vs_nat_icmp_v6(skb, pp, cp, 0);
1015
1016         /* Another hack: avoid icmp_send in ip_fragment */
1017         skb->local_df = 1;
1018
1019         IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
1020
1021         rc = NF_STOLEN;
1022         goto out;
1023
1024 tx_error_icmp:
1025         dst_link_failure(skb);
1026 tx_error:
1027         dev_kfree_skb(skb);
1028         rc = NF_STOLEN;
1029 out:
1030         LeaveFunction(10);
1031         return rc;
1032 tx_error_put:
1033         dst_release(&rt->dst);
1034         goto tx_error;
1035 }
1036 #endif