]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/netfilter/ipvs/ip_vs_core.c
Merge branch 'fix/asoc' into for-linus
[net-next-2.6.git] / net / netfilter / ipvs / ip_vs_core.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the Netfilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19  * and others.
20  *
21  * Changes:
22  *      Paul `Rusty' Russell            properly handle non-linear skbs
23  *      Harald Welte                    don't use nfcache
24  *
25  */
26
27 #define KMSG_COMPONENT "IPVS"
28 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
29
30 #include <linux/module.h>
31 #include <linux/kernel.h>
32 #include <linux/ip.h>
33 #include <linux/tcp.h>
34 #include <linux/sctp.h>
35 #include <linux/icmp.h>
36 #include <linux/slab.h>
37
38 #include <net/ip.h>
39 #include <net/tcp.h>
40 #include <net/udp.h>
41 #include <net/icmp.h>                   /* for icmp_send */
42 #include <net/route.h>
43
44 #include <linux/netfilter.h>
45 #include <linux/netfilter_ipv4.h>
46
47 #ifdef CONFIG_IP_VS_IPV6
48 #include <net/ipv6.h>
49 #include <linux/netfilter_ipv6.h>
50 #endif
51
52 #include <net/ip_vs.h>
53
54
55 EXPORT_SYMBOL(register_ip_vs_scheduler);
56 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
57 EXPORT_SYMBOL(ip_vs_proto_name);
58 EXPORT_SYMBOL(ip_vs_conn_new);
59 EXPORT_SYMBOL(ip_vs_conn_in_get);
60 EXPORT_SYMBOL(ip_vs_conn_out_get);
61 #ifdef CONFIG_IP_VS_PROTO_TCP
62 EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
63 #endif
64 EXPORT_SYMBOL(ip_vs_conn_put);
65 #ifdef CONFIG_IP_VS_DEBUG
66 EXPORT_SYMBOL(ip_vs_get_debug_level);
67 #endif
68
69
70 /* ID used in ICMP lookups */
71 #define icmp_id(icmph)          (((icmph)->un).echo.id)
72 #define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
73
74 const char *ip_vs_proto_name(unsigned proto)
75 {
76         static char buf[20];
77
78         switch (proto) {
79         case IPPROTO_IP:
80                 return "IP";
81         case IPPROTO_UDP:
82                 return "UDP";
83         case IPPROTO_TCP:
84                 return "TCP";
85         case IPPROTO_SCTP:
86                 return "SCTP";
87         case IPPROTO_ICMP:
88                 return "ICMP";
89 #ifdef CONFIG_IP_VS_IPV6
90         case IPPROTO_ICMPV6:
91                 return "ICMPv6";
92 #endif
93         default:
94                 sprintf(buf, "IP_%d", proto);
95                 return buf;
96         }
97 }
98
99 void ip_vs_init_hash_table(struct list_head *table, int rows)
100 {
101         while (--rows >= 0)
102                 INIT_LIST_HEAD(&table[rows]);
103 }
104
105 static inline void
106 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
107 {
108         struct ip_vs_dest *dest = cp->dest;
109         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
110                 spin_lock(&dest->stats.lock);
111                 dest->stats.ustats.inpkts++;
112                 dest->stats.ustats.inbytes += skb->len;
113                 spin_unlock(&dest->stats.lock);
114
115                 spin_lock(&dest->svc->stats.lock);
116                 dest->svc->stats.ustats.inpkts++;
117                 dest->svc->stats.ustats.inbytes += skb->len;
118                 spin_unlock(&dest->svc->stats.lock);
119
120                 spin_lock(&ip_vs_stats.lock);
121                 ip_vs_stats.ustats.inpkts++;
122                 ip_vs_stats.ustats.inbytes += skb->len;
123                 spin_unlock(&ip_vs_stats.lock);
124         }
125 }
126
127
128 static inline void
129 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
130 {
131         struct ip_vs_dest *dest = cp->dest;
132         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
133                 spin_lock(&dest->stats.lock);
134                 dest->stats.ustats.outpkts++;
135                 dest->stats.ustats.outbytes += skb->len;
136                 spin_unlock(&dest->stats.lock);
137
138                 spin_lock(&dest->svc->stats.lock);
139                 dest->svc->stats.ustats.outpkts++;
140                 dest->svc->stats.ustats.outbytes += skb->len;
141                 spin_unlock(&dest->svc->stats.lock);
142
143                 spin_lock(&ip_vs_stats.lock);
144                 ip_vs_stats.ustats.outpkts++;
145                 ip_vs_stats.ustats.outbytes += skb->len;
146                 spin_unlock(&ip_vs_stats.lock);
147         }
148 }
149
150
151 static inline void
152 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
153 {
154         spin_lock(&cp->dest->stats.lock);
155         cp->dest->stats.ustats.conns++;
156         spin_unlock(&cp->dest->stats.lock);
157
158         spin_lock(&svc->stats.lock);
159         svc->stats.ustats.conns++;
160         spin_unlock(&svc->stats.lock);
161
162         spin_lock(&ip_vs_stats.lock);
163         ip_vs_stats.ustats.conns++;
164         spin_unlock(&ip_vs_stats.lock);
165 }
166
167
168 static inline int
169 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
170                 const struct sk_buff *skb,
171                 struct ip_vs_protocol *pp)
172 {
173         if (unlikely(!pp->state_transition))
174                 return 0;
175         return pp->state_transition(cp, direction, skb, pp);
176 }
177
178
179 /*
180  *  IPVS persistent scheduling function
181  *  It creates a connection entry according to its template if exists,
182  *  or selects a server and creates a connection entry plus a template.
183  *  Locking: we are svc user (svc->refcnt), so we hold all dests too
184  *  Protocols supported: TCP, UDP
185  */
186 static struct ip_vs_conn *
187 ip_vs_sched_persist(struct ip_vs_service *svc,
188                     const struct sk_buff *skb,
189                     __be16 ports[2])
190 {
191         struct ip_vs_conn *cp = NULL;
192         struct ip_vs_iphdr iph;
193         struct ip_vs_dest *dest;
194         struct ip_vs_conn *ct;
195         __be16  dport;                  /* destination port to forward */
196         __be16  flags;
197         union nf_inet_addr snet;        /* source network of the client,
198                                            after masking */
199
200         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
201
202         /* Mask saddr with the netmask to adjust template granularity */
203 #ifdef CONFIG_IP_VS_IPV6
204         if (svc->af == AF_INET6)
205                 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
206         else
207 #endif
208                 snet.ip = iph.saddr.ip & svc->netmask;
209
210         IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
211                       "mnet %s\n",
212                       IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
213                       IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
214                       IP_VS_DBG_ADDR(svc->af, &snet));
215
216         /*
217          * As far as we know, FTP is a very complicated network protocol, and
218          * it uses control connection and data connections. For active FTP,
219          * FTP server initialize data connection to the client, its source port
220          * is often 20. For passive FTP, FTP server tells the clients the port
221          * that it passively listens to,  and the client issues the data
222          * connection. In the tunneling or direct routing mode, the load
223          * balancer is on the client-to-server half of connection, the port
224          * number is unknown to the load balancer. So, a conn template like
225          * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
226          * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
227          * is created for other persistent services.
228          */
229         if (ports[1] == svc->port) {
230                 /* Check if a template already exists */
231                 if (svc->port != FTPPORT)
232                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
233                                              &iph.daddr, ports[1]);
234                 else
235                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
236                                              &iph.daddr, 0);
237
238                 if (!ct || !ip_vs_check_template(ct)) {
239                         /*
240                          * No template found or the dest of the connection
241                          * template is not available.
242                          */
243                         dest = svc->scheduler->schedule(svc, skb);
244                         if (dest == NULL) {
245                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
246                                 return NULL;
247                         }
248
249                         /*
250                          * Create a template like <protocol,caddr,0,
251                          * vaddr,vport,daddr,dport> for non-ftp service,
252                          * and <protocol,caddr,0,vaddr,0,daddr,0>
253                          * for ftp service.
254                          */
255                         if (svc->port != FTPPORT)
256                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
257                                                     &snet, 0,
258                                                     &iph.daddr,
259                                                     ports[1],
260                                                     &dest->addr, dest->port,
261                                                     IP_VS_CONN_F_TEMPLATE,
262                                                     dest);
263                         else
264                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
265                                                     &snet, 0,
266                                                     &iph.daddr, 0,
267                                                     &dest->addr, 0,
268                                                     IP_VS_CONN_F_TEMPLATE,
269                                                     dest);
270                         if (ct == NULL)
271                                 return NULL;
272
273                         ct->timeout = svc->timeout;
274                 } else {
275                         /* set destination with the found template */
276                         dest = ct->dest;
277                 }
278                 dport = dest->port;
279         } else {
280                 /*
281                  * Note: persistent fwmark-based services and persistent
282                  * port zero service are handled here.
283                  * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
284                  * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
285                  */
286                 if (svc->fwmark) {
287                         union nf_inet_addr fwmark = {
288                                 .ip = htonl(svc->fwmark)
289                         };
290
291                         ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
292                                              &fwmark, 0);
293                 } else
294                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
295                                              &iph.daddr, 0);
296
297                 if (!ct || !ip_vs_check_template(ct)) {
298                         /*
299                          * If it is not persistent port zero, return NULL,
300                          * otherwise create a connection template.
301                          */
302                         if (svc->port)
303                                 return NULL;
304
305                         dest = svc->scheduler->schedule(svc, skb);
306                         if (dest == NULL) {
307                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
308                                 return NULL;
309                         }
310
311                         /*
312                          * Create a template according to the service
313                          */
314                         if (svc->fwmark) {
315                                 union nf_inet_addr fwmark = {
316                                         .ip = htonl(svc->fwmark)
317                                 };
318
319                                 ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
320                                                     &snet, 0,
321                                                     &fwmark, 0,
322                                                     &dest->addr, 0,
323                                                     IP_VS_CONN_F_TEMPLATE,
324                                                     dest);
325                         } else
326                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
327                                                     &snet, 0,
328                                                     &iph.daddr, 0,
329                                                     &dest->addr, 0,
330                                                     IP_VS_CONN_F_TEMPLATE,
331                                                     dest);
332                         if (ct == NULL)
333                                 return NULL;
334
335                         ct->timeout = svc->timeout;
336                 } else {
337                         /* set destination with the found template */
338                         dest = ct->dest;
339                 }
340                 dport = ports[1];
341         }
342
343         flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
344                  && iph.protocol == IPPROTO_UDP)?
345                 IP_VS_CONN_F_ONE_PACKET : 0;
346
347         /*
348          *    Create a new connection according to the template
349          */
350         cp = ip_vs_conn_new(svc->af, iph.protocol,
351                             &iph.saddr, ports[0],
352                             &iph.daddr, ports[1],
353                             &dest->addr, dport,
354                             flags,
355                             dest);
356         if (cp == NULL) {
357                 ip_vs_conn_put(ct);
358                 return NULL;
359         }
360
361         /*
362          *    Add its control
363          */
364         ip_vs_control_add(cp, ct);
365         ip_vs_conn_put(ct);
366
367         ip_vs_conn_stats(cp, svc);
368         return cp;
369 }
370
371
372 /*
373  *  IPVS main scheduling function
374  *  It selects a server according to the virtual service, and
375  *  creates a connection entry.
376  *  Protocols supported: TCP, UDP
377  */
378 struct ip_vs_conn *
379 ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
380 {
381         struct ip_vs_conn *cp = NULL;
382         struct ip_vs_iphdr iph;
383         struct ip_vs_dest *dest;
384         __be16 _ports[2], *pptr, flags;
385
386         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
387         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
388         if (pptr == NULL)
389                 return NULL;
390
391         /*
392          *    Persistent service
393          */
394         if (svc->flags & IP_VS_SVC_F_PERSISTENT)
395                 return ip_vs_sched_persist(svc, skb, pptr);
396
397         /*
398          *    Non-persistent service
399          */
400         if (!svc->fwmark && pptr[1] != svc->port) {
401                 if (!svc->port)
402                         pr_err("Schedule: port zero only supported "
403                                "in persistent services, "
404                                "check your ipvs configuration\n");
405                 return NULL;
406         }
407
408         dest = svc->scheduler->schedule(svc, skb);
409         if (dest == NULL) {
410                 IP_VS_DBG(1, "Schedule: no dest found.\n");
411                 return NULL;
412         }
413
414         flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
415                  && iph.protocol == IPPROTO_UDP)?
416                 IP_VS_CONN_F_ONE_PACKET : 0;
417
418         /*
419          *    Create a connection entry.
420          */
421         cp = ip_vs_conn_new(svc->af, iph.protocol,
422                             &iph.saddr, pptr[0],
423                             &iph.daddr, pptr[1],
424                             &dest->addr, dest->port ? dest->port : pptr[1],
425                             flags,
426                             dest);
427         if (cp == NULL)
428                 return NULL;
429
430         IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
431                       "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
432                       ip_vs_fwd_tag(cp),
433                       IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
434                       IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
435                       IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
436                       cp->flags, atomic_read(&cp->refcnt));
437
438         ip_vs_conn_stats(cp, svc);
439         return cp;
440 }
441
442
443 /*
444  *  Pass or drop the packet.
445  *  Called by ip_vs_in, when the virtual service is available but
446  *  no destination is available for a new connection.
447  */
448 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
449                 struct ip_vs_protocol *pp)
450 {
451         __be16 _ports[2], *pptr;
452         struct ip_vs_iphdr iph;
453         int unicast;
454         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
455
456         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
457         if (pptr == NULL) {
458                 ip_vs_service_put(svc);
459                 return NF_DROP;
460         }
461
462 #ifdef CONFIG_IP_VS_IPV6
463         if (svc->af == AF_INET6)
464                 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
465         else
466 #endif
467                 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
468
469         /* if it is fwmark-based service, the cache_bypass sysctl is up
470            and the destination is a non-local unicast, then create
471            a cache_bypass connection entry */
472         if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
473                 int ret, cs;
474                 struct ip_vs_conn *cp;
475                 __u16 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
476                                 iph.protocol == IPPROTO_UDP)?
477                                 IP_VS_CONN_F_ONE_PACKET : 0;
478                 union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
479
480                 ip_vs_service_put(svc);
481
482                 /* create a new connection entry */
483                 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
484                 cp = ip_vs_conn_new(svc->af, iph.protocol,
485                                     &iph.saddr, pptr[0],
486                                     &iph.daddr, pptr[1],
487                                     &daddr, 0,
488                                     IP_VS_CONN_F_BYPASS | flags,
489                                     NULL);
490                 if (cp == NULL)
491                         return NF_DROP;
492
493                 /* statistics */
494                 ip_vs_in_stats(cp, skb);
495
496                 /* set state */
497                 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
498
499                 /* transmit the first SYN packet */
500                 ret = cp->packet_xmit(skb, cp, pp);
501                 /* do not touch skb anymore */
502
503                 atomic_inc(&cp->in_pkts);
504                 ip_vs_conn_put(cp);
505                 return ret;
506         }
507
508         /*
509          * When the virtual ftp service is presented, packets destined
510          * for other services on the VIP may get here (except services
511          * listed in the ipvs table), pass the packets, because it is
512          * not ipvs job to decide to drop the packets.
513          */
514         if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
515                 ip_vs_service_put(svc);
516                 return NF_ACCEPT;
517         }
518
519         ip_vs_service_put(svc);
520
521         /*
522          * Notify the client that the destination is unreachable, and
523          * release the socket buffer.
524          * Since it is in IP layer, the TCP socket is not actually
525          * created, the TCP RST packet cannot be sent, instead that
526          * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
527          */
528 #ifdef CONFIG_IP_VS_IPV6
529         if (svc->af == AF_INET6)
530                 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
531         else
532 #endif
533                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
534
535         return NF_DROP;
536 }
537
538 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
539 {
540         return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
541 }
542
543 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
544 {
545         int err = ip_defrag(skb, user);
546
547         if (!err)
548                 ip_send_check(ip_hdr(skb));
549
550         return err;
551 }
552
553 #ifdef CONFIG_IP_VS_IPV6
554 static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
555 {
556         /* TODO IPv6: Find out what to do here for IPv6 */
557         return 0;
558 }
559 #endif
560
561 /*
562  * Packet has been made sufficiently writable in caller
563  * - inout: 1=in->out, 0=out->in
564  */
565 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
566                     struct ip_vs_conn *cp, int inout)
567 {
568         struct iphdr *iph        = ip_hdr(skb);
569         unsigned int icmp_offset = iph->ihl*4;
570         struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
571                                                       icmp_offset);
572         struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
573
574         if (inout) {
575                 iph->saddr = cp->vaddr.ip;
576                 ip_send_check(iph);
577                 ciph->daddr = cp->vaddr.ip;
578                 ip_send_check(ciph);
579         } else {
580                 iph->daddr = cp->daddr.ip;
581                 ip_send_check(iph);
582                 ciph->saddr = cp->daddr.ip;
583                 ip_send_check(ciph);
584         }
585
586         /* the TCP/UDP/SCTP port */
587         if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
588             IPPROTO_SCTP == ciph->protocol) {
589                 __be16 *ports = (void *)ciph + ciph->ihl*4;
590
591                 if (inout)
592                         ports[1] = cp->vport;
593                 else
594                         ports[0] = cp->dport;
595         }
596
597         /* And finally the ICMP checksum */
598         icmph->checksum = 0;
599         icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
600         skb->ip_summed = CHECKSUM_UNNECESSARY;
601
602         if (inout)
603                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
604                         "Forwarding altered outgoing ICMP");
605         else
606                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
607                         "Forwarding altered incoming ICMP");
608 }
609
610 #ifdef CONFIG_IP_VS_IPV6
611 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
612                     struct ip_vs_conn *cp, int inout)
613 {
614         struct ipv6hdr *iph      = ipv6_hdr(skb);
615         unsigned int icmp_offset = sizeof(struct ipv6hdr);
616         struct icmp6hdr *icmph   = (struct icmp6hdr *)(skb_network_header(skb) +
617                                                       icmp_offset);
618         struct ipv6hdr *ciph     = (struct ipv6hdr *)(icmph + 1);
619
620         if (inout) {
621                 iph->saddr = cp->vaddr.in6;
622                 ciph->daddr = cp->vaddr.in6;
623         } else {
624                 iph->daddr = cp->daddr.in6;
625                 ciph->saddr = cp->daddr.in6;
626         }
627
628         /* the TCP/UDP/SCTP port */
629         if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr ||
630             IPPROTO_SCTP == ciph->nexthdr) {
631                 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
632
633                 if (inout)
634                         ports[1] = cp->vport;
635                 else
636                         ports[0] = cp->dport;
637         }
638
639         /* And finally the ICMP checksum */
640         icmph->icmp6_cksum = 0;
641         /* TODO IPv6: is this correct for ICMPv6? */
642         ip_vs_checksum_complete(skb, icmp_offset);
643         skb->ip_summed = CHECKSUM_UNNECESSARY;
644
645         if (inout)
646                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
647                         "Forwarding altered outgoing ICMPv6");
648         else
649                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
650                         "Forwarding altered incoming ICMPv6");
651 }
652 #endif
653
654 /* Handle relevant response ICMP messages - forward to the right
655  * destination host. Used for NAT and local client.
656  */
657 static int handle_response_icmp(int af, struct sk_buff *skb,
658                                 union nf_inet_addr *snet,
659                                 __u8 protocol, struct ip_vs_conn *cp,
660                                 struct ip_vs_protocol *pp,
661                                 unsigned int offset, unsigned int ihl)
662 {
663         unsigned int verdict = NF_DROP;
664
665         if (IP_VS_FWD_METHOD(cp) != 0) {
666                 pr_err("shouldn't reach here, because the box is on the "
667                        "half connection in the tun/dr module.\n");
668         }
669
670         /* Ensure the checksum is correct */
671         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
672                 /* Failed checksum! */
673                 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
674                               IP_VS_DBG_ADDR(af, snet));
675                 goto out;
676         }
677
678         if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
679             IPPROTO_SCTP == protocol)
680                 offset += 2 * sizeof(__u16);
681         if (!skb_make_writable(skb, offset))
682                 goto out;
683
684 #ifdef CONFIG_IP_VS_IPV6
685         if (af == AF_INET6)
686                 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
687         else
688 #endif
689                 ip_vs_nat_icmp(skb, pp, cp, 1);
690
691         /* do the statistics and put it back */
692         ip_vs_out_stats(cp, skb);
693
694         skb->ipvs_property = 1;
695         verdict = NF_ACCEPT;
696
697 out:
698         __ip_vs_conn_put(cp);
699
700         return verdict;
701 }
702
703 /*
704  *      Handle ICMP messages in the inside-to-outside direction (outgoing).
705  *      Find any that might be relevant, check against existing connections.
706  *      Currently handles error types - unreachable, quench, ttl exceeded.
707  */
708 static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
709 {
710         struct iphdr *iph;
711         struct icmphdr  _icmph, *ic;
712         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
713         struct ip_vs_iphdr ciph;
714         struct ip_vs_conn *cp;
715         struct ip_vs_protocol *pp;
716         unsigned int offset, ihl;
717         union nf_inet_addr snet;
718
719         *related = 1;
720
721         /* reassemble IP fragments */
722         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
723                 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
724                         return NF_STOLEN;
725         }
726
727         iph = ip_hdr(skb);
728         offset = ihl = iph->ihl * 4;
729         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
730         if (ic == NULL)
731                 return NF_DROP;
732
733         IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
734                   ic->type, ntohs(icmp_id(ic)),
735                   &iph->saddr, &iph->daddr);
736
737         /*
738          * Work through seeing if this is for us.
739          * These checks are supposed to be in an order that means easy
740          * things are checked first to speed up processing.... however
741          * this means that some packets will manage to get a long way
742          * down this stack and then be rejected, but that's life.
743          */
744         if ((ic->type != ICMP_DEST_UNREACH) &&
745             (ic->type != ICMP_SOURCE_QUENCH) &&
746             (ic->type != ICMP_TIME_EXCEEDED)) {
747                 *related = 0;
748                 return NF_ACCEPT;
749         }
750
751         /* Now find the contained IP header */
752         offset += sizeof(_icmph);
753         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
754         if (cih == NULL)
755                 return NF_ACCEPT; /* The packet looks wrong, ignore */
756
757         pp = ip_vs_proto_get(cih->protocol);
758         if (!pp)
759                 return NF_ACCEPT;
760
761         /* Is the embedded protocol header present? */
762         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
763                      pp->dont_defrag))
764                 return NF_ACCEPT;
765
766         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
767
768         offset += cih->ihl * 4;
769
770         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
771         /* The embedded headers contain source and dest in reverse order */
772         cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
773         if (!cp)
774                 return NF_ACCEPT;
775
776         snet.ip = iph->saddr;
777         return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
778                                     pp, offset, ihl);
779 }
780
781 #ifdef CONFIG_IP_VS_IPV6
782 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
783 {
784         struct ipv6hdr *iph;
785         struct icmp6hdr _icmph, *ic;
786         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
787                                            within the ICMP */
788         struct ip_vs_iphdr ciph;
789         struct ip_vs_conn *cp;
790         struct ip_vs_protocol *pp;
791         unsigned int offset;
792         union nf_inet_addr snet;
793
794         *related = 1;
795
796         /* reassemble IP fragments */
797         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
798                 if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
799                         return NF_STOLEN;
800         }
801
802         iph = ipv6_hdr(skb);
803         offset = sizeof(struct ipv6hdr);
804         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
805         if (ic == NULL)
806                 return NF_DROP;
807
808         IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
809                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
810                   &iph->saddr, &iph->daddr);
811
812         /*
813          * Work through seeing if this is for us.
814          * These checks are supposed to be in an order that means easy
815          * things are checked first to speed up processing.... however
816          * this means that some packets will manage to get a long way
817          * down this stack and then be rejected, but that's life.
818          */
819         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
820             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
821             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
822                 *related = 0;
823                 return NF_ACCEPT;
824         }
825
826         /* Now find the contained IP header */
827         offset += sizeof(_icmph);
828         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
829         if (cih == NULL)
830                 return NF_ACCEPT; /* The packet looks wrong, ignore */
831
832         pp = ip_vs_proto_get(cih->nexthdr);
833         if (!pp)
834                 return NF_ACCEPT;
835
836         /* Is the embedded protocol header present? */
837         /* TODO: we don't support fragmentation at the moment anyways */
838         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
839                 return NF_ACCEPT;
840
841         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
842
843         offset += sizeof(struct ipv6hdr);
844
845         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
846         /* The embedded headers contain source and dest in reverse order */
847         cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
848         if (!cp)
849                 return NF_ACCEPT;
850
851         ipv6_addr_copy(&snet.in6, &iph->saddr);
852         return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
853                                     pp, offset, sizeof(struct ipv6hdr));
854 }
855 #endif
856
857 /*
858  * Check if sctp chunc is ABORT chunk
859  */
860 static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
861 {
862         sctp_chunkhdr_t *sch, schunk;
863         sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
864                         sizeof(schunk), &schunk);
865         if (sch == NULL)
866                 return 0;
867         if (sch->type == SCTP_CID_ABORT)
868                 return 1;
869         return 0;
870 }
871
872 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
873 {
874         struct tcphdr _tcph, *th;
875
876         th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
877         if (th == NULL)
878                 return 0;
879         return th->rst;
880 }
881
882 /* Handle response packets: rewrite addresses and send away...
883  * Used for NAT and local client.
884  */
885 static unsigned int
886 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
887                 struct ip_vs_conn *cp, int ihl)
888 {
889         IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
890
891         if (!skb_make_writable(skb, ihl))
892                 goto drop;
893
894         /* mangle the packet */
895         if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
896                 goto drop;
897
898 #ifdef CONFIG_IP_VS_IPV6
899         if (af == AF_INET6)
900                 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
901         else
902 #endif
903         {
904                 ip_hdr(skb)->saddr = cp->vaddr.ip;
905                 ip_send_check(ip_hdr(skb));
906         }
907
908         /* For policy routing, packets originating from this
909          * machine itself may be routed differently to packets
910          * passing through.  We want this packet to be routed as
911          * if it came from this machine itself.  So re-compute
912          * the routing information.
913          */
914 #ifdef CONFIG_IP_VS_IPV6
915         if (af == AF_INET6) {
916                 if (ip6_route_me_harder(skb) != 0)
917                         goto drop;
918         } else
919 #endif
920                 if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
921                         goto drop;
922
923         IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
924
925         ip_vs_out_stats(cp, skb);
926         ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
927         ip_vs_update_conntrack(skb, cp, 0);
928         ip_vs_conn_put(cp);
929
930         skb->ipvs_property = 1;
931
932         LeaveFunction(11);
933         return NF_ACCEPT;
934
935 drop:
936         ip_vs_conn_put(cp);
937         kfree_skb(skb);
938         return NF_STOLEN;
939 }
940
941 /*
942  *      It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
943  *      Check if outgoing packet belongs to the established ip_vs_conn.
944  */
945 static unsigned int
946 ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
947           const struct net_device *in, const struct net_device *out,
948           int (*okfn)(struct sk_buff *))
949 {
950         struct ip_vs_iphdr iph;
951         struct ip_vs_protocol *pp;
952         struct ip_vs_conn *cp;
953         int af;
954
955         EnterFunction(11);
956
957         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
958
959         if (skb->ipvs_property)
960                 return NF_ACCEPT;
961
962         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
963 #ifdef CONFIG_IP_VS_IPV6
964         if (af == AF_INET6) {
965                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
966                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
967
968                         if (related)
969                                 return verdict;
970                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
971                 }
972         } else
973 #endif
974                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
975                         int related, verdict = ip_vs_out_icmp(skb, &related);
976
977                         if (related)
978                                 return verdict;
979                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
980                 }
981
982         pp = ip_vs_proto_get(iph.protocol);
983         if (unlikely(!pp))
984                 return NF_ACCEPT;
985
986         /* reassemble IP fragments */
987 #ifdef CONFIG_IP_VS_IPV6
988         if (af == AF_INET6) {
989                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
990                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
991
992                         if (related)
993                                 return verdict;
994
995                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
996                 }
997         } else
998 #endif
999                 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
1000                              !pp->dont_defrag)) {
1001                         if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
1002                                 return NF_STOLEN;
1003
1004                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1005                 }
1006
1007         /*
1008          * Check if the packet belongs to an existing entry
1009          */
1010         cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1011
1012         if (unlikely(!cp)) {
1013                 if (sysctl_ip_vs_nat_icmp_send &&
1014                     (pp->protocol == IPPROTO_TCP ||
1015                      pp->protocol == IPPROTO_UDP ||
1016                      pp->protocol == IPPROTO_SCTP)) {
1017                         __be16 _ports[2], *pptr;
1018
1019                         pptr = skb_header_pointer(skb, iph.len,
1020                                                   sizeof(_ports), _ports);
1021                         if (pptr == NULL)
1022                                 return NF_ACCEPT;       /* Not for me */
1023                         if (ip_vs_lookup_real_service(af, iph.protocol,
1024                                                       &iph.saddr,
1025                                                       pptr[0])) {
1026                                 /*
1027                                  * Notify the real server: there is no
1028                                  * existing entry if it is not RST
1029                                  * packet or not TCP packet.
1030                                  */
1031                                 if ((iph.protocol != IPPROTO_TCP &&
1032                                      iph.protocol != IPPROTO_SCTP)
1033                                      || ((iph.protocol == IPPROTO_TCP
1034                                           && !is_tcp_reset(skb, iph.len))
1035                                          || (iph.protocol == IPPROTO_SCTP
1036                                                 && !is_sctp_abort(skb,
1037                                                         iph.len)))) {
1038 #ifdef CONFIG_IP_VS_IPV6
1039                                         if (af == AF_INET6)
1040                                                 icmpv6_send(skb,
1041                                                             ICMPV6_DEST_UNREACH,
1042                                                             ICMPV6_PORT_UNREACH,
1043                                                             0);
1044                                         else
1045 #endif
1046                                                 icmp_send(skb,
1047                                                           ICMP_DEST_UNREACH,
1048                                                           ICMP_PORT_UNREACH, 0);
1049                                         return NF_DROP;
1050                                 }
1051                         }
1052                 }
1053                 IP_VS_DBG_PKT(12, pp, skb, 0,
1054                               "packet continues traversal as normal");
1055                 return NF_ACCEPT;
1056         }
1057
1058         return handle_response(af, skb, pp, cp, iph.len);
1059 }
1060
1061
1062 /*
1063  *      Handle ICMP messages in the outside-to-inside direction (incoming).
1064  *      Find any that might be relevant, check against existing connections,
1065  *      forward to the right destination host if relevant.
1066  *      Currently handles error types - unreachable, quench, ttl exceeded.
1067  */
1068 static int
1069 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1070 {
1071         struct iphdr *iph;
1072         struct icmphdr  _icmph, *ic;
1073         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
1074         struct ip_vs_iphdr ciph;
1075         struct ip_vs_conn *cp;
1076         struct ip_vs_protocol *pp;
1077         unsigned int offset, ihl, verdict;
1078         union nf_inet_addr snet;
1079
1080         *related = 1;
1081
1082         /* reassemble IP fragments */
1083         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1084                 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
1085                                             IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1086                         return NF_STOLEN;
1087         }
1088
1089         iph = ip_hdr(skb);
1090         offset = ihl = iph->ihl * 4;
1091         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1092         if (ic == NULL)
1093                 return NF_DROP;
1094
1095         IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1096                   ic->type, ntohs(icmp_id(ic)),
1097                   &iph->saddr, &iph->daddr);
1098
1099         /*
1100          * Work through seeing if this is for us.
1101          * These checks are supposed to be in an order that means easy
1102          * things are checked first to speed up processing.... however
1103          * this means that some packets will manage to get a long way
1104          * down this stack and then be rejected, but that's life.
1105          */
1106         if ((ic->type != ICMP_DEST_UNREACH) &&
1107             (ic->type != ICMP_SOURCE_QUENCH) &&
1108             (ic->type != ICMP_TIME_EXCEEDED)) {
1109                 *related = 0;
1110                 return NF_ACCEPT;
1111         }
1112
1113         /* Now find the contained IP header */
1114         offset += sizeof(_icmph);
1115         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1116         if (cih == NULL)
1117                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1118
1119         pp = ip_vs_proto_get(cih->protocol);
1120         if (!pp)
1121                 return NF_ACCEPT;
1122
1123         /* Is the embedded protocol header present? */
1124         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1125                      pp->dont_defrag))
1126                 return NF_ACCEPT;
1127
1128         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
1129
1130         offset += cih->ihl * 4;
1131
1132         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1133         /* The embedded headers contain source and dest in reverse order */
1134         cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
1135         if (!cp) {
1136                 /* The packet could also belong to a local client */
1137                 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1138                 if (cp) {
1139                         snet.ip = iph->saddr;
1140                         return handle_response_icmp(AF_INET, skb, &snet,
1141                                                     cih->protocol, cp, pp,
1142                                                     offset, ihl);
1143                 }
1144                 return NF_ACCEPT;
1145         }
1146
1147         verdict = NF_DROP;
1148
1149         /* Ensure the checksum is correct */
1150         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1151                 /* Failed checksum! */
1152                 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1153                           &iph->saddr);
1154                 goto out;
1155         }
1156
1157         /* do the statistics and put it back */
1158         ip_vs_in_stats(cp, skb);
1159         if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1160                 offset += 2 * sizeof(__u16);
1161         verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1162         /* do not touch skb anymore */
1163
1164   out:
1165         __ip_vs_conn_put(cp);
1166
1167         return verdict;
1168 }
1169
1170 #ifdef CONFIG_IP_VS_IPV6
1171 static int
1172 ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1173 {
1174         struct ipv6hdr *iph;
1175         struct icmp6hdr _icmph, *ic;
1176         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
1177                                            within the ICMP */
1178         struct ip_vs_iphdr ciph;
1179         struct ip_vs_conn *cp;
1180         struct ip_vs_protocol *pp;
1181         unsigned int offset, verdict;
1182         union nf_inet_addr snet;
1183
1184         *related = 1;
1185
1186         /* reassemble IP fragments */
1187         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1188                 if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
1189                                                IP_DEFRAG_VS_IN :
1190                                                IP_DEFRAG_VS_FWD))
1191                         return NF_STOLEN;
1192         }
1193
1194         iph = ipv6_hdr(skb);
1195         offset = sizeof(struct ipv6hdr);
1196         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1197         if (ic == NULL)
1198                 return NF_DROP;
1199
1200         IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
1201                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
1202                   &iph->saddr, &iph->daddr);
1203
1204         /*
1205          * Work through seeing if this is for us.
1206          * These checks are supposed to be in an order that means easy
1207          * things are checked first to speed up processing.... however
1208          * this means that some packets will manage to get a long way
1209          * down this stack and then be rejected, but that's life.
1210          */
1211         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1212             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1213             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1214                 *related = 0;
1215                 return NF_ACCEPT;
1216         }
1217
1218         /* Now find the contained IP header */
1219         offset += sizeof(_icmph);
1220         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1221         if (cih == NULL)
1222                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1223
1224         pp = ip_vs_proto_get(cih->nexthdr);
1225         if (!pp)
1226                 return NF_ACCEPT;
1227
1228         /* Is the embedded protocol header present? */
1229         /* TODO: we don't support fragmentation at the moment anyways */
1230         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1231                 return NF_ACCEPT;
1232
1233         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
1234
1235         offset += sizeof(struct ipv6hdr);
1236
1237         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1238         /* The embedded headers contain source and dest in reverse order */
1239         cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
1240         if (!cp) {
1241                 /* The packet could also belong to a local client */
1242                 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1243                 if (cp) {
1244                         ipv6_addr_copy(&snet.in6, &iph->saddr);
1245                         return handle_response_icmp(AF_INET6, skb, &snet,
1246                                                     cih->nexthdr,
1247                                                     cp, pp, offset,
1248                                                     sizeof(struct ipv6hdr));
1249                 }
1250                 return NF_ACCEPT;
1251         }
1252
1253         verdict = NF_DROP;
1254
1255         /* do the statistics and put it back */
1256         ip_vs_in_stats(cp, skb);
1257         if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
1258             IPPROTO_SCTP == cih->nexthdr)
1259                 offset += 2 * sizeof(__u16);
1260         verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1261         /* do not touch skb anymore */
1262
1263         __ip_vs_conn_put(cp);
1264
1265         return verdict;
1266 }
1267 #endif
1268
1269
1270 /*
1271  *      Check if it's for virtual services, look it up,
1272  *      and send it on its way...
1273  */
1274 static unsigned int
1275 ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1276          const struct net_device *in, const struct net_device *out,
1277          int (*okfn)(struct sk_buff *))
1278 {
1279         struct ip_vs_iphdr iph;
1280         struct ip_vs_protocol *pp;
1281         struct ip_vs_conn *cp;
1282         int ret, restart, af, pkts;
1283
1284         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
1285
1286         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1287
1288         /*
1289          *      Big tappo: only PACKET_HOST, including loopback for local client
1290          *      Don't handle local packets on IPv6 for now
1291          */
1292         if (unlikely(skb->pkt_type != PACKET_HOST)) {
1293                 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
1294                               skb->pkt_type,
1295                               iph.protocol,
1296                               IP_VS_DBG_ADDR(af, &iph.daddr));
1297                 return NF_ACCEPT;
1298         }
1299
1300 #ifdef CONFIG_IP_VS_IPV6
1301         if (af == AF_INET6) {
1302                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1303                         int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
1304
1305                         if (related)
1306                                 return verdict;
1307                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1308                 }
1309         } else
1310 #endif
1311                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1312                         int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
1313
1314                         if (related)
1315                                 return verdict;
1316                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1317                 }
1318
1319         /* Protocol supported? */
1320         pp = ip_vs_proto_get(iph.protocol);
1321         if (unlikely(!pp))
1322                 return NF_ACCEPT;
1323
1324         /*
1325          * Check if the packet belongs to an existing connection entry
1326          */
1327         cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1328
1329         if (unlikely(!cp)) {
1330                 int v;
1331
1332                 /* For local client packets, it could be a response */
1333                 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1334                 if (cp)
1335                         return handle_response(af, skb, pp, cp, iph.len);
1336
1337                 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1338                         return v;
1339         }
1340
1341         if (unlikely(!cp)) {
1342                 /* sorry, all this trouble for a no-hit :) */
1343                 IP_VS_DBG_PKT(12, pp, skb, 0,
1344                               "packet continues traversal as normal");
1345                 return NF_ACCEPT;
1346         }
1347
1348         IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1349
1350         /* Check the server status */
1351         if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1352                 /* the destination server is not available */
1353
1354                 if (sysctl_ip_vs_expire_nodest_conn) {
1355                         /* try to expire the connection immediately */
1356                         ip_vs_conn_expire_now(cp);
1357                 }
1358                 /* don't restart its timer, and silently
1359                    drop the packet. */
1360                 __ip_vs_conn_put(cp);
1361                 return NF_DROP;
1362         }
1363
1364         ip_vs_in_stats(cp, skb);
1365         restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1366         if (cp->packet_xmit)
1367                 ret = cp->packet_xmit(skb, cp, pp);
1368                 /* do not touch skb anymore */
1369         else {
1370                 IP_VS_DBG_RL("warning: packet_xmit is null");
1371                 ret = NF_ACCEPT;
1372         }
1373
1374         /* Increase its packet counter and check if it is needed
1375          * to be synchronized
1376          *
1377          * Sync connection if it is about to close to
1378          * encorage the standby servers to update the connections timeout
1379          */
1380         pkts = atomic_add_return(1, &cp->in_pkts);
1381         if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1382             cp->protocol == IPPROTO_SCTP) {
1383                 if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
1384                         (atomic_read(&cp->in_pkts) %
1385                          sysctl_ip_vs_sync_threshold[1]
1386                          == sysctl_ip_vs_sync_threshold[0])) ||
1387                                 (cp->old_state != cp->state &&
1388                                  ((cp->state == IP_VS_SCTP_S_CLOSED) ||
1389                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
1390                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
1391                         ip_vs_sync_conn(cp);
1392                         goto out;
1393                 }
1394         }
1395
1396         if (af == AF_INET &&
1397             (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1398             (((cp->protocol != IPPROTO_TCP ||
1399                cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1400               (pkts % sysctl_ip_vs_sync_threshold[1]
1401                == sysctl_ip_vs_sync_threshold[0])) ||
1402              ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1403               ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1404                (cp->state == IP_VS_TCP_S_CLOSE) ||
1405                (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1406                (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1407                 ip_vs_sync_conn(cp);
1408 out:
1409         cp->old_state = cp->state;
1410
1411         ip_vs_conn_put(cp);
1412         return ret;
1413 }
1414
1415
1416 /*
1417  *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1418  *      related packets destined for 0.0.0.0/0.
1419  *      When fwmark-based virtual service is used, such as transparent
1420  *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1421  *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1422  *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1423  *      and send them to ip_vs_in_icmp.
1424  */
1425 static unsigned int
1426 ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1427                    const struct net_device *in, const struct net_device *out,
1428                    int (*okfn)(struct sk_buff *))
1429 {
1430         int r;
1431
1432         if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1433                 return NF_ACCEPT;
1434
1435         return ip_vs_in_icmp(skb, &r, hooknum);
1436 }
1437
1438 #ifdef CONFIG_IP_VS_IPV6
1439 static unsigned int
1440 ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1441                       const struct net_device *in, const struct net_device *out,
1442                       int (*okfn)(struct sk_buff *))
1443 {
1444         int r;
1445
1446         if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1447                 return NF_ACCEPT;
1448
1449         return ip_vs_in_icmp_v6(skb, &r, hooknum);
1450 }
1451 #endif
1452
1453
1454 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1455         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1456          * or VS/NAT(change destination), so that filtering rules can be
1457          * applied to IPVS. */
1458         {
1459                 .hook           = ip_vs_in,
1460                 .owner          = THIS_MODULE,
1461                 .pf             = PF_INET,
1462                 .hooknum        = NF_INET_LOCAL_IN,
1463                 .priority       = 100,
1464         },
1465         /* After packet filtering, change source only for VS/NAT */
1466         {
1467                 .hook           = ip_vs_out,
1468                 .owner          = THIS_MODULE,
1469                 .pf             = PF_INET,
1470                 .hooknum        = NF_INET_FORWARD,
1471                 .priority       = 100,
1472         },
1473         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1474          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1475         {
1476                 .hook           = ip_vs_forward_icmp,
1477                 .owner          = THIS_MODULE,
1478                 .pf             = PF_INET,
1479                 .hooknum        = NF_INET_FORWARD,
1480                 .priority       = 99,
1481         },
1482 #ifdef CONFIG_IP_VS_IPV6
1483         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1484          * or VS/NAT(change destination), so that filtering rules can be
1485          * applied to IPVS. */
1486         {
1487                 .hook           = ip_vs_in,
1488                 .owner          = THIS_MODULE,
1489                 .pf             = PF_INET6,
1490                 .hooknum        = NF_INET_LOCAL_IN,
1491                 .priority       = 100,
1492         },
1493         /* After packet filtering, change source only for VS/NAT */
1494         {
1495                 .hook           = ip_vs_out,
1496                 .owner          = THIS_MODULE,
1497                 .pf             = PF_INET6,
1498                 .hooknum        = NF_INET_FORWARD,
1499                 .priority       = 100,
1500         },
1501         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1502          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1503         {
1504                 .hook           = ip_vs_forward_icmp_v6,
1505                 .owner          = THIS_MODULE,
1506                 .pf             = PF_INET6,
1507                 .hooknum        = NF_INET_FORWARD,
1508                 .priority       = 99,
1509         },
1510 #endif
1511 };
1512
1513
1514 /*
1515  *      Initialize IP Virtual Server
1516  */
1517 static int __init ip_vs_init(void)
1518 {
1519         int ret;
1520
1521         ip_vs_estimator_init();
1522
1523         ret = ip_vs_control_init();
1524         if (ret < 0) {
1525                 pr_err("can't setup control.\n");
1526                 goto cleanup_estimator;
1527         }
1528
1529         ip_vs_protocol_init();
1530
1531         ret = ip_vs_app_init();
1532         if (ret < 0) {
1533                 pr_err("can't setup application helper.\n");
1534                 goto cleanup_protocol;
1535         }
1536
1537         ret = ip_vs_conn_init();
1538         if (ret < 0) {
1539                 pr_err("can't setup connection table.\n");
1540                 goto cleanup_app;
1541         }
1542
1543         ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1544         if (ret < 0) {
1545                 pr_err("can't register hooks.\n");
1546                 goto cleanup_conn;
1547         }
1548
1549         pr_info("ipvs loaded.\n");
1550         return ret;
1551
1552   cleanup_conn:
1553         ip_vs_conn_cleanup();
1554   cleanup_app:
1555         ip_vs_app_cleanup();
1556   cleanup_protocol:
1557         ip_vs_protocol_cleanup();
1558         ip_vs_control_cleanup();
1559   cleanup_estimator:
1560         ip_vs_estimator_cleanup();
1561         return ret;
1562 }
1563
1564 static void __exit ip_vs_cleanup(void)
1565 {
1566         nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1567         ip_vs_conn_cleanup();
1568         ip_vs_app_cleanup();
1569         ip_vs_protocol_cleanup();
1570         ip_vs_control_cleanup();
1571         ip_vs_estimator_cleanup();
1572         pr_info("ipvs unloaded.\n");
1573 }
1574
1575 module_init(ip_vs_init);
1576 module_exit(ip_vs_cleanup);
1577 MODULE_LICENSE("GPL");