]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/netfilter/ipvs/ip_vs_core.c
Merge branch 'msm-core' of git://codeaurora.org/quic/kernel/dwalker/linux-msm
[net-next-2.6.git] / net / netfilter / ipvs / ip_vs_core.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the Netfilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19  * and others.
20  *
21  * Changes:
22  *      Paul `Rusty' Russell            properly handle non-linear skbs
23  *      Harald Welte                    don't use nfcache
24  *
25  */
26
27 #define KMSG_COMPONENT "IPVS"
28 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
29
30 #include <linux/module.h>
31 #include <linux/kernel.h>
32 #include <linux/ip.h>
33 #include <linux/tcp.h>
34 #include <linux/sctp.h>
35 #include <linux/icmp.h>
36 #include <linux/slab.h>
37
38 #include <net/ip.h>
39 #include <net/tcp.h>
40 #include <net/udp.h>
41 #include <net/icmp.h>                   /* for icmp_send */
42 #include <net/route.h>
43
44 #include <linux/netfilter.h>
45 #include <linux/netfilter_ipv4.h>
46
47 #ifdef CONFIG_IP_VS_IPV6
48 #include <net/ipv6.h>
49 #include <linux/netfilter_ipv6.h>
50 #endif
51
52 #include <net/ip_vs.h>
53
54
55 EXPORT_SYMBOL(register_ip_vs_scheduler);
56 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
57 EXPORT_SYMBOL(ip_vs_proto_name);
58 EXPORT_SYMBOL(ip_vs_conn_new);
59 EXPORT_SYMBOL(ip_vs_conn_in_get);
60 EXPORT_SYMBOL(ip_vs_conn_out_get);
61 #ifdef CONFIG_IP_VS_PROTO_TCP
62 EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
63 #endif
64 EXPORT_SYMBOL(ip_vs_conn_put);
65 #ifdef CONFIG_IP_VS_DEBUG
66 EXPORT_SYMBOL(ip_vs_get_debug_level);
67 #endif
68
69
70 /* ID used in ICMP lookups */
71 #define icmp_id(icmph)          (((icmph)->un).echo.id)
72 #define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
73
74 const char *ip_vs_proto_name(unsigned proto)
75 {
76         static char buf[20];
77
78         switch (proto) {
79         case IPPROTO_IP:
80                 return "IP";
81         case IPPROTO_UDP:
82                 return "UDP";
83         case IPPROTO_TCP:
84                 return "TCP";
85         case IPPROTO_SCTP:
86                 return "SCTP";
87         case IPPROTO_ICMP:
88                 return "ICMP";
89 #ifdef CONFIG_IP_VS_IPV6
90         case IPPROTO_ICMPV6:
91                 return "ICMPv6";
92 #endif
93         default:
94                 sprintf(buf, "IP_%d", proto);
95                 return buf;
96         }
97 }
98
99 void ip_vs_init_hash_table(struct list_head *table, int rows)
100 {
101         while (--rows >= 0)
102                 INIT_LIST_HEAD(&table[rows]);
103 }
104
105 static inline void
106 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
107 {
108         struct ip_vs_dest *dest = cp->dest;
109         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
110                 spin_lock(&dest->stats.lock);
111                 dest->stats.ustats.inpkts++;
112                 dest->stats.ustats.inbytes += skb->len;
113                 spin_unlock(&dest->stats.lock);
114
115                 spin_lock(&dest->svc->stats.lock);
116                 dest->svc->stats.ustats.inpkts++;
117                 dest->svc->stats.ustats.inbytes += skb->len;
118                 spin_unlock(&dest->svc->stats.lock);
119
120                 spin_lock(&ip_vs_stats.lock);
121                 ip_vs_stats.ustats.inpkts++;
122                 ip_vs_stats.ustats.inbytes += skb->len;
123                 spin_unlock(&ip_vs_stats.lock);
124         }
125 }
126
127
128 static inline void
129 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
130 {
131         struct ip_vs_dest *dest = cp->dest;
132         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
133                 spin_lock(&dest->stats.lock);
134                 dest->stats.ustats.outpkts++;
135                 dest->stats.ustats.outbytes += skb->len;
136                 spin_unlock(&dest->stats.lock);
137
138                 spin_lock(&dest->svc->stats.lock);
139                 dest->svc->stats.ustats.outpkts++;
140                 dest->svc->stats.ustats.outbytes += skb->len;
141                 spin_unlock(&dest->svc->stats.lock);
142
143                 spin_lock(&ip_vs_stats.lock);
144                 ip_vs_stats.ustats.outpkts++;
145                 ip_vs_stats.ustats.outbytes += skb->len;
146                 spin_unlock(&ip_vs_stats.lock);
147         }
148 }
149
150
151 static inline void
152 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
153 {
154         spin_lock(&cp->dest->stats.lock);
155         cp->dest->stats.ustats.conns++;
156         spin_unlock(&cp->dest->stats.lock);
157
158         spin_lock(&svc->stats.lock);
159         svc->stats.ustats.conns++;
160         spin_unlock(&svc->stats.lock);
161
162         spin_lock(&ip_vs_stats.lock);
163         ip_vs_stats.ustats.conns++;
164         spin_unlock(&ip_vs_stats.lock);
165 }
166
167
168 static inline int
169 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
170                 const struct sk_buff *skb,
171                 struct ip_vs_protocol *pp)
172 {
173         if (unlikely(!pp->state_transition))
174                 return 0;
175         return pp->state_transition(cp, direction, skb, pp);
176 }
177
178
179 /*
180  *  IPVS persistent scheduling function
181  *  It creates a connection entry according to its template if exists,
182  *  or selects a server and creates a connection entry plus a template.
183  *  Locking: we are svc user (svc->refcnt), so we hold all dests too
184  *  Protocols supported: TCP, UDP
185  */
186 static struct ip_vs_conn *
187 ip_vs_sched_persist(struct ip_vs_service *svc,
188                     const struct sk_buff *skb,
189                     __be16 ports[2])
190 {
191         struct ip_vs_conn *cp = NULL;
192         struct ip_vs_iphdr iph;
193         struct ip_vs_dest *dest;
194         struct ip_vs_conn *ct;
195         __be16  dport;                  /* destination port to forward */
196         __be16  flags;
197         union nf_inet_addr snet;        /* source network of the client,
198                                            after masking */
199
200         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
201
202         /* Mask saddr with the netmask to adjust template granularity */
203 #ifdef CONFIG_IP_VS_IPV6
204         if (svc->af == AF_INET6)
205                 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
206         else
207 #endif
208                 snet.ip = iph.saddr.ip & svc->netmask;
209
210         IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
211                       "mnet %s\n",
212                       IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
213                       IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
214                       IP_VS_DBG_ADDR(svc->af, &snet));
215
216         /*
217          * As far as we know, FTP is a very complicated network protocol, and
218          * it uses control connection and data connections. For active FTP,
219          * FTP server initialize data connection to the client, its source port
220          * is often 20. For passive FTP, FTP server tells the clients the port
221          * that it passively listens to,  and the client issues the data
222          * connection. In the tunneling or direct routing mode, the load
223          * balancer is on the client-to-server half of connection, the port
224          * number is unknown to the load balancer. So, a conn template like
225          * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
226          * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
227          * is created for other persistent services.
228          */
229         if (ports[1] == svc->port) {
230                 /* Check if a template already exists */
231                 if (svc->port != FTPPORT)
232                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
233                                              &iph.daddr, ports[1]);
234                 else
235                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
236                                              &iph.daddr, 0);
237
238                 if (!ct || !ip_vs_check_template(ct)) {
239                         /*
240                          * No template found or the dest of the connection
241                          * template is not available.
242                          */
243                         dest = svc->scheduler->schedule(svc, skb);
244                         if (dest == NULL) {
245                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
246                                 return NULL;
247                         }
248
249                         /*
250                          * Create a template like <protocol,caddr,0,
251                          * vaddr,vport,daddr,dport> for non-ftp service,
252                          * and <protocol,caddr,0,vaddr,0,daddr,0>
253                          * for ftp service.
254                          */
255                         if (svc->port != FTPPORT)
256                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
257                                                     &snet, 0,
258                                                     &iph.daddr,
259                                                     ports[1],
260                                                     &dest->addr, dest->port,
261                                                     IP_VS_CONN_F_TEMPLATE,
262                                                     dest);
263                         else
264                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
265                                                     &snet, 0,
266                                                     &iph.daddr, 0,
267                                                     &dest->addr, 0,
268                                                     IP_VS_CONN_F_TEMPLATE,
269                                                     dest);
270                         if (ct == NULL)
271                                 return NULL;
272
273                         ct->timeout = svc->timeout;
274                 } else {
275                         /* set destination with the found template */
276                         dest = ct->dest;
277                 }
278                 dport = dest->port;
279         } else {
280                 /*
281                  * Note: persistent fwmark-based services and persistent
282                  * port zero service are handled here.
283                  * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
284                  * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
285                  */
286                 if (svc->fwmark) {
287                         union nf_inet_addr fwmark = {
288                                 .ip = htonl(svc->fwmark)
289                         };
290
291                         ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
292                                              &fwmark, 0);
293                 } else
294                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
295                                              &iph.daddr, 0);
296
297                 if (!ct || !ip_vs_check_template(ct)) {
298                         /*
299                          * If it is not persistent port zero, return NULL,
300                          * otherwise create a connection template.
301                          */
302                         if (svc->port)
303                                 return NULL;
304
305                         dest = svc->scheduler->schedule(svc, skb);
306                         if (dest == NULL) {
307                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
308                                 return NULL;
309                         }
310
311                         /*
312                          * Create a template according to the service
313                          */
314                         if (svc->fwmark) {
315                                 union nf_inet_addr fwmark = {
316                                         .ip = htonl(svc->fwmark)
317                                 };
318
319                                 ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
320                                                     &snet, 0,
321                                                     &fwmark, 0,
322                                                     &dest->addr, 0,
323                                                     IP_VS_CONN_F_TEMPLATE,
324                                                     dest);
325                         } else
326                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
327                                                     &snet, 0,
328                                                     &iph.daddr, 0,
329                                                     &dest->addr, 0,
330                                                     IP_VS_CONN_F_TEMPLATE,
331                                                     dest);
332                         if (ct == NULL)
333                                 return NULL;
334
335                         ct->timeout = svc->timeout;
336                 } else {
337                         /* set destination with the found template */
338                         dest = ct->dest;
339                 }
340                 dport = ports[1];
341         }
342
343         flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
344                  && iph.protocol == IPPROTO_UDP)?
345                 IP_VS_CONN_F_ONE_PACKET : 0;
346
347         /*
348          *    Create a new connection according to the template
349          */
350         cp = ip_vs_conn_new(svc->af, iph.protocol,
351                             &iph.saddr, ports[0],
352                             &iph.daddr, ports[1],
353                             &dest->addr, dport,
354                             flags,
355                             dest);
356         if (cp == NULL) {
357                 ip_vs_conn_put(ct);
358                 return NULL;
359         }
360
361         /*
362          *    Add its control
363          */
364         ip_vs_control_add(cp, ct);
365         ip_vs_conn_put(ct);
366
367         ip_vs_conn_stats(cp, svc);
368         return cp;
369 }
370
371
372 /*
373  *  IPVS main scheduling function
374  *  It selects a server according to the virtual service, and
375  *  creates a connection entry.
376  *  Protocols supported: TCP, UDP
377  */
378 struct ip_vs_conn *
379 ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
380 {
381         struct ip_vs_conn *cp = NULL;
382         struct ip_vs_iphdr iph;
383         struct ip_vs_dest *dest;
384         __be16 _ports[2], *pptr, flags;
385
386         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
387         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
388         if (pptr == NULL)
389                 return NULL;
390
391         /*
392          *    Persistent service
393          */
394         if (svc->flags & IP_VS_SVC_F_PERSISTENT)
395                 return ip_vs_sched_persist(svc, skb, pptr);
396
397         /*
398          *    Non-persistent service
399          */
400         if (!svc->fwmark && pptr[1] != svc->port) {
401                 if (!svc->port)
402                         pr_err("Schedule: port zero only supported "
403                                "in persistent services, "
404                                "check your ipvs configuration\n");
405                 return NULL;
406         }
407
408         dest = svc->scheduler->schedule(svc, skb);
409         if (dest == NULL) {
410                 IP_VS_DBG(1, "Schedule: no dest found.\n");
411                 return NULL;
412         }
413
414         flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
415                  && iph.protocol == IPPROTO_UDP)?
416                 IP_VS_CONN_F_ONE_PACKET : 0;
417
418         /*
419          *    Create a connection entry.
420          */
421         cp = ip_vs_conn_new(svc->af, iph.protocol,
422                             &iph.saddr, pptr[0],
423                             &iph.daddr, pptr[1],
424                             &dest->addr, dest->port ? dest->port : pptr[1],
425                             flags,
426                             dest);
427         if (cp == NULL)
428                 return NULL;
429
430         IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
431                       "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
432                       ip_vs_fwd_tag(cp),
433                       IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
434                       IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
435                       IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
436                       cp->flags, atomic_read(&cp->refcnt));
437
438         ip_vs_conn_stats(cp, svc);
439         return cp;
440 }
441
442
443 /*
444  *  Pass or drop the packet.
445  *  Called by ip_vs_in, when the virtual service is available but
446  *  no destination is available for a new connection.
447  */
448 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
449                 struct ip_vs_protocol *pp)
450 {
451         __be16 _ports[2], *pptr;
452         struct ip_vs_iphdr iph;
453         int unicast;
454         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
455
456         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
457         if (pptr == NULL) {
458                 ip_vs_service_put(svc);
459                 return NF_DROP;
460         }
461
462 #ifdef CONFIG_IP_VS_IPV6
463         if (svc->af == AF_INET6)
464                 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
465         else
466 #endif
467                 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
468
469         /* if it is fwmark-based service, the cache_bypass sysctl is up
470            and the destination is a non-local unicast, then create
471            a cache_bypass connection entry */
472         if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
473                 int ret, cs;
474                 struct ip_vs_conn *cp;
475                 __u16 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
476                                 iph.protocol == IPPROTO_UDP)?
477                                 IP_VS_CONN_F_ONE_PACKET : 0;
478                 union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
479
480                 ip_vs_service_put(svc);
481
482                 /* create a new connection entry */
483                 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
484                 cp = ip_vs_conn_new(svc->af, iph.protocol,
485                                     &iph.saddr, pptr[0],
486                                     &iph.daddr, pptr[1],
487                                     &daddr, 0,
488                                     IP_VS_CONN_F_BYPASS | flags,
489                                     NULL);
490                 if (cp == NULL)
491                         return NF_DROP;
492
493                 /* statistics */
494                 ip_vs_in_stats(cp, skb);
495
496                 /* set state */
497                 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
498
499                 /* transmit the first SYN packet */
500                 ret = cp->packet_xmit(skb, cp, pp);
501                 /* do not touch skb anymore */
502
503                 atomic_inc(&cp->in_pkts);
504                 ip_vs_conn_put(cp);
505                 return ret;
506         }
507
508         /*
509          * When the virtual ftp service is presented, packets destined
510          * for other services on the VIP may get here (except services
511          * listed in the ipvs table), pass the packets, because it is
512          * not ipvs job to decide to drop the packets.
513          */
514         if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
515                 ip_vs_service_put(svc);
516                 return NF_ACCEPT;
517         }
518
519         ip_vs_service_put(svc);
520
521         /*
522          * Notify the client that the destination is unreachable, and
523          * release the socket buffer.
524          * Since it is in IP layer, the TCP socket is not actually
525          * created, the TCP RST packet cannot be sent, instead that
526          * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
527          */
528 #ifdef CONFIG_IP_VS_IPV6
529         if (svc->af == AF_INET6)
530                 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
531         else
532 #endif
533                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
534
535         return NF_DROP;
536 }
537
538 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
539 {
540         return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
541 }
542
543 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
544 {
545         int err = ip_defrag(skb, user);
546
547         if (!err)
548                 ip_send_check(ip_hdr(skb));
549
550         return err;
551 }
552
553 #ifdef CONFIG_IP_VS_IPV6
554 static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
555 {
556         /* TODO IPv6: Find out what to do here for IPv6 */
557         return 0;
558 }
559 #endif
560
561 /*
562  * Packet has been made sufficiently writable in caller
563  * - inout: 1=in->out, 0=out->in
564  */
565 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
566                     struct ip_vs_conn *cp, int inout)
567 {
568         struct iphdr *iph        = ip_hdr(skb);
569         unsigned int icmp_offset = iph->ihl*4;
570         struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
571                                                       icmp_offset);
572         struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
573
574         if (inout) {
575                 iph->saddr = cp->vaddr.ip;
576                 ip_send_check(iph);
577                 ciph->daddr = cp->vaddr.ip;
578                 ip_send_check(ciph);
579         } else {
580                 iph->daddr = cp->daddr.ip;
581                 ip_send_check(iph);
582                 ciph->saddr = cp->daddr.ip;
583                 ip_send_check(ciph);
584         }
585
586         /* the TCP/UDP/SCTP port */
587         if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
588             IPPROTO_SCTP == ciph->protocol) {
589                 __be16 *ports = (void *)ciph + ciph->ihl*4;
590
591                 if (inout)
592                         ports[1] = cp->vport;
593                 else
594                         ports[0] = cp->dport;
595         }
596
597         /* And finally the ICMP checksum */
598         icmph->checksum = 0;
599         icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
600         skb->ip_summed = CHECKSUM_UNNECESSARY;
601
602         if (inout)
603                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
604                         "Forwarding altered outgoing ICMP");
605         else
606                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
607                         "Forwarding altered incoming ICMP");
608 }
609
610 #ifdef CONFIG_IP_VS_IPV6
611 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
612                     struct ip_vs_conn *cp, int inout)
613 {
614         struct ipv6hdr *iph      = ipv6_hdr(skb);
615         unsigned int icmp_offset = sizeof(struct ipv6hdr);
616         struct icmp6hdr *icmph   = (struct icmp6hdr *)(skb_network_header(skb) +
617                                                       icmp_offset);
618         struct ipv6hdr *ciph     = (struct ipv6hdr *)(icmph + 1);
619
620         if (inout) {
621                 iph->saddr = cp->vaddr.in6;
622                 ciph->daddr = cp->vaddr.in6;
623         } else {
624                 iph->daddr = cp->daddr.in6;
625                 ciph->saddr = cp->daddr.in6;
626         }
627
628         /* the TCP/UDP/SCTP port */
629         if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr ||
630             IPPROTO_SCTP == ciph->nexthdr) {
631                 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
632
633                 if (inout)
634                         ports[1] = cp->vport;
635                 else
636                         ports[0] = cp->dport;
637         }
638
639         /* And finally the ICMP checksum */
640         icmph->icmp6_cksum = 0;
641         /* TODO IPv6: is this correct for ICMPv6? */
642         ip_vs_checksum_complete(skb, icmp_offset);
643         skb->ip_summed = CHECKSUM_UNNECESSARY;
644
645         if (inout)
646                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
647                         "Forwarding altered outgoing ICMPv6");
648         else
649                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
650                         "Forwarding altered incoming ICMPv6");
651 }
652 #endif
653
654 /* Handle relevant response ICMP messages - forward to the right
655  * destination host. Used for NAT and local client.
656  */
657 static int handle_response_icmp(int af, struct sk_buff *skb,
658                                 union nf_inet_addr *snet,
659                                 __u8 protocol, struct ip_vs_conn *cp,
660                                 struct ip_vs_protocol *pp,
661                                 unsigned int offset, unsigned int ihl)
662 {
663         unsigned int verdict = NF_DROP;
664
665         if (IP_VS_FWD_METHOD(cp) != 0) {
666                 pr_err("shouldn't reach here, because the box is on the "
667                        "half connection in the tun/dr module.\n");
668         }
669
670         /* Ensure the checksum is correct */
671         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
672                 /* Failed checksum! */
673                 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
674                               IP_VS_DBG_ADDR(af, snet));
675                 goto out;
676         }
677
678         if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
679             IPPROTO_SCTP == protocol)
680                 offset += 2 * sizeof(__u16);
681         if (!skb_make_writable(skb, offset))
682                 goto out;
683
684 #ifdef CONFIG_IP_VS_IPV6
685         if (af == AF_INET6)
686                 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
687         else
688 #endif
689                 ip_vs_nat_icmp(skb, pp, cp, 1);
690
691         /* do the statistics and put it back */
692         ip_vs_out_stats(cp, skb);
693
694         skb->ipvs_property = 1;
695         verdict = NF_ACCEPT;
696
697 out:
698         __ip_vs_conn_put(cp);
699
700         return verdict;
701 }
702
703 /*
704  *      Handle ICMP messages in the inside-to-outside direction (outgoing).
705  *      Find any that might be relevant, check against existing connections.
706  *      Currently handles error types - unreachable, quench, ttl exceeded.
707  */
708 static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
709 {
710         struct iphdr *iph;
711         struct icmphdr  _icmph, *ic;
712         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
713         struct ip_vs_iphdr ciph;
714         struct ip_vs_conn *cp;
715         struct ip_vs_protocol *pp;
716         unsigned int offset, ihl;
717         union nf_inet_addr snet;
718
719         *related = 1;
720
721         /* reassemble IP fragments */
722         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
723                 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
724                         return NF_STOLEN;
725         }
726
727         iph = ip_hdr(skb);
728         offset = ihl = iph->ihl * 4;
729         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
730         if (ic == NULL)
731                 return NF_DROP;
732
733         IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
734                   ic->type, ntohs(icmp_id(ic)),
735                   &iph->saddr, &iph->daddr);
736
737         /*
738          * Work through seeing if this is for us.
739          * These checks are supposed to be in an order that means easy
740          * things are checked first to speed up processing.... however
741          * this means that some packets will manage to get a long way
742          * down this stack and then be rejected, but that's life.
743          */
744         if ((ic->type != ICMP_DEST_UNREACH) &&
745             (ic->type != ICMP_SOURCE_QUENCH) &&
746             (ic->type != ICMP_TIME_EXCEEDED)) {
747                 *related = 0;
748                 return NF_ACCEPT;
749         }
750
751         /* Now find the contained IP header */
752         offset += sizeof(_icmph);
753         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
754         if (cih == NULL)
755                 return NF_ACCEPT; /* The packet looks wrong, ignore */
756
757         pp = ip_vs_proto_get(cih->protocol);
758         if (!pp)
759                 return NF_ACCEPT;
760
761         /* Is the embedded protocol header present? */
762         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
763                      pp->dont_defrag))
764                 return NF_ACCEPT;
765
766         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
767
768         offset += cih->ihl * 4;
769
770         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
771         /* The embedded headers contain source and dest in reverse order */
772         cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
773         if (!cp)
774                 return NF_ACCEPT;
775
776         snet.ip = iph->saddr;
777         return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
778                                     pp, offset, ihl);
779 }
780
781 #ifdef CONFIG_IP_VS_IPV6
782 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
783 {
784         struct ipv6hdr *iph;
785         struct icmp6hdr _icmph, *ic;
786         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
787                                            within the ICMP */
788         struct ip_vs_iphdr ciph;
789         struct ip_vs_conn *cp;
790         struct ip_vs_protocol *pp;
791         unsigned int offset;
792         union nf_inet_addr snet;
793
794         *related = 1;
795
796         /* reassemble IP fragments */
797         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
798                 if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
799                         return NF_STOLEN;
800         }
801
802         iph = ipv6_hdr(skb);
803         offset = sizeof(struct ipv6hdr);
804         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
805         if (ic == NULL)
806                 return NF_DROP;
807
808         IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
809                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
810                   &iph->saddr, &iph->daddr);
811
812         /*
813          * Work through seeing if this is for us.
814          * These checks are supposed to be in an order that means easy
815          * things are checked first to speed up processing.... however
816          * this means that some packets will manage to get a long way
817          * down this stack and then be rejected, but that's life.
818          */
819         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
820             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
821             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
822                 *related = 0;
823                 return NF_ACCEPT;
824         }
825
826         /* Now find the contained IP header */
827         offset += sizeof(_icmph);
828         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
829         if (cih == NULL)
830                 return NF_ACCEPT; /* The packet looks wrong, ignore */
831
832         pp = ip_vs_proto_get(cih->nexthdr);
833         if (!pp)
834                 return NF_ACCEPT;
835
836         /* Is the embedded protocol header present? */
837         /* TODO: we don't support fragmentation at the moment anyways */
838         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
839                 return NF_ACCEPT;
840
841         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
842
843         offset += sizeof(struct ipv6hdr);
844
845         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
846         /* The embedded headers contain source and dest in reverse order */
847         cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
848         if (!cp)
849                 return NF_ACCEPT;
850
851         ipv6_addr_copy(&snet.in6, &iph->saddr);
852         return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
853                                     pp, offset, sizeof(struct ipv6hdr));
854 }
855 #endif
856
857 /*
858  * Check if sctp chunc is ABORT chunk
859  */
860 static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
861 {
862         sctp_chunkhdr_t *sch, schunk;
863         sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
864                         sizeof(schunk), &schunk);
865         if (sch == NULL)
866                 return 0;
867         if (sch->type == SCTP_CID_ABORT)
868                 return 1;
869         return 0;
870 }
871
872 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
873 {
874         struct tcphdr _tcph, *th;
875
876         th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
877         if (th == NULL)
878                 return 0;
879         return th->rst;
880 }
881
882 /* Handle response packets: rewrite addresses and send away...
883  * Used for NAT and local client.
884  */
885 static unsigned int
886 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
887                 struct ip_vs_conn *cp, int ihl)
888 {
889         IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
890
891         if (!skb_make_writable(skb, ihl))
892                 goto drop;
893
894         /* mangle the packet */
895         if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
896                 goto drop;
897
898 #ifdef CONFIG_IP_VS_IPV6
899         if (af == AF_INET6)
900                 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
901         else
902 #endif
903         {
904                 ip_hdr(skb)->saddr = cp->vaddr.ip;
905                 ip_send_check(ip_hdr(skb));
906         }
907
908         /* For policy routing, packets originating from this
909          * machine itself may be routed differently to packets
910          * passing through.  We want this packet to be routed as
911          * if it came from this machine itself.  So re-compute
912          * the routing information.
913          */
914 #ifdef CONFIG_IP_VS_IPV6
915         if (af == AF_INET6) {
916                 if (ip6_route_me_harder(skb) != 0)
917                         goto drop;
918         } else
919 #endif
920                 if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
921                         goto drop;
922
923         IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
924
925         ip_vs_out_stats(cp, skb);
926         ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
927         ip_vs_conn_put(cp);
928
929         skb->ipvs_property = 1;
930
931         LeaveFunction(11);
932         return NF_ACCEPT;
933
934 drop:
935         ip_vs_conn_put(cp);
936         kfree_skb(skb);
937         return NF_STOLEN;
938 }
939
940 /*
941  *      It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
942  *      Check if outgoing packet belongs to the established ip_vs_conn.
943  */
944 static unsigned int
945 ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
946           const struct net_device *in, const struct net_device *out,
947           int (*okfn)(struct sk_buff *))
948 {
949         struct ip_vs_iphdr iph;
950         struct ip_vs_protocol *pp;
951         struct ip_vs_conn *cp;
952         int af;
953
954         EnterFunction(11);
955
956         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
957
958         if (skb->ipvs_property)
959                 return NF_ACCEPT;
960
961         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
962 #ifdef CONFIG_IP_VS_IPV6
963         if (af == AF_INET6) {
964                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
965                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
966
967                         if (related)
968                                 return verdict;
969                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
970                 }
971         } else
972 #endif
973                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
974                         int related, verdict = ip_vs_out_icmp(skb, &related);
975
976                         if (related)
977                                 return verdict;
978                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
979                 }
980
981         pp = ip_vs_proto_get(iph.protocol);
982         if (unlikely(!pp))
983                 return NF_ACCEPT;
984
985         /* reassemble IP fragments */
986 #ifdef CONFIG_IP_VS_IPV6
987         if (af == AF_INET6) {
988                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
989                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
990
991                         if (related)
992                                 return verdict;
993
994                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
995                 }
996         } else
997 #endif
998                 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
999                              !pp->dont_defrag)) {
1000                         if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
1001                                 return NF_STOLEN;
1002
1003                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1004                 }
1005
1006         /*
1007          * Check if the packet belongs to an existing entry
1008          */
1009         cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1010
1011         if (unlikely(!cp)) {
1012                 if (sysctl_ip_vs_nat_icmp_send &&
1013                     (pp->protocol == IPPROTO_TCP ||
1014                      pp->protocol == IPPROTO_UDP ||
1015                      pp->protocol == IPPROTO_SCTP)) {
1016                         __be16 _ports[2], *pptr;
1017
1018                         pptr = skb_header_pointer(skb, iph.len,
1019                                                   sizeof(_ports), _ports);
1020                         if (pptr == NULL)
1021                                 return NF_ACCEPT;       /* Not for me */
1022                         if (ip_vs_lookup_real_service(af, iph.protocol,
1023                                                       &iph.saddr,
1024                                                       pptr[0])) {
1025                                 /*
1026                                  * Notify the real server: there is no
1027                                  * existing entry if it is not RST
1028                                  * packet or not TCP packet.
1029                                  */
1030                                 if ((iph.protocol != IPPROTO_TCP &&
1031                                      iph.protocol != IPPROTO_SCTP)
1032                                      || ((iph.protocol == IPPROTO_TCP
1033                                           && !is_tcp_reset(skb, iph.len))
1034                                          || (iph.protocol == IPPROTO_SCTP
1035                                                 && !is_sctp_abort(skb,
1036                                                         iph.len)))) {
1037 #ifdef CONFIG_IP_VS_IPV6
1038                                         if (af == AF_INET6)
1039                                                 icmpv6_send(skb,
1040                                                             ICMPV6_DEST_UNREACH,
1041                                                             ICMPV6_PORT_UNREACH,
1042                                                             0);
1043                                         else
1044 #endif
1045                                                 icmp_send(skb,
1046                                                           ICMP_DEST_UNREACH,
1047                                                           ICMP_PORT_UNREACH, 0);
1048                                         return NF_DROP;
1049                                 }
1050                         }
1051                 }
1052                 IP_VS_DBG_PKT(12, pp, skb, 0,
1053                               "packet continues traversal as normal");
1054                 return NF_ACCEPT;
1055         }
1056
1057         return handle_response(af, skb, pp, cp, iph.len);
1058 }
1059
1060
1061 /*
1062  *      Handle ICMP messages in the outside-to-inside direction (incoming).
1063  *      Find any that might be relevant, check against existing connections,
1064  *      forward to the right destination host if relevant.
1065  *      Currently handles error types - unreachable, quench, ttl exceeded.
1066  */
1067 static int
1068 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1069 {
1070         struct iphdr *iph;
1071         struct icmphdr  _icmph, *ic;
1072         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
1073         struct ip_vs_iphdr ciph;
1074         struct ip_vs_conn *cp;
1075         struct ip_vs_protocol *pp;
1076         unsigned int offset, ihl, verdict;
1077         union nf_inet_addr snet;
1078
1079         *related = 1;
1080
1081         /* reassemble IP fragments */
1082         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1083                 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
1084                                             IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1085                         return NF_STOLEN;
1086         }
1087
1088         iph = ip_hdr(skb);
1089         offset = ihl = iph->ihl * 4;
1090         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1091         if (ic == NULL)
1092                 return NF_DROP;
1093
1094         IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1095                   ic->type, ntohs(icmp_id(ic)),
1096                   &iph->saddr, &iph->daddr);
1097
1098         /*
1099          * Work through seeing if this is for us.
1100          * These checks are supposed to be in an order that means easy
1101          * things are checked first to speed up processing.... however
1102          * this means that some packets will manage to get a long way
1103          * down this stack and then be rejected, but that's life.
1104          */
1105         if ((ic->type != ICMP_DEST_UNREACH) &&
1106             (ic->type != ICMP_SOURCE_QUENCH) &&
1107             (ic->type != ICMP_TIME_EXCEEDED)) {
1108                 *related = 0;
1109                 return NF_ACCEPT;
1110         }
1111
1112         /* Now find the contained IP header */
1113         offset += sizeof(_icmph);
1114         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1115         if (cih == NULL)
1116                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1117
1118         pp = ip_vs_proto_get(cih->protocol);
1119         if (!pp)
1120                 return NF_ACCEPT;
1121
1122         /* Is the embedded protocol header present? */
1123         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1124                      pp->dont_defrag))
1125                 return NF_ACCEPT;
1126
1127         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
1128
1129         offset += cih->ihl * 4;
1130
1131         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1132         /* The embedded headers contain source and dest in reverse order */
1133         cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
1134         if (!cp) {
1135                 /* The packet could also belong to a local client */
1136                 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1137                 if (cp) {
1138                         snet.ip = iph->saddr;
1139                         return handle_response_icmp(AF_INET, skb, &snet,
1140                                                     cih->protocol, cp, pp,
1141                                                     offset, ihl);
1142                 }
1143                 return NF_ACCEPT;
1144         }
1145
1146         verdict = NF_DROP;
1147
1148         /* Ensure the checksum is correct */
1149         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1150                 /* Failed checksum! */
1151                 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1152                           &iph->saddr);
1153                 goto out;
1154         }
1155
1156         /* do the statistics and put it back */
1157         ip_vs_in_stats(cp, skb);
1158         if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1159                 offset += 2 * sizeof(__u16);
1160         verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1161         /* do not touch skb anymore */
1162
1163   out:
1164         __ip_vs_conn_put(cp);
1165
1166         return verdict;
1167 }
1168
1169 #ifdef CONFIG_IP_VS_IPV6
1170 static int
1171 ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1172 {
1173         struct ipv6hdr *iph;
1174         struct icmp6hdr _icmph, *ic;
1175         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
1176                                            within the ICMP */
1177         struct ip_vs_iphdr ciph;
1178         struct ip_vs_conn *cp;
1179         struct ip_vs_protocol *pp;
1180         unsigned int offset, verdict;
1181         union nf_inet_addr snet;
1182
1183         *related = 1;
1184
1185         /* reassemble IP fragments */
1186         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1187                 if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
1188                                                IP_DEFRAG_VS_IN :
1189                                                IP_DEFRAG_VS_FWD))
1190                         return NF_STOLEN;
1191         }
1192
1193         iph = ipv6_hdr(skb);
1194         offset = sizeof(struct ipv6hdr);
1195         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1196         if (ic == NULL)
1197                 return NF_DROP;
1198
1199         IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
1200                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
1201                   &iph->saddr, &iph->daddr);
1202
1203         /*
1204          * Work through seeing if this is for us.
1205          * These checks are supposed to be in an order that means easy
1206          * things are checked first to speed up processing.... however
1207          * this means that some packets will manage to get a long way
1208          * down this stack and then be rejected, but that's life.
1209          */
1210         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1211             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1212             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1213                 *related = 0;
1214                 return NF_ACCEPT;
1215         }
1216
1217         /* Now find the contained IP header */
1218         offset += sizeof(_icmph);
1219         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1220         if (cih == NULL)
1221                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1222
1223         pp = ip_vs_proto_get(cih->nexthdr);
1224         if (!pp)
1225                 return NF_ACCEPT;
1226
1227         /* Is the embedded protocol header present? */
1228         /* TODO: we don't support fragmentation at the moment anyways */
1229         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1230                 return NF_ACCEPT;
1231
1232         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
1233
1234         offset += sizeof(struct ipv6hdr);
1235
1236         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1237         /* The embedded headers contain source and dest in reverse order */
1238         cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
1239         if (!cp) {
1240                 /* The packet could also belong to a local client */
1241                 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1242                 if (cp) {
1243                         ipv6_addr_copy(&snet.in6, &iph->saddr);
1244                         return handle_response_icmp(AF_INET6, skb, &snet,
1245                                                     cih->nexthdr,
1246                                                     cp, pp, offset,
1247                                                     sizeof(struct ipv6hdr));
1248                 }
1249                 return NF_ACCEPT;
1250         }
1251
1252         verdict = NF_DROP;
1253
1254         /* do the statistics and put it back */
1255         ip_vs_in_stats(cp, skb);
1256         if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
1257             IPPROTO_SCTP == cih->nexthdr)
1258                 offset += 2 * sizeof(__u16);
1259         verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1260         /* do not touch skb anymore */
1261
1262         __ip_vs_conn_put(cp);
1263
1264         return verdict;
1265 }
1266 #endif
1267
1268
1269 /*
1270  *      Check if it's for virtual services, look it up,
1271  *      and send it on its way...
1272  */
1273 static unsigned int
1274 ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1275          const struct net_device *in, const struct net_device *out,
1276          int (*okfn)(struct sk_buff *))
1277 {
1278         struct ip_vs_iphdr iph;
1279         struct ip_vs_protocol *pp;
1280         struct ip_vs_conn *cp;
1281         int ret, restart, af, pkts;
1282
1283         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
1284
1285         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1286
1287         /*
1288          *      Big tappo: only PACKET_HOST, including loopback for local client
1289          *      Don't handle local packets on IPv6 for now
1290          */
1291         if (unlikely(skb->pkt_type != PACKET_HOST)) {
1292                 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
1293                               skb->pkt_type,
1294                               iph.protocol,
1295                               IP_VS_DBG_ADDR(af, &iph.daddr));
1296                 return NF_ACCEPT;
1297         }
1298
1299 #ifdef CONFIG_IP_VS_IPV6
1300         if (af == AF_INET6) {
1301                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1302                         int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
1303
1304                         if (related)
1305                                 return verdict;
1306                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1307                 }
1308         } else
1309 #endif
1310                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1311                         int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
1312
1313                         if (related)
1314                                 return verdict;
1315                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1316                 }
1317
1318         /* Protocol supported? */
1319         pp = ip_vs_proto_get(iph.protocol);
1320         if (unlikely(!pp))
1321                 return NF_ACCEPT;
1322
1323         /*
1324          * Check if the packet belongs to an existing connection entry
1325          */
1326         cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1327
1328         if (unlikely(!cp)) {
1329                 int v;
1330
1331                 /* For local client packets, it could be a response */
1332                 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1333                 if (cp)
1334                         return handle_response(af, skb, pp, cp, iph.len);
1335
1336                 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1337                         return v;
1338         }
1339
1340         if (unlikely(!cp)) {
1341                 /* sorry, all this trouble for a no-hit :) */
1342                 IP_VS_DBG_PKT(12, pp, skb, 0,
1343                               "packet continues traversal as normal");
1344                 return NF_ACCEPT;
1345         }
1346
1347         IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1348
1349         /* Check the server status */
1350         if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1351                 /* the destination server is not available */
1352
1353                 if (sysctl_ip_vs_expire_nodest_conn) {
1354                         /* try to expire the connection immediately */
1355                         ip_vs_conn_expire_now(cp);
1356                 }
1357                 /* don't restart its timer, and silently
1358                    drop the packet. */
1359                 __ip_vs_conn_put(cp);
1360                 return NF_DROP;
1361         }
1362
1363         ip_vs_in_stats(cp, skb);
1364         restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1365         if (cp->packet_xmit)
1366                 ret = cp->packet_xmit(skb, cp, pp);
1367                 /* do not touch skb anymore */
1368         else {
1369                 IP_VS_DBG_RL("warning: packet_xmit is null");
1370                 ret = NF_ACCEPT;
1371         }
1372
1373         /* Increase its packet counter and check if it is needed
1374          * to be synchronized
1375          *
1376          * Sync connection if it is about to close to
1377          * encorage the standby servers to update the connections timeout
1378          */
1379         pkts = atomic_add_return(1, &cp->in_pkts);
1380         if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1381             cp->protocol == IPPROTO_SCTP) {
1382                 if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
1383                         (atomic_read(&cp->in_pkts) %
1384                          sysctl_ip_vs_sync_threshold[1]
1385                          == sysctl_ip_vs_sync_threshold[0])) ||
1386                                 (cp->old_state != cp->state &&
1387                                  ((cp->state == IP_VS_SCTP_S_CLOSED) ||
1388                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
1389                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
1390                         ip_vs_sync_conn(cp);
1391                         goto out;
1392                 }
1393         }
1394
1395         if (af == AF_INET &&
1396             (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1397             (((cp->protocol != IPPROTO_TCP ||
1398                cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1399               (pkts % sysctl_ip_vs_sync_threshold[1]
1400                == sysctl_ip_vs_sync_threshold[0])) ||
1401              ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1402               ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1403                (cp->state == IP_VS_TCP_S_CLOSE) ||
1404                (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1405                (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1406                 ip_vs_sync_conn(cp);
1407 out:
1408         cp->old_state = cp->state;
1409
1410         ip_vs_conn_put(cp);
1411         return ret;
1412 }
1413
1414
1415 /*
1416  *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1417  *      related packets destined for 0.0.0.0/0.
1418  *      When fwmark-based virtual service is used, such as transparent
1419  *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1420  *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1421  *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1422  *      and send them to ip_vs_in_icmp.
1423  */
1424 static unsigned int
1425 ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1426                    const struct net_device *in, const struct net_device *out,
1427                    int (*okfn)(struct sk_buff *))
1428 {
1429         int r;
1430
1431         if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1432                 return NF_ACCEPT;
1433
1434         return ip_vs_in_icmp(skb, &r, hooknum);
1435 }
1436
1437 #ifdef CONFIG_IP_VS_IPV6
1438 static unsigned int
1439 ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1440                       const struct net_device *in, const struct net_device *out,
1441                       int (*okfn)(struct sk_buff *))
1442 {
1443         int r;
1444
1445         if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1446                 return NF_ACCEPT;
1447
1448         return ip_vs_in_icmp_v6(skb, &r, hooknum);
1449 }
1450 #endif
1451
1452
1453 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1454         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1455          * or VS/NAT(change destination), so that filtering rules can be
1456          * applied to IPVS. */
1457         {
1458                 .hook           = ip_vs_in,
1459                 .owner          = THIS_MODULE,
1460                 .pf             = PF_INET,
1461                 .hooknum        = NF_INET_LOCAL_IN,
1462                 .priority       = 100,
1463         },
1464         /* After packet filtering, change source only for VS/NAT */
1465         {
1466                 .hook           = ip_vs_out,
1467                 .owner          = THIS_MODULE,
1468                 .pf             = PF_INET,
1469                 .hooknum        = NF_INET_FORWARD,
1470                 .priority       = 100,
1471         },
1472         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1473          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1474         {
1475                 .hook           = ip_vs_forward_icmp,
1476                 .owner          = THIS_MODULE,
1477                 .pf             = PF_INET,
1478                 .hooknum        = NF_INET_FORWARD,
1479                 .priority       = 99,
1480         },
1481 #ifdef CONFIG_IP_VS_IPV6
1482         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1483          * or VS/NAT(change destination), so that filtering rules can be
1484          * applied to IPVS. */
1485         {
1486                 .hook           = ip_vs_in,
1487                 .owner          = THIS_MODULE,
1488                 .pf             = PF_INET6,
1489                 .hooknum        = NF_INET_LOCAL_IN,
1490                 .priority       = 100,
1491         },
1492         /* After packet filtering, change source only for VS/NAT */
1493         {
1494                 .hook           = ip_vs_out,
1495                 .owner          = THIS_MODULE,
1496                 .pf             = PF_INET6,
1497                 .hooknum        = NF_INET_FORWARD,
1498                 .priority       = 100,
1499         },
1500         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1501          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1502         {
1503                 .hook           = ip_vs_forward_icmp_v6,
1504                 .owner          = THIS_MODULE,
1505                 .pf             = PF_INET6,
1506                 .hooknum        = NF_INET_FORWARD,
1507                 .priority       = 99,
1508         },
1509 #endif
1510 };
1511
1512
1513 /*
1514  *      Initialize IP Virtual Server
1515  */
1516 static int __init ip_vs_init(void)
1517 {
1518         int ret;
1519
1520         ip_vs_estimator_init();
1521
1522         ret = ip_vs_control_init();
1523         if (ret < 0) {
1524                 pr_err("can't setup control.\n");
1525                 goto cleanup_estimator;
1526         }
1527
1528         ip_vs_protocol_init();
1529
1530         ret = ip_vs_app_init();
1531         if (ret < 0) {
1532                 pr_err("can't setup application helper.\n");
1533                 goto cleanup_protocol;
1534         }
1535
1536         ret = ip_vs_conn_init();
1537         if (ret < 0) {
1538                 pr_err("can't setup connection table.\n");
1539                 goto cleanup_app;
1540         }
1541
1542         ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1543         if (ret < 0) {
1544                 pr_err("can't register hooks.\n");
1545                 goto cleanup_conn;
1546         }
1547
1548         pr_info("ipvs loaded.\n");
1549         return ret;
1550
1551   cleanup_conn:
1552         ip_vs_conn_cleanup();
1553   cleanup_app:
1554         ip_vs_app_cleanup();
1555   cleanup_protocol:
1556         ip_vs_protocol_cleanup();
1557         ip_vs_control_cleanup();
1558   cleanup_estimator:
1559         ip_vs_estimator_cleanup();
1560         return ret;
1561 }
1562
1563 static void __exit ip_vs_cleanup(void)
1564 {
1565         nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1566         ip_vs_conn_cleanup();
1567         ip_vs_app_cleanup();
1568         ip_vs_protocol_cleanup();
1569         ip_vs_control_cleanup();
1570         ip_vs_estimator_cleanup();
1571         pr_info("ipvs unloaded.\n");
1572 }
1573
1574 module_init(ip_vs_init);
1575 module_exit(ip_vs_cleanup);
1576 MODULE_LICENSE("GPL");