]> bbs.cooldavid.org Git - net-next-2.6.git/commitdiff
Merge branch 'for-patrick' of git://git.kernel.org/pub/scm/linux/kernel/git/horms...
authorPatrick McHardy <kaber@trash.net>
Thu, 21 Oct 2010 14:25:51 +0000 (16:25 +0200)
committerPatrick McHardy <kaber@trash.net>
Thu, 21 Oct 2010 14:25:51 +0000 (16:25 +0200)
12 files changed:
include/net/ip_vs.h
net/ipv4/netfilter/nf_nat_core.c
net/netfilter/ipvs/ip_vs_conn.c
net/netfilter/ipvs/ip_vs_core.c
net/netfilter/ipvs/ip_vs_ctl.c
net/netfilter/ipvs/ip_vs_ftp.c
net/netfilter/ipvs/ip_vs_proto.c
net/netfilter/ipvs/ip_vs_proto_ah_esp.c
net/netfilter/ipvs/ip_vs_proto_sctp.c
net/netfilter/ipvs/ip_vs_proto_tcp.c
net/netfilter/ipvs/ip_vs_proto_udp.c
net/netfilter/ipvs/ip_vs_xmit.c

index 6e8a6192e5746f12db408c669f1e0a8b5e7f189e..b7bbd6c28cfa17dde6fa3a972d33635c5a498312 100644 (file)
@@ -25,7 +25,7 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>                        /* for struct ipv6hdr */
 #include <net/ipv6.h>                  /* for ipv6_addr_copy */
-#ifdef CONFIG_IP_VS_NFCT
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 #include <net/netfilter/nf_conntrack.h>
 #endif
 
@@ -136,24 +136,24 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
                if (net_ratelimit())                                    \
                        printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__);  \
        } while (0)
-#define IP_VS_DBG_PKT(level, pp, skb, ofs, msg)                                \
+#define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg)                    \
        do {                                                            \
                if (level <= ip_vs_get_debug_level())                   \
-                       pp->debug_packet(pp, skb, ofs, msg);            \
+                       pp->debug_packet(af, pp, skb, ofs, msg);        \
        } while (0)
-#define IP_VS_DBG_RL_PKT(level, pp, skb, ofs, msg)                     \
+#define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg)                 \
        do {                                                            \
                if (level <= ip_vs_get_debug_level() &&                 \
                    net_ratelimit())                                    \
-                       pp->debug_packet(pp, skb, ofs, msg);            \
+                       pp->debug_packet(af, pp, skb, ofs, msg);        \
        } while (0)
 #else  /* NO DEBUGGING at ALL */
 #define IP_VS_DBG_BUF(level, msg...)  do {} while (0)
 #define IP_VS_ERR_BUF(msg...)  do {} while (0)
 #define IP_VS_DBG(level, msg...)  do {} while (0)
 #define IP_VS_DBG_RL(msg...)  do {} while (0)
-#define IP_VS_DBG_PKT(level, pp, skb, ofs, msg)                do {} while (0)
-#define IP_VS_DBG_RL_PKT(level, pp, skb, ofs, msg)     do {} while (0)
+#define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg)    do {} while (0)
+#define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) do {} while (0)
 #endif
 
 #define IP_VS_BUG() BUG()
@@ -345,7 +345,7 @@ struct ip_vs_protocol {
 
        int (*app_conn_bind)(struct ip_vs_conn *cp);
 
-       void (*debug_packet)(struct ip_vs_protocol *pp,
+       void (*debug_packet)(int af, struct ip_vs_protocol *pp,
                             const struct sk_buff *skb,
                             int offset,
                             const char *msg);
@@ -409,6 +409,7 @@ struct ip_vs_conn {
        /* packet transmitter for different forwarding methods.  If it
           mangles the packet, it must return NF_DROP or better NF_STOLEN,
           otherwise this must be changed to a sk_buff **.
+          NF_ACCEPT can be returned when destination is local.
         */
        int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp,
                           struct ip_vs_protocol *pp);
@@ -597,11 +598,19 @@ struct ip_vs_app {
        __be16                  port;           /* port number in net order */
        atomic_t                usecnt;         /* usage counter */
 
-       /* output hook: return false if can't linearize. diff set for TCP.  */
+       /*
+        * output hook: Process packet in inout direction, diff set for TCP.
+        * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok,
+        *         2=Mangled but checksum was not updated
+        */
        int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *,
                       struct sk_buff *, int *diff);
 
-       /* input hook: return false if can't linearize. diff set for TCP. */
+       /*
+        * input hook: Process packet in outin direction, diff set for TCP.
+        * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok,
+        *         2=Mangled but checksum was not updated
+        */
        int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *,
                      struct sk_buff *, int *diff);
 
@@ -819,7 +828,8 @@ extern int
 ip_vs_set_state_timeout(int *table, int num, const char *const *names,
                        const char *name, int to);
 extern void
-ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
+                         const struct sk_buff *skb,
                          int offset, const char *msg);
 
 extern struct ip_vs_protocol ip_vs_protocol_tcp;
@@ -841,7 +851,8 @@ extern int ip_vs_unbind_scheduler(struct ip_vs_service *svc);
 extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name);
 extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler);
 extern struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb);
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
+              struct ip_vs_protocol *pp, int *ignored);
 extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
                        struct ip_vs_protocol *pp);
 
@@ -1013,6 +1024,24 @@ static inline __wsum ip_vs_check_diff2(__be16 old, __be16 new, __wsum oldsum)
        return csum_partial(diff, sizeof(diff), oldsum);
 }
 
+/*
+ * Forget current conntrack (unconfirmed) and attach notrack entry
+ */
+static inline void ip_vs_notrack(struct sk_buff *skb)
+{
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+       enum ip_conntrack_info ctinfo;
+       struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+       if (!ct || !nf_ct_is_untracked(ct)) {
+               nf_reset(skb);
+               skb->nfct = &nf_ct_untracked_get()->ct_general;
+               skb->nfctinfo = IP_CT_NEW;
+               nf_conntrack_get(skb->nfct);
+       }
+#endif
+}
+
 #ifdef CONFIG_IP_VS_NFCT
 /*
  *      Netfilter connection tracking
index e2e00c4da883b44f9c319b601e52857f2e0c59d3..0047923c1f22aff63a6f7030384e7926539d430f 100644 (file)
@@ -462,6 +462,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
                        return 0;
        }
 
+       if (manip == IP_NAT_MANIP_SRC)
+               statusbit = IPS_SRC_NAT;
+       else
+               statusbit = IPS_DST_NAT;
+
+       /* Invert if this is reply dir. */
+       if (dir == IP_CT_DIR_REPLY)
+               statusbit ^= IPS_NAT_MASK;
+
+       if (!(ct->status & statusbit))
+               return 1;
+
        pr_debug("icmp_reply_translation: translating error %p manip %u "
                 "dir %s\n", skb, manip,
                 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
@@ -496,20 +508,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
 
        /* Change outer to look the reply to an incoming packet
         * (proto 0 means don't invert per-proto part). */
-       if (manip == IP_NAT_MANIP_SRC)
-               statusbit = IPS_SRC_NAT;
-       else
-               statusbit = IPS_DST_NAT;
-
-       /* Invert if this is reply dir. */
-       if (dir == IP_CT_DIR_REPLY)
-               statusbit ^= IPS_NAT_MASK;
-
-       if (ct->status & statusbit) {
-               nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
-               if (!manip_pkt(0, skb, 0, &target, manip))
-                       return 0;
-       }
+       nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+       if (!manip_pkt(0, skb, 0, &target, manip))
+               return 0;
 
        return 1;
 }
index 1d1a529dbe24d20e2da90878c08e60c987671b2e..e9adecdc8ca4779468c494c1a2418049ca2384eb 100644 (file)
@@ -563,6 +563,8 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
                 */
                if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
                        conn_flags &= ~IP_VS_CONN_F_INACTIVE;
+               /* connections inherit forwarding method from dest */
+               cp->flags &= ~IP_VS_CONN_F_FWD_MASK;
        }
        cp->flags |= conn_flags;
        cp->dest = dest;
index e5fef7aef0d43fc06600f560440eb981a1fe48fa..b4e51e9c5a04ad4e1314a338529cc0284407feb3 100644 (file)
@@ -48,6 +48,7 @@
 #ifdef CONFIG_IP_VS_IPV6
 #include <net/ipv6.h>
 #include <linux/netfilter_ipv6.h>
+#include <net/ip6_route.h>
 #endif
 
 #include <net/ip_vs.h>
@@ -342,7 +343,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
  *  Protocols supported: TCP, UDP
  */
 struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb)
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
+              struct ip_vs_protocol *pp, int *ignored)
 {
        struct ip_vs_conn *cp = NULL;
        struct ip_vs_iphdr iph;
@@ -350,16 +352,44 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb)
        __be16 _ports[2], *pptr;
        unsigned int flags;
 
+       *ignored = 1;
        ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
        pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
        if (pptr == NULL)
                return NULL;
 
+       /*
+        * FTPDATA needs this check when using local real server.
+        * Never schedule Active FTPDATA connections from real server.
+        * For LVS-NAT they must be already created. For other methods
+        * with persistence the connection is created on SYN+ACK.
+        */
+       if (pptr[0] == FTPDATA) {
+               IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+                             "Not scheduling FTPDATA");
+               return NULL;
+       }
+
+       /*
+        * Do not schedule replies from local real server. It is risky
+        * for fwmark services but mostly for persistent services.
+        */
+       if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+           (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
+           (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
+               IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+                             "Not scheduling reply for existing connection");
+               __ip_vs_conn_put(cp);
+               return NULL;
+       }
+
        /*
         *    Persistent service
         */
-       if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+       if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
+               *ignored = 0;
                return ip_vs_sched_persist(svc, skb, pptr);
+       }
 
        /*
         *    Non-persistent service
@@ -372,6 +402,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb)
                return NULL;
        }
 
+       *ignored = 0;
+
        dest = svc->scheduler->schedule(svc, skb);
        if (dest == NULL) {
                IP_VS_DBG(1, "Schedule: no dest found.\n");
@@ -498,35 +530,32 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
         * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
         */
 #ifdef CONFIG_IP_VS_IPV6
-       if (svc->af == AF_INET6)
+       if (svc->af == AF_INET6) {
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
-       else
+       else
 #endif
                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 
        return NF_DROP;
 }
 
-/*
- * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
- * chain and is used to avoid double NAT and confirmation when we do
- * not want to keep the conntrack structure
- */
-static unsigned int ip_vs_post_routing(unsigned int hooknum,
-                                      struct sk_buff *skb,
-                                      const struct net_device *in,
-                                      const struct net_device *out,
-                                      int (*okfn)(struct sk_buff *))
+__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 {
-       if (!skb->ipvs_property)
-               return NF_ACCEPT;
-       /* The packet was sent from IPVS, exit this chain */
-       return NF_STOP;
+       return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
 }
 
-__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
+static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
 {
-       return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
+       if (NF_INET_LOCAL_IN == hooknum)
+               return IP_DEFRAG_VS_IN;
+       if (NF_INET_FORWARD == hooknum)
+               return IP_DEFRAG_VS_FWD;
+       return IP_DEFRAG_VS_OUT;
 }
 
 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
@@ -589,10 +618,10 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
        skb->ip_summed = CHECKSUM_UNNECESSARY;
 
        if (inout)
-               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+               IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
                        "Forwarding altered outgoing ICMP");
        else
-               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+               IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
                        "Forwarding altered incoming ICMP");
 }
 
@@ -634,11 +663,13 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
        skb->ip_summed = CHECKSUM_PARTIAL;
 
        if (inout)
-               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-                       "Forwarding altered outgoing ICMPv6");
+               IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+                             (void *)ciph - (void *)iph,
+                             "Forwarding altered outgoing ICMPv6");
        else
-               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-                       "Forwarding altered incoming ICMPv6");
+               IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+                             (void *)ciph - (void *)iph,
+                             "Forwarding altered incoming ICMPv6");
 }
 #endif
 
@@ -679,11 +710,23 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
 #endif
                ip_vs_nat_icmp(skb, pp, cp, 1);
 
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6) {
+               if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
+                       goto out;
+       } else
+#endif
+               if ((sysctl_ip_vs_snat_reroute ||
+                    skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+                   ip_route_me_harder(skb, RTN_LOCAL) != 0)
+                       goto out;
+
        /* do the statistics and put it back */
        ip_vs_out_stats(cp, skb);
 
+       skb->ipvs_property = 1;
        if (!(cp->flags & IP_VS_CONN_F_NFCT))
-               skb->ipvs_property = 1;
+               ip_vs_notrack(skb);
        else
                ip_vs_update_conntrack(skb, cp, 0);
        verdict = NF_ACCEPT;
@@ -699,7 +742,8 @@ out:
  *     Find any that might be relevant, check against existing connections.
  *     Currently handles error types - unreachable, quench, ttl exceeded.
  */
-static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
+static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
+                         unsigned int hooknum)
 {
        struct iphdr *iph;
        struct icmphdr  _icmph, *ic;
@@ -714,7 +758,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 
        /* reassemble IP fragments */
        if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-               if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+               if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
                        return NF_STOLEN;
        }
 
@@ -757,7 +801,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
                     pp->dont_defrag))
                return NF_ACCEPT;
 
-       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
+       IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+                     "Checking outgoing ICMP for");
 
        offset += cih->ihl * 4;
 
@@ -773,7 +818,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 }
 
 #ifdef CONFIG_IP_VS_IPV6
-static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
+static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
+                            unsigned int hooknum)
 {
        struct ipv6hdr *iph;
        struct icmp6hdr _icmph, *ic;
@@ -789,7 +835,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 
        /* reassemble IP fragments */
        if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-               if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
+               if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
                        return NF_STOLEN;
        }
 
@@ -832,7 +878,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
        if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
                return NF_ACCEPT;
 
-       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
+       IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+                     "Checking outgoing ICMPv6 for");
 
        offset += sizeof(struct ipv6hdr);
 
@@ -880,7 +927,7 @@ static unsigned int
 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
                struct ip_vs_conn *cp, int ihl)
 {
-       IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
+       IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
 
        if (!skb_make_writable(skb, ihl))
                goto drop;
@@ -914,23 +961,24 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
         * if it came from this machine itself.  So re-compute
         * the routing information.
         */
-       if (sysctl_ip_vs_snat_reroute) {
 #ifdef CONFIG_IP_VS_IPV6
-               if (af == AF_INET6) {
-                       if (ip6_route_me_harder(skb) != 0)
-                               goto drop;
-               } else
+       if (af == AF_INET6) {
+               if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
+                       goto drop;
+       } else
 #endif
-                       if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
-                               goto drop;
-       }
+               if ((sysctl_ip_vs_snat_reroute ||
+                    skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+                   ip_route_me_harder(skb, RTN_LOCAL) != 0)
+                       goto drop;
 
-       IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
+       IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
 
        ip_vs_out_stats(cp, skb);
        ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
+       skb->ipvs_property = 1;
        if (!(cp->flags & IP_VS_CONN_F_NFCT))
-               skb->ipvs_property = 1;
+               ip_vs_notrack(skb);
        else
                ip_vs_update_conntrack(skb, cp, 0);
        ip_vs_conn_put(cp);
@@ -946,53 +994,54 @@ drop:
 }
 
 /*
- *     It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
  *     Check if outgoing packet belongs to the established ip_vs_conn.
  */
 static unsigned int
-ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
-         const struct net_device *in, const struct net_device *out,
-         int (*okfn)(struct sk_buff *))
+ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 {
        struct ip_vs_iphdr iph;
        struct ip_vs_protocol *pp;
        struct ip_vs_conn *cp;
-       int af;
 
        EnterFunction(11);
 
-       af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
+       /* Already marked as IPVS request or reply? */
        if (skb->ipvs_property)
                return NF_ACCEPT;
 
+       /* Bad... Do not break raw sockets */
+       if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+                    af == AF_INET)) {
+               struct sock *sk = skb->sk;
+               struct inet_sock *inet = inet_sk(skb->sk);
+
+               if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+                       return NF_ACCEPT;
+       }
+
+       if (unlikely(!skb_dst(skb)))
+               return NF_ACCEPT;
+
        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 #ifdef CONFIG_IP_VS_IPV6
        if (af == AF_INET6) {
                if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-                       int related, verdict = ip_vs_out_icmp_v6(skb, &related);
+                       int related;
+                       int verdict = ip_vs_out_icmp_v6(skb, &related,
+                                                       hooknum);
 
-                       if (related) {
-                               if (sysctl_ip_vs_snat_reroute &&
-                                       NF_ACCEPT == verdict &&
-                                       ip6_route_me_harder(skb))
-                                       verdict = NF_DROP;
+                       if (related)
                                return verdict;
-                       }
                        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
                }
        } else
 #endif
                if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-                       int related, verdict = ip_vs_out_icmp(skb, &related);
+                       int related;
+                       int verdict = ip_vs_out_icmp(skb, &related, hooknum);
 
-                       if (related) {
-                               if (sysctl_ip_vs_snat_reroute &&
-                                       NF_ACCEPT == verdict &&
-                                       ip_route_me_harder(skb, RTN_LOCAL))
-                                       verdict = NF_DROP;
+                       if (related)
                                return verdict;
-                       }
                        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
                }
 
@@ -1003,19 +1052,19 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
        /* reassemble IP fragments */
 #ifdef CONFIG_IP_VS_IPV6
        if (af == AF_INET6) {
-               if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-                       int related, verdict = ip_vs_out_icmp_v6(skb, &related);
-
-                       if (related)
-                               return verdict;
-
-                       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+               if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+                       if (ip_vs_gather_frags_v6(skb,
+                                                 ip_vs_defrag_user(hooknum)))
+                               return NF_STOLEN;
                }
+
+               ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
        } else
 #endif
                if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
                             !pp->dont_defrag)) {
-                       if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+                       if (ip_vs_gather_frags(skb,
+                                              ip_vs_defrag_user(hooknum)))
                                return NF_STOLEN;
 
                        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
@@ -1026,55 +1075,123 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
         */
        cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
 
-       if (unlikely(!cp)) {
-               if (sysctl_ip_vs_nat_icmp_send &&
-                   (pp->protocol == IPPROTO_TCP ||
-                    pp->protocol == IPPROTO_UDP ||
-                    pp->protocol == IPPROTO_SCTP)) {
-                       __be16 _ports[2], *pptr;
-
-                       pptr = skb_header_pointer(skb, iph.len,
-                                                 sizeof(_ports), _ports);
-                       if (pptr == NULL)
-                               return NF_ACCEPT;       /* Not for me */
-                       if (ip_vs_lookup_real_service(af, iph.protocol,
-                                                     &iph.saddr,
-                                                     pptr[0])) {
-                               /*
-                                * Notify the real server: there is no
-                                * existing entry if it is not RST
-                                * packet or not TCP packet.
-                                */
-                               if ((iph.protocol != IPPROTO_TCP &&
-                                    iph.protocol != IPPROTO_SCTP)
-                                    || ((iph.protocol == IPPROTO_TCP
-                                         && !is_tcp_reset(skb, iph.len))
-                                        || (iph.protocol == IPPROTO_SCTP
-                                               && !is_sctp_abort(skb,
-                                                       iph.len)))) {
+       if (likely(cp))
+               return handle_response(af, skb, pp, cp, iph.len);
+       if (sysctl_ip_vs_nat_icmp_send &&
+           (pp->protocol == IPPROTO_TCP ||
+            pp->protocol == IPPROTO_UDP ||
+            pp->protocol == IPPROTO_SCTP)) {
+               __be16 _ports[2], *pptr;
+
+               pptr = skb_header_pointer(skb, iph.len,
+                                         sizeof(_ports), _ports);
+               if (pptr == NULL)
+                       return NF_ACCEPT;       /* Not for me */
+               if (ip_vs_lookup_real_service(af, iph.protocol,
+                                             &iph.saddr,
+                                             pptr[0])) {
+                       /*
+                        * Notify the real server: there is no
+                        * existing entry if it is not RST
+                        * packet or not TCP packet.
+                        */
+                       if ((iph.protocol != IPPROTO_TCP &&
+                            iph.protocol != IPPROTO_SCTP)
+                            || ((iph.protocol == IPPROTO_TCP
+                                 && !is_tcp_reset(skb, iph.len))
+                                || (iph.protocol == IPPROTO_SCTP
+                                       && !is_sctp_abort(skb,
+                                               iph.len)))) {
 #ifdef CONFIG_IP_VS_IPV6
-                                       if (af == AF_INET6)
-                                               icmpv6_send(skb,
-                                                           ICMPV6_DEST_UNREACH,
-                                                           ICMPV6_PORT_UNREACH,
-                                                           0);
-                                       else
+                               if (af == AF_INET6) {
+                                       struct net *net =
+                                               dev_net(skb_dst(skb)->dev);
+
+                                       if (!skb->dev)
+                                               skb->dev = net->loopback_dev;
+                                       icmpv6_send(skb,
+                                                   ICMPV6_DEST_UNREACH,
+                                                   ICMPV6_PORT_UNREACH,
+                                                   0);
+                               } else
 #endif
-                                               icmp_send(skb,
-                                                         ICMP_DEST_UNREACH,
-                                                         ICMP_PORT_UNREACH, 0);
-                                       return NF_DROP;
-                               }
+                                       icmp_send(skb,
+                                                 ICMP_DEST_UNREACH,
+                                                 ICMP_PORT_UNREACH, 0);
+                               return NF_DROP;
                        }
                }
-               IP_VS_DBG_PKT(12, pp, skb, 0,
-                             "packet continues traversal as normal");
-               return NF_ACCEPT;
        }
+       IP_VS_DBG_PKT(12, af, pp, skb, 0,
+                     "ip_vs_out: packet continues traversal as normal");
+       return NF_ACCEPT;
+}
+
+/*
+ *     It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ *     used only for VS/NAT.
+ *     Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
+            const struct net_device *in, const struct net_device *out,
+            int (*okfn)(struct sk_buff *))
+{
+       return ip_vs_out(hooknum, skb, AF_INET);
+}
+
+/*
+ *     It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ *     Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
+                  const struct net_device *in, const struct net_device *out,
+                  int (*okfn)(struct sk_buff *))
+{
+       unsigned int verdict;
 
-       return handle_response(af, skb, pp, cp, iph.len);
+       /* Disable BH in LOCAL_OUT until all places are fixed */
+       local_bh_disable();
+       verdict = ip_vs_out(hooknum, skb, AF_INET);
+       local_bh_enable();
+       return verdict;
 }
 
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ *     It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ *     used only for VS/NAT.
+ *     Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
+            const struct net_device *in, const struct net_device *out,
+            int (*okfn)(struct sk_buff *))
+{
+       return ip_vs_out(hooknum, skb, AF_INET6);
+}
+
+/*
+ *     It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ *     Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
+                  const struct net_device *in, const struct net_device *out,
+                  int (*okfn)(struct sk_buff *))
+{
+       unsigned int verdict;
+
+       /* Disable BH in LOCAL_OUT until all places are fixed */
+       local_bh_disable();
+       verdict = ip_vs_out(hooknum, skb, AF_INET6);
+       local_bh_enable();
+       return verdict;
+}
+
+#endif
 
 /*
  *     Handle ICMP messages in the outside-to-inside direction (incoming).
@@ -1098,8 +1215,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 
        /* reassemble IP fragments */
        if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-               if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
-                                           IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
+               if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
                        return NF_STOLEN;
        }
 
@@ -1142,7 +1258,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
                     pp->dont_defrag))
                return NF_ACCEPT;
 
-       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
+       IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+                     "Checking incoming ICMP for");
 
        offset += cih->ihl * 4;
 
@@ -1176,7 +1293,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
        if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
                offset += 2 * sizeof(__u16);
        verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
-       /* do not touch skb anymore */
+       /* LOCALNODE from FORWARD hook is not supported */
+       if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
+           skb_rtable(skb)->rt_flags & RTCF_LOCAL) {
+               IP_VS_DBG(1, "%s(): "
+                         "local delivery to %pI4 but in FORWARD\n",
+                         __func__, &skb_rtable(skb)->rt_dst);
+               verdict = NF_DROP;
+       }
 
   out:
        __ip_vs_conn_put(cp);
@@ -1197,14 +1321,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
        struct ip_vs_protocol *pp;
        unsigned int offset, verdict;
        union nf_inet_addr snet;
+       struct rt6_info *rt;
 
        *related = 1;
 
        /* reassemble IP fragments */
        if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-               if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
-                                              IP_DEFRAG_VS_IN :
-                                              IP_DEFRAG_VS_FWD))
+               if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
                        return NF_STOLEN;
        }
 
@@ -1247,7 +1370,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
        if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
                return NF_ACCEPT;
 
-       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
+       IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+                     "Checking incoming ICMPv6 for");
 
        offset += sizeof(struct ipv6hdr);
 
@@ -1275,7 +1399,15 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
            IPPROTO_SCTP == cih->nexthdr)
                offset += 2 * sizeof(__u16);
        verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
-       /* do not touch skb anymore */
+       /* LOCALNODE from FORWARD hook is not supported */
+       if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
+           (rt = (struct rt6_info *) skb_dst(skb)) &&
+           rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) {
+               IP_VS_DBG(1, "%s(): "
+                         "local delivery to %pI6 but in FORWARD\n",
+                         __func__, &rt->rt6i_dst);
+               verdict = NF_DROP;
+       }
 
        __ip_vs_conn_put(cp);
 
@@ -1289,35 +1421,49 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
  *     and send it on its way...
  */
 static unsigned int
-ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
-        const struct net_device *in, const struct net_device *out,
-        int (*okfn)(struct sk_buff *))
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 {
        struct ip_vs_iphdr iph;
        struct ip_vs_protocol *pp;
        struct ip_vs_conn *cp;
-       int ret, restart, af, pkts;
-
-       af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
+       int ret, restart, pkts;
 
-       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+       /* Already marked as IPVS request or reply? */
+       if (skb->ipvs_property)
+               return NF_ACCEPT;
 
        /*
-        *      Big tappo: only PACKET_HOST, including loopback for local client
-        *      Don't handle local packets on IPv6 for now
+        *      Big tappo:
+        *      - remote client: only PACKET_HOST
+        *      - route: used for struct net when skb->dev is unset
         */
-       if (unlikely(skb->pkt_type != PACKET_HOST)) {
-               IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
-                             skb->pkt_type,
-                             iph.protocol,
-                             IP_VS_DBG_ADDR(af, &iph.daddr));
+       if (unlikely((skb->pkt_type != PACKET_HOST &&
+                     hooknum != NF_INET_LOCAL_OUT) ||
+                    !skb_dst(skb))) {
+               ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+               IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
+                             " ignored in hook %u\n",
+                             skb->pkt_type, iph.protocol,
+                             IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
                return NF_ACCEPT;
        }
+       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+       /* Bad... Do not break raw sockets */
+       if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+                    af == AF_INET)) {
+               struct sock *sk = skb->sk;
+               struct inet_sock *inet = inet_sk(skb->sk);
+
+               if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+                       return NF_ACCEPT;
+       }
 
 #ifdef CONFIG_IP_VS_IPV6
        if (af == AF_INET6) {
                if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-                       int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
+                       int related;
+                       int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
 
                        if (related)
                                return verdict;
@@ -1326,7 +1472,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
        } else
 #endif
                if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-                       int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
+                       int related;
+                       int verdict = ip_vs_in_icmp(skb, &related, hooknum);
 
                        if (related)
                                return verdict;
@@ -1346,23 +1493,18 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
        if (unlikely(!cp)) {
                int v;
 
-               /* For local client packets, it could be a response */
-               cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
-               if (cp)
-                       return handle_response(af, skb, pp, cp, iph.len);
-
                if (!pp->conn_schedule(af, skb, pp, &v, &cp))
                        return v;
        }
 
        if (unlikely(!cp)) {
                /* sorry, all this trouble for a no-hit :) */
-               IP_VS_DBG_PKT(12, pp, skb, 0,
-                             "packet continues traversal as normal");
+               IP_VS_DBG_PKT(12, af, pp, skb, 0,
+                             "ip_vs_in: packet continues traversal as normal");
                return NF_ACCEPT;
        }
 
-       IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
+       IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
 
        /* Check the server status */
        if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
@@ -1429,6 +1571,72 @@ out:
        return ret;
 }
 
+/*
+ *     AF_INET handler in NF_INET_LOCAL_IN chain
+ *     Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
+                     const struct net_device *in,
+                     const struct net_device *out,
+                     int (*okfn)(struct sk_buff *))
+{
+       return ip_vs_in(hooknum, skb, AF_INET);
+}
+
+/*
+ *     AF_INET handler in NF_INET_LOCAL_OUT chain
+ *     Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
+                    const struct net_device *in, const struct net_device *out,
+                    int (*okfn)(struct sk_buff *))
+{
+       unsigned int verdict;
+
+       /* Disable BH in LOCAL_OUT until all places are fixed */
+       local_bh_disable();
+       verdict = ip_vs_in(hooknum, skb, AF_INET);
+       local_bh_enable();
+       return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ *     AF_INET6 handler in NF_INET_LOCAL_IN chain
+ *     Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
+                     const struct net_device *in,
+                     const struct net_device *out,
+                     int (*okfn)(struct sk_buff *))
+{
+       return ip_vs_in(hooknum, skb, AF_INET6);
+}
+
+/*
+ *     AF_INET6 handler in NF_INET_LOCAL_OUT chain
+ *     Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
+                    const struct net_device *in, const struct net_device *out,
+                    int (*okfn)(struct sk_buff *))
+{
+       unsigned int verdict;
+
+       /* Disable BH in LOCAL_OUT until all places are fixed */
+       local_bh_disable();
+       verdict = ip_vs_in(hooknum, skb, AF_INET6);
+       local_bh_enable();
+       return verdict;
+}
+
+#endif
+
 
 /*
  *     It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
@@ -1469,23 +1677,39 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
 
 
 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+       /* After packet filtering, change source only for VS/NAT */
+       {
+               .hook           = ip_vs_reply4,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_LOCAL_IN,
+               .priority       = 99,
+       },
        /* After packet filtering, forward packet through VS/DR, VS/TUN,
         * or VS/NAT(change destination), so that filtering rules can be
         * applied to IPVS. */
        {
-               .hook           = ip_vs_in,
+               .hook           = ip_vs_remote_request4,
                .owner          = THIS_MODULE,
                .pf             = PF_INET,
-               .hooknum        = NF_INET_LOCAL_IN,
-               .priority       = 100,
+               .hooknum        = NF_INET_LOCAL_IN,
+               .priority       = 101,
        },
-       /* After packet filtering, change source only for VS/NAT */
+       /* Before ip_vs_in, change source only for VS/NAT */
+       {
+               .hook           = ip_vs_local_reply4,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_LOCAL_OUT,
+               .priority       = -99,
+       },
+       /* After mangle, schedule and forward local requests */
        {
-               .hook           = ip_vs_out,
+               .hook           = ip_vs_local_request4,
                .owner          = THIS_MODULE,
                .pf             = PF_INET,
-               .hooknum        = NF_INET_FORWARD,
-               .priority       = 100,
+               .hooknum        = NF_INET_LOCAL_OUT,
+               .priority       = -98,
        },
        /* After packet filtering (but before ip_vs_out_icmp), catch icmp
         * destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1493,35 +1717,51 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
                .hook           = ip_vs_forward_icmp,
                .owner          = THIS_MODULE,
                .pf             = PF_INET,
-               .hooknum        = NF_INET_FORWARD,
-               .priority       = 99,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 99,
        },
-       /* Before the netfilter connection tracking, exit from POST_ROUTING */
+       /* After packet filtering, change source only for VS/NAT */
        {
-               .hook           = ip_vs_post_routing,
+               .hook           = ip_vs_reply4,
                .owner          = THIS_MODULE,
                .pf             = PF_INET,
-               .hooknum        = NF_INET_POST_ROUTING,
-               .priority       = NF_IP_PRI_NAT_SRC-1,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 100,
        },
 #ifdef CONFIG_IP_VS_IPV6
+       /* After packet filtering, change source only for VS/NAT */
+       {
+               .hook           = ip_vs_reply6,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET6,
+               .hooknum        = NF_INET_LOCAL_IN,
+               .priority       = 99,
+       },
        /* After packet filtering, forward packet through VS/DR, VS/TUN,
         * or VS/NAT(change destination), so that filtering rules can be
         * applied to IPVS. */
        {
-               .hook           = ip_vs_in,
+               .hook           = ip_vs_remote_request6,
                .owner          = THIS_MODULE,
                .pf             = PF_INET6,
-               .hooknum        = NF_INET_LOCAL_IN,
-               .priority       = 100,
+               .hooknum        = NF_INET_LOCAL_IN,
+               .priority       = 101,
        },
-       /* After packet filtering, change source only for VS/NAT */
+       /* Before ip_vs_in, change source only for VS/NAT */
        {
-               .hook           = ip_vs_out,
+               .hook           = ip_vs_local_reply6,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_LOCAL_OUT,
+               .priority       = -99,
+       },
+       /* After mangle, schedule and forward local requests */
+       {
+               .hook           = ip_vs_local_request6,
                .owner          = THIS_MODULE,
                .pf             = PF_INET6,
-               .hooknum        = NF_INET_FORWARD,
-               .priority       = 100,
+               .hooknum        = NF_INET_LOCAL_OUT,
+               .priority       = -98,
        },
        /* After packet filtering (but before ip_vs_out_icmp), catch icmp
         * destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1529,16 +1769,16 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
                .hook           = ip_vs_forward_icmp_v6,
                .owner          = THIS_MODULE,
                .pf             = PF_INET6,
-               .hooknum        = NF_INET_FORWARD,
-               .priority       = 99,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 99,
        },
-       /* Before the netfilter connection tracking, exit from POST_ROUTING */
+       /* After packet filtering, change source only for VS/NAT */
        {
-               .hook           = ip_vs_post_routing,
+               .hook           = ip_vs_reply6,
                .owner          = THIS_MODULE,
                .pf             = PF_INET6,
-               .hooknum        = NF_INET_POST_ROUTING,
-               .priority       = NF_IP6_PRI_NAT_SRC-1,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 100,
        },
 #endif
 };
index 0b884d3e192fafa92f2dc68c97c8bb6f782a33c8..5f5daa30b0afe541d00c1577850ce565c31fb13b 100644 (file)
@@ -777,20 +777,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
        conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
        conn_flags |= IP_VS_CONN_F_INACTIVE;
 
-       /* check if local node and update the flags */
-#ifdef CONFIG_IP_VS_IPV6
-       if (svc->af == AF_INET6) {
-               if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
-                       conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-                               | IP_VS_CONN_F_LOCALNODE;
-               }
-       } else
-#endif
-               if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
-                       conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-                               | IP_VS_CONN_F_LOCALNODE;
-               }
-
        /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
        if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
                conn_flags |= IP_VS_CONN_F_NOOUTPUT;
@@ -824,6 +810,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
        dest->u_threshold = udest->u_threshold;
        dest->l_threshold = udest->l_threshold;
 
+       spin_lock(&dest->dst_lock);
+       ip_vs_dst_reset(dest);
+       spin_unlock(&dest->dst_lock);
+
        if (add)
                ip_vs_new_estimator(&dest->stats);
 
index 090889a3b3af069fdeb2cf91a995a281c7ac1704..75455000ad1c1cde82b2134970ab3a67a5a97b82 100644 (file)
@@ -242,9 +242,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
                        ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
                                                       start-data, end-start,
                                                       buf, buf_len);
-                       if (ret)
+                       if (ret) {
                                ip_vs_nfct_expect_related(skb, ct, n_cp,
                                                          IPPROTO_TCP, 0, 0);
+                               if (skb->ip_summed == CHECKSUM_COMPLETE)
+                                       skb->ip_summed = CHECKSUM_UNNECESSARY;
+                               /* csum is updated */
+                               ret = 1;
+                       }
                }
 
                /*
index 027f654799feb969fe5a64de8b262e37287e8f48..c539983908771ead7df2e14b7d4eb9c259369064 100644 (file)
@@ -172,8 +172,8 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
        else if (ih->frag_off & htons(IP_OFFSET))
                sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr);
        else {
-               __be16 _ports[2], *pptr
-;
+               __be16 _ports[2], *pptr;
+
                pptr = skb_header_pointer(skb, offset + ih->ihl*4,
                                          sizeof(_ports), _ports);
                if (pptr == NULL)
@@ -223,13 +223,13 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
 
 
 void
-ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
+ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
                          const struct sk_buff *skb,
                          int offset,
                          const char *msg)
 {
 #ifdef CONFIG_IP_VS_IPV6
-       if (skb->protocol == htons(ETH_P_IPV6))
+       if (af == AF_INET6)
                ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
        else
 #endif
index 8956ef33ea6cba45246709b3fb00d4ce3bbb9e56..3a0461117d3fad6216747b60b5ec24856ca9d7fc 100644 (file)
@@ -117,54 +117,6 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
        return 0;
 }
 
-
-static void
-ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-                      int offset, const char *msg)
-{
-       char buf[256];
-       struct iphdr _iph, *ih;
-
-       ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-       if (ih == NULL)
-               sprintf(buf, "TRUNCATED");
-       else
-               sprintf(buf, "%pI4->%pI4", &ih->saddr, &ih->daddr);
-
-       pr_debug("%s: %s %s\n", msg, pp->name, buf);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static void
-ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-                      int offset, const char *msg)
-{
-       char buf[256];
-       struct ipv6hdr _iph, *ih;
-
-       ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-       if (ih == NULL)
-               sprintf(buf, "TRUNCATED");
-       else
-               sprintf(buf, "%pI6->%pI6", &ih->saddr, &ih->daddr);
-
-       pr_debug("%s: %s %s\n", msg, pp->name, buf);
-}
-#endif
-
-static void
-ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-                   int offset, const char *msg)
-{
-#ifdef CONFIG_IP_VS_IPV6
-       if (skb->protocol == htons(ETH_P_IPV6))
-               ah_esp_debug_packet_v6(pp, skb, offset, msg);
-       else
-#endif
-               ah_esp_debug_packet_v4(pp, skb, offset, msg);
-}
-
-
 static void ah_esp_init(struct ip_vs_protocol *pp)
 {
        /* nothing to do now */
@@ -195,7 +147,7 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
        .register_app =         NULL,
        .unregister_app =       NULL,
        .app_conn_bind =        NULL,
-       .debug_packet =         ah_esp_debug_packet,
+       .debug_packet =         ip_vs_tcpudp_debug_packet,
        .timeout_change =       NULL,           /* ISAKMP */
        .set_state_timeout =    NULL,
 };
@@ -219,7 +171,7 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
        .register_app =         NULL,
        .unregister_app =       NULL,
        .app_conn_bind =        NULL,
-       .debug_packet =         ah_esp_debug_packet,
+       .debug_packet =         ip_vs_tcpudp_debug_packet,
        .timeout_change =       NULL,           /* ISAKMP */
 };
 #endif
index 4c0855cb006ee93c721d53ff0b95cef5ec1853bf..d254345bfda7066504c305832340dc4105f827fb 100644 (file)
@@ -31,6 +31,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
        if ((sch->type == SCTP_CID_INIT) &&
            (svc = ip_vs_service_get(af, skb->mark, iph.protocol,
                                     &iph.daddr, sh->dest))) {
+               int ignored;
+
                if (ip_vs_todrop()) {
                        /*
                         * It seems that we are very loaded.
@@ -44,8 +46,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
                 * Let the virtual server select a real server for the
                 * incoming connection, and create a connection entry.
                 */
-               *cpp = ip_vs_schedule(svc, skb);
-               if (!*cpp) {
+               *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+               if (!*cpp && !ignored) {
                        *verdict = ip_vs_leave(svc, skb, pp);
                        return 0;
                }
@@ -174,7 +176,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 
        if (val != cmp) {
                /* CRC failure, dump it. */
-               IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+               IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
                                "Failed checksum for");
                return 0;
        }
index 282d24de8592e659466657533b10ad6eadf4bd5c..f6c5200e214663fe915b2136532c54d03861e5eb 100644 (file)
@@ -43,9 +43,12 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
                return 0;
        }
 
+       /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
        if (th->syn &&
            (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
                                     th->dest))) {
+               int ignored;
+
                if (ip_vs_todrop()) {
                        /*
                         * It seems that we are very loaded.
@@ -60,8 +63,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
                 * Let the virtual server select a real server for the
                 * incoming connection, and create a connection entry.
                 */
-               *cpp = ip_vs_schedule(svc, skb);
-               if (!*cpp) {
+               *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+               if (!*cpp && !ignored) {
                        *verdict = ip_vs_leave(svc, skb, pp);
                        return 0;
                }
@@ -101,15 +104,15 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph,
 #ifdef CONFIG_IP_VS_IPV6
        if (af == AF_INET6)
                tcph->check =
-                       csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+                       ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
                                         ip_vs_check_diff2(oldlen, newlen,
-                                               ~csum_unfold(tcph->check))));
+                                               csum_unfold(tcph->check))));
        else
 #endif
        tcph->check =
-               csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+               ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
                                ip_vs_check_diff2(oldlen, newlen,
-                                               ~csum_unfold(tcph->check))));
+                                               csum_unfold(tcph->check))));
 }
 
 
@@ -120,6 +123,7 @@ tcp_snat_handler(struct sk_buff *skb,
        struct tcphdr *tcph;
        unsigned int tcphoff;
        int oldlen;
+       int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
        if (cp->af == AF_INET6)
@@ -134,13 +138,20 @@ tcp_snat_handler(struct sk_buff *skb,
                return 0;
 
        if (unlikely(cp->app != NULL)) {
+               int ret;
+
                /* Some checks before mangling */
                if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
                        return 0;
 
                /* Call application helper if needed */
-               if (!ip_vs_app_pkt_out(cp, skb))
+               if (!(ret = ip_vs_app_pkt_out(cp, skb)))
                        return 0;
+               /* ret=2: csum update is needed after payload mangling */
+               if (ret == 1)
+                       oldlen = skb->len - tcphoff;
+               else
+                       payload_csum = 1;
        }
 
        tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -151,12 +162,13 @@ tcp_snat_handler(struct sk_buff *skb,
                tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
                                        htons(oldlen),
                                        htons(skb->len - tcphoff));
-       } else if (!cp->app) {
+       } else if (!payload_csum) {
                /* Only port and addr are changed, do fast csum update */
                tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
                                     cp->dport, cp->vport);
                if (skb->ip_summed == CHECKSUM_COMPLETE)
-                       skb->ip_summed = CHECKSUM_NONE;
+                       skb->ip_summed = (cp->app && pp->csum_check) ?
+                                        CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
        } else {
                /* full checksum calculation */
                tcph->check = 0;
@@ -174,6 +186,7 @@ tcp_snat_handler(struct sk_buff *skb,
                                                        skb->len - tcphoff,
                                                        cp->protocol,
                                                        skb->csum);
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
 
                IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
                          pp->name, tcph->check,
@@ -190,6 +203,7 @@ tcp_dnat_handler(struct sk_buff *skb,
        struct tcphdr *tcph;
        unsigned int tcphoff;
        int oldlen;
+       int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
        if (cp->af == AF_INET6)
@@ -204,6 +218,8 @@ tcp_dnat_handler(struct sk_buff *skb,
                return 0;
 
        if (unlikely(cp->app != NULL)) {
+               int ret;
+
                /* Some checks before mangling */
                if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
                        return 0;
@@ -212,8 +228,13 @@ tcp_dnat_handler(struct sk_buff *skb,
                 *      Attempt ip_vs_app call.
                 *      It will fix ip_vs_conn and iph ack_seq stuff
                 */
-               if (!ip_vs_app_pkt_in(cp, skb))
+               if (!(ret = ip_vs_app_pkt_in(cp, skb)))
                        return 0;
+               /* ret=2: csum update is needed after payload mangling */
+               if (ret == 1)
+                       oldlen = skb->len - tcphoff;
+               else
+                       payload_csum = 1;
        }
 
        tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -223,15 +244,16 @@ tcp_dnat_handler(struct sk_buff *skb,
         *      Adjust TCP checksums
         */
        if (skb->ip_summed == CHECKSUM_PARTIAL) {
-               tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+               tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
                                        htons(oldlen),
                                        htons(skb->len - tcphoff));
-       } else if (!cp->app) {
+       } else if (!payload_csum) {
                /* Only port and addr are changed, do fast csum update */
                tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
                                     cp->vport, cp->dport);
                if (skb->ip_summed == CHECKSUM_COMPLETE)
-                       skb->ip_summed = CHECKSUM_NONE;
+                       skb->ip_summed = (cp->app && pp->csum_check) ?
+                                        CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
        } else {
                /* full checksum calculation */
                tcph->check = 0;
@@ -278,7 +300,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
                                            skb->len - tcphoff,
                                            ipv6_hdr(skb)->nexthdr,
                                            skb->csum)) {
-                               IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                               IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
                                                 "Failed checksum for");
                                return 0;
                        }
@@ -289,7 +311,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
                                              skb->len - tcphoff,
                                              ip_hdr(skb)->protocol,
                                              skb->csum)) {
-                               IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                               IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
                                                 "Failed checksum for");
                                return 0;
                        }
index 8553231b5d412ca557f8699ee998e05351152213..9d106a06bb0a46376252b32f2d30882d921b8b16 100644 (file)
@@ -46,6 +46,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
        svc = ip_vs_service_get(af, skb->mark, iph.protocol,
                                &iph.daddr, uh->dest);
        if (svc) {
+               int ignored;
+
                if (ip_vs_todrop()) {
                        /*
                         * It seems that we are very loaded.
@@ -60,8 +62,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
                 * Let the virtual server select a real server for the
                 * incoming connection, and create a connection entry.
                 */
-               *cpp = ip_vs_schedule(svc, skb);
-               if (!*cpp) {
+               *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+               if (!*cpp && !ignored) {
                        *verdict = ip_vs_leave(svc, skb, pp);
                        return 0;
                }
@@ -102,15 +104,15 @@ udp_partial_csum_update(int af, struct udphdr *uhdr,
 #ifdef CONFIG_IP_VS_IPV6
        if (af == AF_INET6)
                uhdr->check =
-                       csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+                       ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
                                         ip_vs_check_diff2(oldlen, newlen,
-                                               ~csum_unfold(uhdr->check))));
+                                               csum_unfold(uhdr->check))));
        else
 #endif
        uhdr->check =
-               csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+               ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
                                ip_vs_check_diff2(oldlen, newlen,
-                                               ~csum_unfold(uhdr->check))));
+                                               csum_unfold(uhdr->check))));
 }
 
 
@@ -121,6 +123,7 @@ udp_snat_handler(struct sk_buff *skb,
        struct udphdr *udph;
        unsigned int udphoff;
        int oldlen;
+       int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
        if (cp->af == AF_INET6)
@@ -135,6 +138,8 @@ udp_snat_handler(struct sk_buff *skb,
                return 0;
 
        if (unlikely(cp->app != NULL)) {
+               int ret;
+
                /* Some checks before mangling */
                if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
                        return 0;
@@ -142,8 +147,13 @@ udp_snat_handler(struct sk_buff *skb,
                /*
                 *      Call application helper if needed
                 */
-               if (!ip_vs_app_pkt_out(cp, skb))
+               if (!(ret = ip_vs_app_pkt_out(cp, skb)))
                        return 0;
+               /* ret=2: csum update is needed after payload mangling */
+               if (ret == 1)
+                       oldlen = skb->len - udphoff;
+               else
+                       payload_csum = 1;
        }
 
        udph = (void *)skb_network_header(skb) + udphoff;
@@ -156,12 +166,13 @@ udp_snat_handler(struct sk_buff *skb,
                udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
                                        htons(oldlen),
                                        htons(skb->len - udphoff));
-       } else if (!cp->app && (udph->check != 0)) {
+       } else if (!payload_csum && (udph->check != 0)) {
                /* Only port and addr are changed, do fast csum update */
                udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
                                     cp->dport, cp->vport);
                if (skb->ip_summed == CHECKSUM_COMPLETE)
-                       skb->ip_summed = CHECKSUM_NONE;
+                       skb->ip_summed = (cp->app && pp->csum_check) ?
+                                        CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
        } else {
                /* full checksum calculation */
                udph->check = 0;
@@ -181,6 +192,7 @@ udp_snat_handler(struct sk_buff *skb,
                                                        skb->csum);
                if (udph->check == 0)
                        udph->check = CSUM_MANGLED_0;
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
                IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
                          pp->name, udph->check,
                          (char*)&(udph->check) - (char*)udph);
@@ -196,6 +208,7 @@ udp_dnat_handler(struct sk_buff *skb,
        struct udphdr *udph;
        unsigned int udphoff;
        int oldlen;
+       int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
        if (cp->af == AF_INET6)
@@ -210,6 +223,8 @@ udp_dnat_handler(struct sk_buff *skb,
                return 0;
 
        if (unlikely(cp->app != NULL)) {
+               int ret;
+
                /* Some checks before mangling */
                if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
                        return 0;
@@ -218,8 +233,13 @@ udp_dnat_handler(struct sk_buff *skb,
                 *      Attempt ip_vs_app call.
                 *      It will fix ip_vs_conn
                 */
-               if (!ip_vs_app_pkt_in(cp, skb))
+               if (!(ret = ip_vs_app_pkt_in(cp, skb)))
                        return 0;
+               /* ret=2: csum update is needed after payload mangling */
+               if (ret == 1)
+                       oldlen = skb->len - udphoff;
+               else
+                       payload_csum = 1;
        }
 
        udph = (void *)skb_network_header(skb) + udphoff;
@@ -229,15 +249,16 @@ udp_dnat_handler(struct sk_buff *skb,
         *      Adjust UDP checksums
         */
        if (skb->ip_summed == CHECKSUM_PARTIAL) {
-               udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+               udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
                                        htons(oldlen),
                                        htons(skb->len - udphoff));
-       } else if (!cp->app && (udph->check != 0)) {
+       } else if (!payload_csum && (udph->check != 0)) {
                /* Only port and addr are changed, do fast csum update */
                udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
                                     cp->vport, cp->dport);
                if (skb->ip_summed == CHECKSUM_COMPLETE)
-                       skb->ip_summed = CHECKSUM_NONE;
+                       skb->ip_summed = (cp->app && pp->csum_check) ?
+                                        CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
        } else {
                /* full checksum calculation */
                udph->check = 0;
@@ -293,7 +314,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
                                                    skb->len - udphoff,
                                                    ipv6_hdr(skb)->nexthdr,
                                                    skb->csum)) {
-                                       IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                                       IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
                                                         "Failed checksum for");
                                        return 0;
                                }
@@ -304,7 +325,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
                                                      skb->len - udphoff,
                                                      ip_hdr(skb)->protocol,
                                                      skb->csum)) {
-                                       IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                                       IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
                                                         "Failed checksum for");
                                        return 0;
                                }
index b0bd8afbf3686088635e70c0b7269cad2a099d9d..de04ea39cde8990025bdb5f63ff408fcc948fb0a 100644 (file)
  *
  * Changes:
  *
+ * Description of forwarding methods:
+ * - all transmitters are called from LOCAL_IN (remote clients) and
+ * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
+ * - not all connections have destination server, for example,
+ * connections in backup server when fwmark is used
+ * - bypass connections use daddr from packet
+ * LOCAL_OUT rules:
+ * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
+ * - skb->pkt_type is not set yet
+ * - the only place where we can see skb->sk != NULL
  */
 
 #define KMSG_COMPONENT "IPVS"
@@ -67,12 +77,19 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
        return dst;
 }
 
+/*
+ * Get route to destination or remote server
+ * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
+ *         &4=Allow redirect from remote daddr to local
+ */
 static struct rtable *
-__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
+__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
+                  __be32 daddr, u32 rtos, int rt_mode)
 {
-       struct net *net = dev_net(skb->dev);
+       struct net *net = dev_net(skb_dst(skb)->dev);
        struct rtable *rt;                      /* Route to the other host */
-       struct ip_vs_dest *dest = cp->dest;
+       struct rtable *ort;                     /* Original route */
+       int local;
 
        if (dest) {
                spin_lock(&dest->dst_lock);
@@ -104,23 +121,95 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
                        .oif = 0,
                        .nl_u = {
                                .ip4_u = {
-                                       .daddr = cp->daddr.ip,
+                                       .daddr = daddr,
                                        .saddr = 0,
                                        .tos = rtos, } },
                };
 
                if (ip_route_output_key(net, &rt, &fl)) {
                        IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
-                                    &cp->daddr.ip);
+                                    &daddr);
                        return NULL;
                }
        }
 
+       local = rt->rt_flags & RTCF_LOCAL;
+       if (!((local ? 1 : 2) & rt_mode)) {
+               IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
+                            (rt->rt_flags & RTCF_LOCAL) ?
+                            "local":"non-local", &rt->rt_dst);
+               ip_rt_put(rt);
+               return NULL;
+       }
+       if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) &&
+                                        ort->rt_flags & RTCF_LOCAL)) {
+               IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
+                            "requires NAT method, dest: %pI4\n",
+                            &ip_hdr(skb)->daddr, &rt->rt_dst);
+               ip_rt_put(rt);
+               return NULL;
+       }
+       if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
+               IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
+                            "to non-local address, dest: %pI4\n",
+                            &ip_hdr(skb)->saddr, &rt->rt_dst);
+               ip_rt_put(rt);
+               return NULL;
+       }
+
        return rt;
 }
 
+/* Reroute packet to local IPv4 stack after DNAT */
+static int
+__ip_vs_reroute_locally(struct sk_buff *skb)
+{
+       struct rtable *rt = skb_rtable(skb);
+       struct net_device *dev = rt->dst.dev;
+       struct net *net = dev_net(dev);
+       struct iphdr *iph = ip_hdr(skb);
+
+       if (rt->fl.iif) {
+               unsigned long orefdst = skb->_skb_refdst;
+
+               if (ip_route_input(skb, iph->daddr, iph->saddr,
+                                  iph->tos, skb->dev))
+                       return 0;
+               refdst_drop(orefdst);
+       } else {
+               struct flowi fl = {
+                       .oif = 0,
+                       .nl_u = {
+                               .ip4_u = {
+                                       .daddr = iph->daddr,
+                                       .saddr = iph->saddr,
+                                       .tos = RT_TOS(iph->tos),
+                               }
+                       },
+                       .mark = skb->mark,
+               };
+               struct rtable *rt;
+
+               if (ip_route_output_key(net, &rt, &fl))
+                       return 0;
+               if (!(rt->rt_flags & RTCF_LOCAL)) {
+                       ip_rt_put(rt);
+                       return 0;
+               }
+               /* Drop old route. */
+               skb_dst_drop(skb);
+               skb_dst_set(skb, &rt->dst);
+       }
+       return 1;
+}
+
 #ifdef CONFIG_IP_VS_IPV6
 
+static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
+{
+       return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK;
+}
+
 static struct dst_entry *
 __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
                        struct in6_addr *ret_saddr, int do_xfrm)
@@ -155,14 +244,21 @@ out_err:
        return NULL;
 }
 
+/*
+ * Get route to destination or remote server
+ * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
+ *         &4=Allow redirect from remote daddr to local
+ */
 static struct rt6_info *
-__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-                     struct in6_addr *ret_saddr, int do_xfrm)
+__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
+                     struct in6_addr *daddr, struct in6_addr *ret_saddr,
+                     int do_xfrm, int rt_mode)
 {
-       struct net *net = dev_net(skb->dev);
+       struct net *net = dev_net(skb_dst(skb)->dev);
        struct rt6_info *rt;                    /* Route to the other host */
-       struct ip_vs_dest *dest = cp->dest;
+       struct rt6_info *ort;                   /* Original route */
        struct dst_entry *dst;
+       int local;
 
        if (dest) {
                spin_lock(&dest->dst_lock);
@@ -188,13 +284,38 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
                        ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
                spin_unlock(&dest->dst_lock);
        } else {
-               dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr,
-                                             do_xfrm);
+               dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
                if (!dst)
                        return NULL;
                rt = (struct rt6_info *) dst;
        }
 
+       local = __ip_vs_is_local_route6(rt);
+       if (!((local ? 1 : 2) & rt_mode)) {
+               IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n",
+                            local ? "local":"non-local", daddr);
+               dst_release(&rt->dst);
+               return NULL;
+       }
+       if (local && !(rt_mode & 4) &&
+           !((ort = (struct rt6_info *) skb_dst(skb)) &&
+             __ip_vs_is_local_route6(ort))) {
+               IP_VS_DBG_RL("Redirect from non-local address %pI6 to local "
+                            "requires NAT method, dest: %pI6\n",
+                            &ipv6_hdr(skb)->daddr, daddr);
+               dst_release(&rt->dst);
+               return NULL;
+       }
+       if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+                    ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
+                                   IPV6_ADDR_LOOPBACK)) {
+               IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 "
+                            "to non-local address, dest: %pI6\n",
+                            &ipv6_hdr(skb)->saddr, daddr);
+               dst_release(&rt->dst);
+               return NULL;
+       }
+
        return rt;
 }
 #endif
@@ -217,30 +338,37 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
 ({                                                             \
        int __ret = NF_ACCEPT;                                  \
                                                                \
+       (skb)->ipvs_property = 1;                               \
        if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT))          \
                __ret = ip_vs_confirm_conntrack(skb, cp);       \
        if (__ret == NF_ACCEPT) {                               \
                nf_reset(skb);                                  \
-               (skb)->ip_summed = CHECKSUM_NONE;               \
+               skb_forward_csum(skb);                          \
        }                                                       \
        __ret;                                                  \
 })
 
-#define IP_VS_XMIT_NAT(pf, skb, cp)                            \
+#define IP_VS_XMIT_NAT(pf, skb, cp, local)             \
 do {                                                   \
+       (skb)->ipvs_property = 1;                       \
        if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
-               (skb)->ipvs_property = 1;               \
+               ip_vs_notrack(skb);                     \
        else                                            \
                ip_vs_update_conntrack(skb, cp, 1);     \
+       if (local)                                      \
+               return NF_ACCEPT;                       \
        skb_forward_csum(skb);                          \
        NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,     \
                skb_dst(skb)->dev, dst_output);         \
 } while (0)
 
-#define IP_VS_XMIT(pf, skb, cp)                                \
+#define IP_VS_XMIT(pf, skb, cp, local)                 \
 do {                                                   \
+       (skb)->ipvs_property = 1;                       \
        if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
-               (skb)->ipvs_property = 1;               \
+               ip_vs_notrack(skb);                     \
+       if (local)                                      \
+               return NF_ACCEPT;                       \
        skb_forward_csum(skb);                          \
        NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,     \
                skb_dst(skb)->dev, dst_output);         \
@@ -255,7 +383,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
                struct ip_vs_protocol *pp)
 {
        /* we do not touch skb and do not need pskb ptr */
-       return NF_ACCEPT;
+       IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
 }
 
 
@@ -268,27 +396,15 @@ int
 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
                  struct ip_vs_protocol *pp)
 {
-       struct net *net = dev_net(skb->dev);
        struct rtable *rt;                      /* Route to the other host */
        struct iphdr  *iph = ip_hdr(skb);
-       u8     tos = iph->tos;
        int    mtu;
-       struct flowi fl = {
-               .oif = 0,
-               .nl_u = {
-                       .ip4_u = {
-                               .daddr = iph->daddr,
-                               .saddr = 0,
-                               .tos = RT_TOS(tos), } },
-       };
 
        EnterFunction(10);
 
-       if (ip_route_output_key(net, &rt, &fl)) {
-               IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
-                            __func__, &iph->daddr);
+       if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr,
+                                     RT_TOS(iph->tos), 2)))
                goto tx_error_icmp;
-       }
 
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
@@ -316,7 +432,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
+       IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
 
        LeaveFunction(10);
        return NF_STOLEN;
@@ -334,24 +450,25 @@ int
 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
                     struct ip_vs_protocol *pp)
 {
-       struct net *net = dev_net(skb->dev);
-       struct dst_entry *dst;
        struct rt6_info *rt;                    /* Route to the other host */
        struct ipv6hdr  *iph = ipv6_hdr(skb);
        int    mtu;
 
        EnterFunction(10);
 
-       dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0);
-       if (!dst)
+       if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, 2)))
                goto tx_error_icmp;
-       rt = (struct rt6_info *) dst;
 
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if (skb->len > mtu) {
-               dst_release(&rt->dst);
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+               dst_release(&rt->dst);
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
                goto tx_error;
        }
@@ -373,7 +490,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
+       IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
 
        LeaveFunction(10);
        return NF_STOLEN;
@@ -398,6 +515,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        struct rtable *rt;              /* Route to the other host */
        int mtu;
        struct iphdr *iph = ip_hdr(skb);
+       int local;
 
        EnterFunction(10);
 
@@ -411,16 +529,42 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
                IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
        }
 
-       if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
+       if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+                                     RT_TOS(iph->tos), 1|2|4)))
                goto tx_error_icmp;
+       local = rt->rt_flags & RTCF_LOCAL;
+       /*
+        * Avoid duplicate tuple in reply direction for NAT traffic
+        * to local address when connection is sync-ed
+        */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+       if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+               enum ip_conntrack_info ctinfo;
+               struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+               if (ct && !nf_ct_is_untracked(ct)) {
+                       IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
+                                        "ip_vs_nat_xmit(): "
+                                        "stopping DNAT to local address");
+                       goto tx_error_put;
+               }
+       }
+#endif
+
+       /* From world but DNAT to loopback address? */
+       if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
+               IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
+                                "stopping DNAT to loopback address");
+               goto tx_error_put;
+       }
 
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
-               ip_rt_put(rt);
                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-               IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
-               goto tx_error;
+               IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
+                                "ip_vs_nat_xmit(): frag needed for");
+               goto tx_error_put;
        }
 
        /* copy-on-write the packet before mangling it */
@@ -430,17 +574,28 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        if (skb_cow(skb, rt->dst.dev->hard_header_len))
                goto tx_error_put;
 
-       /* drop old route */
-       skb_dst_drop(skb);
-       skb_dst_set(skb, &rt->dst);
-
        /* mangle the packet */
        if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
-               goto tx_error;
+               goto tx_error_put;
        ip_hdr(skb)->daddr = cp->daddr.ip;
        ip_send_check(ip_hdr(skb));
 
-       IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+       if (!local) {
+               /* drop old route */
+               skb_dst_drop(skb);
+               skb_dst_set(skb, &rt->dst);
+       } else {
+               ip_rt_put(rt);
+               /*
+                * Some IPv4 replies get local address from routes,
+                * not from iph, so while we DNAT after routing
+                * we need this second input/output route.
+                */
+               if (!__ip_vs_reroute_locally(skb))
+                       goto tx_error;
+       }
+
+       IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
 
        /* FIXME: when application helper enlarges the packet and the length
           is larger than the MTU of outgoing device, there will be still
@@ -449,7 +604,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp);
+       IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
 
        LeaveFunction(10);
        return NF_STOLEN;
@@ -472,6 +627,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
        struct rt6_info *rt;            /* Route to the other host */
        int mtu;
+       int local;
 
        EnterFunction(10);
 
@@ -486,18 +642,49 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
                IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
        }
 
-       rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
-       if (!rt)
+       if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+                                        0, 1|2|4)))
                goto tx_error_icmp;
+       local = __ip_vs_is_local_route6(rt);
+       /*
+        * Avoid duplicate tuple in reply direction for NAT traffic
+        * to local address when connection is sync-ed
+        */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+       if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+               enum ip_conntrack_info ctinfo;
+               struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+               if (ct && !nf_ct_is_untracked(ct)) {
+                       IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
+                                        "ip_vs_nat_xmit_v6(): "
+                                        "stopping DNAT to local address");
+                       goto tx_error_put;
+               }
+       }
+#endif
+
+       /* From world but DNAT to loopback address? */
+       if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+           ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+               IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
+                                "ip_vs_nat_xmit_v6(): "
+                                "stopping DNAT to loopback address");
+               goto tx_error_put;
+       }
 
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if (skb->len > mtu) {
-               dst_release(&rt->dst);
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-               IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+               IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
                                 "ip_vs_nat_xmit_v6(): frag needed for");
-               goto tx_error;
+               goto tx_error_put;
        }
 
        /* copy-on-write the packet before mangling it */
@@ -507,16 +694,21 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        if (skb_cow(skb, rt->dst.dev->hard_header_len))
                goto tx_error_put;
 
-       /* drop old route */
-       skb_dst_drop(skb);
-       skb_dst_set(skb, &rt->dst);
-
        /* mangle the packet */
        if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
                goto tx_error;
-       ipv6_hdr(skb)->daddr = cp->daddr.in6;
+       ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6);
 
-       IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+       if (!local || !skb->dev) {
+               /* drop the old route when skb is not shared */
+               skb_dst_drop(skb);
+               skb_dst_set(skb, &rt->dst);
+       } else {
+               /* destined to loopback, do we need to change route? */
+               dst_release(&rt->dst);
+       }
+
+       IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
 
        /* FIXME: when application helper enlarges the packet and the length
           is larger than the MTU of outgoing device, there will be still
@@ -525,7 +717,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp);
+       IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
 
        LeaveFunction(10);
        return NF_STOLEN;
@@ -578,23 +770,20 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
        EnterFunction(10);
 
-       if (skb->protocol != htons(ETH_P_IP)) {
-               IP_VS_DBG_RL("%s(): protocol error, "
-                            "ETH_P_IP: %d, skb protocol: %d\n",
-                            __func__, htons(ETH_P_IP), skb->protocol);
-               goto tx_error;
-       }
-
-       if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos))))
+       if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+                                     RT_TOS(tos), 1|2)))
                goto tx_error_icmp;
+       if (rt->rt_flags & RTCF_LOCAL) {
+               ip_rt_put(rt);
+               IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+       }
 
        tdev = rt->dst.dev;
 
        mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
        if (mtu < 68) {
-               ip_rt_put(rt);
                IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
-               goto tx_error;
+               goto tx_error_put;
        }
        if (skb_dst(skb))
                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
@@ -604,9 +793,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        if ((old_iph->frag_off & htons(IP_DF))
            && mtu < ntohs(old_iph->tot_len)) {
                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-               ip_rt_put(rt);
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-               goto tx_error;
+               goto tx_error_put;
        }
 
        /*
@@ -675,6 +863,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        kfree_skb(skb);
        LeaveFunction(10);
        return NF_STOLEN;
+tx_error_put:
+       ip_rt_put(rt);
+       goto tx_error;
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -693,34 +884,34 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
        EnterFunction(10);
 
-       if (skb->protocol != htons(ETH_P_IPV6)) {
-               IP_VS_DBG_RL("%s(): protocol error, "
-                            "ETH_P_IPV6: %d, skb protocol: %d\n",
-                            __func__, htons(ETH_P_IPV6), skb->protocol);
-               goto tx_error;
-       }
-
-       rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1);
-       if (!rt)
+       if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
+                                        &saddr, 1, 1|2)))
                goto tx_error_icmp;
+       if (__ip_vs_is_local_route6(rt)) {
+               dst_release(&rt->dst);
+               IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+       }
 
        tdev = rt->dst.dev;
 
        mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
        if (mtu < IPV6_MIN_MTU) {
-               dst_release(&rt->dst);
                IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
                             IPV6_MIN_MTU);
-               goto tx_error;
+               goto tx_error_put;
        }
        if (skb_dst(skb))
                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
        if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-               dst_release(&rt->dst);
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-               goto tx_error;
+               goto tx_error_put;
        }
 
        /*
@@ -786,6 +977,9 @@ tx_error:
        kfree_skb(skb);
        LeaveFunction(10);
        return NF_STOLEN;
+tx_error_put:
+       dst_release(&rt->dst);
+       goto tx_error;
 }
 #endif
 
@@ -804,8 +998,13 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
        EnterFunction(10);
 
-       if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
+       if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+                                     RT_TOS(iph->tos), 1|2)))
                goto tx_error_icmp;
+       if (rt->rt_flags & RTCF_LOCAL) {
+               ip_rt_put(rt);
+               IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+       }
 
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
@@ -833,7 +1032,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
+       IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
 
        LeaveFunction(10);
        return NF_STOLEN;
@@ -856,13 +1055,22 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
        EnterFunction(10);
 
-       rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
-       if (!rt)
+       if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+                                        0, 1|2)))
                goto tx_error_icmp;
+       if (__ip_vs_is_local_route6(rt)) {
+               dst_release(&rt->dst);
+               IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+       }
 
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if (skb->len > mtu) {
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
                dst_release(&rt->dst);
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -886,7 +1094,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
+       IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
 
        LeaveFunction(10);
        return NF_STOLEN;
@@ -912,6 +1120,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        struct rtable   *rt;    /* Route to the other host */
        int mtu;
        int rc;
+       int local;
 
        EnterFunction(10);
 
@@ -932,16 +1141,43 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
         * mangle and send the packet here (only for VS/NAT)
         */
 
-       if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos))))
+       if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+                                     RT_TOS(ip_hdr(skb)->tos), 1|2|4)))
                goto tx_error_icmp;
+       local = rt->rt_flags & RTCF_LOCAL;
+
+       /*
+        * Avoid duplicate tuple in reply direction for NAT traffic
+        * to local address when connection is sync-ed
+        */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+       if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+               enum ip_conntrack_info ctinfo;
+               struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+               if (ct && !nf_ct_is_untracked(ct)) {
+                       IP_VS_DBG(10, "%s(): "
+                                 "stopping DNAT to local address %pI4\n",
+                                 __func__, &cp->daddr.ip);
+                       goto tx_error_put;
+               }
+       }
+#endif
+
+       /* From world but DNAT to loopback address? */
+       if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
+               IP_VS_DBG(1, "%s(): "
+                         "stopping DNAT to loopback %pI4\n",
+                         __func__, &cp->daddr.ip);
+               goto tx_error_put;
+       }
 
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
-               ip_rt_put(rt);
                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-               goto tx_error;
+               goto tx_error_put;
        }
 
        /* copy-on-write the packet before mangling it */
@@ -951,16 +1187,27 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        if (skb_cow(skb, rt->dst.dev->hard_header_len))
                goto tx_error_put;
 
-       /* drop the old route when skb is not shared */
-       skb_dst_drop(skb);
-       skb_dst_set(skb, &rt->dst);
-
        ip_vs_nat_icmp(skb, pp, cp, 0);
 
+       if (!local) {
+               /* drop the old route when skb is not shared */
+               skb_dst_drop(skb);
+               skb_dst_set(skb, &rt->dst);
+       } else {
+               ip_rt_put(rt);
+               /*
+                * Some IPv4 replies get local address from routes,
+                * not from iph, so while we DNAT after routing
+                * we need this second input/output route.
+                */
+               if (!__ip_vs_reroute_locally(skb))
+                       goto tx_error;
+       }
+
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
+       IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
 
        rc = NF_STOLEN;
        goto out;
@@ -986,6 +1233,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        struct rt6_info *rt;    /* Route to the other host */
        int mtu;
        int rc;
+       int local;
 
        EnterFunction(10);
 
@@ -1006,17 +1254,49 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
         * mangle and send the packet here (only for VS/NAT)
         */
 
-       rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
-       if (!rt)
+       if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+                                        0, 1|2|4)))
                goto tx_error_icmp;
 
+       local = __ip_vs_is_local_route6(rt);
+       /*
+        * Avoid duplicate tuple in reply direction for NAT traffic
+        * to local address when connection is sync-ed
+        */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+       if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+               enum ip_conntrack_info ctinfo;
+               struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+               if (ct && !nf_ct_is_untracked(ct)) {
+                       IP_VS_DBG(10, "%s(): "
+                                 "stopping DNAT to local address %pI6\n",
+                                 __func__, &cp->daddr.in6);
+                       goto tx_error_put;
+               }
+       }
+#endif
+
+       /* From world but DNAT to loopback address? */
+       if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+           ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+               IP_VS_DBG(1, "%s(): "
+                         "stopping DNAT to loopback %pI6\n",
+                         __func__, &cp->daddr.in6);
+               goto tx_error_put;
+       }
+
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if (skb->len > mtu) {
-               dst_release(&rt->dst);
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-               goto tx_error;
+               goto tx_error_put;
        }
 
        /* copy-on-write the packet before mangling it */
@@ -1026,16 +1306,21 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        if (skb_cow(skb, rt->dst.dev->hard_header_len))
                goto tx_error_put;
 
-       /* drop the old route when skb is not shared */
-       skb_dst_drop(skb);
-       skb_dst_set(skb, &rt->dst);
-
        ip_vs_nat_icmp_v6(skb, pp, cp, 0);
 
+       if (!local || !skb->dev) {
+               /* drop the old route when skb is not shared */
+               skb_dst_drop(skb);
+               skb_dst_set(skb, &rt->dst);
+       } else {
+               /* destined to loopback, do we need to change route? */
+               dst_release(&rt->dst);
+       }
+
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
+       IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
 
        rc = NF_STOLEN;
        goto out;