]> bbs.cooldavid.org Git - net-next-2.6.git/commitdiff
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-next-2.6
authorDavid S. Miller <davem@davemloft.net>
Thu, 21 Oct 2010 15:21:34 +0000 (08:21 -0700)
committerDavid S. Miller <davem@davemloft.net>
Thu, 21 Oct 2010 15:21:34 +0000 (08:21 -0700)
78 files changed:
include/linux/in6.h
include/linux/ip_vs.h
include/linux/ipv6.h
include/linux/netfilter/nf_conntrack_common.h
include/linux/netfilter/nf_conntrack_sip.h
include/linux/netfilter/nfnetlink_conntrack.h
include/linux/netfilter/x_tables.h
include/linux/netfilter/xt_TPROXY.h
include/linux/netfilter_arp/arp_tables.h
include/linux/netfilter_bridge/Kbuild
include/linux/netfilter_ipv4/ip_tables.h
include/linux/netfilter_ipv6/ip6_tables.h
include/net/inet_hashtables.h
include/net/ip_vs.h
include/net/netfilter/ipv6/nf_defrag_ipv6.h [new file with mode: 0644]
include/net/netfilter/nf_conntrack_expect.h
include/net/netfilter/nf_nat_protocol.h
include/net/netfilter/nf_tproxy_core.h
include/net/netfilter/xt_log.h [new file with mode: 0644]
include/net/udp.h
net/dccp/ipv4.c
net/dccp/ipv6.c
net/ipv4/inet_hashtables.c
net/ipv4/netfilter/Kconfig
net/ipv4/netfilter/arp_tables.c
net/ipv4/netfilter/arpt_mangle.c
net/ipv4/netfilter/ip_tables.c
net/ipv4/netfilter/ipt_LOG.c
net/ipv4/netfilter/nf_nat_amanda.c
net/ipv4/netfilter/nf_nat_core.c
net/ipv4/netfilter/nf_nat_ftp.c
net/ipv4/netfilter/nf_nat_h323.c
net/ipv4/netfilter/nf_nat_helper.c
net/ipv4/netfilter/nf_nat_irc.c
net/ipv4/netfilter/nf_nat_rule.c
net/ipv4/netfilter/nf_nat_sip.c
net/ipv4/tcp_ipv4.c
net/ipv6/af_inet6.c
net/ipv6/datagram.c
net/ipv6/ipv6_sockglue.c
net/ipv6/netfilter/Kconfig
net/ipv6/netfilter/Makefile
net/ipv6/netfilter/ip6_tables.c
net/ipv6/netfilter/ip6t_LOG.c
net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
net/ipv6/netfilter/nf_conntrack_reasm.c
net/ipv6/netfilter/nf_defrag_ipv6_hooks.c [new file with mode: 0644]
net/ipv6/tcp_ipv6.c
net/ipv6/udp.c
net/netfilter/core.c
net/netfilter/ipvs/Kconfig
net/netfilter/ipvs/Makefile
net/netfilter/ipvs/ip_vs_app.c
net/netfilter/ipvs/ip_vs_conn.c
net/netfilter/ipvs/ip_vs_core.c
net/netfilter/ipvs/ip_vs_ctl.c
net/netfilter/ipvs/ip_vs_ftp.c
net/netfilter/ipvs/ip_vs_nfct.c [new file with mode: 0644]
net/netfilter/ipvs/ip_vs_pe.c [new file with mode: 0644]
net/netfilter/ipvs/ip_vs_pe_sip.c [new file with mode: 0644]
net/netfilter/ipvs/ip_vs_proto.c
net/netfilter/ipvs/ip_vs_proto_ah_esp.c
net/netfilter/ipvs/ip_vs_proto_sctp.c
net/netfilter/ipvs/ip_vs_proto_tcp.c
net/netfilter/ipvs/ip_vs_proto_udp.c
net/netfilter/ipvs/ip_vs_sched.c
net/netfilter/ipvs/ip_vs_sync.c
net/netfilter/ipvs/ip_vs_xmit.c
net/netfilter/nf_conntrack_core.c
net/netfilter/nf_conntrack_expect.c
net/netfilter/nf_conntrack_netlink.c
net/netfilter/nf_conntrack_sip.c
net/netfilter/nf_tproxy_core.c
net/netfilter/x_tables.c
net/netfilter/xt_TPROXY.c
net/netfilter/xt_ipvs.c
net/netfilter/xt_socket.c
net/sched/act_ipt.c

index c4bf46f764bf34bffab67e6de3ec9c8ddee70b0e..097a34b55560a9d4434f61f78b85f5fbbef8e2e7 100644 (file)
@@ -268,6 +268,10 @@ struct in6_flowlabel_req {
 /* RFC5082: Generalized Ttl Security Mechanism */
 #define IPV6_MINHOPCOUNT               73
 
+#define IPV6_ORIGDSTADDR        74
+#define IPV6_RECVORIGDSTADDR    IPV6_ORIGDSTADDR
+#define IPV6_TRANSPARENT        75
+
 /*
  * Multicast Routing:
  * see include/linux/mroute6.h.
index 9708de265bb1e24f1b6130c2f0ca3ace1b249dfc..5f43a3b2e3ad7149c13f2552987c8b5ce1749fcb 100644 (file)
@@ -70,6 +70,7 @@
 
 /*
  *      IPVS Connection Flags
+ *      Only flags 0..15 are sent to backup server
  */
 #define IP_VS_CONN_F_FWD_MASK  0x0007          /* mask for the fwd methods */
 #define IP_VS_CONN_F_MASQ      0x0000          /* masquerading/NAT */
 #define IP_VS_CONN_F_TEMPLATE  0x1000          /* template, not connection */
 #define IP_VS_CONN_F_ONE_PACKET        0x2000          /* forward only one packet */
 
+/* Flags that are not sent to backup server start from bit 16 */
+#define IP_VS_CONN_F_NFCT      (1 << 16)       /* use netfilter conntrack */
+
+/* Connection flags from destination that can be changed by user space */
+#define IP_VS_CONN_F_DEST_MASK (IP_VS_CONN_F_FWD_MASK | \
+                               IP_VS_CONN_F_ONE_PACKET | \
+                               IP_VS_CONN_F_NFCT | \
+                               0)
+
 #define IP_VS_SCHEDNAME_MAXLEN 16
+#define IP_VS_PENAME_MAXLEN    16
 #define IP_VS_IFNAME_MAXLEN    16
 
+#define IP_VS_PEDATA_MAXLEN     255
 
 /*
  *     The struct ip_vs_service_user and struct ip_vs_dest_user are
@@ -324,6 +336,9 @@ enum {
        IPVS_SVC_ATTR_NETMASK,          /* persistent netmask */
 
        IPVS_SVC_ATTR_STATS,            /* nested attribute for service stats */
+
+       IPVS_SVC_ATTR_PE_NAME,          /* name of ct retriever */
+
        __IPVS_SVC_ATTR_MAX,
 };
 
index e62683ba88e6824e72b3e8c81c7315998868f606..8e429d0e0405df6f50ebbfe9273d719f18facc57 100644 (file)
@@ -341,7 +341,9 @@ struct ipv6_pinfo {
                                odstopts:1,
                                 rxflow:1,
                                rxtclass:1,
-                               rxpmtu:1;
+                               rxpmtu:1,
+                               rxorigdstaddr:1;
+                               /* 2 bits hole */
                } bits;
                __u16           all;
        } rxopt;
index 1afd18c855ec99d9cf8cb4b9570b06b6e28b74d2..50cdc2559a5aa05a10e4d1115c48da7185a5e7bd 100644 (file)
@@ -98,8 +98,14 @@ enum ip_conntrack_events {
 
 enum ip_conntrack_expect_events {
        IPEXP_NEW,              /* new expectation */
+       IPEXP_DESTROY,          /* destroyed expectation */
 };
 
+/* expectation flags */
+#define NF_CT_EXPECT_PERMANENT         0x1
+#define NF_CT_EXPECT_INACTIVE          0x2
+#define NF_CT_EXPECT_USERSPACE         0x4
+
 #ifdef __KERNEL__
 struct ip_conntrack_stat {
        unsigned int searched;
index ff8cfbcf3b81148e483831281752572b2faaac1b..0ce91d56a5f264c989ee5b17a027b489eaabc593 100644 (file)
@@ -89,6 +89,7 @@ enum sip_header_types {
        SIP_HDR_VIA_TCP,
        SIP_HDR_EXPIRES,
        SIP_HDR_CONTENT_LENGTH,
+       SIP_HDR_CALL_ID,
 };
 
 enum sdp_header_types {
index 9ed534c991b9312d84876c9162ed63e5ef2b5c66..455f0ce4f43041e5e4226764f71e48e7a8b6d4e1 100644 (file)
@@ -161,6 +161,7 @@ enum ctattr_expect {
        CTA_EXPECT_ID,
        CTA_EXPECT_HELP_NAME,
        CTA_EXPECT_ZONE,
+       CTA_EXPECT_FLAGS,
        __CTA_EXPECT_MAX
 };
 #define CTA_EXPECT_MAX (__CTA_EXPECT_MAX - 1)
index 24e5d01d27d07b860bfed06701fb2d864ee89bc3..742bec051440e4f800bbf00c5aee2ab468541958 100644 (file)
@@ -66,6 +66,11 @@ struct xt_standard_target {
        int verdict;
 };
 
+struct xt_error_target {
+       struct xt_entry_target target;
+       char errorname[XT_FUNCTION_MAXNAMELEN];
+};
+
 /* The argument to IPT_SO_GET_REVISION_*.  Returns highest revision
  * kernel supports, if >= revision. */
 struct xt_get_revision {
index 152e8f97132b1a213271075c16c3ac0b2ce9895d..3f3d69361289ca4cecda2182f71d62e0b7137053 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef _XT_TPROXY_H_target
-#define _XT_TPROXY_H_target
+#ifndef _XT_TPROXY_H
+#define _XT_TPROXY_H
 
 /* TPROXY target is capable of marking the packet to perform
  * redirection. We can get rid of that whenever we get support for
@@ -11,4 +11,11 @@ struct xt_tproxy_target_info {
        __be16 lport;
 };
 
-#endif /* _XT_TPROXY_H_target */
+struct xt_tproxy_target_info_v1 {
+       u_int32_t mark_mask;
+       u_int32_t mark_value;
+       union nf_inet_addr laddr;
+       __be16 lport;
+};
+
+#endif /* _XT_TPROXY_H */
index e9948c0560f6fb302d29b5be35c6fd083d8409a7..adbf4bff87eda6823689222410cb625b63488150 100644 (file)
 
 #include <linux/netfilter/x_tables.h>
 
+#ifndef __KERNEL__
 #define ARPT_FUNCTION_MAXNAMELEN XT_FUNCTION_MAXNAMELEN
 #define ARPT_TABLE_MAXNAMELEN XT_TABLE_MAXNAMELEN
+#define arpt_entry_target xt_entry_target
+#define arpt_standard_target xt_standard_target
+#define arpt_error_target xt_error_target
+#define ARPT_CONTINUE XT_CONTINUE
+#define ARPT_RETURN XT_RETURN
+#define arpt_counters_info xt_counters_info
+#define arpt_counters xt_counters
+#define ARPT_STANDARD_TARGET XT_STANDARD_TARGET
+#define ARPT_ERROR_TARGET XT_ERROR_TARGET
+#define ARPT_ENTRY_ITERATE(entries, size, fn, args...) \
+       XT_ENTRY_ITERATE(struct arpt_entry, entries, size, fn, ## args)
+#endif
 
 #define ARPT_DEV_ADDR_LEN_MAX 16
 
@@ -63,9 +76,6 @@ struct arpt_arp {
        u_int16_t invflags;
 };
 
-#define arpt_entry_target xt_entry_target
-#define arpt_standard_target xt_standard_target
-
 /* Values for "flag" field in struct arpt_ip (general arp structure).
  * No flags defined yet.
  */
@@ -125,16 +135,10 @@ struct arpt_entry
 #define ARPT_SO_GET_REVISION_TARGET    (ARPT_BASE_CTL + 3)
 #define ARPT_SO_GET_MAX                        (ARPT_SO_GET_REVISION_TARGET)
 
-/* CONTINUE verdict for targets */
-#define ARPT_CONTINUE XT_CONTINUE
-
-/* For standard target */
-#define ARPT_RETURN XT_RETURN
-
 /* The argument to ARPT_SO_GET_INFO */
 struct arpt_getinfo {
        /* Which table: caller fills this in. */
-       char name[ARPT_TABLE_MAXNAMELEN];
+       char name[XT_TABLE_MAXNAMELEN];
 
        /* Kernel fills these in. */
        /* Which hook entry points are valid: bitmask */
@@ -156,7 +160,7 @@ struct arpt_getinfo {
 /* The argument to ARPT_SO_SET_REPLACE. */
 struct arpt_replace {
        /* Which table. */
-       char name[ARPT_TABLE_MAXNAMELEN];
+       char name[XT_TABLE_MAXNAMELEN];
 
        /* Which hook entry points are valid: bitmask.  You can't
            change this. */
@@ -184,14 +188,10 @@ struct arpt_replace {
        struct arpt_entry entries[0];
 };
 
-/* The argument to ARPT_SO_ADD_COUNTERS. */
-#define arpt_counters_info xt_counters_info
-#define arpt_counters xt_counters
-
 /* The argument to ARPT_SO_GET_ENTRIES. */
 struct arpt_get_entries {
        /* Which table: user fills this in. */
-       char name[ARPT_TABLE_MAXNAMELEN];
+       char name[XT_TABLE_MAXNAMELEN];
 
        /* User fills this in: total entry size. */
        unsigned int size;
@@ -200,23 +200,12 @@ struct arpt_get_entries {
        struct arpt_entry entrytable[0];
 };
 
-/* Standard return verdict, or do jump. */
-#define ARPT_STANDARD_TARGET XT_STANDARD_TARGET
-/* Error verdict. */
-#define ARPT_ERROR_TARGET XT_ERROR_TARGET
-
 /* Helper functions */
-static __inline__ struct arpt_entry_target *arpt_get_target(struct arpt_entry *e)
+static __inline__ struct xt_entry_target *arpt_get_target(struct arpt_entry *e)
 {
        return (void *)e + e->target_offset;
 }
 
-#ifndef __KERNEL__
-/* fn returns 0 to continue iteration */
-#define ARPT_ENTRY_ITERATE(entries, size, fn, args...) \
-       XT_ENTRY_ITERATE(struct arpt_entry, entries, size, fn, ## args)
-#endif
-
 /*
  *     Main firewall chains definitions and global var's definitions.
  */
@@ -225,17 +214,12 @@ static __inline__ struct arpt_entry_target *arpt_get_target(struct arpt_entry *e
 /* Standard entry. */
 struct arpt_standard {
        struct arpt_entry entry;
-       struct arpt_standard_target target;
-};
-
-struct arpt_error_target {
-       struct arpt_entry_target target;
-       char errorname[ARPT_FUNCTION_MAXNAMELEN];
+       struct xt_standard_target target;
 };
 
 struct arpt_error {
        struct arpt_entry entry;
-       struct arpt_error_target target;
+       struct xt_error_target target;
 };
 
 #define ARPT_ENTRY_INIT(__size)                                                       \
@@ -247,16 +231,16 @@ struct arpt_error {
 #define ARPT_STANDARD_INIT(__verdict)                                         \
 {                                                                             \
        .entry          = ARPT_ENTRY_INIT(sizeof(struct arpt_standard)),       \
-       .target         = XT_TARGET_INIT(ARPT_STANDARD_TARGET,                 \
-                                        sizeof(struct arpt_standard_target)), \
+       .target         = XT_TARGET_INIT(XT_STANDARD_TARGET,                   \
+                                        sizeof(struct xt_standard_target)), \
        .target.verdict = -(__verdict) - 1,                                    \
 }
 
 #define ARPT_ERROR_INIT                                                               \
 {                                                                             \
        .entry          = ARPT_ENTRY_INIT(sizeof(struct arpt_error)),          \
-       .target         = XT_TARGET_INIT(ARPT_ERROR_TARGET,                    \
-                                        sizeof(struct arpt_error_target)),    \
+       .target         = XT_TARGET_INIT(XT_ERROR_TARGET,                      \
+                                        sizeof(struct xt_error_target)),      \
        .target.errorname = "ERROR",                                           \
 }
 
@@ -271,8 +255,6 @@ extern unsigned int arpt_do_table(struct sk_buff *skb,
                                  const struct net_device *out,
                                  struct xt_table *table);
 
-#define ARPT_ALIGN(s) XT_ALIGN(s)
-
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
 
@@ -285,14 +267,12 @@ struct compat_arpt_entry {
        unsigned char elems[0];
 };
 
-static inline struct arpt_entry_target *
+static inline struct xt_entry_target *
 compat_arpt_get_target(struct compat_arpt_entry *e)
 {
        return (void *)e + e->target_offset;
 }
 
-#define COMPAT_ARPT_ALIGN(s)   COMPAT_XT_ALIGN(s)
-
 #endif /* CONFIG_COMPAT */
 #endif /*__KERNEL__*/
 #endif /* _ARPTABLES_H */
index d4d78672873e6ad29c886a8125dbe3a37078e6c8..e48f1a3f5a4affda5e9ee3f39fe6709c893dfd7c 100644 (file)
@@ -3,11 +3,13 @@ header-y += ebt_among.h
 header-y += ebt_arp.h
 header-y += ebt_arpreply.h
 header-y += ebt_ip.h
+header-y += ebt_ip6.h
 header-y += ebt_limit.h
 header-y += ebt_log.h
 header-y += ebt_mark_m.h
 header-y += ebt_mark_t.h
 header-y += ebt_nat.h
+header-y += ebt_nflog.h
 header-y += ebt_pkttype.h
 header-y += ebt_redirect.h
 header-y += ebt_stp.h
index 704a7b6e81698a2585f766ef7b5a0e0ac8896b8f..64a5d95c58e8a6057679a402186f0188931e35b1 100644 (file)
 
 #include <linux/netfilter/x_tables.h>
 
+#ifndef __KERNEL__
 #define IPT_FUNCTION_MAXNAMELEN XT_FUNCTION_MAXNAMELEN
 #define IPT_TABLE_MAXNAMELEN XT_TABLE_MAXNAMELEN
 #define ipt_match xt_match
 #define ipt_target xt_target
 #define ipt_table xt_table
 #define ipt_get_revision xt_get_revision
+#define ipt_entry_match xt_entry_match
+#define ipt_entry_target xt_entry_target
+#define ipt_standard_target xt_standard_target
+#define ipt_error_target xt_error_target
+#define ipt_counters xt_counters
+#define IPT_CONTINUE XT_CONTINUE
+#define IPT_RETURN XT_RETURN
+
+/* This group is older than old (iptables < v1.4.0-rc1~89) */
+#include <linux/netfilter/xt_tcpudp.h>
+#define ipt_udp xt_udp
+#define ipt_tcp xt_tcp
+#define IPT_TCP_INV_SRCPT      XT_TCP_INV_SRCPT
+#define IPT_TCP_INV_DSTPT      XT_TCP_INV_DSTPT
+#define IPT_TCP_INV_FLAGS      XT_TCP_INV_FLAGS
+#define IPT_TCP_INV_OPTION     XT_TCP_INV_OPTION
+#define IPT_TCP_INV_MASK       XT_TCP_INV_MASK
+#define IPT_UDP_INV_SRCPT      XT_UDP_INV_SRCPT
+#define IPT_UDP_INV_DSTPT      XT_UDP_INV_DSTPT
+#define IPT_UDP_INV_MASK       XT_UDP_INV_MASK
+
+/* The argument to IPT_SO_ADD_COUNTERS. */
+#define ipt_counters_info xt_counters_info
+/* Standard return verdict, or do jump. */
+#define IPT_STANDARD_TARGET XT_STANDARD_TARGET
+/* Error verdict. */
+#define IPT_ERROR_TARGET XT_ERROR_TARGET
+
+/* fn returns 0 to continue iteration */
+#define IPT_MATCH_ITERATE(e, fn, args...) \
+       XT_MATCH_ITERATE(struct ipt_entry, e, fn, ## args)
+
+/* fn returns 0 to continue iteration */
+#define IPT_ENTRY_ITERATE(entries, size, fn, args...) \
+       XT_ENTRY_ITERATE(struct ipt_entry, entries, size, fn, ## args)
+#endif
 
 /* Yes, Virginia, you have to zero the padding. */
 struct ipt_ip {
@@ -52,12 +89,6 @@ struct ipt_ip {
        u_int8_t invflags;
 };
 
-#define ipt_entry_match xt_entry_match
-#define ipt_entry_target xt_entry_target
-#define ipt_standard_target xt_standard_target
-
-#define ipt_counters xt_counters
-
 /* Values for "flag" field in struct ipt_ip (general ip structure). */
 #define IPT_F_FRAG             0x01    /* Set if rule is a fragment rule */
 #define IPT_F_GOTO             0x02    /* Set if jump is a goto */
@@ -116,23 +147,6 @@ struct ipt_entry {
 #define IPT_SO_GET_REVISION_TARGET     (IPT_BASE_CTL + 3)
 #define IPT_SO_GET_MAX                 IPT_SO_GET_REVISION_TARGET
 
-#define IPT_CONTINUE XT_CONTINUE
-#define IPT_RETURN XT_RETURN
-
-#include <linux/netfilter/xt_tcpudp.h>
-#define ipt_udp xt_udp
-#define ipt_tcp xt_tcp
-
-#define IPT_TCP_INV_SRCPT      XT_TCP_INV_SRCPT
-#define IPT_TCP_INV_DSTPT      XT_TCP_INV_DSTPT
-#define IPT_TCP_INV_FLAGS      XT_TCP_INV_FLAGS
-#define IPT_TCP_INV_OPTION     XT_TCP_INV_OPTION
-#define IPT_TCP_INV_MASK       XT_TCP_INV_MASK
-
-#define IPT_UDP_INV_SRCPT      XT_UDP_INV_SRCPT
-#define IPT_UDP_INV_DSTPT      XT_UDP_INV_DSTPT
-#define IPT_UDP_INV_MASK       XT_UDP_INV_MASK
-
 /* ICMP matching stuff */
 struct ipt_icmp {
        u_int8_t type;                          /* type to match */
@@ -146,7 +160,7 @@ struct ipt_icmp {
 /* The argument to IPT_SO_GET_INFO */
 struct ipt_getinfo {
        /* Which table: caller fills this in. */
-       char name[IPT_TABLE_MAXNAMELEN];
+       char name[XT_TABLE_MAXNAMELEN];
 
        /* Kernel fills these in. */
        /* Which hook entry points are valid: bitmask */
@@ -168,7 +182,7 @@ struct ipt_getinfo {
 /* The argument to IPT_SO_SET_REPLACE. */
 struct ipt_replace {
        /* Which table. */
-       char name[IPT_TABLE_MAXNAMELEN];
+       char name[XT_TABLE_MAXNAMELEN];
 
        /* Which hook entry points are valid: bitmask.  You can't
            change this. */
@@ -196,13 +210,10 @@ struct ipt_replace {
        struct ipt_entry entries[0];
 };
 
-/* The argument to IPT_SO_ADD_COUNTERS. */
-#define ipt_counters_info xt_counters_info
-
 /* The argument to IPT_SO_GET_ENTRIES. */
 struct ipt_get_entries {
        /* Which table: user fills this in. */
-       char name[IPT_TABLE_MAXNAMELEN];
+       char name[XT_TABLE_MAXNAMELEN];
 
        /* User fills this in: total entry size. */
        unsigned int size;
@@ -211,28 +222,13 @@ struct ipt_get_entries {
        struct ipt_entry entrytable[0];
 };
 
-/* Standard return verdict, or do jump. */
-#define IPT_STANDARD_TARGET XT_STANDARD_TARGET
-/* Error verdict. */
-#define IPT_ERROR_TARGET XT_ERROR_TARGET
-
 /* Helper functions */
-static __inline__ struct ipt_entry_target *
+static __inline__ struct xt_entry_target *
 ipt_get_target(struct ipt_entry *e)
 {
        return (void *)e + e->target_offset;
 }
 
-#ifndef __KERNEL__
-/* fn returns 0 to continue iteration */
-#define IPT_MATCH_ITERATE(e, fn, args...) \
-       XT_MATCH_ITERATE(struct ipt_entry, e, fn, ## args)
-
-/* fn returns 0 to continue iteration */
-#define IPT_ENTRY_ITERATE(entries, size, fn, args...) \
-       XT_ENTRY_ITERATE(struct ipt_entry, entries, size, fn, ## args)
-#endif
-
 /*
  *     Main firewall chains definitions and global var's definitions.
  */
@@ -249,17 +245,12 @@ extern void ipt_unregister_table(struct net *net, struct xt_table *table);
 /* Standard entry. */
 struct ipt_standard {
        struct ipt_entry entry;
-       struct ipt_standard_target target;
-};
-
-struct ipt_error_target {
-       struct ipt_entry_target target;
-       char errorname[IPT_FUNCTION_MAXNAMELEN];
+       struct xt_standard_target target;
 };
 
 struct ipt_error {
        struct ipt_entry entry;
-       struct ipt_error_target target;
+       struct xt_error_target target;
 };
 
 #define IPT_ENTRY_INIT(__size)                                                \
@@ -271,7 +262,7 @@ struct ipt_error {
 #define IPT_STANDARD_INIT(__verdict)                                          \
 {                                                                             \
        .entry          = IPT_ENTRY_INIT(sizeof(struct ipt_standard)),         \
-       .target         = XT_TARGET_INIT(IPT_STANDARD_TARGET,                  \
+       .target         = XT_TARGET_INIT(XT_STANDARD_TARGET,                   \
                                         sizeof(struct xt_standard_target)),   \
        .target.verdict = -(__verdict) - 1,                                    \
 }
@@ -279,8 +270,8 @@ struct ipt_error {
 #define IPT_ERROR_INIT                                                        \
 {                                                                             \
        .entry          = IPT_ENTRY_INIT(sizeof(struct ipt_error)),            \
-       .target         = XT_TARGET_INIT(IPT_ERROR_TARGET,                     \
-                                        sizeof(struct ipt_error_target)),     \
+       .target         = XT_TARGET_INIT(XT_ERROR_TARGET,                      \
+                                        sizeof(struct xt_error_target)),      \
        .target.errorname = "ERROR",                                           \
 }
 
@@ -291,8 +282,6 @@ extern unsigned int ipt_do_table(struct sk_buff *skb,
                                 const struct net_device *out,
                                 struct xt_table *table);
 
-#define IPT_ALIGN(s) XT_ALIGN(s)
-
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
 
@@ -307,14 +296,12 @@ struct compat_ipt_entry {
 };
 
 /* Helper functions */
-static inline struct ipt_entry_target *
+static inline struct xt_entry_target *
 compat_ipt_get_target(struct compat_ipt_entry *e)
 {
        return (void *)e + e->target_offset;
 }
 
-#define COMPAT_IPT_ALIGN(s)    COMPAT_XT_ALIGN(s)
-
 #endif /* CONFIG_COMPAT */
 #endif /*__KERNEL__*/
 #endif /* _IPTABLES_H */
index 18442ff19c07ce1a213d5a4f81244907af8cd178..c9784f7a9c1f31675d7a64cff2246623aea495ef 100644 (file)
 
 #include <linux/netfilter/x_tables.h>
 
+#ifndef __KERNEL__
 #define IP6T_FUNCTION_MAXNAMELEN XT_FUNCTION_MAXNAMELEN
 #define IP6T_TABLE_MAXNAMELEN XT_TABLE_MAXNAMELEN
-
 #define ip6t_match xt_match
 #define ip6t_target xt_target
 #define ip6t_table xt_table
 #define ip6t_get_revision xt_get_revision
+#define ip6t_entry_match xt_entry_match
+#define ip6t_entry_target xt_entry_target
+#define ip6t_standard_target xt_standard_target
+#define ip6t_error_target xt_error_target
+#define ip6t_counters xt_counters
+#define IP6T_CONTINUE XT_CONTINUE
+#define IP6T_RETURN XT_RETURN
+
+/* Pre-iptables-1.4.0 */
+#include <linux/netfilter/xt_tcpudp.h>
+#define ip6t_tcp xt_tcp
+#define ip6t_udp xt_udp
+#define IP6T_TCP_INV_SRCPT     XT_TCP_INV_SRCPT
+#define IP6T_TCP_INV_DSTPT     XT_TCP_INV_DSTPT
+#define IP6T_TCP_INV_FLAGS     XT_TCP_INV_FLAGS
+#define IP6T_TCP_INV_OPTION    XT_TCP_INV_OPTION
+#define IP6T_TCP_INV_MASK      XT_TCP_INV_MASK
+#define IP6T_UDP_INV_SRCPT     XT_UDP_INV_SRCPT
+#define IP6T_UDP_INV_DSTPT     XT_UDP_INV_DSTPT
+#define IP6T_UDP_INV_MASK      XT_UDP_INV_MASK
+
+#define ip6t_counters_info xt_counters_info
+#define IP6T_STANDARD_TARGET XT_STANDARD_TARGET
+#define IP6T_ERROR_TARGET XT_ERROR_TARGET
+#define IP6T_MATCH_ITERATE(e, fn, args...) \
+       XT_MATCH_ITERATE(struct ip6t_entry, e, fn, ## args)
+#define IP6T_ENTRY_ITERATE(entries, size, fn, args...) \
+       XT_ENTRY_ITERATE(struct ip6t_entry, entries, size, fn, ## args)
+#endif
 
 /* Yes, Virginia, you have to zero the padding. */
 struct ip6t_ip6 {
@@ -62,12 +91,6 @@ struct ip6t_ip6 {
        u_int8_t invflags;
 };
 
-#define ip6t_entry_match xt_entry_match
-#define ip6t_entry_target xt_entry_target
-#define ip6t_standard_target xt_standard_target
-
-#define ip6t_counters  xt_counters
-
 /* Values for "flag" field in struct ip6t_ip6 (general ip6 structure). */
 #define IP6T_F_PROTO           0x01    /* Set if rule cares about upper 
                                           protocols */
@@ -112,17 +135,12 @@ struct ip6t_entry {
 /* Standard entry */
 struct ip6t_standard {
        struct ip6t_entry entry;
-       struct ip6t_standard_target target;
-};
-
-struct ip6t_error_target {
-       struct ip6t_entry_target target;
-       char errorname[IP6T_FUNCTION_MAXNAMELEN];
+       struct xt_standard_target target;
 };
 
 struct ip6t_error {
        struct ip6t_entry entry;
-       struct ip6t_error_target target;
+       struct xt_error_target target;
 };
 
 #define IP6T_ENTRY_INIT(__size)                                                       \
@@ -134,16 +152,16 @@ struct ip6t_error {
 #define IP6T_STANDARD_INIT(__verdict)                                         \
 {                                                                             \
        .entry          = IP6T_ENTRY_INIT(sizeof(struct ip6t_standard)),       \
-       .target         = XT_TARGET_INIT(IP6T_STANDARD_TARGET,                 \
-                                        sizeof(struct ip6t_standard_target)), \
+       .target         = XT_TARGET_INIT(XT_STANDARD_TARGET,                   \
+                                        sizeof(struct xt_standard_target)),   \
        .target.verdict = -(__verdict) - 1,                                    \
 }
 
 #define IP6T_ERROR_INIT                                                               \
 {                                                                             \
        .entry          = IP6T_ENTRY_INIT(sizeof(struct ip6t_error)),          \
-       .target         = XT_TARGET_INIT(IP6T_ERROR_TARGET,                    \
-                                        sizeof(struct ip6t_error_target)),    \
+       .target         = XT_TARGET_INIT(XT_ERROR_TARGET,                      \
+                                        sizeof(struct xt_error_target)),      \
        .target.errorname = "ERROR",                                           \
 }
 
@@ -166,30 +184,6 @@ struct ip6t_error {
 #define IP6T_SO_GET_REVISION_TARGET    (IP6T_BASE_CTL + 5)
 #define IP6T_SO_GET_MAX                        IP6T_SO_GET_REVISION_TARGET
 
-/* CONTINUE verdict for targets */
-#define IP6T_CONTINUE XT_CONTINUE
-
-/* For standard target */
-#define IP6T_RETURN XT_RETURN
-
-/* TCP/UDP matching stuff */
-#include <linux/netfilter/xt_tcpudp.h>
-
-#define ip6t_tcp xt_tcp
-#define ip6t_udp xt_udp
-
-/* Values for "inv" field in struct ipt_tcp. */
-#define IP6T_TCP_INV_SRCPT     XT_TCP_INV_SRCPT
-#define IP6T_TCP_INV_DSTPT     XT_TCP_INV_DSTPT
-#define IP6T_TCP_INV_FLAGS     XT_TCP_INV_FLAGS
-#define IP6T_TCP_INV_OPTION    XT_TCP_INV_OPTION
-#define IP6T_TCP_INV_MASK      XT_TCP_INV_MASK
-
-/* Values for "invflags" field in struct ipt_udp. */
-#define IP6T_UDP_INV_SRCPT     XT_UDP_INV_SRCPT
-#define IP6T_UDP_INV_DSTPT     XT_UDP_INV_DSTPT
-#define IP6T_UDP_INV_MASK      XT_UDP_INV_MASK
-
 /* ICMP matching stuff */
 struct ip6t_icmp {
        u_int8_t type;                          /* type to match */
@@ -203,7 +197,7 @@ struct ip6t_icmp {
 /* The argument to IP6T_SO_GET_INFO */
 struct ip6t_getinfo {
        /* Which table: caller fills this in. */
-       char name[IP6T_TABLE_MAXNAMELEN];
+       char name[XT_TABLE_MAXNAMELEN];
 
        /* Kernel fills these in. */
        /* Which hook entry points are valid: bitmask */
@@ -225,7 +219,7 @@ struct ip6t_getinfo {
 /* The argument to IP6T_SO_SET_REPLACE. */
 struct ip6t_replace {
        /* Which table. */
-       char name[IP6T_TABLE_MAXNAMELEN];
+       char name[XT_TABLE_MAXNAMELEN];
 
        /* Which hook entry points are valid: bitmask.  You can't
            change this. */
@@ -253,13 +247,10 @@ struct ip6t_replace {
        struct ip6t_entry entries[0];
 };
 
-/* The argument to IP6T_SO_ADD_COUNTERS. */
-#define ip6t_counters_info xt_counters_info
-
 /* The argument to IP6T_SO_GET_ENTRIES. */
 struct ip6t_get_entries {
        /* Which table: user fills this in. */
-       char name[IP6T_TABLE_MAXNAMELEN];
+       char name[XT_TABLE_MAXNAMELEN];
 
        /* User fills this in: total entry size. */
        unsigned int size;
@@ -268,28 +259,13 @@ struct ip6t_get_entries {
        struct ip6t_entry entrytable[0];
 };
 
-/* Standard return verdict, or do jump. */
-#define IP6T_STANDARD_TARGET XT_STANDARD_TARGET
-/* Error verdict. */
-#define IP6T_ERROR_TARGET XT_ERROR_TARGET
-
 /* Helper functions */
-static __inline__ struct ip6t_entry_target *
+static __inline__ struct xt_entry_target *
 ip6t_get_target(struct ip6t_entry *e)
 {
        return (void *)e + e->target_offset;
 }
 
-#ifndef __KERNEL__
-/* fn returns 0 to continue iteration */
-#define IP6T_MATCH_ITERATE(e, fn, args...) \
-       XT_MATCH_ITERATE(struct ip6t_entry, e, fn, ## args)
-
-/* fn returns 0 to continue iteration */
-#define IP6T_ENTRY_ITERATE(entries, size, fn, args...) \
-       XT_ENTRY_ITERATE(struct ip6t_entry, entries, size, fn, ## args)
-#endif
-
 /*
  *     Main firewall chains definitions and global var's definitions.
  */
@@ -316,8 +292,6 @@ extern int ip6t_ext_hdr(u8 nexthdr);
 extern int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
                         int target, unsigned short *fragoff);
 
-#define IP6T_ALIGN(s) XT_ALIGN(s)
-
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
 
@@ -331,14 +305,12 @@ struct compat_ip6t_entry {
        unsigned char elems[0];
 };
 
-static inline struct ip6t_entry_target *
+static inline struct xt_entry_target *
 compat_ip6t_get_target(struct compat_ip6t_entry *e)
 {
        return (void *)e + e->target_offset;
 }
 
-#define COMPAT_IP6T_ALIGN(s)   COMPAT_XT_ALIGN(s)
-
 #endif /* CONFIG_COMPAT */
 #endif /*__KERNEL__*/
 #endif /* _IP6_TABLES_H */
index 74358d1b3f439ba6f995ceb0c928730f8c180252..e9c2ed8af864b4fc197e8310551eb6f7ba71dd25 100644 (file)
@@ -245,7 +245,7 @@ static inline int inet_sk_listen_hashfn(const struct sock *sk)
 }
 
 /* Caller must disable local BH processing. */
-extern void __inet_inherit_port(struct sock *sk, struct sock *child);
+extern int __inet_inherit_port(struct sock *sk, struct sock *child);
 
 extern void inet_put_port(struct sock *sk);
 
index f976885f686f67f593d2928175e9f06de1c94f56..b7bbd6c28cfa17dde6fa3a972d33635c5a498312 100644 (file)
@@ -25,7 +25,9 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>                        /* for struct ipv6hdr */
 #include <net/ipv6.h>                  /* for ipv6_addr_copy */
-
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#include <net/netfilter/nf_conntrack.h>
+#endif
 
 /* Connections' size value needed by ip_vs_ctl.c */
 extern int ip_vs_conn_tab_size;
@@ -134,24 +136,24 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
                if (net_ratelimit())                                    \
                        printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__);  \
        } while (0)
-#define IP_VS_DBG_PKT(level, pp, skb, ofs, msg)                                \
+#define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg)                    \
        do {                                                            \
                if (level <= ip_vs_get_debug_level())                   \
-                       pp->debug_packet(pp, skb, ofs, msg);            \
+                       pp->debug_packet(af, pp, skb, ofs, msg);        \
        } while (0)
-#define IP_VS_DBG_RL_PKT(level, pp, skb, ofs, msg)                     \
+#define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg)                 \
        do {                                                            \
                if (level <= ip_vs_get_debug_level() &&                 \
                    net_ratelimit())                                    \
-                       pp->debug_packet(pp, skb, ofs, msg);            \
+                       pp->debug_packet(af, pp, skb, ofs, msg);        \
        } while (0)
 #else  /* NO DEBUGGING at ALL */
 #define IP_VS_DBG_BUF(level, msg...)  do {} while (0)
 #define IP_VS_ERR_BUF(msg...)  do {} while (0)
 #define IP_VS_DBG(level, msg...)  do {} while (0)
 #define IP_VS_DBG_RL(msg...)  do {} while (0)
-#define IP_VS_DBG_PKT(level, pp, skb, ofs, msg)                do {} while (0)
-#define IP_VS_DBG_RL_PKT(level, pp, skb, ofs, msg)     do {} while (0)
+#define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg)    do {} while (0)
+#define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) do {} while (0)
 #endif
 
 #define IP_VS_BUG() BUG()
@@ -343,7 +345,7 @@ struct ip_vs_protocol {
 
        int (*app_conn_bind)(struct ip_vs_conn *cp);
 
-       void (*debug_packet)(struct ip_vs_protocol *pp,
+       void (*debug_packet)(int af, struct ip_vs_protocol *pp,
                             const struct sk_buff *skb,
                             int offset,
                             const char *msg);
@@ -355,6 +357,19 @@ struct ip_vs_protocol {
 
 extern struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto);
 
+struct ip_vs_conn_param {
+       const union nf_inet_addr        *caddr;
+       const union nf_inet_addr        *vaddr;
+       __be16                          cport;
+       __be16                          vport;
+       __u16                           protocol;
+       u16                             af;
+
+       const struct ip_vs_pe           *pe;
+       char                            *pe_data;
+       __u8                            pe_data_len;
+};
+
 /*
  *     IP_VS structure allocated for each dynamically scheduled connection
  */
@@ -366,6 +381,7 @@ struct ip_vs_conn {
        union nf_inet_addr       caddr;          /* client address */
        union nf_inet_addr       vaddr;          /* virtual address */
        union nf_inet_addr       daddr;          /* destination address */
+       volatile __u32           flags;          /* status flags */
        __be16                   cport;
        __be16                   vport;
        __be16                   dport;
@@ -378,7 +394,6 @@ struct ip_vs_conn {
 
        /* Flags and state transition */
        spinlock_t              lock;           /* lock for state transition */
-       volatile __u16          flags;          /* status flags */
        volatile __u16          state;          /* state info */
        volatile __u16          old_state;      /* old state, to be used for
                                                 * state transition triggerd
@@ -394,6 +409,7 @@ struct ip_vs_conn {
        /* packet transmitter for different forwarding methods.  If it
           mangles the packet, it must return NF_DROP or better NF_STOLEN,
           otherwise this must be changed to a sk_buff **.
+          NF_ACCEPT can be returned when destination is local.
         */
        int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp,
                           struct ip_vs_protocol *pp);
@@ -405,6 +421,9 @@ struct ip_vs_conn {
        void                    *app_data;      /* Application private data */
        struct ip_vs_seq        in_seq;         /* incoming seq. struct */
        struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
+
+       char                    *pe_data;
+       __u8                    pe_data_len;
 };
 
 
@@ -426,6 +445,7 @@ struct ip_vs_service_user_kern {
 
        /* virtual service options */
        char                    *sched_name;
+       char                    *pe_name;
        unsigned                flags;          /* virtual service flags */
        unsigned                timeout;        /* persistent timeout in sec */
        u32                     netmask;        /* persistent netmask */
@@ -475,6 +495,9 @@ struct ip_vs_service {
        struct ip_vs_scheduler  *scheduler;    /* bound scheduler object */
        rwlock_t                sched_lock;    /* lock sched_data */
        void                    *sched_data;   /* scheduler application data */
+
+       /* alternate persistence engine */
+       struct ip_vs_pe         *pe;
 };
 
 
@@ -507,6 +530,10 @@ struct ip_vs_dest {
        spinlock_t              dst_lock;       /* lock of dst_cache */
        struct dst_entry        *dst_cache;     /* destination cache entry */
        u32                     dst_rtos;       /* RT_TOS(tos) for dst */
+       u32                     dst_cookie;
+#ifdef CONFIG_IP_VS_IPV6
+       struct in6_addr         dst_saddr;
+#endif
 
        /* for virtual service */
        struct ip_vs_service    *svc;           /* service it belongs to */
@@ -538,6 +565,21 @@ struct ip_vs_scheduler {
                                       const struct sk_buff *skb);
 };
 
+/* The persistence engine object */
+struct ip_vs_pe {
+       struct list_head        n_list;         /* d-linked list head */
+       char                    *name;          /* scheduler name */
+       atomic_t                refcnt;         /* reference counter */
+       struct module           *module;        /* THIS_MODULE/NULL */
+
+       /* get the connection template, if any */
+       int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb);
+       bool (*ct_match)(const struct ip_vs_conn_param *p,
+                        struct ip_vs_conn *ct);
+       u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval,
+                          bool inverse);
+       int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf);
+};
 
 /*
  *     The application module object (a.k.a. app incarnation)
@@ -556,11 +598,19 @@ struct ip_vs_app {
        __be16                  port;           /* port number in net order */
        atomic_t                usecnt;         /* usage counter */
 
-       /* output hook: return false if can't linearize. diff set for TCP.  */
+       /*
+        * output hook: Process packet in inout direction, diff set for TCP.
+        * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok,
+        *         2=Mangled but checksum was not updated
+        */
        int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *,
                       struct sk_buff *, int *diff);
 
-       /* input hook: return false if can't linearize. diff set for TCP. */
+       /*
+        * input hook: Process packet in outin direction, diff set for TCP.
+        * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok,
+        *         2=Mangled but checksum was not updated
+        */
        int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *,
                      struct sk_buff *, int *diff);
 
@@ -624,13 +674,25 @@ enum {
        IP_VS_DIR_LAST,
 };
 
-extern struct ip_vs_conn *ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port);
+static inline void ip_vs_conn_fill_param(int af, int protocol,
+                                        const union nf_inet_addr *caddr,
+                                        __be16 cport,
+                                        const union nf_inet_addr *vaddr,
+                                        __be16 vport,
+                                        struct ip_vs_conn_param *p)
+{
+       p->af = af;
+       p->protocol = protocol;
+       p->caddr = caddr;
+       p->cport = cport;
+       p->vaddr = vaddr;
+       p->vport = vport;
+       p->pe = NULL;
+       p->pe_data = NULL;
+}
 
-extern struct ip_vs_conn *ip_vs_ct_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port);
+struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p);
+struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p);
 
 struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
                                            struct ip_vs_protocol *pp,
@@ -638,9 +700,7 @@ struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
                                            unsigned int proto_off,
                                            int inverse);
 
-extern struct ip_vs_conn *ip_vs_conn_out_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port);
+struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p);
 
 struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
                                             struct ip_vs_protocol *pp,
@@ -656,11 +716,10 @@ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp)
 extern void ip_vs_conn_put(struct ip_vs_conn *cp);
 extern void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport);
 
-extern struct ip_vs_conn *
-ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
-              const union nf_inet_addr *vaddr, __be16 vport,
-              const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
-              struct ip_vs_dest *dest);
+struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p,
+                                 const union nf_inet_addr *daddr,
+                                 __be16 dport, unsigned flags,
+                                 struct ip_vs_dest *dest);
 extern void ip_vs_conn_expire_now(struct ip_vs_conn *cp);
 
 extern const char * ip_vs_state_name(__u16 proto, int state);
@@ -751,6 +810,12 @@ extern int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb);
 extern int ip_vs_app_init(void);
 extern void ip_vs_app_cleanup(void);
 
+void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe);
+void ip_vs_unbind_pe(struct ip_vs_service *svc);
+int register_ip_vs_pe(struct ip_vs_pe *pe);
+int unregister_ip_vs_pe(struct ip_vs_pe *pe);
+extern struct ip_vs_pe *ip_vs_pe_get(const char *name);
+extern void ip_vs_pe_put(struct ip_vs_pe *pe);
 
 /*
  *     IPVS protocol functions (from ip_vs_proto.c)
@@ -763,7 +828,8 @@ extern int
 ip_vs_set_state_timeout(int *table, int num, const char *const *names,
                        const char *name, int to);
 extern void
-ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
+                         const struct sk_buff *skb,
                          int offset, const char *msg);
 
 extern struct ip_vs_protocol ip_vs_protocol_tcp;
@@ -785,7 +851,8 @@ extern int ip_vs_unbind_scheduler(struct ip_vs_service *svc);
 extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name);
 extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler);
 extern struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb);
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
+              struct ip_vs_protocol *pp, int *ignored);
 extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
                        struct ip_vs_protocol *pp);
 
@@ -798,6 +865,8 @@ extern int sysctl_ip_vs_expire_nodest_conn;
 extern int sysctl_ip_vs_expire_quiescent_template;
 extern int sysctl_ip_vs_sync_threshold[2];
 extern int sysctl_ip_vs_nat_icmp_send;
+extern int sysctl_ip_vs_conntrack;
+extern int sysctl_ip_vs_snat_reroute;
 extern struct ip_vs_stats ip_vs_stats;
 extern const struct ctl_path net_vs_ctl_path[];
 
@@ -955,8 +1024,65 @@ static inline __wsum ip_vs_check_diff2(__be16 old, __be16 new, __wsum oldsum)
        return csum_partial(diff, sizeof(diff), oldsum);
 }
 
+/*
+ * Forget current conntrack (unconfirmed) and attach notrack entry
+ */
+static inline void ip_vs_notrack(struct sk_buff *skb)
+{
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+       enum ip_conntrack_info ctinfo;
+       struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+       if (!ct || !nf_ct_is_untracked(ct)) {
+               nf_reset(skb);
+               skb->nfct = &nf_ct_untracked_get()->ct_general;
+               skb->nfctinfo = IP_CT_NEW;
+               nf_conntrack_get(skb->nfct);
+       }
+#endif
+}
+
+#ifdef CONFIG_IP_VS_NFCT
+/*
+ *      Netfilter connection tracking
+ *      (from ip_vs_nfct.c)
+ */
+static inline int ip_vs_conntrack_enabled(void)
+{
+       return sysctl_ip_vs_conntrack;
+}
+
 extern void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp,
                                   int outin);
+extern int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp);
+extern void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+                                     struct ip_vs_conn *cp, u_int8_t proto,
+                                     const __be16 port, int from_rs);
+extern void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp);
+
+#else
+
+static inline int ip_vs_conntrack_enabled(void)
+{
+       return 0;
+}
+
+static inline void ip_vs_update_conntrack(struct sk_buff *skb,
+                                         struct ip_vs_conn *cp, int outin)
+{
+}
+
+static inline int ip_vs_confirm_conntrack(struct sk_buff *skb,
+                                         struct ip_vs_conn *cp)
+{
+       return NF_ACCEPT;
+}
+
+static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
+{
+}
+/* CONFIG_IP_VS_NFCT */
+#endif
 
 #endif /* __KERNEL__ */
 
diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h
new file mode 100644 (file)
index 0000000..94dd54d
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef _NF_DEFRAG_IPV6_H
+#define _NF_DEFRAG_IPV6_H
+
+extern void nf_defrag_ipv6_enable(void);
+
+#endif /* _NF_DEFRAG_IPV6_H */
index 11e815084fcf05cb1a680b3c61968a1eb66a1ac5..0f8a8c587532f78ac5f191d93a1088b03ba1fd78 100644 (file)
@@ -67,9 +67,6 @@ struct nf_conntrack_expect_policy {
 
 #define NF_CT_EXPECT_CLASS_DEFAULT     0
 
-#define NF_CT_EXPECT_PERMANENT 0x1
-#define NF_CT_EXPECT_INACTIVE  0x2
-
 int nf_conntrack_expect_init(struct net *net);
 void nf_conntrack_expect_fini(struct net *net);
 
@@ -85,9 +82,16 @@ struct nf_conntrack_expect *
 nf_ct_find_expectation(struct net *net, u16 zone,
                       const struct nf_conntrack_tuple *tuple);
 
-void nf_ct_unlink_expect(struct nf_conntrack_expect *exp);
+void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
+                               u32 pid, int report);
+static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
+{
+       nf_ct_unlink_expect_report(exp, 0, 0);
+}
+
 void nf_ct_remove_expectations(struct nf_conn *ct);
 void nf_ct_unexpect_related(struct nf_conntrack_expect *exp);
+void nf_ct_remove_userspace_expectations(void);
 
 /* Allocate space for an expectation: this is mandatory before calling
    nf_ct_expect_related.  You will have to call put afterwards. */
index df17bac46bf5ef81cc4a1f3114e99eb54a913758..93cc90d28e66e31047f7219e2c4a1cbd8ce6774c 100644 (file)
@@ -45,9 +45,6 @@ struct nf_nat_protocol {
 extern int nf_nat_protocol_register(const struct nf_nat_protocol *proto);
 extern void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto);
 
-extern const struct nf_nat_protocol *nf_nat_proto_find_get(u_int8_t protocol);
-extern void nf_nat_proto_put(const struct nf_nat_protocol *proto);
-
 /* Built-in protocols. */
 extern const struct nf_nat_protocol nf_nat_protocol_tcp;
 extern const struct nf_nat_protocol nf_nat_protocol_udp;
index 208b46f4d6d2b2bfbf9b2439b783dbfa6a7a7a46..cd85b3bc8327219f1e036698c253034c27765d66 100644 (file)
 #include <linux/in.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/inet_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
 #include <net/tcp.h>
 
+#define NFT_LOOKUP_ANY         0
+#define NFT_LOOKUP_LISTENER    1
+#define NFT_LOOKUP_ESTABLISHED 2
+
 /* look up and get a reference to a matching socket */
-extern struct sock *
+
+
+/* This function is used by the 'TPROXY' target and the 'socket'
+ * match. The following lookups are supported:
+ *
+ * Explicit TProxy target rule
+ * ===========================
+ *
+ * This is used when the user wants to intercept a connection matching
+ * an explicit iptables rule. In this case the sockets are assumed
+ * matching in preference order:
+ *
+ *   - match: if there's a fully established connection matching the
+ *     _packet_ tuple, it is returned, assuming the redirection
+ *     already took place and we process a packet belonging to an
+ *     established connection
+ *
+ *   - match: if there's a listening socket matching the redirection
+ *     (e.g. on-port & on-ip of the connection), it is returned,
+ *     regardless if it was bound to 0.0.0.0 or an explicit
+ *     address. The reasoning is that if there's an explicit rule, it
+ *     does not really matter if the listener is bound to an interface
+ *     or to 0. The user already stated that he wants redirection
+ *     (since he added the rule).
+ *
+ * "socket" match based redirection (no specific rule)
+ * ===================================================
+ *
+ * There are connections with dynamic endpoints (e.g. FTP data
+ * connection) that the user is unable to add explicit rules
+ * for. These are taken care of by a generic "socket" rule. It is
+ * assumed that the proxy application is trusted to open such
+ * connections without explicit iptables rule (except of course the
+ * generic 'socket' rule). In this case the following sockets are
+ * matched in preference order:
+ *
+ *   - match: if there's a fully established connection matching the
+ *     _packet_ tuple
+ *
+ *   - match: if there's a non-zero bound listener (possibly with a
+ *     non-local address) We don't accept zero-bound listeners, since
+ *     then local services could intercept traffic going through the
+ *     box.
+ *
+ * Please note that there's an overlap between what a TPROXY target
+ * and a socket match will match. Normally if you have both rules the
+ * "socket" match will be the first one, effectively all packets
+ * belonging to established connections going through that one.
+ */
+static inline struct sock *
 nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
                      const __be32 saddr, const __be32 daddr,
                      const __be16 sport, const __be16 dport,
-                     const struct net_device *in, bool listening);
+                     const struct net_device *in, int lookup_type)
+{
+       struct sock *sk;
+
+       /* look up socket */
+       switch (protocol) {
+       case IPPROTO_TCP:
+               switch (lookup_type) {
+               case NFT_LOOKUP_ANY:
+                       sk = __inet_lookup(net, &tcp_hashinfo,
+                                          saddr, sport, daddr, dport,
+                                          in->ifindex);
+                       break;
+               case NFT_LOOKUP_LISTENER:
+                       sk = inet_lookup_listener(net, &tcp_hashinfo,
+                                                   daddr, dport,
+                                                   in->ifindex);
+
+                       /* NOTE: we return listeners even if bound to
+                        * 0.0.0.0, those are filtered out in
+                        * xt_socket, since xt_TPROXY needs 0 bound
+                        * listeners too */
+
+                       break;
+               case NFT_LOOKUP_ESTABLISHED:
+                       sk = inet_lookup_established(net, &tcp_hashinfo,
+                                                   saddr, sport, daddr, dport,
+                                                   in->ifindex);
+                       break;
+               default:
+                       WARN_ON(1);
+                       sk = NULL;
+                       break;
+               }
+               break;
+       case IPPROTO_UDP:
+               sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
+                                    in->ifindex);
+               if (sk && lookup_type != NFT_LOOKUP_ANY) {
+                       int connected = (sk->sk_state == TCP_ESTABLISHED);
+                       int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
+
+                       /* NOTE: we return listeners even if bound to
+                        * 0.0.0.0, those are filtered out in
+                        * xt_socket, since xt_TPROXY needs 0 bound
+                        * listeners too */
+                       if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
+                           (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
+                               sock_put(sk);
+                               sk = NULL;
+                       }
+               }
+               break;
+       default:
+               WARN_ON(1);
+               sk = NULL;
+       }
+
+       pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
+                protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
+
+       return sk;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static inline struct sock *
+nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
+                     const struct in6_addr *saddr, const struct in6_addr *daddr,
+                     const __be16 sport, const __be16 dport,
+                     const struct net_device *in, int lookup_type)
+{
+       struct sock *sk;
+
+       /* look up socket */
+       switch (protocol) {
+       case IPPROTO_TCP:
+               switch (lookup_type) {
+               case NFT_LOOKUP_ANY:
+                       sk = inet6_lookup(net, &tcp_hashinfo,
+                                         saddr, sport, daddr, dport,
+                                         in->ifindex);
+                       break;
+               case NFT_LOOKUP_LISTENER:
+                       sk = inet6_lookup_listener(net, &tcp_hashinfo,
+                                                  daddr, ntohs(dport),
+                                                  in->ifindex);
+
+                       /* NOTE: we return listeners even if bound to
+                        * 0.0.0.0, those are filtered out in
+                        * xt_socket, since xt_TPROXY needs 0 bound
+                        * listeners too */
+
+                       break;
+               case NFT_LOOKUP_ESTABLISHED:
+                       sk = __inet6_lookup_established(net, &tcp_hashinfo,
+                                                       saddr, sport, daddr, ntohs(dport),
+                                                       in->ifindex);
+                       break;
+               default:
+                       WARN_ON(1);
+                       sk = NULL;
+                       break;
+               }
+               break;
+       case IPPROTO_UDP:
+               sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
+                                    in->ifindex);
+               if (sk && lookup_type != NFT_LOOKUP_ANY) {
+                       int connected = (sk->sk_state == TCP_ESTABLISHED);
+                       int wildcard = ipv6_addr_any(&inet6_sk(sk)->rcv_saddr);
+
+                       /* NOTE: we return listeners even if bound to
+                        * 0.0.0.0, those are filtered out in
+                        * xt_socket, since xt_TPROXY needs 0 bound
+                        * listeners too */
+                       if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
+                           (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
+                               sock_put(sk);
+                               sk = NULL;
+                       }
+               }
+               break;
+       default:
+               WARN_ON(1);
+               sk = NULL;
+       }
+
+       pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
+                protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
+
+       return sk;
+}
+#endif
 
 static inline void
 nf_tproxy_put_sock(struct sock *sk)
diff --git a/include/net/netfilter/xt_log.h b/include/net/netfilter/xt_log.h
new file mode 100644 (file)
index 0000000..0dfb34a
--- /dev/null
@@ -0,0 +1,54 @@
+#define S_SIZE (1024 - (sizeof(unsigned int) + 1))
+
+struct sbuff {
+       unsigned int    count;
+       char            buf[S_SIZE + 1];
+};
+static struct sbuff emergency, *emergency_ptr = &emergency;
+
+static int sb_add(struct sbuff *m, const char *f, ...)
+{
+       va_list args;
+       int len;
+
+       if (likely(m->count < S_SIZE)) {
+               va_start(args, f);
+               len = vsnprintf(m->buf + m->count, S_SIZE - m->count, f, args);
+               va_end(args);
+               if (likely(m->count + len < S_SIZE)) {
+                       m->count += len;
+                       return 0;
+               }
+       }
+       m->count = S_SIZE;
+       printk_once(KERN_ERR KBUILD_MODNAME " please increase S_SIZE\n");
+       return -1;
+}
+
+static struct sbuff *sb_open(void)
+{
+       struct sbuff *m = kmalloc(sizeof(*m), GFP_ATOMIC);
+
+       if (unlikely(!m)) {
+               local_bh_disable();
+               do {
+                       m = xchg(&emergency_ptr, NULL);
+               } while (!m);
+       }
+       m->count = 0;
+       return m;
+}
+
+static void sb_close(struct sbuff *m)
+{
+       m->buf[m->count] = 0;
+       printk("%s\n", m->buf);
+
+       if (likely(m != &emergency))
+               kfree(m);
+       else {
+               xchg(&emergency_ptr, m);
+               local_bh_enable();
+       }
+}
+
index a184d3496b1369deefd62aba376f04320f76a773..200b82848c9a3606b0076c3f8621fb63342af6a9 100644 (file)
@@ -183,6 +183,9 @@ extern int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 extern struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
                                    __be32 daddr, __be16 dport,
                                    int dif);
+extern struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
+                                   const struct in6_addr *daddr, __be16 dport,
+                                   int dif);
 
 /*
  *     SNMP statistics for UDP and UDP-Lite
index d4a166f0f391d6bfccd85471b2616057d808c848..3f69ea1148291ce2e5ad4956f4c27931d8db7e8f 100644 (file)
@@ -392,7 +392,7 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
 
        newsk = dccp_create_openreq_child(sk, req, skb);
        if (newsk == NULL)
-               goto exit;
+               goto exit_nonewsk;
 
        sk_setup_caps(newsk, dst);
 
@@ -409,16 +409,20 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
 
        dccp_sync_mss(newsk, dst_mtu(dst));
 
+       if (__inet_inherit_port(sk, newsk) < 0) {
+               sock_put(newsk);
+               goto exit;
+       }
        __inet_hash_nolisten(newsk, NULL);
-       __inet_inherit_port(sk, newsk);
 
        return newsk;
 
 exit_overflow:
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+       dst_release(dst);
 exit:
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
-       dst_release(dst);
        return NULL;
 }
 
index 6e3f32575df78bbf75fc8e3cd46ea0cb52f4f771..dca711df9b60cea9e8ecf259cfdfc3909c60d3bb 100644 (file)
@@ -564,7 +564,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 
        newsk = dccp_create_openreq_child(sk, req, skb);
        if (newsk == NULL)
-               goto out;
+               goto out_nonewsk;
 
        /*
         * No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -632,18 +632,22 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
        newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
        newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
 
+       if (__inet_inherit_port(sk, newsk) < 0) {
+               sock_put(newsk);
+               goto out;
+       }
        __inet6_hash(newsk, NULL);
-       __inet_inherit_port(sk, newsk);
 
        return newsk;
 
 out_overflow:
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+out_nonewsk:
+       dst_release(dst);
 out:
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
        if (opt != NULL && opt != np->opt)
                sock_kfree_s(sk, opt, opt->tot_len);
-       dst_release(dst);
        return NULL;
 }
 
index fb7ad5a21ff3e2e86b2d9018ea29ccd2e3921848..1b344f30b463fab9ed70a8f19a19d348d7c626f7 100644 (file)
@@ -101,19 +101,43 @@ void inet_put_port(struct sock *sk)
 }
 EXPORT_SYMBOL(inet_put_port);
 
-void __inet_inherit_port(struct sock *sk, struct sock *child)
+int __inet_inherit_port(struct sock *sk, struct sock *child)
 {
        struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
-       const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num,
+       unsigned short port = inet_sk(child)->inet_num;
+       const int bhash = inet_bhashfn(sock_net(sk), port,
                        table->bhash_size);
        struct inet_bind_hashbucket *head = &table->bhash[bhash];
        struct inet_bind_bucket *tb;
 
        spin_lock(&head->lock);
        tb = inet_csk(sk)->icsk_bind_hash;
+       if (tb->port != port) {
+               /* NOTE: using tproxy and redirecting skbs to a proxy
+                * on a different listener port breaks the assumption
+                * that the listener socket's icsk_bind_hash is the same
+                * as that of the child socket. We have to look up or
+                * create a new bind bucket for the child here. */
+               struct hlist_node *node;
+               inet_bind_bucket_for_each(tb, node, &head->chain) {
+                       if (net_eq(ib_net(tb), sock_net(sk)) &&
+                           tb->port == port)
+                               break;
+               }
+               if (!node) {
+                       tb = inet_bind_bucket_create(table->bind_bucket_cachep,
+                                                    sock_net(sk), head, port);
+                       if (!tb) {
+                               spin_unlock(&head->lock);
+                               return -ENOMEM;
+                       }
+               }
+       }
        sk_add_bind_node(child, &tb->owners);
        inet_csk(child)->icsk_bind_hash = tb;
        spin_unlock(&head->lock);
+
+       return 0;
 }
 EXPORT_SYMBOL_GPL(__inet_inherit_port);
 
index 1833bdbf9805474c729483e5abef53ffe99e120d..8e3350643b63e003747661aa46470670773a4b34 100644 (file)
@@ -324,10 +324,10 @@ config IP_NF_TARGET_ECN
 
 config IP_NF_TARGET_TTL
        tristate '"TTL" target support'
-       depends on NETFILTER_ADVANCED
+       depends on NETFILTER_ADVANCED && IP_NF_MANGLE
        select NETFILTER_XT_TARGET_HL
        ---help---
-       This is a backwards-compat option for the user's convenience
+       This is a backwards-compatible option for the user's convenience
        (e.g. when running oldconfig). It selects
        CONFIG_NETFILTER_XT_TARGET_HL.
 
index 8b642f1524684fddfef0797ce4ea7ce6cca01a41..3cad2591ace0c15fdad446ab528b0f40c2624d09 100644 (file)
@@ -228,7 +228,7 @@ arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
        return NF_DROP;
 }
 
-static inline const struct arpt_entry_target *
+static inline const struct xt_entry_target *
 arpt_get_target_c(const struct arpt_entry *e)
 {
        return arpt_get_target((struct arpt_entry *)e);
@@ -282,7 +282,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 
        arp = arp_hdr(skb);
        do {
-               const struct arpt_entry_target *t;
+               const struct xt_entry_target *t;
 
                if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
                        e = arpt_next_entry(e);
@@ -297,10 +297,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
                if (!t->u.kernel.target->target) {
                        int v;
 
-                       v = ((struct arpt_standard_target *)t)->verdict;
+                       v = ((struct xt_standard_target *)t)->verdict;
                        if (v < 0) {
                                /* Pop from stack? */
-                               if (v != ARPT_RETURN) {
+                               if (v != XT_RETURN) {
                                        verdict = (unsigned)(-v) - 1;
                                        break;
                                }
@@ -332,7 +332,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
                /* Target might have changed stuff. */
                arp = arp_hdr(skb);
 
-               if (verdict == ARPT_CONTINUE)
+               if (verdict == XT_CONTINUE)
                        e = arpt_next_entry(e);
                else
                        /* Verdict */
@@ -377,7 +377,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
                e->counters.pcnt = pos;
 
                for (;;) {
-                       const struct arpt_standard_target *t
+                       const struct xt_standard_target *t
                                = (void *)arpt_get_target_c(e);
                        int visited = e->comefrom & (1 << hook);
 
@@ -392,13 +392,13 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
                        /* Unconditional return/END. */
                        if ((e->target_offset == sizeof(struct arpt_entry) &&
                             (strcmp(t->target.u.user.name,
-                                    ARPT_STANDARD_TARGET) == 0) &&
+                                    XT_STANDARD_TARGET) == 0) &&
                             t->verdict < 0 && unconditional(&e->arp)) ||
                            visited) {
                                unsigned int oldpos, size;
 
                                if ((strcmp(t->target.u.user.name,
-                                           ARPT_STANDARD_TARGET) == 0) &&
+                                           XT_STANDARD_TARGET) == 0) &&
                                    t->verdict < -NF_MAX_VERDICT - 1) {
                                        duprintf("mark_source_chains: bad "
                                                "negative verdict (%i)\n",
@@ -433,7 +433,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
                                int newpos = t->verdict;
 
                                if (strcmp(t->target.u.user.name,
-                                          ARPT_STANDARD_TARGET) == 0 &&
+                                          XT_STANDARD_TARGET) == 0 &&
                                    newpos >= 0) {
                                        if (newpos > newinfo->size -
                                                sizeof(struct arpt_entry)) {
@@ -464,14 +464,14 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 
 static inline int check_entry(const struct arpt_entry *e, const char *name)
 {
-       const struct arpt_entry_target *t;
+       const struct xt_entry_target *t;
 
        if (!arp_checkentry(&e->arp)) {
                duprintf("arp_tables: arp check failed %p %s.\n", e, name);
                return -EINVAL;
        }
 
-       if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset)
+       if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
                return -EINVAL;
 
        t = arpt_get_target_c(e);
@@ -483,7 +483,7 @@ static inline int check_entry(const struct arpt_entry *e, const char *name)
 
 static inline int check_target(struct arpt_entry *e, const char *name)
 {
-       struct arpt_entry_target *t = arpt_get_target(e);
+       struct xt_entry_target *t = arpt_get_target(e);
        int ret;
        struct xt_tgchk_param par = {
                .table     = name,
@@ -506,7 +506,7 @@ static inline int check_target(struct arpt_entry *e, const char *name)
 static inline int
 find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
 {
-       struct arpt_entry_target *t;
+       struct xt_entry_target *t;
        struct xt_target *target;
        int ret;
 
@@ -536,7 +536,7 @@ out:
 
 static bool check_underflow(const struct arpt_entry *e)
 {
-       const struct arpt_entry_target *t;
+       const struct xt_entry_target *t;
        unsigned int verdict;
 
        if (!unconditional(&e->arp))
@@ -544,7 +544,7 @@ static bool check_underflow(const struct arpt_entry *e)
        t = arpt_get_target_c(e);
        if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
                return false;
-       verdict = ((struct arpt_standard_target *)t)->verdict;
+       verdict = ((struct xt_standard_target *)t)->verdict;
        verdict = -verdict - 1;
        return verdict == NF_DROP || verdict == NF_ACCEPT;
 }
@@ -566,7 +566,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
        }
 
        if (e->next_offset
-           < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) {
+           < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) {
                duprintf("checking: element %p size %u\n",
                         e, e->next_offset);
                return -EINVAL;
@@ -598,7 +598,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 static inline void cleanup_entry(struct arpt_entry *e)
 {
        struct xt_tgdtor_param par;
-       struct arpt_entry_target *t;
+       struct xt_entry_target *t;
 
        t = arpt_get_target(e);
        par.target   = t->u.kernel.target;
@@ -794,7 +794,7 @@ static int copy_entries_to_user(unsigned int total_size,
        /* FIXME: use iterator macros --RR */
        /* ... then go back and fix counters and names */
        for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
-               const struct arpt_entry_target *t;
+               const struct xt_entry_target *t;
 
                e = (struct arpt_entry *)(loc_cpu_entry + off);
                if (copy_to_user(userptr + off
@@ -807,7 +807,7 @@ static int copy_entries_to_user(unsigned int total_size,
 
                t = arpt_get_target_c(e);
                if (copy_to_user(userptr + off + e->target_offset
-                                + offsetof(struct arpt_entry_target,
+                                + offsetof(struct xt_entry_target,
                                            u.user.name),
                                 t->u.kernel.target->name,
                                 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -844,7 +844,7 @@ static int compat_calc_entry(const struct arpt_entry *e,
                             const struct xt_table_info *info,
                             const void *base, struct xt_table_info *newinfo)
 {
-       const struct arpt_entry_target *t;
+       const struct xt_entry_target *t;
        unsigned int entry_offset;
        int off, i, ret;
 
@@ -895,7 +895,7 @@ static int compat_table_info(const struct xt_table_info *info,
 static int get_info(struct net *net, void __user *user,
                     const int *len, int compat)
 {
-       char name[ARPT_TABLE_MAXNAMELEN];
+       char name[XT_TABLE_MAXNAMELEN];
        struct xt_table *t;
        int ret;
 
@@ -908,7 +908,7 @@ static int get_info(struct net *net, void __user *user,
        if (copy_from_user(name, user, sizeof(name)) != 0)
                return -EFAULT;
 
-       name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
+       name[XT_TABLE_MAXNAMELEN-1] = '\0';
 #ifdef CONFIG_COMPAT
        if (compat)
                xt_compat_lock(NFPROTO_ARP);
@@ -1204,7 +1204,7 @@ static int do_add_counters(struct net *net, const void __user *user,
 #ifdef CONFIG_COMPAT
 static inline void compat_release_entry(struct compat_arpt_entry *e)
 {
-       struct arpt_entry_target *t;
+       struct xt_entry_target *t;
 
        t = compat_arpt_get_target(e);
        module_put(t->u.kernel.target->me);
@@ -1220,7 +1220,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
                                  const unsigned int *underflows,
                                  const char *name)
 {
-       struct arpt_entry_target *t;
+       struct xt_entry_target *t;
        struct xt_target *target;
        unsigned int entry_offset;
        int ret, off, h;
@@ -1288,7 +1288,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
                            unsigned int *size, const char *name,
                            struct xt_table_info *newinfo, unsigned char *base)
 {
-       struct arpt_entry_target *t;
+       struct xt_entry_target *t;
        struct xt_target *target;
        struct arpt_entry *de;
        unsigned int origsize;
@@ -1474,7 +1474,7 @@ out_unlock:
 }
 
 struct compat_arpt_replace {
-       char                            name[ARPT_TABLE_MAXNAMELEN];
+       char                            name[XT_TABLE_MAXNAMELEN];
        u32                             valid_hooks;
        u32                             num_entries;
        u32                             size;
@@ -1567,7 +1567,7 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
                                     struct xt_counters *counters,
                                     unsigned int i)
 {
-       struct arpt_entry_target *t;
+       struct xt_entry_target *t;
        struct compat_arpt_entry __user *ce;
        u_int16_t target_offset, next_offset;
        compat_uint_t origsize;
@@ -1628,7 +1628,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
 }
 
 struct compat_arpt_get_entries {
-       char name[ARPT_TABLE_MAXNAMELEN];
+       char name[XT_TABLE_MAXNAMELEN];
        compat_uint_t size;
        struct compat_arpt_entry entrytable[0];
 };
@@ -1828,7 +1828,7 @@ void arpt_unregister_table(struct xt_table *table)
 /* The built-in targets: standard (NULL) and error. */
 static struct xt_target arpt_builtin_tg[] __read_mostly = {
        {
-               .name             = ARPT_STANDARD_TARGET,
+               .name             = XT_STANDARD_TARGET,
                .targetsize       = sizeof(int),
                .family           = NFPROTO_ARP,
 #ifdef CONFIG_COMPAT
@@ -1838,9 +1838,9 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = {
 #endif
        },
        {
-               .name             = ARPT_ERROR_TARGET,
+               .name             = XT_ERROR_TARGET,
                .target           = arpt_error,
-               .targetsize       = ARPT_FUNCTION_MAXNAMELEN,
+               .targetsize       = XT_FUNCTION_MAXNAMELEN,
                .family           = NFPROTO_ARP,
        },
 };
index e1be7dd1171b368eb4a0e1593abc9b8b2294838a..b8ddcc480ed97157fbda78f741d9091b4e854006 100644 (file)
@@ -63,7 +63,7 @@ static int checkentry(const struct xt_tgchk_param *par)
                return false;
 
        if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
-          mangle->target != ARPT_CONTINUE)
+          mangle->target != XT_CONTINUE)
                return false;
        return true;
 }
index d163f2e3b2e99e5f18ae9997d3c74867b3e79354..d31b007a6d80dcda45f7f7913ec72af7556b79bf 100644 (file)
@@ -186,7 +186,7 @@ static inline bool unconditional(const struct ipt_ip *ip)
 }
 
 /* for const-correctness */
-static inline const struct ipt_entry_target *
+static inline const struct xt_entry_target *
 ipt_get_target_c(const struct ipt_entry *e)
 {
        return ipt_get_target((struct ipt_entry *)e);
@@ -230,9 +230,9 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
                      const char *hookname, const char **chainname,
                      const char **comment, unsigned int *rulenum)
 {
-       const struct ipt_standard_target *t = (void *)ipt_get_target_c(s);
+       const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
 
-       if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) {
+       if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
                /* Head of user chain: ERROR target with chainname */
                *chainname = t->target.data;
                (*rulenum) = 0;
@@ -241,7 +241,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
 
                if (s->target_offset == sizeof(struct ipt_entry) &&
                    strcmp(t->target.u.kernel.target->name,
-                          IPT_STANDARD_TARGET) == 0 &&
+                          XT_STANDARD_TARGET) == 0 &&
                   t->verdict < 0 &&
                   unconditional(&s->ip)) {
                        /* Tail of chains: STANDARD target (return/policy) */
@@ -346,7 +346,7 @@ ipt_do_table(struct sk_buff *skb,
                 get_entry(table_base, private->underflow[hook]));
 
        do {
-               const struct ipt_entry_target *t;
+               const struct xt_entry_target *t;
                const struct xt_entry_match *ematch;
 
                IP_NF_ASSERT(e);
@@ -380,10 +380,10 @@ ipt_do_table(struct sk_buff *skb,
                if (!t->u.kernel.target->target) {
                        int v;
 
-                       v = ((struct ipt_standard_target *)t)->verdict;
+                       v = ((struct xt_standard_target *)t)->verdict;
                        if (v < 0) {
                                /* Pop from stack? */
-                               if (v != IPT_RETURN) {
+                               if (v != XT_RETURN) {
                                        verdict = (unsigned)(-v) - 1;
                                        break;
                                }
@@ -421,7 +421,7 @@ ipt_do_table(struct sk_buff *skb,
                verdict = t->u.kernel.target->target(skb, &acpar);
                /* Target might have changed stuff. */
                ip = ip_hdr(skb);
-               if (verdict == IPT_CONTINUE)
+               if (verdict == XT_CONTINUE)
                        e = ipt_next_entry(e);
                else
                        /* Verdict */
@@ -461,7 +461,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
                e->counters.pcnt = pos;
 
                for (;;) {
-                       const struct ipt_standard_target *t
+                       const struct xt_standard_target *t
                                = (void *)ipt_get_target_c(e);
                        int visited = e->comefrom & (1 << hook);
 
@@ -475,13 +475,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
                        /* Unconditional return/END. */
                        if ((e->target_offset == sizeof(struct ipt_entry) &&
                             (strcmp(t->target.u.user.name,
-                                    IPT_STANDARD_TARGET) == 0) &&
+                                    XT_STANDARD_TARGET) == 0) &&
                             t->verdict < 0 && unconditional(&e->ip)) ||
                            visited) {
                                unsigned int oldpos, size;
 
                                if ((strcmp(t->target.u.user.name,
-                                           IPT_STANDARD_TARGET) == 0) &&
+                                           XT_STANDARD_TARGET) == 0) &&
                                    t->verdict < -NF_MAX_VERDICT - 1) {
                                        duprintf("mark_source_chains: bad "
                                                "negative verdict (%i)\n",
@@ -524,7 +524,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
                                int newpos = t->verdict;
 
                                if (strcmp(t->target.u.user.name,
-                                          IPT_STANDARD_TARGET) == 0 &&
+                                          XT_STANDARD_TARGET) == 0 &&
                                    newpos >= 0) {
                                        if (newpos > newinfo->size -
                                                sizeof(struct ipt_entry)) {
@@ -552,7 +552,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
        return 1;
 }
 
-static void cleanup_match(struct ipt_entry_match *m, struct net *net)
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
 {
        struct xt_mtdtor_param par;
 
@@ -568,14 +568,14 @@ static void cleanup_match(struct ipt_entry_match *m, struct net *net)
 static int
 check_entry(const struct ipt_entry *e, const char *name)
 {
-       const struct ipt_entry_target *t;
+       const struct xt_entry_target *t;
 
        if (!ip_checkentry(&e->ip)) {
                duprintf("ip check failed %p %s.\n", e, par->match->name);
                return -EINVAL;
        }
 
-       if (e->target_offset + sizeof(struct ipt_entry_target) >
+       if (e->target_offset + sizeof(struct xt_entry_target) >
            e->next_offset)
                return -EINVAL;
 
@@ -587,7 +587,7 @@ check_entry(const struct ipt_entry *e, const char *name)
 }
 
 static int
-check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
+check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
        const struct ipt_ip *ip = par->entryinfo;
        int ret;
@@ -605,7 +605,7 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
 }
 
 static int
-find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
        struct xt_match *match;
        int ret;
@@ -630,7 +630,7 @@ err:
 
 static int check_target(struct ipt_entry *e, struct net *net, const char *name)
 {
-       struct ipt_entry_target *t = ipt_get_target(e);
+       struct xt_entry_target *t = ipt_get_target(e);
        struct xt_tgchk_param par = {
                .net       = net,
                .table     = name,
@@ -656,7 +656,7 @@ static int
 find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
                 unsigned int size)
 {
-       struct ipt_entry_target *t;
+       struct xt_entry_target *t;
        struct xt_target *target;
        int ret;
        unsigned int j;
@@ -707,7 +707,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 
 static bool check_underflow(const struct ipt_entry *e)
 {
-       const struct ipt_entry_target *t;
+       const struct xt_entry_target *t;
        unsigned int verdict;
 
        if (!unconditional(&e->ip))
@@ -715,7 +715,7 @@ static bool check_underflow(const struct ipt_entry *e)
        t = ipt_get_target_c(e);
        if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
                return false;
-       verdict = ((struct ipt_standard_target *)t)->verdict;
+       verdict = ((struct xt_standard_target *)t)->verdict;
        verdict = -verdict - 1;
        return verdict == NF_DROP || verdict == NF_ACCEPT;
 }
@@ -738,7 +738,7 @@ check_entry_size_and_hooks(struct ipt_entry *e,
        }
 
        if (e->next_offset
-           < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) {
+           < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) {
                duprintf("checking: element %p size %u\n",
                         e, e->next_offset);
                return -EINVAL;
@@ -771,7 +771,7 @@ static void
 cleanup_entry(struct ipt_entry *e, struct net *net)
 {
        struct xt_tgdtor_param par;
-       struct ipt_entry_target *t;
+       struct xt_entry_target *t;
        struct xt_entry_match *ematch;
 
        /* Cleanup all matches */
@@ -972,8 +972,8 @@ copy_entries_to_user(unsigned int total_size,
        /* ... then go back and fix counters and names */
        for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
                unsigned int i;
-               const struct ipt_entry_match *m;
-               const struct ipt_entry_target *t;
+               const struct xt_entry_match *m;
+               const struct xt_entry_target *t;
 
                e = (struct ipt_entry *)(loc_cpu_entry + off);
                if (copy_to_user(userptr + off
@@ -990,7 +990,7 @@ copy_entries_to_user(unsigned int total_size,
                        m = (void *)e + i;
 
                        if (copy_to_user(userptr + off + i
-                                        + offsetof(struct ipt_entry_match,
+                                        + offsetof(struct xt_entry_match,
                                                    u.user.name),
                                         m->u.kernel.match->name,
                                         strlen(m->u.kernel.match->name)+1)
@@ -1002,7 +1002,7 @@ copy_entries_to_user(unsigned int total_size,
 
                t = ipt_get_target_c(e);
                if (copy_to_user(userptr + off + e->target_offset
-                                + offsetof(struct ipt_entry_target,
+                                + offsetof(struct xt_entry_target,
                                            u.user.name),
                                 t->u.kernel.target->name,
                                 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -1040,7 +1040,7 @@ static int compat_calc_entry(const struct ipt_entry *e,
                             const void *base, struct xt_table_info *newinfo)
 {
        const struct xt_entry_match *ematch;
-       const struct ipt_entry_target *t;
+       const struct xt_entry_target *t;
        unsigned int entry_offset;
        int off, i, ret;
 
@@ -1092,7 +1092,7 @@ static int compat_table_info(const struct xt_table_info *info,
 static int get_info(struct net *net, void __user *user,
                     const int *len, int compat)
 {
-       char name[IPT_TABLE_MAXNAMELEN];
+       char name[XT_TABLE_MAXNAMELEN];
        struct xt_table *t;
        int ret;
 
@@ -1105,7 +1105,7 @@ static int get_info(struct net *net, void __user *user,
        if (copy_from_user(name, user, sizeof(name)) != 0)
                return -EFAULT;
 
-       name[IPT_TABLE_MAXNAMELEN-1] = '\0';
+       name[XT_TABLE_MAXNAMELEN-1] = '\0';
 #ifdef CONFIG_COMPAT
        if (compat)
                xt_compat_lock(AF_INET);
@@ -1400,14 +1400,14 @@ do_add_counters(struct net *net, const void __user *user,
 
 #ifdef CONFIG_COMPAT
 struct compat_ipt_replace {
-       char                    name[IPT_TABLE_MAXNAMELEN];
+       char                    name[XT_TABLE_MAXNAMELEN];
        u32                     valid_hooks;
        u32                     num_entries;
        u32                     size;
        u32                     hook_entry[NF_INET_NUMHOOKS];
        u32                     underflow[NF_INET_NUMHOOKS];
        u32                     num_counters;
-       compat_uptr_t           counters;       /* struct ipt_counters * */
+       compat_uptr_t           counters;       /* struct xt_counters * */
        struct compat_ipt_entry entries[0];
 };
 
@@ -1416,7 +1416,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
                          unsigned int *size, struct xt_counters *counters,
                          unsigned int i)
 {
-       struct ipt_entry_target *t;
+       struct xt_entry_target *t;
        struct compat_ipt_entry __user *ce;
        u_int16_t target_offset, next_offset;
        compat_uint_t origsize;
@@ -1451,7 +1451,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
 }
 
 static int
-compat_find_calc_match(struct ipt_entry_match *m,
+compat_find_calc_match(struct xt_entry_match *m,
                       const char *name,
                       const struct ipt_ip *ip,
                       unsigned int hookmask,
@@ -1473,7 +1473,7 @@ compat_find_calc_match(struct ipt_entry_match *m,
 
 static void compat_release_entry(struct compat_ipt_entry *e)
 {
-       struct ipt_entry_target *t;
+       struct xt_entry_target *t;
        struct xt_entry_match *ematch;
 
        /* Cleanup all matches */
@@ -1494,7 +1494,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
                                  const char *name)
 {
        struct xt_entry_match *ematch;
-       struct ipt_entry_target *t;
+       struct xt_entry_target *t;
        struct xt_target *target;
        unsigned int entry_offset;
        unsigned int j;
@@ -1576,7 +1576,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
                            unsigned int *size, const char *name,
                            struct xt_table_info *newinfo, unsigned char *base)
 {
-       struct ipt_entry_target *t;
+       struct xt_entry_target *t;
        struct xt_target *target;
        struct ipt_entry *de;
        unsigned int origsize;
@@ -1884,7 +1884,7 @@ compat_do_ipt_set_ctl(struct sock *sk,    int cmd, void __user *user,
 }
 
 struct compat_ipt_get_entries {
-       char name[IPT_TABLE_MAXNAMELEN];
+       char name[XT_TABLE_MAXNAMELEN];
        compat_uint_t size;
        struct compat_ipt_entry entrytable[0];
 };
@@ -2039,7 +2039,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 
        case IPT_SO_GET_REVISION_MATCH:
        case IPT_SO_GET_REVISION_TARGET: {
-               struct ipt_get_revision rev;
+               struct xt_get_revision rev;
                int target;
 
                if (*len != sizeof(rev)) {
@@ -2176,7 +2176,7 @@ static int icmp_checkentry(const struct xt_mtchk_param *par)
 
 static struct xt_target ipt_builtin_tg[] __read_mostly = {
        {
-               .name             = IPT_STANDARD_TARGET,
+               .name             = XT_STANDARD_TARGET,
                .targetsize       = sizeof(int),
                .family           = NFPROTO_IPV4,
 #ifdef CONFIG_COMPAT
@@ -2186,9 +2186,9 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = {
 #endif
        },
        {
-               .name             = IPT_ERROR_TARGET,
+               .name             = XT_ERROR_TARGET,
                .target           = ipt_error,
-               .targetsize       = IPT_FUNCTION_MAXNAMELEN,
+               .targetsize       = XT_FUNCTION_MAXNAMELEN,
                .family           = NFPROTO_IPV4,
        },
 };
index 915fc17d7ce214a017f1993e0ab82a0603a3f4f6..72ffc8fda2e9faca3ab8c4cf3682df0de4a44faa 100644 (file)
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ipt_LOG.h>
 #include <net/netfilter/nf_log.h>
+#include <net/netfilter/xt_log.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog");
 
-/* Use lock to serialize, so printks don't overlap */
-static DEFINE_SPINLOCK(log_lock);
-
 /* One level of recursion won't kill us */
-static void dump_packet(const struct nf_loginfo *info,
+static void dump_packet(struct sbuff *m,
+                       const struct nf_loginfo *info,
                        const struct sk_buff *skb,
                        unsigned int iphoff)
 {
@@ -48,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info,
 
        ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
        if (ih == NULL) {
-               printk("TRUNCATED");
+               sb_add(m, "TRUNCATED");
                return;
        }
 
        /* Important fields:
         * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
        /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
-       printk("SRC=%pI4 DST=%pI4 ",
+       sb_add(m, "SRC=%pI4 DST=%pI4 ",
               &ih->saddr, &ih->daddr);
 
        /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
-       printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+       sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
               ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
               ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
 
        /* Max length: 6 "CE DF MF " */
        if (ntohs(ih->frag_off) & IP_CE)
-               printk("CE ");
+               sb_add(m, "CE ");
        if (ntohs(ih->frag_off) & IP_DF)
-               printk("DF ");
+               sb_add(m, "DF ");
        if (ntohs(ih->frag_off) & IP_MF)
-               printk("MF ");
+               sb_add(m, "MF ");
 
        /* Max length: 11 "FRAG:65535 " */
        if (ntohs(ih->frag_off) & IP_OFFSET)
-               printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+               sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
 
        if ((logflags & IPT_LOG_IPOPT) &&
            ih->ihl * 4 > sizeof(struct iphdr)) {
@@ -85,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info,
                op = skb_header_pointer(skb, iphoff+sizeof(_iph),
                                        optsize, _opt);
                if (op == NULL) {
-                       printk("TRUNCATED");
+                       sb_add(m, "TRUNCATED");
                        return;
                }
 
                /* Max length: 127 "OPT (" 15*4*2chars ") " */
-               printk("OPT (");
+               sb_add(m, "OPT (");
                for (i = 0; i < optsize; i++)
-                       printk("%02X", op[i]);
-               printk(") ");
+                       sb_add(m, "%02X", op[i]);
+               sb_add(m, ") ");
        }
 
        switch (ih->protocol) {
@@ -102,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info,
                const struct tcphdr *th;
 
                /* Max length: 10 "PROTO=TCP " */
-               printk("PROTO=TCP ");
+               sb_add(m, "PROTO=TCP ");
 
                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;
@@ -111,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info,
                th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
                                        sizeof(_tcph), &_tcph);
                if (th == NULL) {
-                       printk("INCOMPLETE [%u bytes] ",
+                       sb_add(m, "INCOMPLETE [%u bytes] ",
                               skb->len - iphoff - ih->ihl*4);
                        break;
                }
 
                /* Max length: 20 "SPT=65535 DPT=65535 " */
-               printk("SPT=%u DPT=%u ",
+               sb_add(m, "SPT=%u DPT=%u ",
                       ntohs(th->source), ntohs(th->dest));
                /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
                if (logflags & IPT_LOG_TCPSEQ)
-                       printk("SEQ=%u ACK=%u ",
+                       sb_add(m, "SEQ=%u ACK=%u ",
                               ntohl(th->seq), ntohl(th->ack_seq));
                /* Max length: 13 "WINDOW=65535 " */
-               printk("WINDOW=%u ", ntohs(th->window));
+               sb_add(m, "WINDOW=%u ", ntohs(th->window));
                /* Max length: 9 "RES=0x3F " */
-               printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+               sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
                /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
                if (th->cwr)
-                       printk("CWR ");
+                       sb_add(m, "CWR ");
                if (th->ece)
-                       printk("ECE ");
+                       sb_add(m, "ECE ");
                if (th->urg)
-                       printk("URG ");
+                       sb_add(m, "URG ");
                if (th->ack)
-                       printk("ACK ");
+                       sb_add(m, "ACK ");
                if (th->psh)
-                       printk("PSH ");
+                       sb_add(m, "PSH ");
                if (th->rst)
-                       printk("RST ");
+                       sb_add(m, "RST ");
                if (th->syn)
-                       printk("SYN ");
+                       sb_add(m, "SYN ");
                if (th->fin)
-                       printk("FIN ");
+                       sb_add(m, "FIN ");
                /* Max length: 11 "URGP=65535 " */
-               printk("URGP=%u ", ntohs(th->urg_ptr));
+               sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
 
                if ((logflags & IPT_LOG_TCPOPT) &&
                    th->doff * 4 > sizeof(struct tcphdr)) {
@@ -158,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info,
                                                iphoff+ih->ihl*4+sizeof(_tcph),
                                                optsize, _opt);
                        if (op == NULL) {
-                               printk("TRUNCATED");
+                               sb_add(m, "TRUNCATED");
                                return;
                        }
 
                        /* Max length: 127 "OPT (" 15*4*2chars ") " */
-                       printk("OPT (");
+                       sb_add(m, "OPT (");
                        for (i = 0; i < optsize; i++)
-                               printk("%02X", op[i]);
-                       printk(") ");
+                               sb_add(m, "%02X", op[i]);
+                       sb_add(m, ") ");
                }
                break;
        }
@@ -177,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info,
 
                if (ih->protocol == IPPROTO_UDP)
                        /* Max length: 10 "PROTO=UDP "     */
-                       printk("PROTO=UDP " );
+                       sb_add(m, "PROTO=UDP " );
                else    /* Max length: 14 "PROTO=UDPLITE " */
-                       printk("PROTO=UDPLITE ");
+                       sb_add(m, "PROTO=UDPLITE ");
 
                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;
@@ -188,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info,
                uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
                                        sizeof(_udph), &_udph);
                if (uh == NULL) {
-                       printk("INCOMPLETE [%u bytes] ",
+                       sb_add(m, "INCOMPLETE [%u bytes] ",
                               skb->len - iphoff - ih->ihl*4);
                        break;
                }
 
                /* Max length: 20 "SPT=65535 DPT=65535 " */
-               printk("SPT=%u DPT=%u LEN=%u ",
+               sb_add(m, "SPT=%u DPT=%u LEN=%u ",
                       ntohs(uh->source), ntohs(uh->dest),
                       ntohs(uh->len));
                break;
@@ -221,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info,
                            [ICMP_ADDRESSREPLY] = 12 };
 
                /* Max length: 11 "PROTO=ICMP " */
-               printk("PROTO=ICMP ");
+               sb_add(m, "PROTO=ICMP ");
 
                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;
@@ -230,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info,
                ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
                                         sizeof(_icmph), &_icmph);
                if (ich == NULL) {
-                       printk("INCOMPLETE [%u bytes] ",
+                       sb_add(m, "INCOMPLETE [%u bytes] ",
                               skb->len - iphoff - ih->ihl*4);
                        break;
                }
 
                /* Max length: 18 "TYPE=255 CODE=255 " */
-               printk("TYPE=%u CODE=%u ", ich->type, ich->code);
+               sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
 
                /* Max length: 25 "INCOMPLETE [65535 bytes] " */
                if (ich->type <= NR_ICMP_TYPES &&
                    required_len[ich->type] &&
                    skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
-                       printk("INCOMPLETE [%u bytes] ",
+                       sb_add(m, "INCOMPLETE [%u bytes] ",
                               skb->len - iphoff - ih->ihl*4);
                        break;
                }
@@ -251,35 +250,35 @@ static void dump_packet(const struct nf_loginfo *info,
                case ICMP_ECHOREPLY:
                case ICMP_ECHO:
                        /* Max length: 19 "ID=65535 SEQ=65535 " */
-                       printk("ID=%u SEQ=%u ",
+                       sb_add(m, "ID=%u SEQ=%u ",
                               ntohs(ich->un.echo.id),
                               ntohs(ich->un.echo.sequence));
                        break;
 
                case ICMP_PARAMETERPROB:
                        /* Max length: 14 "PARAMETER=255 " */
-                       printk("PARAMETER=%u ",
+                       sb_add(m, "PARAMETER=%u ",
                               ntohl(ich->un.gateway) >> 24);
                        break;
                case ICMP_REDIRECT:
                        /* Max length: 24 "GATEWAY=255.255.255.255 " */
-                       printk("GATEWAY=%pI4 ", &ich->un.gateway);
+                       sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
                        /* Fall through */
                case ICMP_DEST_UNREACH:
                case ICMP_SOURCE_QUENCH:
                case ICMP_TIME_EXCEEDED:
                        /* Max length: 3+maxlen */
                        if (!iphoff) { /* Only recurse once. */
-                               printk("[");
-                               dump_packet(info, skb,
+                               sb_add(m, "[");
+                               dump_packet(m, info, skb,
                                            iphoff + ih->ihl*4+sizeof(_icmph));
-                               printk("] ");
+                               sb_add(m, "] ");
                        }
 
                        /* Max length: 10 "MTU=65535 " */
                        if (ich->type == ICMP_DEST_UNREACH &&
                            ich->code == ICMP_FRAG_NEEDED)
-                               printk("MTU=%u ", ntohs(ich->un.frag.mtu));
+                               sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu));
                }
                break;
        }
@@ -292,19 +291,19 @@ static void dump_packet(const struct nf_loginfo *info,
                        break;
 
                /* Max length: 9 "PROTO=AH " */
-               printk("PROTO=AH ");
+               sb_add(m, "PROTO=AH ");
 
                /* Max length: 25 "INCOMPLETE [65535 bytes] " */
                ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
                                        sizeof(_ahdr), &_ahdr);
                if (ah == NULL) {
-                       printk("INCOMPLETE [%u bytes] ",
+                       sb_add(m, "INCOMPLETE [%u bytes] ",
                               skb->len - iphoff - ih->ihl*4);
                        break;
                }
 
                /* Length: 15 "SPI=0xF1234567 " */
-               printk("SPI=0x%x ", ntohl(ah->spi));
+               sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
                break;
        }
        case IPPROTO_ESP: {
@@ -312,7 +311,7 @@ static void dump_packet(const struct nf_loginfo *info,
                const struct ip_esp_hdr *eh;
 
                /* Max length: 10 "PROTO=ESP " */
-               printk("PROTO=ESP ");
+               sb_add(m, "PROTO=ESP ");
 
                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;
@@ -321,25 +320,25 @@ static void dump_packet(const struct nf_loginfo *info,
                eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
                                        sizeof(_esph), &_esph);
                if (eh == NULL) {
-                       printk("INCOMPLETE [%u bytes] ",
+                       sb_add(m, "INCOMPLETE [%u bytes] ",
                               skb->len - iphoff - ih->ihl*4);
                        break;
                }
 
                /* Length: 15 "SPI=0xF1234567 " */
-               printk("SPI=0x%x ", ntohl(eh->spi));
+               sb_add(m, "SPI=0x%x ", ntohl(eh->spi));
                break;
        }
        /* Max length: 10 "PROTO 255 " */
        default:
-               printk("PROTO=%u ", ih->protocol);
+               sb_add(m, "PROTO=%u ", ih->protocol);
        }
 
        /* Max length: 15 "UID=4294967295 " */
        if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
                read_lock_bh(&skb->sk->sk_callback_lock);
                if (skb->sk->sk_socket && skb->sk->sk_socket->file)
-                       printk("UID=%u GID=%u ",
+                       sb_add(m, "UID=%u GID=%u ",
                                skb->sk->sk_socket->file->f_cred->fsuid,
                                skb->sk->sk_socket->file->f_cred->fsgid);
                read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -347,7 +346,7 @@ static void dump_packet(const struct nf_loginfo *info,
 
        /* Max length: 16 "MARK=0xFFFFFFFF " */
        if (!iphoff && skb->mark)
-               printk("MARK=0x%x ", skb->mark);
+               sb_add(m, "MARK=0x%x ", skb->mark);
 
        /* Proto    Max log string length */
        /* IP:      40+46+6+11+127 = 230 */
@@ -364,7 +363,8 @@ static void dump_packet(const struct nf_loginfo *info,
        /* maxlen = 230+   91  + 230 + 252 = 803 */
 }
 
-static void dump_mac_header(const struct nf_loginfo *info,
+static void dump_mac_header(struct sbuff *m,
+                           const struct nf_loginfo *info,
                            const struct sk_buff *skb)
 {
        struct net_device *dev = skb->dev;
@@ -378,7 +378,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
 
        switch (dev->type) {
        case ARPHRD_ETHER:
-               printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+               sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
                       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
                       ntohs(eth_hdr(skb)->h_proto));
                return;
@@ -387,17 +387,17 @@ static void dump_mac_header(const struct nf_loginfo *info,
        }
 
 fallback:
-       printk("MAC=");
+       sb_add(m, "MAC=");
        if (dev->hard_header_len &&
            skb->mac_header != skb->network_header) {
                const unsigned char *p = skb_mac_header(skb);
                unsigned int i;
 
-               printk("%02x", *p++);
+               sb_add(m, "%02x", *p++);
                for (i = 1; i < dev->hard_header_len; i++, p++)
-                       printk(":%02x", *p);
+                       sb_add(m, ":%02x", *p);
        }
-       printk(" ");
+       sb_add(m, " ");
 }
 
 static struct nf_loginfo default_loginfo = {
@@ -419,11 +419,12 @@ ipt_log_packet(u_int8_t pf,
               const struct nf_loginfo *loginfo,
               const char *prefix)
 {
+       struct sbuff *m = sb_open();
+
        if (!loginfo)
                loginfo = &default_loginfo;
 
-       spin_lock_bh(&log_lock);
-       printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+       sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
               prefix,
               in ? in->name : "",
               out ? out->name : "");
@@ -434,20 +435,20 @@ ipt_log_packet(u_int8_t pf,
 
                physindev = skb->nf_bridge->physindev;
                if (physindev && in != physindev)
-                       printk("PHYSIN=%s ", physindev->name);
+                       sb_add(m, "PHYSIN=%s ", physindev->name);
                physoutdev = skb->nf_bridge->physoutdev;
                if (physoutdev && out != physoutdev)
-                       printk("PHYSOUT=%s ", physoutdev->name);
+                       sb_add(m, "PHYSOUT=%s ", physoutdev->name);
        }
 #endif
 
        /* MAC logging for input path only. */
        if (in && !out)
-               dump_mac_header(loginfo, skb);
+               dump_mac_header(m, loginfo, skb);
+
+       dump_packet(m, loginfo, skb, 0);
 
-       dump_packet(loginfo, skb, 0);
-       printk("\n");
-       spin_unlock_bh(&log_lock);
+       sb_close(m);
 }
 
 static unsigned int
index c31b876682502c5d6c13523ff086103e49c5fd7f..0f23b3f06df05e7643e1cd337325955dc6942794 100644 (file)
@@ -44,9 +44,16 @@ static unsigned int help(struct sk_buff *skb,
 
        /* Try to get same port: if not, try to change it. */
        for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+               int ret;
+
                exp->tuple.dst.u.tcp.port = htons(port);
-               if (nf_ct_expect_related(exp) == 0)
+               ret = nf_ct_expect_related(exp);
+               if (ret == 0)
+                       break;
+               else if (ret != -EBUSY) {
+                       port = 0;
                        break;
+               }
        }
 
        if (port == 0)
index 8c8632d9b93cead0cd115945a9566d1e57829667..0047923c1f22aff63a6f7030384e7926539d430f 100644 (file)
@@ -47,7 +47,7 @@ __nf_nat_proto_find(u_int8_t protonum)
        return rcu_dereference(nf_nat_protos[protonum]);
 }
 
-const struct nf_nat_protocol *
+static const struct nf_nat_protocol *
 nf_nat_proto_find_get(u_int8_t protonum)
 {
        const struct nf_nat_protocol *p;
@@ -60,14 +60,12 @@ nf_nat_proto_find_get(u_int8_t protonum)
 
        return p;
 }
-EXPORT_SYMBOL_GPL(nf_nat_proto_find_get);
 
-void
+static void
 nf_nat_proto_put(const struct nf_nat_protocol *p)
 {
        module_put(p->me);
 }
-EXPORT_SYMBOL_GPL(nf_nat_proto_put);
 
 /* We keep an extra hash for each conntrack, for fast searching. */
 static inline unsigned int
@@ -262,11 +260,17 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
        proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
 
        /* Only bother mapping if it's not already in range and unique */
-       if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM) &&
-           (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
-            proto->in_range(tuple, maniptype, &range->min, &range->max)) &&
-           !nf_nat_used_tuple(tuple, ct))
-               goto out;
+       if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
+               if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
+                       if (proto->in_range(tuple, maniptype, &range->min,
+                                           &range->max) &&
+                           (range->min.all == range->max.all ||
+                            !nf_nat_used_tuple(tuple, ct)))
+                               goto out;
+               } else if (!nf_nat_used_tuple(tuple, ct)) {
+                       goto out;
+               }
+       }
 
        /* Last change: get protocol to try to obtain unique tuple. */
        proto->unique_tuple(tuple, range, maniptype, ct);
@@ -458,6 +462,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
                        return 0;
        }
 
+       if (manip == IP_NAT_MANIP_SRC)
+               statusbit = IPS_SRC_NAT;
+       else
+               statusbit = IPS_DST_NAT;
+
+       /* Invert if this is reply dir. */
+       if (dir == IP_CT_DIR_REPLY)
+               statusbit ^= IPS_NAT_MASK;
+
+       if (!(ct->status & statusbit))
+               return 1;
+
        pr_debug("icmp_reply_translation: translating error %p manip %u "
                 "dir %s\n", skb, manip,
                 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
@@ -492,20 +508,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
 
        /* Change outer to look the reply to an incoming packet
         * (proto 0 means don't invert per-proto part). */
-       if (manip == IP_NAT_MANIP_SRC)
-               statusbit = IPS_SRC_NAT;
-       else
-               statusbit = IPS_DST_NAT;
-
-       /* Invert if this is reply dir. */
-       if (dir == IP_CT_DIR_REPLY)
-               statusbit ^= IPS_NAT_MASK;
-
-       if (ct->status & statusbit) {
-               nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
-               if (!manip_pkt(0, skb, 0, &target, manip))
-                       return 0;
-       }
+       nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+       if (!manip_pkt(0, skb, 0, &target, manip))
+               return 0;
 
        return 1;
 }
index 86e0e84ff0a04fe09cd3b5d67dddcf93457ffa15..dc73abb3fe27ecea0537ab8fa2a5e029435fb783 100644 (file)
@@ -79,9 +79,16 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
 
        /* Try to get same port: if not, try to change it. */
        for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+               int ret;
+
                exp->tuple.dst.u.tcp.port = htons(port);
-               if (nf_ct_expect_related(exp) == 0)
+               ret = nf_ct_expect_related(exp);
+               if (ret == 0)
+                       break;
+               else if (ret != -EBUSY) {
+                       port = 0;
                        break;
+               }
        }
 
        if (port == 0)
index 5045196d853c7878050161ea5e43e1aac26eff83..790f3160e0121c19ed5f2bef07388656ae7310a6 100644 (file)
@@ -222,13 +222,24 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
        /* Try to get a pair of ports. */
        for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
             nated_port != 0; nated_port += 2) {
+               int ret;
+
                rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
-               if (nf_ct_expect_related(rtp_exp) == 0) {
+               ret = nf_ct_expect_related(rtp_exp);
+               if (ret == 0) {
                        rtcp_exp->tuple.dst.u.udp.port =
                            htons(nated_port + 1);
-                       if (nf_ct_expect_related(rtcp_exp) == 0)
+                       ret = nf_ct_expect_related(rtcp_exp);
+                       if (ret == 0)
+                               break;
+                       else if (ret != -EBUSY) {
+                               nf_ct_unexpect_related(rtp_exp);
+                               nated_port = 0;
                                break;
-                       nf_ct_unexpect_related(rtp_exp);
+                       }
+               } else if (ret != -EBUSY) {
+                       nated_port = 0;
+                       break;
                }
        }
 
@@ -284,9 +295,16 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
 
        /* Try to get same port: if not, try to change it. */
        for (; nated_port != 0; nated_port++) {
+               int ret;
+
                exp->tuple.dst.u.tcp.port = htons(nated_port);
-               if (nf_ct_expect_related(exp) == 0)
+               ret = nf_ct_expect_related(exp);
+               if (ret == 0)
+                       break;
+               else if (ret != -EBUSY) {
+                       nated_port = 0;
                        break;
+               }
        }
 
        if (nated_port == 0) {  /* No port available */
@@ -334,9 +352,16 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
 
        /* Try to get same port: if not, try to change it. */
        for (; nated_port != 0; nated_port++) {
+               int ret;
+
                exp->tuple.dst.u.tcp.port = htons(nated_port);
-               if (nf_ct_expect_related(exp) == 0)
+               ret = nf_ct_expect_related(exp);
+               if (ret == 0)
                        break;
+               else if (ret != -EBUSY) {
+                       nated_port = 0;
+                       break;
+               }
        }
 
        if (nated_port == 0) {  /* No port available */
@@ -418,9 +443,16 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
 
        /* Try to get same port: if not, try to change it. */
        for (; nated_port != 0; nated_port++) {
+               int ret;
+
                exp->tuple.dst.u.tcp.port = htons(nated_port);
-               if (nf_ct_expect_related(exp) == 0)
+               ret = nf_ct_expect_related(exp);
+               if (ret == 0)
+                       break;
+               else if (ret != -EBUSY) {
+                       nated_port = 0;
                        break;
+               }
        }
 
        if (nated_port == 0) {  /* No port available */
@@ -500,9 +532,16 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
 
        /* Try to get same port: if not, try to change it. */
        for (nated_port = ntohs(port); nated_port != 0; nated_port++) {
+               int ret;
+
                exp->tuple.dst.u.tcp.port = htons(nated_port);
-               if (nf_ct_expect_related(exp) == 0)
+               ret = nf_ct_expect_related(exp);
+               if (ret == 0)
                        break;
+               else if (ret != -EBUSY) {
+                       nated_port = 0;
+                       break;
+               }
        }
 
        if (nated_port == 0) {  /* No port available */
index 4a0c6b548eee22f1dfc0e6d01187f209319f10d7..31427fb57aa8abdc299bc1a2c1ab2ebc3929b543 100644 (file)
@@ -153,6 +153,35 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 }
 EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
 
+static void nf_nat_csum(struct sk_buff *skb, struct iphdr *iph, void *data,
+                       int datalen, __sum16 *check, int oldlen)
+{
+       struct rtable *rt = skb_rtable(skb);
+
+       if (skb->ip_summed != CHECKSUM_PARTIAL) {
+               if (!(rt->rt_flags & RTCF_LOCAL) &&
+                   skb->dev->features & NETIF_F_V4_CSUM) {
+                       skb->ip_summed = CHECKSUM_PARTIAL;
+                       skb->csum_start = skb_headroom(skb) +
+                                         skb_network_offset(skb) +
+                                         iph->ihl * 4;
+                       skb->csum_offset = (void *)check - data;
+                       *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+                                                   datalen, iph->protocol, 0);
+               } else {
+                       *check = 0;
+                       *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+                                                  datalen, iph->protocol,
+                                                  csum_partial(data, datalen,
+                                                               0));
+                       if (iph->protocol == IPPROTO_UDP && !*check)
+                               *check = CSUM_MANGLED_0;
+               }
+       } else
+               inet_proto_csum_replace2(check, skb,
+                                        htons(oldlen), htons(datalen), 1);
+}
+
 /* Generic function for mangling variable-length address changes inside
  * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
  * command in FTP).
@@ -169,7 +198,6 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
                               const char *rep_buffer,
                               unsigned int rep_len, bool adjust)
 {
-       struct rtable *rt = skb_rtable(skb);
        struct iphdr *iph;
        struct tcphdr *tcph;
        int oldlen, datalen;
@@ -192,26 +220,7 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
                        match_offset, match_len, rep_buffer, rep_len);
 
        datalen = skb->len - iph->ihl*4;
-       if (skb->ip_summed != CHECKSUM_PARTIAL) {
-               if (!(rt->rt_flags & RTCF_LOCAL) &&
-                   skb->dev->features & NETIF_F_V4_CSUM) {
-                       skb->ip_summed = CHECKSUM_PARTIAL;
-                       skb->csum_start = skb_headroom(skb) +
-                                         skb_network_offset(skb) +
-                                         iph->ihl * 4;
-                       skb->csum_offset = offsetof(struct tcphdr, check);
-                       tcph->check = ~tcp_v4_check(datalen,
-                                                   iph->saddr, iph->daddr, 0);
-               } else {
-                       tcph->check = 0;
-                       tcph->check = tcp_v4_check(datalen,
-                                                  iph->saddr, iph->daddr,
-                                                  csum_partial(tcph,
-                                                               datalen, 0));
-               }
-       } else
-               inet_proto_csum_replace2(&tcph->check, skb,
-                                        htons(oldlen), htons(datalen), 1);
+       nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
 
        if (adjust && rep_len != match_len)
                nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
@@ -240,7 +249,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
                         const char *rep_buffer,
                         unsigned int rep_len)
 {
-       struct rtable *rt = skb_rtable(skb);
        struct iphdr *iph;
        struct udphdr *udph;
        int datalen, oldlen;
@@ -274,29 +282,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
        if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
                return 1;
 
-       if (skb->ip_summed != CHECKSUM_PARTIAL) {
-               if (!(rt->rt_flags & RTCF_LOCAL) &&
-                   skb->dev->features & NETIF_F_V4_CSUM) {
-                       skb->ip_summed = CHECKSUM_PARTIAL;
-                       skb->csum_start = skb_headroom(skb) +
-                                         skb_network_offset(skb) +
-                                         iph->ihl * 4;
-                       skb->csum_offset = offsetof(struct udphdr, check);
-                       udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
-                                                        datalen, IPPROTO_UDP,
-                                                        0);
-               } else {
-                       udph->check = 0;
-                       udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
-                                                       datalen, IPPROTO_UDP,
-                                                       csum_partial(udph,
-                                                                    datalen, 0));
-                       if (!udph->check)
-                               udph->check = CSUM_MANGLED_0;
-               }
-       } else
-               inet_proto_csum_replace2(&udph->check, skb,
-                                        htons(oldlen), htons(datalen), 1);
+       nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
 
        return 1;
 }
index ea83a886b03e6ac3204f410047542a9edea8c4dc..535e1a80235688480bb9b58d2492291183e7703a 100644 (file)
@@ -45,9 +45,16 @@ static unsigned int help(struct sk_buff *skb,
 
        /* Try to get same port: if not, try to change it. */
        for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+               int ret;
+
                exp->tuple.dst.u.tcp.port = htons(port);
-               if (nf_ct_expect_related(exp) == 0)
+               ret = nf_ct_expect_related(exp);
+               if (ret == 0)
+                       break;
+               else if (ret != -EBUSY) {
+                       port = 0;
                        break;
+               }
        }
 
        if (port == 0)
index ebbd319f62f56741ea2d22da8529cda0586089a0..21c30426480b0d08cdc10a0257ee2f6ce736840f 100644 (file)
@@ -106,16 +106,15 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
 {
        /* Force range to this IP; let proto decide mapping for
           per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
-          Use reply in case it's already been mangled (eg local packet).
        */
-       __be32 ip
-               = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
-                  ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip
-                  : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
-       struct nf_nat_range range
-               = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
-
-       pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip);
+       struct nf_nat_range range;
+
+       range.flags = 0;
+       pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
+                HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ?
+                &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
+                &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
+
        return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
 }
 
index 11b538deaaec1f996505d387d549b3de6dd4bb37..e40cf7816fdbb4e0088efd200a46cbd3a9041ed2 100644 (file)
@@ -307,9 +307,16 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
        exp->expectfn = ip_nat_sip_expected;
 
        for (; port != 0; port++) {
+               int ret;
+
                exp->tuple.dst.u.udp.port = htons(port);
-               if (nf_ct_expect_related(exp) == 0)
+               ret = nf_ct_expect_related(exp);
+               if (ret == 0)
+                       break;
+               else if (ret != -EBUSY) {
+                       port = 0;
                        break;
+               }
        }
 
        if (port == 0)
@@ -480,13 +487,25 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
        /* Try to get same pair of ports: if not, try to change them. */
        for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
             port != 0; port += 2) {
+               int ret;
+
                rtp_exp->tuple.dst.u.udp.port = htons(port);
-               if (nf_ct_expect_related(rtp_exp) != 0)
+               ret = nf_ct_expect_related(rtp_exp);
+               if (ret == -EBUSY)
                        continue;
+               else if (ret < 0) {
+                       port = 0;
+                       break;
+               }
                rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
-               if (nf_ct_expect_related(rtcp_exp) == 0)
+               ret = nf_ct_expect_related(rtcp_exp);
+               if (ret == 0)
                        break;
-               nf_ct_unexpect_related(rtp_exp);
+               else if (ret != -EBUSY) {
+                       nf_ct_unexpect_related(rtp_exp);
+                       port = 0;
+                       break;
+               }
        }
 
        if (port == 0)
index a0232f3a358b2b8fc6cd98b96a98f6b5cc69f1f6..8f8527d4168225be9429766df1d1e2c085057868 100644 (file)
@@ -1422,7 +1422,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
        newsk = tcp_create_openreq_child(sk, req, skb);
        if (!newsk)
-               goto exit;
+               goto exit_nonewsk;
 
        newsk->sk_gso_type = SKB_GSO_TCPV4;
        sk_setup_caps(newsk, dst);
@@ -1469,16 +1469,20 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        }
 #endif
 
+       if (__inet_inherit_port(sk, newsk) < 0) {
+               sock_put(newsk);
+               goto exit;
+       }
        __inet_hash_nolisten(newsk, NULL);
-       __inet_inherit_port(sk, newsk);
 
        return newsk;
 
 exit_overflow:
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+       dst_release(dst);
 exit:
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
-       dst_release(dst);
        return NULL;
 }
 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
index 60220985bb806ecf5b366315bdfe5508757c86f8..54e8e42f7a88ad675aae5fb4782d6ce36d9a4ba9 100644 (file)
@@ -343,7 +343,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
                         */
                        v4addr = LOOPBACK4_IPV6;
                        if (!(addr_type & IPV6_ADDR_MULTICAST)) {
-                               if (!ipv6_chk_addr(net, &addr->sin6_addr,
+                               if (!inet->transparent &&
+                                   !ipv6_chk_addr(net, &addr->sin6_addr,
                                                   dev, 0)) {
                                        err = -EADDRNOTAVAIL;
                                        goto out_unlock;
index ef371aa01ac50724f9dff9cbf7d084e062d844c9..320bdb877eed2ff61da25c6c9a1ac2f6cc00baac 100644 (file)
@@ -577,6 +577,25 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
                u8 *ptr = nh + opt->dst1;
                put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
        }
+       if (np->rxopt.bits.rxorigdstaddr) {
+               struct sockaddr_in6 sin6;
+               u16 *ports = (u16 *) skb_transport_header(skb);
+
+               if (skb_transport_offset(skb) + 4 <= skb->len) {
+                       /* All current transport protocols have the port numbers in the
+                        * first four bytes of the transport header and this function is
+                        * written with this assumption in mind.
+                        */
+
+                       sin6.sin6_family = AF_INET6;
+                       ipv6_addr_copy(&sin6.sin6_addr, &ipv6_hdr(skb)->daddr);
+                       sin6.sin6_port = ports[1];
+                       sin6.sin6_flowinfo = 0;
+                       sin6.sin6_scope_id = 0;
+
+                       put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6);
+               }
+       }
        return 0;
 }
 
index a7f66bc8f0b0ef2a8b0eee454c1e103344eda417..0553867a317f4466b31df3bd6d2695e180537be9 100644 (file)
@@ -342,6 +342,21 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
                retv = 0;
                break;
 
+       case IPV6_TRANSPARENT:
+               if (optlen < sizeof(int))
+                       goto e_inval;
+               /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */
+               inet_sk(sk)->transparent = valbool;
+               retv = 0;
+               break;
+
+       case IPV6_RECVORIGDSTADDR:
+               if (optlen < sizeof(int))
+                       goto e_inval;
+               np->rxopt.bits.rxorigdstaddr = valbool;
+               retv = 0;
+               break;
+
        case IPV6_HOPOPTS:
        case IPV6_RTHDRDSTOPTS:
        case IPV6_RTHDR:
@@ -1104,6 +1119,14 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
                break;
        }
 
+       case IPV6_TRANSPARENT:
+               val = inet_sk(sk)->transparent;
+               break;
+
+       case IPV6_RECVORIGDSTADDR:
+               val = np->rxopt.bits.rxorigdstaddr;
+               break;
+
        case IPV6_UNICAST_HOPS:
        case IPV6_MULTICAST_HOPS:
        {
index 29d643bcafa4b38be01c7ff40cb65760f8c9af17..44d2eeac089b4fedc3a65c312263b667231838dc 100644 (file)
@@ -132,10 +132,10 @@ config IP6_NF_MATCH_RT
 # The targets
 config IP6_NF_TARGET_HL
        tristate '"HL" hoplimit target support'
-       depends on NETFILTER_ADVANCED
+       depends on NETFILTER_ADVANCED && IP6_NF_MANGLE
        select NETFILTER_XT_TARGET_HL
        ---help---
-       This is a backwards-compat option for the user's convenience
+       This is a backwards-compatible option for the user's convenience
        (e.g. when running oldconfig). It selects
        CONFIG_NETFILTER_XT_TARGET_HL.
 
index aafbba30c899fac569184a5ef9943f13ecfdb3b9..3f8e4a3d83ce107bdf2280c35fac189f8a2a2838 100644 (file)
@@ -11,10 +11,11 @@ obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
 obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
 
 # objects for l3 independent conntrack
-nf_conntrack_ipv6-objs  :=  nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o
+nf_conntrack_ipv6-objs  :=  nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
+nf_defrag_ipv6-objs := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
 
 # l3 independent conntrack
-obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
+obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_defrag_ipv6.o
 
 # matches
 obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
index 6b331e9b57065c32c94ac15f53bd592b2020c7af..51df035897e77dfa5b89e6328817cfcb638702cb 100644 (file)
@@ -215,7 +215,7 @@ static inline bool unconditional(const struct ip6t_ip6 *ipv6)
        return memcmp(ipv6, &uncond, sizeof(uncond)) == 0;
 }
 
-static inline const struct ip6t_entry_target *
+static inline const struct xt_entry_target *
 ip6t_get_target_c(const struct ip6t_entry *e)
 {
        return ip6t_get_target((struct ip6t_entry *)e);
@@ -260,9 +260,9 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
                      const char *hookname, const char **chainname,
                      const char **comment, unsigned int *rulenum)
 {
-       const struct ip6t_standard_target *t = (void *)ip6t_get_target_c(s);
+       const struct xt_standard_target *t = (void *)ip6t_get_target_c(s);
 
-       if (strcmp(t->target.u.kernel.target->name, IP6T_ERROR_TARGET) == 0) {
+       if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
                /* Head of user chain: ERROR target with chainname */
                *chainname = t->target.data;
                (*rulenum) = 0;
@@ -271,7 +271,7 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
 
                if (s->target_offset == sizeof(struct ip6t_entry) &&
                    strcmp(t->target.u.kernel.target->name,
-                          IP6T_STANDARD_TARGET) == 0 &&
+                          XT_STANDARD_TARGET) == 0 &&
                    t->verdict < 0 &&
                    unconditional(&s->ipv6)) {
                        /* Tail of chains: STANDARD target (return/policy) */
@@ -369,7 +369,7 @@ ip6t_do_table(struct sk_buff *skb,
        e = get_entry(table_base, private->hook_entry[hook]);
 
        do {
-               const struct ip6t_entry_target *t;
+               const struct xt_entry_target *t;
                const struct xt_entry_match *ematch;
 
                IP_NF_ASSERT(e);
@@ -403,10 +403,10 @@ ip6t_do_table(struct sk_buff *skb,
                if (!t->u.kernel.target->target) {
                        int v;
 
-                       v = ((struct ip6t_standard_target *)t)->verdict;
+                       v = ((struct xt_standard_target *)t)->verdict;
                        if (v < 0) {
                                /* Pop from stack? */
-                               if (v != IP6T_RETURN) {
+                               if (v != XT_RETURN) {
                                        verdict = (unsigned)(-v) - 1;
                                        break;
                                }
@@ -434,7 +434,7 @@ ip6t_do_table(struct sk_buff *skb,
                acpar.targinfo = t->data;
 
                verdict = t->u.kernel.target->target(skb, &acpar);
-               if (verdict == IP6T_CONTINUE)
+               if (verdict == XT_CONTINUE)
                        e = ip6t_next_entry(e);
                else
                        /* Verdict */
@@ -474,7 +474,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
                e->counters.pcnt = pos;
 
                for (;;) {
-                       const struct ip6t_standard_target *t
+                       const struct xt_standard_target *t
                                = (void *)ip6t_get_target_c(e);
                        int visited = e->comefrom & (1 << hook);
 
@@ -488,13 +488,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
                        /* Unconditional return/END. */
                        if ((e->target_offset == sizeof(struct ip6t_entry) &&
                             (strcmp(t->target.u.user.name,
-                                    IP6T_STANDARD_TARGET) == 0) &&
+                                    XT_STANDARD_TARGET) == 0) &&
                             t->verdict < 0 &&
                             unconditional(&e->ipv6)) || visited) {
                                unsigned int oldpos, size;
 
                                if ((strcmp(t->target.u.user.name,
-                                           IP6T_STANDARD_TARGET) == 0) &&
+                                           XT_STANDARD_TARGET) == 0) &&
                                    t->verdict < -NF_MAX_VERDICT - 1) {
                                        duprintf("mark_source_chains: bad "
                                                "negative verdict (%i)\n",
@@ -537,7 +537,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
                                int newpos = t->verdict;
 
                                if (strcmp(t->target.u.user.name,
-                                          IP6T_STANDARD_TARGET) == 0 &&
+                                          XT_STANDARD_TARGET) == 0 &&
                                    newpos >= 0) {
                                        if (newpos > newinfo->size -
                                                sizeof(struct ip6t_entry)) {
@@ -565,7 +565,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
        return 1;
 }
 
-static void cleanup_match(struct ip6t_entry_match *m, struct net *net)
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
 {
        struct xt_mtdtor_param par;
 
@@ -581,14 +581,14 @@ static void cleanup_match(struct ip6t_entry_match *m, struct net *net)
 static int
 check_entry(const struct ip6t_entry *e, const char *name)
 {
-       const struct ip6t_entry_target *t;
+       const struct xt_entry_target *t;
 
        if (!ip6_checkentry(&e->ipv6)) {
                duprintf("ip_tables: ip check failed %p %s.\n", e, name);
                return -EINVAL;
        }
 
-       if (e->target_offset + sizeof(struct ip6t_entry_target) >
+       if (e->target_offset + sizeof(struct xt_entry_target) >
            e->next_offset)
                return -EINVAL;
 
@@ -599,7 +599,7 @@ check_entry(const struct ip6t_entry *e, const char *name)
        return 0;
 }
 
-static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par)
+static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
        const struct ip6t_ip6 *ipv6 = par->entryinfo;
        int ret;
@@ -618,7 +618,7 @@ static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par)
 }
 
 static int
-find_check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par)
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
        struct xt_match *match;
        int ret;
@@ -643,7 +643,7 @@ err:
 
 static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
 {
-       struct ip6t_entry_target *t = ip6t_get_target(e);
+       struct xt_entry_target *t = ip6t_get_target(e);
        struct xt_tgchk_param par = {
                .net       = net,
                .table     = name,
@@ -670,7 +670,7 @@ static int
 find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
                 unsigned int size)
 {
-       struct ip6t_entry_target *t;
+       struct xt_entry_target *t;
        struct xt_target *target;
        int ret;
        unsigned int j;
@@ -721,7 +721,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 
 static bool check_underflow(const struct ip6t_entry *e)
 {
-       const struct ip6t_entry_target *t;
+       const struct xt_entry_target *t;
        unsigned int verdict;
 
        if (!unconditional(&e->ipv6))
@@ -729,7 +729,7 @@ static bool check_underflow(const struct ip6t_entry *e)
        t = ip6t_get_target_c(e);
        if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
                return false;
-       verdict = ((struct ip6t_standard_target *)t)->verdict;
+       verdict = ((struct xt_standard_target *)t)->verdict;
        verdict = -verdict - 1;
        return verdict == NF_DROP || verdict == NF_ACCEPT;
 }
@@ -752,7 +752,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
        }
 
        if (e->next_offset
-           < sizeof(struct ip6t_entry) + sizeof(struct ip6t_entry_target)) {
+           < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) {
                duprintf("checking: element %p size %u\n",
                         e, e->next_offset);
                return -EINVAL;
@@ -784,7 +784,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 static void cleanup_entry(struct ip6t_entry *e, struct net *net)
 {
        struct xt_tgdtor_param par;
-       struct ip6t_entry_target *t;
+       struct xt_entry_target *t;
        struct xt_entry_match *ematch;
 
        /* Cleanup all matches */
@@ -985,8 +985,8 @@ copy_entries_to_user(unsigned int total_size,
        /* ... then go back and fix counters and names */
        for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
                unsigned int i;
-               const struct ip6t_entry_match *m;
-               const struct ip6t_entry_target *t;
+               const struct xt_entry_match *m;
+               const struct xt_entry_target *t;
 
                e = (struct ip6t_entry *)(loc_cpu_entry + off);
                if (copy_to_user(userptr + off
@@ -1003,7 +1003,7 @@ copy_entries_to_user(unsigned int total_size,
                        m = (void *)e + i;
 
                        if (copy_to_user(userptr + off + i
-                                        + offsetof(struct ip6t_entry_match,
+                                        + offsetof(struct xt_entry_match,
                                                    u.user.name),
                                         m->u.kernel.match->name,
                                         strlen(m->u.kernel.match->name)+1)
@@ -1015,7 +1015,7 @@ copy_entries_to_user(unsigned int total_size,
 
                t = ip6t_get_target_c(e);
                if (copy_to_user(userptr + off + e->target_offset
-                                + offsetof(struct ip6t_entry_target,
+                                + offsetof(struct xt_entry_target,
                                            u.user.name),
                                 t->u.kernel.target->name,
                                 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -1053,7 +1053,7 @@ static int compat_calc_entry(const struct ip6t_entry *e,
                             const void *base, struct xt_table_info *newinfo)
 {
        const struct xt_entry_match *ematch;
-       const struct ip6t_entry_target *t;
+       const struct xt_entry_target *t;
        unsigned int entry_offset;
        int off, i, ret;
 
@@ -1105,7 +1105,7 @@ static int compat_table_info(const struct xt_table_info *info,
 static int get_info(struct net *net, void __user *user,
                     const int *len, int compat)
 {
-       char name[IP6T_TABLE_MAXNAMELEN];
+       char name[XT_TABLE_MAXNAMELEN];
        struct xt_table *t;
        int ret;
 
@@ -1118,7 +1118,7 @@ static int get_info(struct net *net, void __user *user,
        if (copy_from_user(name, user, sizeof(name)) != 0)
                return -EFAULT;
 
-       name[IP6T_TABLE_MAXNAMELEN-1] = '\0';
+       name[XT_TABLE_MAXNAMELEN-1] = '\0';
 #ifdef CONFIG_COMPAT
        if (compat)
                xt_compat_lock(AF_INET6);
@@ -1415,14 +1415,14 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 
 #ifdef CONFIG_COMPAT
 struct compat_ip6t_replace {
-       char                    name[IP6T_TABLE_MAXNAMELEN];
+       char                    name[XT_TABLE_MAXNAMELEN];
        u32                     valid_hooks;
        u32                     num_entries;
        u32                     size;
        u32                     hook_entry[NF_INET_NUMHOOKS];
        u32                     underflow[NF_INET_NUMHOOKS];
        u32                     num_counters;
-       compat_uptr_t           counters;       /* struct ip6t_counters * */
+       compat_uptr_t           counters;       /* struct xt_counters * */
        struct compat_ip6t_entry entries[0];
 };
 
@@ -1431,7 +1431,7 @@ compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
                          unsigned int *size, struct xt_counters *counters,
                          unsigned int i)
 {
-       struct ip6t_entry_target *t;
+       struct xt_entry_target *t;
        struct compat_ip6t_entry __user *ce;
        u_int16_t target_offset, next_offset;
        compat_uint_t origsize;
@@ -1466,7 +1466,7 @@ compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
 }
 
 static int
-compat_find_calc_match(struct ip6t_entry_match *m,
+compat_find_calc_match(struct xt_entry_match *m,
                       const char *name,
                       const struct ip6t_ip6 *ipv6,
                       unsigned int hookmask,
@@ -1488,7 +1488,7 @@ compat_find_calc_match(struct ip6t_entry_match *m,
 
 static void compat_release_entry(struct compat_ip6t_entry *e)
 {
-       struct ip6t_entry_target *t;
+       struct xt_entry_target *t;
        struct xt_entry_match *ematch;
 
        /* Cleanup all matches */
@@ -1509,7 +1509,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
                                  const char *name)
 {
        struct xt_entry_match *ematch;
-       struct ip6t_entry_target *t;
+       struct xt_entry_target *t;
        struct xt_target *target;
        unsigned int entry_offset;
        unsigned int j;
@@ -1591,7 +1591,7 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
                            unsigned int *size, const char *name,
                            struct xt_table_info *newinfo, unsigned char *base)
 {
-       struct ip6t_entry_target *t;
+       struct xt_entry_target *t;
        struct xt_target *target;
        struct ip6t_entry *de;
        unsigned int origsize;
@@ -1899,7 +1899,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
 }
 
 struct compat_ip6t_get_entries {
-       char name[IP6T_TABLE_MAXNAMELEN];
+       char name[XT_TABLE_MAXNAMELEN];
        compat_uint_t size;
        struct compat_ip6t_entry entrytable[0];
 };
@@ -2054,7 +2054,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 
        case IP6T_SO_GET_REVISION_MATCH:
        case IP6T_SO_GET_REVISION_TARGET: {
-               struct ip6t_get_revision rev;
+               struct xt_get_revision rev;
                int target;
 
                if (*len != sizeof(rev)) {
@@ -2191,7 +2191,7 @@ static int icmp6_checkentry(const struct xt_mtchk_param *par)
 /* The built-in targets: standard (NULL) and error. */
 static struct xt_target ip6t_builtin_tg[] __read_mostly = {
        {
-               .name             = IP6T_STANDARD_TARGET,
+               .name             = XT_STANDARD_TARGET,
                .targetsize       = sizeof(int),
                .family           = NFPROTO_IPV6,
 #ifdef CONFIG_COMPAT
@@ -2201,9 +2201,9 @@ static struct xt_target ip6t_builtin_tg[] __read_mostly = {
 #endif
        },
        {
-               .name             = IP6T_ERROR_TARGET,
+               .name             = XT_ERROR_TARGET,
                .target           = ip6t_error,
-               .targetsize       = IP6T_FUNCTION_MAXNAMELEN,
+               .targetsize       = XT_FUNCTION_MAXNAMELEN,
                .family           = NFPROTO_IPV6,
        },
 };
index 0a07ae7b933f2fbbac6fbf8cf9741d8bea6882e6..09c88891a753e725d8594edc68941b293d171100 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/netfilter/nf_log.h>
+#include <net/netfilter/xt_log.h>
 
 MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
 MODULE_DESCRIPTION("Xtables: IPv6 packet logging to syslog");
@@ -32,11 +33,9 @@ struct in_device;
 #include <net/route.h>
 #include <linux/netfilter_ipv6/ip6t_LOG.h>
 
-/* Use lock to serialize, so printks don't overlap */
-static DEFINE_SPINLOCK(log_lock);
-
 /* One level of recursion won't kill us */
-static void dump_packet(const struct nf_loginfo *info,
+static void dump_packet(struct sbuff *m,
+                       const struct nf_loginfo *info,
                        const struct sk_buff *skb, unsigned int ip6hoff,
                        int recurse)
 {
@@ -55,15 +54,15 @@ static void dump_packet(const struct nf_loginfo *info,
 
        ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
        if (ih == NULL) {
-               printk("TRUNCATED");
+               sb_add(m, "TRUNCATED");
                return;
        }
 
        /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
-       printk("SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
+       sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
 
        /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
-       printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+       sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
               ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
               (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
               ih->hop_limit,
@@ -78,35 +77,35 @@ static void dump_packet(const struct nf_loginfo *info,
 
                hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
                if (hp == NULL) {
-                       printk("TRUNCATED");
+                       sb_add(m, "TRUNCATED");
                        return;
                }
 
                /* Max length: 48 "OPT (...) " */
                if (logflags & IP6T_LOG_IPOPT)
-                       printk("OPT ( ");
+                       sb_add(m, "OPT ( ");
 
                switch (currenthdr) {
                case IPPROTO_FRAGMENT: {
                        struct frag_hdr _fhdr;
                        const struct frag_hdr *fh;
 
-                       printk("FRAG:");
+                       sb_add(m, "FRAG:");
                        fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
                                                &_fhdr);
                        if (fh == NULL) {
-                               printk("TRUNCATED ");
+                               sb_add(m, "TRUNCATED ");
                                return;
                        }
 
                        /* Max length: 6 "65535 " */
-                       printk("%u ", ntohs(fh->frag_off) & 0xFFF8);
+                       sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
 
                        /* Max length: 11 "INCOMPLETE " */
                        if (fh->frag_off & htons(0x0001))
-                               printk("INCOMPLETE ");
+                               sb_add(m, "INCOMPLETE ");
 
-                       printk("ID:%08x ", ntohl(fh->identification));
+                       sb_add(m, "ID:%08x ", ntohl(fh->identification));
 
                        if (ntohs(fh->frag_off) & 0xFFF8)
                                fragment = 1;
@@ -120,7 +119,7 @@ static void dump_packet(const struct nf_loginfo *info,
                case IPPROTO_HOPOPTS:
                        if (fragment) {
                                if (logflags & IP6T_LOG_IPOPT)
-                                       printk(")");
+                                       sb_add(m, ")");
                                return;
                        }
                        hdrlen = ipv6_optlen(hp);
@@ -132,10 +131,10 @@ static void dump_packet(const struct nf_loginfo *info,
                                const struct ip_auth_hdr *ah;
 
                                /* Max length: 3 "AH " */
-                               printk("AH ");
+                               sb_add(m, "AH ");
 
                                if (fragment) {
-                                       printk(")");
+                                       sb_add(m, ")");
                                        return;
                                }
 
@@ -146,13 +145,13 @@ static void dump_packet(const struct nf_loginfo *info,
                                         * Max length: 26 "INCOMPLETE [65535
                                         *  bytes] )"
                                         */
-                                       printk("INCOMPLETE [%u bytes] )",
+                                       sb_add(m, "INCOMPLETE [%u bytes] )",
                                               skb->len - ptr);
                                        return;
                                }
 
                                /* Length: 15 "SPI=0xF1234567 */
-                               printk("SPI=0x%x ", ntohl(ah->spi));
+                               sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
 
                        }
 
@@ -164,10 +163,10 @@ static void dump_packet(const struct nf_loginfo *info,
                                const struct ip_esp_hdr *eh;
 
                                /* Max length: 4 "ESP " */
-                               printk("ESP ");
+                               sb_add(m, "ESP ");
 
                                if (fragment) {
-                                       printk(")");
+                                       sb_add(m, ")");
                                        return;
                                }
 
@@ -177,23 +176,23 @@ static void dump_packet(const struct nf_loginfo *info,
                                eh = skb_header_pointer(skb, ptr, sizeof(_esph),
                                                        &_esph);
                                if (eh == NULL) {
-                                       printk("INCOMPLETE [%u bytes] )",
+                                       sb_add(m, "INCOMPLETE [%u bytes] )",
                                               skb->len - ptr);
                                        return;
                                }
 
                                /* Length: 16 "SPI=0xF1234567 )" */
-                               printk("SPI=0x%x )", ntohl(eh->spi) );
+                               sb_add(m, "SPI=0x%x )", ntohl(eh->spi) );
 
                        }
                        return;
                default:
                        /* Max length: 20 "Unknown Ext Hdr 255" */
-                       printk("Unknown Ext Hdr %u", currenthdr);
+                       sb_add(m, "Unknown Ext Hdr %u", currenthdr);
                        return;
                }
                if (logflags & IP6T_LOG_IPOPT)
-                       printk(") ");
+                       sb_add(m, ") ");
 
                currenthdr = hp->nexthdr;
                ptr += hdrlen;
@@ -205,7 +204,7 @@ static void dump_packet(const struct nf_loginfo *info,
                const struct tcphdr *th;
 
                /* Max length: 10 "PROTO=TCP " */
-               printk("PROTO=TCP ");
+               sb_add(m, "PROTO=TCP ");
 
                if (fragment)
                        break;
@@ -213,40 +212,40 @@ static void dump_packet(const struct nf_loginfo *info,
                /* Max length: 25 "INCOMPLETE [65535 bytes] " */
                th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph);
                if (th == NULL) {
-                       printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+                       sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
                        return;
                }
 
                /* Max length: 20 "SPT=65535 DPT=65535 " */
-               printk("SPT=%u DPT=%u ",
+               sb_add(m, "SPT=%u DPT=%u ",
                       ntohs(th->source), ntohs(th->dest));
                /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
                if (logflags & IP6T_LOG_TCPSEQ)
-                       printk("SEQ=%u ACK=%u ",
+                       sb_add(m, "SEQ=%u ACK=%u ",
                               ntohl(th->seq), ntohl(th->ack_seq));
                /* Max length: 13 "WINDOW=65535 " */
-               printk("WINDOW=%u ", ntohs(th->window));
+               sb_add(m, "WINDOW=%u ", ntohs(th->window));
                /* Max length: 9 "RES=0x3C " */
-               printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+               sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
                /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
                if (th->cwr)
-                       printk("CWR ");
+                       sb_add(m, "CWR ");
                if (th->ece)
-                       printk("ECE ");
+                       sb_add(m, "ECE ");
                if (th->urg)
-                       printk("URG ");
+                       sb_add(m, "URG ");
                if (th->ack)
-                       printk("ACK ");
+                       sb_add(m, "ACK ");
                if (th->psh)
-                       printk("PSH ");
+                       sb_add(m, "PSH ");
                if (th->rst)
-                       printk("RST ");
+                       sb_add(m, "RST ");
                if (th->syn)
-                       printk("SYN ");
+                       sb_add(m, "SYN ");
                if (th->fin)
-                       printk("FIN ");
+                       sb_add(m, "FIN ");
                /* Max length: 11 "URGP=65535 " */
-               printk("URGP=%u ", ntohs(th->urg_ptr));
+               sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
 
                if ((logflags & IP6T_LOG_TCPOPT) &&
                    th->doff * 4 > sizeof(struct tcphdr)) {
@@ -260,15 +259,15 @@ static void dump_packet(const struct nf_loginfo *info,
                                                ptr + sizeof(struct tcphdr),
                                                optsize, _opt);
                        if (op == NULL) {
-                               printk("OPT (TRUNCATED)");
+                               sb_add(m, "OPT (TRUNCATED)");
                                return;
                        }
 
                        /* Max length: 127 "OPT (" 15*4*2chars ") " */
-                       printk("OPT (");
+                       sb_add(m, "OPT (");
                        for (i =0; i < optsize; i++)
-                               printk("%02X", op[i]);
-                       printk(") ");
+                               sb_add(m, "%02X", op[i]);
+                       sb_add(m, ") ");
                }
                break;
        }
@@ -279,9 +278,9 @@ static void dump_packet(const struct nf_loginfo *info,
 
                if (currenthdr == IPPROTO_UDP)
                        /* Max length: 10 "PROTO=UDP "     */
-                       printk("PROTO=UDP " );
+                       sb_add(m, "PROTO=UDP " );
                else    /* Max length: 14 "PROTO=UDPLITE " */
-                       printk("PROTO=UDPLITE ");
+                       sb_add(m, "PROTO=UDPLITE ");
 
                if (fragment)
                        break;
@@ -289,12 +288,12 @@ static void dump_packet(const struct nf_loginfo *info,
                /* Max length: 25 "INCOMPLETE [65535 bytes] " */
                uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph);
                if (uh == NULL) {
-                       printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+                       sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
                        return;
                }
 
                /* Max length: 20 "SPT=65535 DPT=65535 " */
-               printk("SPT=%u DPT=%u LEN=%u ",
+               sb_add(m, "SPT=%u DPT=%u LEN=%u ",
                       ntohs(uh->source), ntohs(uh->dest),
                       ntohs(uh->len));
                break;
@@ -304,7 +303,7 @@ static void dump_packet(const struct nf_loginfo *info,
                const struct icmp6hdr *ic;
 
                /* Max length: 13 "PROTO=ICMPv6 " */
-               printk("PROTO=ICMPv6 ");
+               sb_add(m, "PROTO=ICMPv6 ");
 
                if (fragment)
                        break;
@@ -312,18 +311,18 @@ static void dump_packet(const struct nf_loginfo *info,
                /* Max length: 25 "INCOMPLETE [65535 bytes] " */
                ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
                if (ic == NULL) {
-                       printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+                       sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
                        return;
                }
 
                /* Max length: 18 "TYPE=255 CODE=255 " */
-               printk("TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
+               sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
 
                switch (ic->icmp6_type) {
                case ICMPV6_ECHO_REQUEST:
                case ICMPV6_ECHO_REPLY:
                        /* Max length: 19 "ID=65535 SEQ=65535 " */
-                       printk("ID=%u SEQ=%u ",
+                       sb_add(m, "ID=%u SEQ=%u ",
                                ntohs(ic->icmp6_identifier),
                                ntohs(ic->icmp6_sequence));
                        break;
@@ -334,35 +333,35 @@ static void dump_packet(const struct nf_loginfo *info,
 
                case ICMPV6_PARAMPROB:
                        /* Max length: 17 "POINTER=ffffffff " */
-                       printk("POINTER=%08x ", ntohl(ic->icmp6_pointer));
+                       sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer));
                        /* Fall through */
                case ICMPV6_DEST_UNREACH:
                case ICMPV6_PKT_TOOBIG:
                case ICMPV6_TIME_EXCEED:
                        /* Max length: 3+maxlen */
                        if (recurse) {
-                               printk("[");
-                               dump_packet(info, skb, ptr + sizeof(_icmp6h),
-                                           0);
-                               printk("] ");
+                               sb_add(m, "[");
+                               dump_packet(m, info, skb,
+                                           ptr + sizeof(_icmp6h), 0);
+                               sb_add(m, "] ");
                        }
 
                        /* Max length: 10 "MTU=65535 " */
                        if (ic->icmp6_type == ICMPV6_PKT_TOOBIG)
-                               printk("MTU=%u ", ntohl(ic->icmp6_mtu));
+                               sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu));
                }
                break;
        }
        /* Max length: 10 "PROTO=255 " */
        default:
-               printk("PROTO=%u ", currenthdr);
+               sb_add(m, "PROTO=%u ", currenthdr);
        }
 
        /* Max length: 15 "UID=4294967295 " */
        if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) {
                read_lock_bh(&skb->sk->sk_callback_lock);
                if (skb->sk->sk_socket && skb->sk->sk_socket->file)
-                       printk("UID=%u GID=%u ",
+                       sb_add(m, "UID=%u GID=%u ",
                                skb->sk->sk_socket->file->f_cred->fsuid,
                                skb->sk->sk_socket->file->f_cred->fsgid);
                read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -370,10 +369,11 @@ static void dump_packet(const struct nf_loginfo *info,
 
        /* Max length: 16 "MARK=0xFFFFFFFF " */
        if (!recurse && skb->mark)
-               printk("MARK=0x%x ", skb->mark);
+               sb_add(m, "MARK=0x%x ", skb->mark);
 }
 
-static void dump_mac_header(const struct nf_loginfo *info,
+static void dump_mac_header(struct sbuff *m,
+                           const struct nf_loginfo *info,
                            const struct sk_buff *skb)
 {
        struct net_device *dev = skb->dev;
@@ -387,7 +387,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
 
        switch (dev->type) {
        case ARPHRD_ETHER:
-               printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+               sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
                       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
                       ntohs(eth_hdr(skb)->h_proto));
                return;
@@ -396,7 +396,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
        }
 
 fallback:
-       printk("MAC=");
+       sb_add(m, "MAC=");
        if (dev->hard_header_len &&
            skb->mac_header != skb->network_header) {
                const unsigned char *p = skb_mac_header(skb);
@@ -408,19 +408,19 @@ fallback:
                        p = NULL;
 
                if (p != NULL) {
-                       printk("%02x", *p++);
+                       sb_add(m, "%02x", *p++);
                        for (i = 1; i < len; i++)
-                               printk(":%02x", p[i]);
+                               sb_add(m, ":%02x", p[i]);
                }
-               printk(" ");
+               sb_add(m, " ");
 
                if (dev->type == ARPHRD_SIT) {
                        const struct iphdr *iph =
                                (struct iphdr *)skb_mac_header(skb);
-                       printk("TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr);
+                       sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr);
                }
        } else
-               printk(" ");
+               sb_add(m, " ");
 }
 
 static struct nf_loginfo default_loginfo = {
@@ -442,22 +442,23 @@ ip6t_log_packet(u_int8_t pf,
                const struct nf_loginfo *loginfo,
                const char *prefix)
 {
+       struct sbuff *m = sb_open();
+
        if (!loginfo)
                loginfo = &default_loginfo;
 
-       spin_lock_bh(&log_lock);
-       printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
-               prefix,
-               in ? in->name : "",
-               out ? out->name : "");
+       sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+              prefix,
+              in ? in->name : "",
+              out ? out->name : "");
 
        /* MAC logging for input path only. */
        if (in && !out)
-               dump_mac_header(loginfo, skb);
+               dump_mac_header(m, loginfo, skb);
+
+       dump_packet(m, loginfo, skb, skb_network_offset(skb), 1);
 
-       dump_packet(loginfo, skb, skb_network_offset(skb), 1);
-       printk("\n");
-       spin_unlock_bh(&log_lock);
+       sb_close(m);
 }
 
 static unsigned int
index ff43461704be5c9433d7468f1fe99c63aab7d613..c8af58b225620795af240156ae1e5b735fa78a2d 100644 (file)
@@ -16,7 +16,6 @@
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/icmp.h>
-#include <linux/sysctl.h>
 #include <net/ipv6.h>
 #include <net/inet_frag.h>
 
@@ -29,6 +28,7 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #include <net/netfilter/nf_log.h>
 
 static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
@@ -189,53 +189,6 @@ out:
        return nf_conntrack_confirm(skb);
 }
 
-static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
-                                               struct sk_buff *skb)
-{
-       u16 zone = NF_CT_DEFAULT_ZONE;
-
-       if (skb->nfct)
-               zone = nf_ct_zone((struct nf_conn *)skb->nfct);
-
-#ifdef CONFIG_BRIDGE_NETFILTER
-       if (skb->nf_bridge &&
-           skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
-               return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
-#endif
-       if (hooknum == NF_INET_PRE_ROUTING)
-               return IP6_DEFRAG_CONNTRACK_IN + zone;
-       else
-               return IP6_DEFRAG_CONNTRACK_OUT + zone;
-
-}
-
-static unsigned int ipv6_defrag(unsigned int hooknum,
-                               struct sk_buff *skb,
-                               const struct net_device *in,
-                               const struct net_device *out,
-                               int (*okfn)(struct sk_buff *))
-{
-       struct sk_buff *reasm;
-
-       /* Previously seen (loopback)?  */
-       if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
-               return NF_ACCEPT;
-
-       reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb));
-       /* queued */
-       if (reasm == NULL)
-               return NF_STOLEN;
-
-       /* error occured or not fragmented */
-       if (reasm == skb)
-               return NF_ACCEPT;
-
-       nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in,
-                          (struct net_device *)out, okfn);
-
-       return NF_STOLEN;
-}
-
 static unsigned int __ipv6_conntrack_in(struct net *net,
                                        unsigned int hooknum,
                                        struct sk_buff *skb,
@@ -287,13 +240,6 @@ static unsigned int ipv6_conntrack_local(unsigned int hooknum,
 }
 
 static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
-       {
-               .hook           = ipv6_defrag,
-               .owner          = THIS_MODULE,
-               .pf             = NFPROTO_IPV6,
-               .hooknum        = NF_INET_PRE_ROUTING,
-               .priority       = NF_IP6_PRI_CONNTRACK_DEFRAG,
-       },
        {
                .hook           = ipv6_conntrack_in,
                .owner          = THIS_MODULE,
@@ -308,13 +254,6 @@ static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
                .hooknum        = NF_INET_LOCAL_OUT,
                .priority       = NF_IP6_PRI_CONNTRACK,
        },
-       {
-               .hook           = ipv6_defrag,
-               .owner          = THIS_MODULE,
-               .pf             = NFPROTO_IPV6,
-               .hooknum        = NF_INET_LOCAL_OUT,
-               .priority       = NF_IP6_PRI_CONNTRACK_DEFRAG,
-       },
        {
                .hook           = ipv6_confirm,
                .owner          = THIS_MODULE,
@@ -386,10 +325,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
        .nlattr_tuple_size      = ipv6_nlattr_tuple_size,
        .nlattr_to_tuple        = ipv6_nlattr_to_tuple,
        .nla_policy             = ipv6_nla_policy,
-#endif
-#ifdef CONFIG_SYSCTL
-       .ctl_table_path         = nf_net_netfilter_sysctl_path,
-       .ctl_table              = nf_ct_ipv6_sysctl_table,
 #endif
        .me                     = THIS_MODULE,
 };
@@ -403,16 +338,12 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
        int ret = 0;
 
        need_conntrack();
+       nf_defrag_ipv6_enable();
 
-       ret = nf_ct_frag6_init();
-       if (ret < 0) {
-               pr_err("nf_conntrack_ipv6: can't initialize frag6.\n");
-               return ret;
-       }
        ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6);
        if (ret < 0) {
                pr_err("nf_conntrack_ipv6: can't register tcp.\n");
-               goto cleanup_frag6;
+               return ret;
        }
 
        ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6);
@@ -450,8 +381,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
  cleanup_tcp:
        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
- cleanup_frag6:
-       nf_ct_frag6_cleanup();
        return ret;
 }
 
@@ -463,7 +392,6 @@ static void __exit nf_conntrack_l3proto_ipv6_fini(void)
        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
-       nf_ct_frag6_cleanup();
 }
 
 module_init(nf_conntrack_l3proto_ipv6_init);
index 138a8b36270694ec96ca4a21f51a55b0c1d62e98..489d71b844ac9ba7c85d7f5612922e4ceeb712cc 100644 (file)
@@ -73,7 +73,7 @@ static struct inet_frags nf_frags;
 static struct netns_frags nf_init_frags;
 
 #ifdef CONFIG_SYSCTL
-struct ctl_table nf_ct_ipv6_sysctl_table[] = {
+struct ctl_table nf_ct_frag6_sysctl_table[] = {
        {
                .procname       = "nf_conntrack_frag6_timeout",
                .data           = &nf_init_frags.timeout,
@@ -97,6 +97,8 @@ struct ctl_table nf_ct_ipv6_sysctl_table[] = {
        },
        { }
 };
+
+static struct ctl_table_header *nf_ct_frag6_sysctl_header;
 #endif
 
 static unsigned int nf_hashfn(struct inet_frag_queue *q)
@@ -623,11 +625,21 @@ int nf_ct_frag6_init(void)
        inet_frags_init_net(&nf_init_frags);
        inet_frags_init(&nf_frags);
 
+       nf_ct_frag6_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path,
+                                                         nf_ct_frag6_sysctl_table);
+       if (!nf_ct_frag6_sysctl_header) {
+               inet_frags_fini(&nf_frags);
+               return -ENOMEM;
+       }
+
        return 0;
 }
 
 void nf_ct_frag6_cleanup(void)
 {
+       unregister_sysctl_table(nf_ct_frag6_sysctl_header);
+       nf_ct_frag6_sysctl_header = NULL;
+
        inet_frags_fini(&nf_frags);
 
        nf_init_frags.low_thresh = 0;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
new file mode 100644 (file)
index 0000000..99abfb5
--- /dev/null
@@ -0,0 +1,131 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/ipv6.h>
+#include <linux/in6.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/ipv6.h>
+#include <net/inet_frag.h>
+
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
+                                               struct sk_buff *skb)
+{
+       u16 zone = NF_CT_DEFAULT_ZONE;
+
+       if (skb->nfct)
+               zone = nf_ct_zone((struct nf_conn *)skb->nfct);
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+       if (skb->nf_bridge &&
+           skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
+               return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
+#endif
+       if (hooknum == NF_INET_PRE_ROUTING)
+               return IP6_DEFRAG_CONNTRACK_IN + zone;
+       else
+               return IP6_DEFRAG_CONNTRACK_OUT + zone;
+
+}
+
+static unsigned int ipv6_defrag(unsigned int hooknum,
+                               struct sk_buff *skb,
+                               const struct net_device *in,
+                               const struct net_device *out,
+                               int (*okfn)(struct sk_buff *))
+{
+       struct sk_buff *reasm;
+
+       /* Previously seen (loopback)?  */
+       if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+               return NF_ACCEPT;
+
+       reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb));
+       /* queued */
+       if (reasm == NULL)
+               return NF_STOLEN;
+
+       /* error occured or not fragmented */
+       if (reasm == skb)
+               return NF_ACCEPT;
+
+       nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in,
+                          (struct net_device *)out, okfn);
+
+       return NF_STOLEN;
+}
+
+static struct nf_hook_ops ipv6_defrag_ops[] = {
+       {
+               .hook           = ipv6_defrag,
+               .owner          = THIS_MODULE,
+               .pf             = NFPROTO_IPV6,
+               .hooknum        = NF_INET_PRE_ROUTING,
+               .priority       = NF_IP6_PRI_CONNTRACK_DEFRAG,
+       },
+       {
+               .hook           = ipv6_defrag,
+               .owner          = THIS_MODULE,
+               .pf             = NFPROTO_IPV6,
+               .hooknum        = NF_INET_LOCAL_OUT,
+               .priority       = NF_IP6_PRI_CONNTRACK_DEFRAG,
+       },
+};
+
+static int __init nf_defrag_init(void)
+{
+       int ret = 0;
+
+       ret = nf_ct_frag6_init();
+       if (ret < 0) {
+               pr_err("nf_defrag_ipv6: can't initialize frag6.\n");
+               return ret;
+       }
+       ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+       if (ret < 0) {
+               pr_err("nf_defrag_ipv6: can't register hooks\n");
+               goto cleanup_frag6;
+       }
+       return ret;
+
+cleanup_frag6:
+       nf_ct_frag6_cleanup();
+       return ret;
+
+}
+
+static void __exit nf_defrag_fini(void)
+{
+       nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+       nf_ct_frag6_cleanup();
+}
+
+void nf_defrag_ipv6_enable(void)
+{
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable);
+
+module_init(nf_defrag_init);
+module_exit(nf_defrag_fini);
+
+MODULE_LICENSE("GPL");
index 8d93f6d81979c68111bbd05a1a94792b43846edc..7e41e2cbb85e39b78ea3c66c5b85273fc04fa036 100644 (file)
@@ -1409,7 +1409,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
        newsk = tcp_create_openreq_child(sk, req, skb);
        if (newsk == NULL)
-               goto out;
+               goto out_nonewsk;
 
        /*
         * No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -1497,18 +1497,22 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        }
 #endif
 
+       if (__inet_inherit_port(sk, newsk) < 0) {
+               sock_put(newsk);
+               goto out;
+       }
        __inet6_hash(newsk, NULL);
-       __inet_inherit_port(sk, newsk);
 
        return newsk;
 
 out_overflow:
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
-out:
-       NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+out_nonewsk:
        if (opt && opt != np->opt)
                sock_kfree_s(sk, opt, opt->tot_len);
        dst_release(dst);
+out:
+       NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
        return NULL;
 }
 
index 5acb3560ff15267021266f59c7b2102f633f3d6a..c84dad432114ef0d885b29244bf9df0854f750e2 100644 (file)
@@ -122,8 +122,8 @@ static void udp_v6_rehash(struct sock *sk)
 
 static inline int compute_score(struct sock *sk, struct net *net,
                                unsigned short hnum,
-                               struct in6_addr *saddr, __be16 sport,
-                               struct in6_addr *daddr, __be16 dport,
+                               const struct in6_addr *saddr, __be16 sport,
+                               const struct in6_addr *daddr, __be16 dport,
                                int dif)
 {
        int score = -1;
@@ -239,8 +239,8 @@ exact_match:
 }
 
 static struct sock *__udp6_lib_lookup(struct net *net,
-                                     struct in6_addr *saddr, __be16 sport,
-                                     struct in6_addr *daddr, __be16 dport,
+                                     const struct in6_addr *saddr, __be16 sport,
+                                     const struct in6_addr *daddr, __be16 dport,
                                      int dif, struct udp_table *udptable)
 {
        struct sock *sk, *result;
@@ -320,6 +320,14 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
                                 udptable);
 }
 
+struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
+                            const struct in6_addr *daddr, __be16 dport, int dif)
+{
+       return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
+}
+EXPORT_SYMBOL_GPL(udp6_lib_lookup);
+
+
 /*
  *     This should be easy, if there is something there we
  *     return it, otherwise we block.
index 78b505d33bfb42cdf1033be323c2fdb1a359a833..8f014f22d132d17e7c79ce0aa727463e3dbd7941 100644 (file)
@@ -105,10 +105,8 @@ EXPORT_SYMBOL(nf_register_hooks);
 
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 {
-       unsigned int i;
-
-       for (i = 0; i < n; i++)
-               nf_unregister_hook(&reg[i]);
+       while (n-- > 0)
+               nf_unregister_hook(&reg[n]);
 }
 EXPORT_SYMBOL(nf_unregister_hooks);
 
index 46a77d5c3887dc7c874b327a77afb539c8d92777..a22dac227055e7499bfd9d8dd509ee5fba3e1c9b 100644 (file)
@@ -3,7 +3,7 @@
 #
 menuconfig IP_VS
        tristate "IP virtual server support"
-       depends on NET && INET && NETFILTER && NF_CONNTRACK
+       depends on NET && INET && NETFILTER
        ---help---
          IP Virtual Server support will let you build a high-performance
          virtual server based on cluster of two or more real servers. This
@@ -235,7 +235,8 @@ comment 'IPVS application helper'
 
 config IP_VS_FTP
        tristate "FTP protocol helper"
-        depends on IP_VS_PROTO_TCP && NF_NAT
+        depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT
+       select IP_VS_NFCT
        ---help---
          FTP is a protocol that transfers IP address and/or port number in
          the payload. In the virtual server via Network Address Translation,
@@ -247,4 +248,19 @@ config     IP_VS_FTP
          If you want to compile it in kernel, say Y. To compile it as a
          module, choose M here. If unsure, say N.
 
+config IP_VS_NFCT
+       bool "Netfilter connection tracking"
+       depends on NF_CONNTRACK
+       ---help---
+         The Netfilter connection tracking support allows the IPVS
+         connection state to be exported to the Netfilter framework
+         for filtering purposes.
+
+config IP_VS_PE_SIP
+       tristate "SIP persistence engine"
+        depends on IP_VS_PROTO_UDP
+       depends on NF_CONNTRACK_SIP
+       ---help---
+         Allow persistence based on the SIP Call-ID
+
 endif # IP_VS
index e3baefd7066e56d9ceaad6025c6de3e710a2e870..34ee602ddb667d806691111c4f6cede4227a6872 100644 (file)
@@ -9,10 +9,13 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
 ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
 ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o
 
+ip_vs-extra_objs-y :=
+ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o
+
 ip_vs-objs :=  ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o        \
                ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o                      \
-               ip_vs_est.o ip_vs_proto.o                                  \
-               $(ip_vs_proto-objs-y)
+               ip_vs_est.o ip_vs_proto.o ip_vs_pe.o                       \
+               $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y)
 
 
 # IPVS core
@@ -32,3 +35,6 @@ obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
 
 # IPVS application helpers
 obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
+
+# IPVS connection template retrievers
+obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o
index e76f87f4aca80fdc4e9f0557c0b491dcdeff7556..a475edee0912e8fef842aa1632ecc2d19599f59d 100644 (file)
@@ -103,8 +103,8 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
                goto out;
 
        list_add(&inc->a_list, &app->incs_list);
-       IP_VS_DBG(9, "%s application %s:%u registered\n",
-                 pp->name, inc->name, inc->port);
+       IP_VS_DBG(9, "%s App %s:%u registered\n",
+                 pp->name, inc->name, ntohs(inc->port));
 
        return 0;
 
@@ -130,7 +130,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc)
                pp->unregister_app(inc);
 
        IP_VS_DBG(9, "%s App %s:%u unregistered\n",
-                 pp->name, inc->name, inc->port);
+                 pp->name, inc->name, ntohs(inc->port));
 
        list_del(&inc->a_list);
 
index b71c69a2db138ac30aadb13e02a697322d54ac90..e9adecdc8ca4779468c494c1a2418049ca2384eb 100644 (file)
@@ -148,6 +148,42 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
                & ip_vs_conn_tab_mask;
 }
 
+static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
+                                            bool inverse)
+{
+       const union nf_inet_addr *addr;
+       __be16 port;
+
+       if (p->pe_data && p->pe->hashkey_raw)
+               return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
+                       ip_vs_conn_tab_mask;
+
+       if (likely(!inverse)) {
+               addr = p->caddr;
+               port = p->cport;
+       } else {
+               addr = p->vaddr;
+               port = p->vport;
+       }
+
+       return ip_vs_conn_hashkey(p->af, p->protocol, addr, port);
+}
+
+static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
+{
+       struct ip_vs_conn_param p;
+
+       ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
+                             NULL, 0, &p);
+
+       if (cp->dest && cp->dest->svc->pe) {
+               p.pe = cp->dest->svc->pe;
+               p.pe_data = cp->pe_data;
+               p.pe_data_len = cp->pe_data_len;
+       }
+
+       return ip_vs_conn_hashkey_param(&p, false);
+}
 
 /*
  *     Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
@@ -162,7 +198,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
                return 0;
 
        /* Hash by protocol, client address and port */
-       hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+       hash = ip_vs_conn_hashkey_conn(cp);
 
        ct_write_lock(hash);
        spin_lock(&cp->lock);
@@ -195,7 +231,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
        int ret;
 
        /* unhash it and decrease its reference counter */
-       hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+       hash = ip_vs_conn_hashkey_conn(cp);
 
        ct_write_lock(hash);
        spin_lock(&cp->lock);
@@ -218,27 +254,26 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 /*
  *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
  *  Called for pkts coming from OUTside-to-INside.
- *     s_addr, s_port: pkt source address (foreign host)
- *     d_addr, d_port: pkt dest address (load balancer)
+ *     p->caddr, p->cport: pkt source address (foreign host)
+ *     p->vaddr, p->vport: pkt dest address (load balancer)
  */
-static inline struct ip_vs_conn *__ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+static inline struct ip_vs_conn *
+__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 {
        unsigned hash;
        struct ip_vs_conn *cp;
 
-       hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+       hash = ip_vs_conn_hashkey_param(p, false);
 
        ct_read_lock(hash);
 
        list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-               if (cp->af == af &&
-                   ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
-                   ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
-                   s_port == cp->cport && d_port == cp->vport &&
-                   ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
-                   protocol == cp->protocol) {
+               if (cp->af == p->af &&
+                   ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
+                   ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
+                   p->cport == cp->cport && p->vport == cp->vport &&
+                   ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
+                   p->protocol == cp->protocol) {
                        /* HIT */
                        atomic_inc(&cp->refcnt);
                        ct_read_unlock(hash);
@@ -251,99 +286,111 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get
        return NULL;
 }
 
-struct ip_vs_conn *ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 {
        struct ip_vs_conn *cp;
 
-       cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
-       if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
-               cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
-                                        d_port);
+       cp = __ip_vs_conn_in_get(p);
+       if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
+               struct ip_vs_conn_param cport_zero_p = *p;
+               cport_zero_p.cport = 0;
+               cp = __ip_vs_conn_in_get(&cport_zero_p);
+       }
 
        IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
-                     ip_vs_proto_name(protocol),
-                     IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-                     IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+                     ip_vs_proto_name(p->protocol),
+                     IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+                     IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
                      cp ? "hit" : "not hit");
 
        return cp;
 }
 
+static int
+ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
+                           const struct ip_vs_iphdr *iph,
+                           unsigned int proto_off, int inverse,
+                           struct ip_vs_conn_param *p)
+{
+       __be16 _ports[2], *pptr;
+
+       pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+       if (pptr == NULL)
+               return 1;
+
+       if (likely(!inverse))
+               ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0],
+                                     &iph->daddr, pptr[1], p);
+       else
+               ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1],
+                                     &iph->saddr, pptr[0], p);
+       return 0;
+}
+
 struct ip_vs_conn *
 ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
                        struct ip_vs_protocol *pp,
                        const struct ip_vs_iphdr *iph,
                        unsigned int proto_off, int inverse)
 {
-       __be16 _ports[2], *pptr;
+       struct ip_vs_conn_param p;
 
-       pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-       if (pptr == NULL)
+       if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
                return NULL;
 
-       if (likely(!inverse))
-               return ip_vs_conn_in_get(af, iph->protocol,
-                                        &iph->saddr, pptr[0],
-                                        &iph->daddr, pptr[1]);
-       else
-               return ip_vs_conn_in_get(af, iph->protocol,
-                                        &iph->daddr, pptr[1],
-                                        &iph->saddr, pptr[0]);
+       return ip_vs_conn_in_get(&p);
 }
 EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
 
 /* Get reference to connection template */
-struct ip_vs_conn *ip_vs_ct_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 {
        unsigned hash;
        struct ip_vs_conn *cp;
 
-       hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+       hash = ip_vs_conn_hashkey_param(p, false);
 
        ct_read_lock(hash);
 
        list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-               if (cp->af == af &&
-                   ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
+               if (p->pe_data && p->pe->ct_match) {
+                       if (p->pe->ct_match(p, cp))
+                               goto out;
+                       continue;
+               }
+
+               if (cp->af == p->af &&
+                   ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
                    /* protocol should only be IPPROTO_IP if
-                    * d_addr is a fwmark */
-                   ip_vs_addr_equal(protocol == IPPROTO_IP ? AF_UNSPEC : af,
-                                    d_addr, &cp->vaddr) &&
-                   s_port == cp->cport && d_port == cp->vport &&
+                    * p->vaddr is a fwmark */
+                   ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
+                                    p->af, p->vaddr, &cp->vaddr) &&
+                   p->cport == cp->cport && p->vport == cp->vport &&
                    cp->flags & IP_VS_CONN_F_TEMPLATE &&
-                   protocol == cp->protocol) {
-                       /* HIT */
-                       atomic_inc(&cp->refcnt);
+                   p->protocol == cp->protocol)
                        goto out;
-               }
        }
        cp = NULL;
 
   out:
+       if (cp)
+               atomic_inc(&cp->refcnt);
        ct_read_unlock(hash);
 
        IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
-                     ip_vs_proto_name(protocol),
-                     IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-                     IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+                     ip_vs_proto_name(p->protocol),
+                     IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+                     IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
                      cp ? "hit" : "not hit");
 
        return cp;
 }
 
-/*
- *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
- *  Called for pkts coming from inside-to-OUTside.
- *     s_addr, s_port: pkt source address (inside host)
- *     d_addr, d_port: pkt dest address (foreign host)
- */
-struct ip_vs_conn *ip_vs_conn_out_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from inside-to-OUTside.
+ *     p->caddr, p->cport: pkt source address (inside host)
+ *     p->vaddr, p->vport: pkt dest address (foreign host) */
+struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
 {
        unsigned hash;
        struct ip_vs_conn *cp, *ret=NULL;
@@ -351,16 +398,16 @@ struct ip_vs_conn *ip_vs_conn_out_get
        /*
         *      Check for "full" addressed entries
         */
-       hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
+       hash = ip_vs_conn_hashkey_param(p, true);
 
        ct_read_lock(hash);
 
        list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-               if (cp->af == af &&
-                   ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
-                   ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
-                   d_port == cp->cport && s_port == cp->dport &&
-                   protocol == cp->protocol) {
+               if (cp->af == p->af &&
+                   ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
+                   ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
+                   p->vport == cp->cport && p->cport == cp->dport &&
+                   p->protocol == cp->protocol) {
                        /* HIT */
                        atomic_inc(&cp->refcnt);
                        ret = cp;
@@ -371,9 +418,9 @@ struct ip_vs_conn *ip_vs_conn_out_get
        ct_read_unlock(hash);
 
        IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
-                     ip_vs_proto_name(protocol),
-                     IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-                     IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+                     ip_vs_proto_name(p->protocol),
+                     IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+                     IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
                      ret ? "hit" : "not hit");
 
        return ret;
@@ -385,20 +432,12 @@ ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
                         const struct ip_vs_iphdr *iph,
                         unsigned int proto_off, int inverse)
 {
-       __be16 _ports[2], *pptr;
+       struct ip_vs_conn_param p;
 
-       pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-       if (pptr == NULL)
+       if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
                return NULL;
 
-       if (likely(!inverse))
-               return ip_vs_conn_out_get(af, iph->protocol,
-                                         &iph->saddr, pptr[0],
-                                         &iph->daddr, pptr[1]);
-       else
-               return ip_vs_conn_out_get(af, iph->protocol,
-                                         &iph->daddr, pptr[1],
-                                         &iph->saddr, pptr[0]);
+       return ip_vs_conn_out_get(&p);
 }
 EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
 
@@ -505,6 +544,8 @@ static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
 static inline void
 ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
 {
+       unsigned int conn_flags;
+
        /* if dest is NULL, then return directly */
        if (!dest)
                return;
@@ -512,16 +553,20 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
        /* Increase the refcnt counter of the dest */
        atomic_inc(&dest->refcnt);
 
+       conn_flags = atomic_read(&dest->conn_flags);
+       if (cp->protocol != IPPROTO_UDP)
+               conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
        /* Bind with the destination and its corresponding transmitter */
-       if ((cp->flags & IP_VS_CONN_F_SYNC) &&
-           (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
+       if (cp->flags & IP_VS_CONN_F_SYNC) {
                /* if the connection is not template and is created
                 * by sync, preserve the activity flag.
                 */
-               cp->flags |= atomic_read(&dest->conn_flags) &
-                            (~IP_VS_CONN_F_INACTIVE);
-       else
-               cp->flags |= atomic_read(&dest->conn_flags);
+               if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
+                       conn_flags &= ~IP_VS_CONN_F_INACTIVE;
+               /* connections inherit forwarding method from dest */
+               cp->flags &= ~IP_VS_CONN_F_FWD_MASK;
+       }
+       cp->flags |= conn_flags;
        cp->dest = dest;
 
        IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
@@ -717,6 +762,10 @@ static void ip_vs_conn_expire(unsigned long data)
                if (cp->control)
                        ip_vs_control_del(cp);
 
+               if (cp->flags & IP_VS_CONN_F_NFCT)
+                       ip_vs_conn_drop_conntrack(cp);
+
+               kfree(cp->pe_data);
                if (unlikely(cp->app != NULL))
                        ip_vs_unbind_app(cp);
                ip_vs_unbind_dest(cp);
@@ -751,13 +800,12 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
  *     Create a new connection entry and hash it into the ip_vs_conn_tab
  */
 struct ip_vs_conn *
-ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
-              const union nf_inet_addr *vaddr, __be16 vport,
+ip_vs_conn_new(const struct ip_vs_conn_param *p,
               const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
               struct ip_vs_dest *dest)
 {
        struct ip_vs_conn *cp;
-       struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+       struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
 
        cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
        if (cp == NULL) {
@@ -767,17 +815,21 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
 
        INIT_LIST_HEAD(&cp->c_list);
        setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
-       cp->af             = af;
-       cp->protocol       = proto;
-       ip_vs_addr_copy(af, &cp->caddr, caddr);
-       cp->cport          = cport;
-       ip_vs_addr_copy(af, &cp->vaddr, vaddr);
-       cp->vport          = vport;
+       cp->af             = p->af;
+       cp->protocol       = p->protocol;
+       ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
+       cp->cport          = p->cport;
+       ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
+       cp->vport          = p->vport;
        /* proto should only be IPPROTO_IP if d_addr is a fwmark */
-       ip_vs_addr_copy(proto == IPPROTO_IP ? AF_UNSPEC : af,
+       ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
                        &cp->daddr, daddr);
        cp->dport          = dport;
        cp->flags          = flags;
+       if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) {
+               cp->pe_data = p->pe_data;
+               cp->pe_data_len = p->pe_data_len;
+       }
        spin_lock_init(&cp->lock);
 
        /*
@@ -803,7 +855,7 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
 
        /* Bind its packet transmitter */
 #ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6)
+       if (p->af == AF_INET6)
                ip_vs_bind_xmit_v6(cp);
        else
 #endif
@@ -812,13 +864,22 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
        if (unlikely(pp && atomic_read(&pp->appcnt)))
                ip_vs_bind_app(cp, pp);
 
+       /*
+        * Allow conntrack to be preserved. By default, conntrack
+        * is created and destroyed for every packet.
+        * Sometimes keeping conntrack can be useful for
+        * IP_VS_CONN_F_ONE_PACKET too.
+        */
+
+       if (ip_vs_conntrack_enabled())
+               cp->flags |= IP_VS_CONN_F_NFCT;
+
        /* Hash it in the ip_vs_conn_tab finally */
        ip_vs_conn_hash(cp);
 
        return cp;
 }
 
-
 /*
  *     /proc/net/ip_vs_conn entries
  */
@@ -834,7 +895,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
                list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
                        if (pos-- == 0) {
                                seq->private = &ip_vs_conn_tab[idx];
-                               return cp;
+                       return cp;
                        }
                }
                ct_read_unlock_bh(idx);
@@ -891,30 +952,45 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
 
        if (v == SEQ_START_TOKEN)
                seq_puts(seq,
-   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires\n");
+   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires PEName PEData\n");
        else {
                const struct ip_vs_conn *cp = v;
+               char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
+               size_t len = 0;
+
+               if (cp->dest && cp->pe_data &&
+                   cp->dest->svc->pe->show_pe_data) {
+                       pe_data[0] = ' ';
+                       len = strlen(cp->dest->svc->pe->name);
+                       memcpy(pe_data + 1, cp->dest->svc->pe->name, len);
+                       pe_data[len + 1] = ' ';
+                       len += 2;
+                       len += cp->dest->svc->pe->show_pe_data(cp,
+                                                              pe_data + len);
+               }
+               pe_data[len] = '\0';
 
 #ifdef CONFIG_IP_VS_IPV6
                if (cp->af == AF_INET6)
-                       seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %7lu\n",
+                       seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
+                               "%pI6 %04X %-11s %7lu%s\n",
                                ip_vs_proto_name(cp->protocol),
                                &cp->caddr.in6, ntohs(cp->cport),
                                &cp->vaddr.in6, ntohs(cp->vport),
                                &cp->daddr.in6, ntohs(cp->dport),
                                ip_vs_state_name(cp->protocol, cp->state),
-                               (cp->timer.expires-jiffies)/HZ);
+                               (cp->timer.expires-jiffies)/HZ, pe_data);
                else
 #endif
                        seq_printf(seq,
                                "%-3s %08X %04X %08X %04X"
-                               " %08X %04X %-11s %7lu\n",
+                               " %08X %04X %-11s %7lu%s\n",
                                ip_vs_proto_name(cp->protocol),
                                ntohl(cp->caddr.ip), ntohs(cp->cport),
                                ntohl(cp->vaddr.ip), ntohs(cp->vport),
                                ntohl(cp->daddr.ip), ntohs(cp->dport),
                                ip_vs_state_name(cp->protocol, cp->state),
-                               (cp->timer.expires-jiffies)/HZ);
+                               (cp->timer.expires-jiffies)/HZ, pe_data);
        }
        return 0;
 }
index 0c043b6ce65e1b3148b249d4560f2e0174798b65..b4e51e9c5a04ad4e1314a338529cc0284407feb3 100644 (file)
@@ -48,6 +48,7 @@
 #ifdef CONFIG_IP_VS_IPV6
 #include <net/ipv6.h>
 #include <linux/netfilter_ipv6.h>
+#include <net/ip6_route.h>
 #endif
 
 #include <net/ip_vs.h>
@@ -176,6 +177,18 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
        return pp->state_transition(cp, direction, skb, pp);
 }
 
+static inline void
+ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
+                             struct sk_buff *skb, int protocol,
+                             const union nf_inet_addr *caddr, __be16 cport,
+                             const union nf_inet_addr *vaddr, __be16 vport,
+                             struct ip_vs_conn_param *p)
+{
+       ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
+       p->pe = svc->pe;
+       if (p->pe && p->pe->fill_param)
+               p->pe->fill_param(p, skb);
+}
 
 /*
  *  IPVS persistent scheduling function
@@ -186,15 +199,16 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
  */
 static struct ip_vs_conn *
 ip_vs_sched_persist(struct ip_vs_service *svc,
-                   const struct sk_buff *skb,
+                   struct sk_buff *skb,
                    __be16 ports[2])
 {
        struct ip_vs_conn *cp = NULL;
        struct ip_vs_iphdr iph;
        struct ip_vs_dest *dest;
        struct ip_vs_conn *ct;
-       __be16  dport;                  /* destination port to forward */
-       __be16  flags;
+       __be16 dport = 0;               /* destination port to forward */
+       unsigned int flags;
+       struct ip_vs_conn_param param;
        union nf_inet_addr snet;        /* source network of the client,
                                           after masking */
 
@@ -227,120 +241,75 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
         * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
         * is created for other persistent services.
         */
-       if (ports[1] == svc->port) {
-               /* Check if a template already exists */
-               if (svc->port != FTPPORT)
-                       ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-                                            &iph.daddr, ports[1]);
-               else
-                       ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-                                            &iph.daddr, 0);
-
-               if (!ct || !ip_vs_check_template(ct)) {
-                       /*
-                        * No template found or the dest of the connection
-                        * template is not available.
-                        */
-                       dest = svc->scheduler->schedule(svc, skb);
-                       if (dest == NULL) {
-                               IP_VS_DBG(1, "p-schedule: no dest found.\n");
-                               return NULL;
-                       }
-
-                       /*
-                        * Create a template like <protocol,caddr,0,
-                        * vaddr,vport,daddr,dport> for non-ftp service,
-                        * and <protocol,caddr,0,vaddr,0,daddr,0>
-                        * for ftp service.
+       {
+               int protocol = iph.protocol;
+               const union nf_inet_addr *vaddr = &iph.daddr;
+               const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+               __be16 vport = 0;
+
+               if (ports[1] == svc->port) {
+                       /* non-FTP template:
+                        * <protocol, caddr, 0, vaddr, vport, daddr, dport>
+                        * FTP template:
+                        * <protocol, caddr, 0, vaddr, 0, daddr, 0>
                         */
                        if (svc->port != FTPPORT)
-                               ct = ip_vs_conn_new(svc->af, iph.protocol,
-                                                   &snet, 0,
-                                                   &iph.daddr,
-                                                   ports[1],
-                                                   &dest->addr, dest->port,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       else
-                               ct = ip_vs_conn_new(svc->af, iph.protocol,
-                                                   &snet, 0,
-                                                   &iph.daddr, 0,
-                                                   &dest->addr, 0,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       if (ct == NULL)
-                               return NULL;
-
-                       ct->timeout = svc->timeout;
+                               vport = ports[1];
                } else {
-                       /* set destination with the found template */
-                       dest = ct->dest;
-               }
-               dport = dest->port;
-       } else {
-               /*
-                * Note: persistent fwmark-based services and persistent
-                * port zero service are handled here.
-                * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
-                * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
-                */
-               if (svc->fwmark) {
-                       union nf_inet_addr fwmark = {
-                               .ip = htonl(svc->fwmark)
-                       };
-
-                       ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
-                                            &fwmark, 0);
-               } else
-                       ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-                                            &iph.daddr, 0);
-
-               if (!ct || !ip_vs_check_template(ct)) {
-                       /*
-                        * If it is not persistent port zero, return NULL,
-                        * otherwise create a connection template.
+                       /* Note: persistent fwmark-based services and
+                        * persistent port zero service are handled here.
+                        * fwmark template:
+                        * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+                        * port zero template:
+                        * <protocol,caddr,0,vaddr,0,daddr,0>
                         */
-                       if (svc->port)
-                               return NULL;
-
-                       dest = svc->scheduler->schedule(svc, skb);
-                       if (dest == NULL) {
-                               IP_VS_DBG(1, "p-schedule: no dest found.\n");
-                               return NULL;
+                       if (svc->fwmark) {
+                               protocol = IPPROTO_IP;
+                               vaddr = &fwmark;
                        }
+               }
+               ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+                                             vaddr, vport, &param);
+       }
 
-                       /*
-                        * Create a template according to the service
-                        */
-                       if (svc->fwmark) {
-                               union nf_inet_addr fwmark = {
-                                       .ip = htonl(svc->fwmark)
-                               };
-
-                               ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
-                                                   &snet, 0,
-                                                   &fwmark, 0,
-                                                   &dest->addr, 0,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       } else
-                               ct = ip_vs_conn_new(svc->af, iph.protocol,
-                                                   &snet, 0,
-                                                   &iph.daddr, 0,
-                                                   &dest->addr, 0,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       if (ct == NULL)
-                               return NULL;
-
-                       ct->timeout = svc->timeout;
-               } else {
-                       /* set destination with the found template */
-                       dest = ct->dest;
+       /* Check if a template already exists */
+       ct = ip_vs_ct_in_get(&param);
+       if (!ct || !ip_vs_check_template(ct)) {
+               /* No template found or the dest of the connection
+                * template is not available.
+                */
+               dest = svc->scheduler->schedule(svc, skb);
+               if (!dest) {
+                       IP_VS_DBG(1, "p-schedule: no dest found.\n");
+                       kfree(param.pe_data);
+                       return NULL;
                }
-               dport = ports[1];
+
+               if (ports[1] == svc->port && svc->port != FTPPORT)
+                       dport = dest->port;
+
+               /* Create a template
+                * This adds param.pe_data to the template,
+                * and thus param.pe_data will be destroyed
+                * when the template expires */
+               ct = ip_vs_conn_new(&param, &dest->addr, dport,
+                                   IP_VS_CONN_F_TEMPLATE, dest);
+               if (ct == NULL) {
+                       kfree(param.pe_data);
+                       return NULL;
+               }
+
+               ct->timeout = svc->timeout;
+       } else {
+               /* set destination with the found template */
+               dest = ct->dest;
+               kfree(param.pe_data);
        }
 
+       dport = ports[1];
+       if (dport == svc->port && dest->port)
+               dport = dest->port;
+
        flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
                 && iph.protocol == IPPROTO_UDP)?
                IP_VS_CONN_F_ONE_PACKET : 0;
@@ -348,12 +317,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
        /*
         *    Create a new connection according to the template
         */
-       cp = ip_vs_conn_new(svc->af, iph.protocol,
-                           &iph.saddr, ports[0],
-                           &iph.daddr, ports[1],
-                           &dest->addr, dport,
-                           flags,
-                           dest);
+       ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
+                             &iph.daddr, ports[1], &param);
+       cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
        if (cp == NULL) {
                ip_vs_conn_put(ct);
                return NULL;
@@ -377,23 +343,53 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
  *  Protocols supported: TCP, UDP
  */
 struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
+              struct ip_vs_protocol *pp, int *ignored)
 {
        struct ip_vs_conn *cp = NULL;
        struct ip_vs_iphdr iph;
        struct ip_vs_dest *dest;
-       __be16 _ports[2], *pptr, flags;
+       __be16 _ports[2], *pptr;
+       unsigned int flags;
 
+       *ignored = 1;
        ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
        pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
        if (pptr == NULL)
                return NULL;
 
+       /*
+        * FTPDATA needs this check when using local real server.
+        * Never schedule Active FTPDATA connections from real server.
+        * For LVS-NAT they must be already created. For other methods
+        * with persistence the connection is created on SYN+ACK.
+        */
+       if (pptr[0] == FTPDATA) {
+               IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+                             "Not scheduling FTPDATA");
+               return NULL;
+       }
+
+       /*
+        * Do not schedule replies from local real server. It is risky
+        * for fwmark services but mostly for persistent services.
+        */
+       if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+           (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
+           (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
+               IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+                             "Not scheduling reply for existing connection");
+               __ip_vs_conn_put(cp);
+               return NULL;
+       }
+
        /*
         *    Persistent service
         */
-       if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+       if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
+               *ignored = 0;
                return ip_vs_sched_persist(svc, skb, pptr);
+       }
 
        /*
         *    Non-persistent service
@@ -406,6 +402,8 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
                return NULL;
        }
 
+       *ignored = 0;
+
        dest = svc->scheduler->schedule(svc, skb);
        if (dest == NULL) {
                IP_VS_DBG(1, "Schedule: no dest found.\n");
@@ -419,14 +417,16 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
        /*
         *    Create a connection entry.
         */
-       cp = ip_vs_conn_new(svc->af, iph.protocol,
-                           &iph.saddr, pptr[0],
-                           &iph.daddr, pptr[1],
-                           &dest->addr, dest->port ? dest->port : pptr[1],
-                           flags,
-                           dest);
-       if (cp == NULL)
-               return NULL;
+       {
+               struct ip_vs_conn_param p;
+               ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
+                                     pptr[0], &iph.daddr, pptr[1], &p);
+               cp = ip_vs_conn_new(&p, &dest->addr,
+                                   dest->port ? dest->port : pptr[1],
+                                   flags, dest);
+               if (!cp)
+                       return NULL;
+       }
 
        IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
                      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
@@ -473,23 +473,26 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
        if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
                int ret, cs;
                struct ip_vs_conn *cp;
-               __u16 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
-                               iph.protocol == IPPROTO_UDP)?
-                               IP_VS_CONN_F_ONE_PACKET : 0;
+               unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
+                                     iph.protocol == IPPROTO_UDP)?
+                                     IP_VS_CONN_F_ONE_PACKET : 0;
                union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
 
                ip_vs_service_put(svc);
 
                /* create a new connection entry */
                IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
-               cp = ip_vs_conn_new(svc->af, iph.protocol,
-                                   &iph.saddr, pptr[0],
-                                   &iph.daddr, pptr[1],
-                                   &daddr, 0,
-                                   IP_VS_CONN_F_BYPASS | flags,
-                                   NULL);
-               if (cp == NULL)
-                       return NF_DROP;
+               {
+                       struct ip_vs_conn_param p;
+                       ip_vs_conn_fill_param(svc->af, iph.protocol,
+                                             &iph.saddr, pptr[0],
+                                             &iph.daddr, pptr[1], &p);
+                       cp = ip_vs_conn_new(&p, &daddr, 0,
+                                           IP_VS_CONN_F_BYPASS | flags,
+                                           NULL);
+                       if (!cp)
+                               return NF_DROP;
+               }
 
                /* statistics */
                ip_vs_in_stats(cp, skb);
@@ -527,9 +530,14 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
         * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
         */
 #ifdef CONFIG_IP_VS_IPV6
-       if (svc->af == AF_INET6)
+       if (svc->af == AF_INET6) {
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
-       else
+       else
 #endif
                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 
@@ -541,6 +549,15 @@ __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
        return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
 }
 
+static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
+{
+       if (NF_INET_LOCAL_IN == hooknum)
+               return IP_DEFRAG_VS_IN;
+       if (NF_INET_FORWARD == hooknum)
+               return IP_DEFRAG_VS_FWD;
+       return IP_DEFRAG_VS_OUT;
+}
+
 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
 {
        int err = ip_defrag(skb, user);
@@ -601,10 +618,10 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
        skb->ip_summed = CHECKSUM_UNNECESSARY;
 
        if (inout)
-               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+               IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
                        "Forwarding altered outgoing ICMP");
        else
-               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+               IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
                        "Forwarding altered incoming ICMP");
 }
 
@@ -646,11 +663,13 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
        skb->ip_summed = CHECKSUM_PARTIAL;
 
        if (inout)
-               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-                       "Forwarding altered outgoing ICMPv6");
+               IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+                             (void *)ciph - (void *)iph,
+                             "Forwarding altered outgoing ICMPv6");
        else
-               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-                       "Forwarding altered incoming ICMPv6");
+               IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+                             (void *)ciph - (void *)iph,
+                             "Forwarding altered incoming ICMPv6");
 }
 #endif
 
@@ -691,10 +710,25 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
 #endif
                ip_vs_nat_icmp(skb, pp, cp, 1);
 
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6) {
+               if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
+                       goto out;
+       } else
+#endif
+               if ((sysctl_ip_vs_snat_reroute ||
+                    skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+                   ip_route_me_harder(skb, RTN_LOCAL) != 0)
+                       goto out;
+
        /* do the statistics and put it back */
        ip_vs_out_stats(cp, skb);
 
        skb->ipvs_property = 1;
+       if (!(cp->flags & IP_VS_CONN_F_NFCT))
+               ip_vs_notrack(skb);
+       else
+               ip_vs_update_conntrack(skb, cp, 0);
        verdict = NF_ACCEPT;
 
 out:
@@ -708,7 +742,8 @@ out:
  *     Find any that might be relevant, check against existing connections.
  *     Currently handles error types - unreachable, quench, ttl exceeded.
  */
-static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
+static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
+                         unsigned int hooknum)
 {
        struct iphdr *iph;
        struct icmphdr  _icmph, *ic;
@@ -723,7 +758,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 
        /* reassemble IP fragments */
        if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-               if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+               if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
                        return NF_STOLEN;
        }
 
@@ -766,7 +801,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
                     pp->dont_defrag))
                return NF_ACCEPT;
 
-       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
+       IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+                     "Checking outgoing ICMP for");
 
        offset += cih->ihl * 4;
 
@@ -782,7 +818,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 }
 
 #ifdef CONFIG_IP_VS_IPV6
-static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
+static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
+                            unsigned int hooknum)
 {
        struct ipv6hdr *iph;
        struct icmp6hdr _icmph, *ic;
@@ -798,7 +835,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 
        /* reassemble IP fragments */
        if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-               if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
+               if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
                        return NF_STOLEN;
        }
 
@@ -841,7 +878,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
        if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
                return NF_ACCEPT;
 
-       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
+       IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+                     "Checking outgoing ICMPv6 for");
 
        offset += sizeof(struct ipv6hdr);
 
@@ -889,7 +927,7 @@ static unsigned int
 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
                struct ip_vs_conn *cp, int ihl)
 {
-       IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
+       IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
 
        if (!skb_make_writable(skb, ihl))
                goto drop;
@@ -908,6 +946,15 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
                ip_send_check(ip_hdr(skb));
        }
 
+       /*
+        * nf_iterate does not expect change in the skb->dst->dev.
+        * It looks like it is not fatal to enable this code for hooks
+        * where our handlers are at the end of the chain list and
+        * when all next handlers use skb->dst->dev and not outdev.
+        * It will definitely route properly the inout NAT traffic
+        * when multiple paths are used.
+        */
+
        /* For policy routing, packets originating from this
         * machine itself may be routed differently to packets
         * passing through.  We want this packet to be routed as
@@ -916,21 +963,25 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
         */
 #ifdef CONFIG_IP_VS_IPV6
        if (af == AF_INET6) {
-               if (ip6_route_me_harder(skb) != 0)
+               if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
                        goto drop;
        } else
 #endif
-               if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
+               if ((sysctl_ip_vs_snat_reroute ||
+                    skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+                   ip_route_me_harder(skb, RTN_LOCAL) != 0)
                        goto drop;
 
-       IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
+       IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
 
        ip_vs_out_stats(cp, skb);
        ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
-       ip_vs_update_conntrack(skb, cp, 0);
-       ip_vs_conn_put(cp);
-
        skb->ipvs_property = 1;
+       if (!(cp->flags & IP_VS_CONN_F_NFCT))
+               ip_vs_notrack(skb);
+       else
+               ip_vs_update_conntrack(skb, cp, 0);
+       ip_vs_conn_put(cp);
 
        LeaveFunction(11);
        return NF_ACCEPT;
@@ -938,35 +989,46 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 drop:
        ip_vs_conn_put(cp);
        kfree_skb(skb);
+       LeaveFunction(11);
        return NF_STOLEN;
 }
 
 /*
- *     It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
  *     Check if outgoing packet belongs to the established ip_vs_conn.
  */
 static unsigned int
-ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
-         const struct net_device *in, const struct net_device *out,
-         int (*okfn)(struct sk_buff *))
+ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 {
        struct ip_vs_iphdr iph;
        struct ip_vs_protocol *pp;
        struct ip_vs_conn *cp;
-       int af;
 
        EnterFunction(11);
 
-       af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
+       /* Already marked as IPVS request or reply? */
        if (skb->ipvs_property)
                return NF_ACCEPT;
 
+       /* Bad... Do not break raw sockets */
+       if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+                    af == AF_INET)) {
+               struct sock *sk = skb->sk;
+               struct inet_sock *inet = inet_sk(skb->sk);
+
+               if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+                       return NF_ACCEPT;
+       }
+
+       if (unlikely(!skb_dst(skb)))
+               return NF_ACCEPT;
+
        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 #ifdef CONFIG_IP_VS_IPV6
        if (af == AF_INET6) {
                if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-                       int related, verdict = ip_vs_out_icmp_v6(skb, &related);
+                       int related;
+                       int verdict = ip_vs_out_icmp_v6(skb, &related,
+                                                       hooknum);
 
                        if (related)
                                return verdict;
@@ -975,7 +1037,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
        } else
 #endif
                if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-                       int related, verdict = ip_vs_out_icmp(skb, &related);
+                       int related;
+                       int verdict = ip_vs_out_icmp(skb, &related, hooknum);
 
                        if (related)
                                return verdict;
@@ -989,19 +1052,19 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
        /* reassemble IP fragments */
 #ifdef CONFIG_IP_VS_IPV6
        if (af == AF_INET6) {
-               if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-                       int related, verdict = ip_vs_out_icmp_v6(skb, &related);
-
-                       if (related)
-                               return verdict;
-
-                       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+               if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+                       if (ip_vs_gather_frags_v6(skb,
+                                                 ip_vs_defrag_user(hooknum)))
+                               return NF_STOLEN;
                }
+
+               ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
        } else
 #endif
                if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
                             !pp->dont_defrag)) {
-                       if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+                       if (ip_vs_gather_frags(skb,
+                                              ip_vs_defrag_user(hooknum)))
                                return NF_STOLEN;
 
                        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
@@ -1012,55 +1075,123 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
         */
        cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
 
-       if (unlikely(!cp)) {
-               if (sysctl_ip_vs_nat_icmp_send &&
-                   (pp->protocol == IPPROTO_TCP ||
-                    pp->protocol == IPPROTO_UDP ||
-                    pp->protocol == IPPROTO_SCTP)) {
-                       __be16 _ports[2], *pptr;
-
-                       pptr = skb_header_pointer(skb, iph.len,
-                                                 sizeof(_ports), _ports);
-                       if (pptr == NULL)
-                               return NF_ACCEPT;       /* Not for me */
-                       if (ip_vs_lookup_real_service(af, iph.protocol,
-                                                     &iph.saddr,
-                                                     pptr[0])) {
-                               /*
-                                * Notify the real server: there is no
-                                * existing entry if it is not RST
-                                * packet or not TCP packet.
-                                */
-                               if ((iph.protocol != IPPROTO_TCP &&
-                                    iph.protocol != IPPROTO_SCTP)
-                                    || ((iph.protocol == IPPROTO_TCP
-                                         && !is_tcp_reset(skb, iph.len))
-                                        || (iph.protocol == IPPROTO_SCTP
-                                               && !is_sctp_abort(skb,
-                                                       iph.len)))) {
+       if (likely(cp))
+               return handle_response(af, skb, pp, cp, iph.len);
+       if (sysctl_ip_vs_nat_icmp_send &&
+           (pp->protocol == IPPROTO_TCP ||
+            pp->protocol == IPPROTO_UDP ||
+            pp->protocol == IPPROTO_SCTP)) {
+               __be16 _ports[2], *pptr;
+
+               pptr = skb_header_pointer(skb, iph.len,
+                                         sizeof(_ports), _ports);
+               if (pptr == NULL)
+                       return NF_ACCEPT;       /* Not for me */
+               if (ip_vs_lookup_real_service(af, iph.protocol,
+                                             &iph.saddr,
+                                             pptr[0])) {
+                       /*
+                        * Notify the real server: there is no
+                        * existing entry if it is not RST
+                        * packet or not TCP packet.
+                        */
+                       if ((iph.protocol != IPPROTO_TCP &&
+                            iph.protocol != IPPROTO_SCTP)
+                            || ((iph.protocol == IPPROTO_TCP
+                                 && !is_tcp_reset(skb, iph.len))
+                                || (iph.protocol == IPPROTO_SCTP
+                                       && !is_sctp_abort(skb,
+                                               iph.len)))) {
 #ifdef CONFIG_IP_VS_IPV6
-                                       if (af == AF_INET6)
-                                               icmpv6_send(skb,
-                                                           ICMPV6_DEST_UNREACH,
-                                                           ICMPV6_PORT_UNREACH,
-                                                           0);
-                                       else
+                               if (af == AF_INET6) {
+                                       struct net *net =
+                                               dev_net(skb_dst(skb)->dev);
+
+                                       if (!skb->dev)
+                                               skb->dev = net->loopback_dev;
+                                       icmpv6_send(skb,
+                                                   ICMPV6_DEST_UNREACH,
+                                                   ICMPV6_PORT_UNREACH,
+                                                   0);
+                               } else
 #endif
-                                               icmp_send(skb,
-                                                         ICMP_DEST_UNREACH,
-                                                         ICMP_PORT_UNREACH, 0);
-                                       return NF_DROP;
-                               }
+                                       icmp_send(skb,
+                                                 ICMP_DEST_UNREACH,
+                                                 ICMP_PORT_UNREACH, 0);
+                               return NF_DROP;
                        }
                }
-               IP_VS_DBG_PKT(12, pp, skb, 0,
-                             "packet continues traversal as normal");
-               return NF_ACCEPT;
        }
+       IP_VS_DBG_PKT(12, af, pp, skb, 0,
+                     "ip_vs_out: packet continues traversal as normal");
+       return NF_ACCEPT;
+}
 
-       return handle_response(af, skb, pp, cp, iph.len);
+/*
+ *     It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ *     used only for VS/NAT.
+ *     Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
+            const struct net_device *in, const struct net_device *out,
+            int (*okfn)(struct sk_buff *))
+{
+       return ip_vs_out(hooknum, skb, AF_INET);
 }
 
+/*
+ *     It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ *     Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
+                  const struct net_device *in, const struct net_device *out,
+                  int (*okfn)(struct sk_buff *))
+{
+       unsigned int verdict;
+
+       /* Disable BH in LOCAL_OUT until all places are fixed */
+       local_bh_disable();
+       verdict = ip_vs_out(hooknum, skb, AF_INET);
+       local_bh_enable();
+       return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ *     It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ *     used only for VS/NAT.
+ *     Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
+            const struct net_device *in, const struct net_device *out,
+            int (*okfn)(struct sk_buff *))
+{
+       return ip_vs_out(hooknum, skb, AF_INET6);
+}
+
+/*
+ *     It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ *     Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
+                  const struct net_device *in, const struct net_device *out,
+                  int (*okfn)(struct sk_buff *))
+{
+       unsigned int verdict;
+
+       /* Disable BH in LOCAL_OUT until all places are fixed */
+       local_bh_disable();
+       verdict = ip_vs_out(hooknum, skb, AF_INET6);
+       local_bh_enable();
+       return verdict;
+}
+
+#endif
 
 /*
  *     Handle ICMP messages in the outside-to-inside direction (incoming).
@@ -1084,8 +1215,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 
        /* reassemble IP fragments */
        if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-               if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
-                                           IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
+               if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
                        return NF_STOLEN;
        }
 
@@ -1128,7 +1258,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
                     pp->dont_defrag))
                return NF_ACCEPT;
 
-       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
+       IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+                     "Checking incoming ICMP for");
 
        offset += cih->ihl * 4;
 
@@ -1162,7 +1293,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
        if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
                offset += 2 * sizeof(__u16);
        verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
-       /* do not touch skb anymore */
+       /* LOCALNODE from FORWARD hook is not supported */
+       if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
+           skb_rtable(skb)->rt_flags & RTCF_LOCAL) {
+               IP_VS_DBG(1, "%s(): "
+                         "local delivery to %pI4 but in FORWARD\n",
+                         __func__, &skb_rtable(skb)->rt_dst);
+               verdict = NF_DROP;
+       }
 
   out:
        __ip_vs_conn_put(cp);
@@ -1183,14 +1321,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
        struct ip_vs_protocol *pp;
        unsigned int offset, verdict;
        union nf_inet_addr snet;
+       struct rt6_info *rt;
 
        *related = 1;
 
        /* reassemble IP fragments */
        if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-               if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
-                                              IP_DEFRAG_VS_IN :
-                                              IP_DEFRAG_VS_FWD))
+               if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
                        return NF_STOLEN;
        }
 
@@ -1233,7 +1370,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
        if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
                return NF_ACCEPT;
 
-       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
+       IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+                     "Checking incoming ICMPv6 for");
 
        offset += sizeof(struct ipv6hdr);
 
@@ -1261,7 +1399,15 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
            IPPROTO_SCTP == cih->nexthdr)
                offset += 2 * sizeof(__u16);
        verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
-       /* do not touch skb anymore */
+       /* LOCALNODE from FORWARD hook is not supported */
+       if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
+           (rt = (struct rt6_info *) skb_dst(skb)) &&
+           rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) {
+               IP_VS_DBG(1, "%s(): "
+                         "local delivery to %pI6 but in FORWARD\n",
+                         __func__, &rt->rt6i_dst);
+               verdict = NF_DROP;
+       }
 
        __ip_vs_conn_put(cp);
 
@@ -1275,35 +1421,49 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
  *     and send it on its way...
  */
 static unsigned int
-ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
-        const struct net_device *in, const struct net_device *out,
-        int (*okfn)(struct sk_buff *))
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 {
        struct ip_vs_iphdr iph;
        struct ip_vs_protocol *pp;
        struct ip_vs_conn *cp;
-       int ret, restart, af, pkts;
+       int ret, restart, pkts;
 
-       af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
-       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+       /* Already marked as IPVS request or reply? */
+       if (skb->ipvs_property)
+               return NF_ACCEPT;
 
        /*
-        *      Big tappo: only PACKET_HOST, including loopback for local client
-        *      Don't handle local packets on IPv6 for now
+        *      Big tappo:
+        *      - remote client: only PACKET_HOST
+        *      - route: used for struct net when skb->dev is unset
         */
-       if (unlikely(skb->pkt_type != PACKET_HOST)) {
-               IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
-                             skb->pkt_type,
-                             iph.protocol,
-                             IP_VS_DBG_ADDR(af, &iph.daddr));
+       if (unlikely((skb->pkt_type != PACKET_HOST &&
+                     hooknum != NF_INET_LOCAL_OUT) ||
+                    !skb_dst(skb))) {
+               ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+               IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
+                             " ignored in hook %u\n",
+                             skb->pkt_type, iph.protocol,
+                             IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
                return NF_ACCEPT;
        }
+       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+       /* Bad... Do not break raw sockets */
+       if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+                    af == AF_INET)) {
+               struct sock *sk = skb->sk;
+               struct inet_sock *inet = inet_sk(skb->sk);
+
+               if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+                       return NF_ACCEPT;
+       }
 
 #ifdef CONFIG_IP_VS_IPV6
        if (af == AF_INET6) {
                if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-                       int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
+                       int related;
+                       int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
 
                        if (related)
                                return verdict;
@@ -1312,7 +1472,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
        } else
 #endif
                if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-                       int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
+                       int related;
+                       int verdict = ip_vs_in_icmp(skb, &related, hooknum);
 
                        if (related)
                                return verdict;
@@ -1332,23 +1493,18 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
        if (unlikely(!cp)) {
                int v;
 
-               /* For local client packets, it could be a response */
-               cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
-               if (cp)
-                       return handle_response(af, skb, pp, cp, iph.len);
-
                if (!pp->conn_schedule(af, skb, pp, &v, &cp))
                        return v;
        }
 
        if (unlikely(!cp)) {
                /* sorry, all this trouble for a no-hit :) */
-               IP_VS_DBG_PKT(12, pp, skb, 0,
-                             "packet continues traversal as normal");
+               IP_VS_DBG_PKT(12, af, pp, skb, 0,
+                             "ip_vs_in: packet continues traversal as normal");
                return NF_ACCEPT;
        }
 
-       IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
+       IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
 
        /* Check the server status */
        if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
@@ -1415,6 +1571,72 @@ out:
        return ret;
 }
 
+/*
+ *     AF_INET handler in NF_INET_LOCAL_IN chain
+ *     Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
+                     const struct net_device *in,
+                     const struct net_device *out,
+                     int (*okfn)(struct sk_buff *))
+{
+       return ip_vs_in(hooknum, skb, AF_INET);
+}
+
+/*
+ *     AF_INET handler in NF_INET_LOCAL_OUT chain
+ *     Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
+                    const struct net_device *in, const struct net_device *out,
+                    int (*okfn)(struct sk_buff *))
+{
+       unsigned int verdict;
+
+       /* Disable BH in LOCAL_OUT until all places are fixed */
+       local_bh_disable();
+       verdict = ip_vs_in(hooknum, skb, AF_INET);
+       local_bh_enable();
+       return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ *     AF_INET6 handler in NF_INET_LOCAL_IN chain
+ *     Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
+                     const struct net_device *in,
+                     const struct net_device *out,
+                     int (*okfn)(struct sk_buff *))
+{
+       return ip_vs_in(hooknum, skb, AF_INET6);
+}
+
+/*
+ *     AF_INET6 handler in NF_INET_LOCAL_OUT chain
+ *     Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
+                    const struct net_device *in, const struct net_device *out,
+                    int (*okfn)(struct sk_buff *))
+{
+       unsigned int verdict;
+
+       /* Disable BH in LOCAL_OUT until all places are fixed */
+       local_bh_disable();
+       verdict = ip_vs_in(hooknum, skb, AF_INET6);
+       local_bh_enable();
+       return verdict;
+}
+
+#endif
+
 
 /*
  *     It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
@@ -1455,23 +1677,39 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
 
 
 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+       /* After packet filtering, change source only for VS/NAT */
+       {
+               .hook           = ip_vs_reply4,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_LOCAL_IN,
+               .priority       = 99,
+       },
        /* After packet filtering, forward packet through VS/DR, VS/TUN,
         * or VS/NAT(change destination), so that filtering rules can be
         * applied to IPVS. */
        {
-               .hook           = ip_vs_in,
+               .hook           = ip_vs_remote_request4,
                .owner          = THIS_MODULE,
                .pf             = PF_INET,
-               .hooknum        = NF_INET_LOCAL_IN,
-               .priority       = 100,
+               .hooknum        = NF_INET_LOCAL_IN,
+               .priority       = 101,
        },
-       /* After packet filtering, change source only for VS/NAT */
+       /* Before ip_vs_in, change source only for VS/NAT */
        {
-               .hook           = ip_vs_out,
+               .hook           = ip_vs_local_reply4,
                .owner          = THIS_MODULE,
                .pf             = PF_INET,
-               .hooknum        = NF_INET_FORWARD,
-               .priority       = 100,
+               .hooknum        = NF_INET_LOCAL_OUT,
+               .priority       = -99,
+       },
+       /* After mangle, schedule and forward local requests */
+       {
+               .hook           = ip_vs_local_request4,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_LOCAL_OUT,
+               .priority       = -98,
        },
        /* After packet filtering (but before ip_vs_out_icmp), catch icmp
         * destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1479,27 +1717,51 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
                .hook           = ip_vs_forward_icmp,
                .owner          = THIS_MODULE,
                .pf             = PF_INET,
-               .hooknum        = NF_INET_FORWARD,
-               .priority       = 99,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 99,
+       },
+       /* After packet filtering, change source only for VS/NAT */
+       {
+               .hook           = ip_vs_reply4,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 100,
        },
 #ifdef CONFIG_IP_VS_IPV6
+       /* After packet filtering, change source only for VS/NAT */
+       {
+               .hook           = ip_vs_reply6,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET6,
+               .hooknum        = NF_INET_LOCAL_IN,
+               .priority       = 99,
+       },
        /* After packet filtering, forward packet through VS/DR, VS/TUN,
         * or VS/NAT(change destination), so that filtering rules can be
         * applied to IPVS. */
        {
-               .hook           = ip_vs_in,
+               .hook           = ip_vs_remote_request6,
                .owner          = THIS_MODULE,
                .pf             = PF_INET6,
-               .hooknum        = NF_INET_LOCAL_IN,
-               .priority       = 100,
+               .hooknum        = NF_INET_LOCAL_IN,
+               .priority       = 101,
        },
-       /* After packet filtering, change source only for VS/NAT */
+       /* Before ip_vs_in, change source only for VS/NAT */
+       {
+               .hook           = ip_vs_local_reply6,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_LOCAL_OUT,
+               .priority       = -99,
+       },
+       /* After mangle, schedule and forward local requests */
        {
-               .hook           = ip_vs_out,
+               .hook           = ip_vs_local_request6,
                .owner          = THIS_MODULE,
                .pf             = PF_INET6,
-               .hooknum        = NF_INET_FORWARD,
-               .priority       = 100,
+               .hooknum        = NF_INET_LOCAL_OUT,
+               .priority       = -98,
        },
        /* After packet filtering (but before ip_vs_out_icmp), catch icmp
         * destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1507,8 +1769,16 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
                .hook           = ip_vs_forward_icmp_v6,
                .owner          = THIS_MODULE,
                .pf             = PF_INET6,
-               .hooknum        = NF_INET_FORWARD,
-               .priority       = 99,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 99,
+       },
+       /* After packet filtering, change source only for VS/NAT */
+       {
+               .hook           = ip_vs_reply6,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET6,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 100,
        },
 #endif
 };
index ca8ec8c4f3113678c51ae1961795bc936a590a64..5f5daa30b0afe541d00c1577850ce565c31fb13b 100644 (file)
@@ -88,6 +88,10 @@ int sysctl_ip_vs_expire_nodest_conn = 0;
 int sysctl_ip_vs_expire_quiescent_template = 0;
 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
 int sysctl_ip_vs_nat_icmp_send = 0;
+#ifdef CONFIG_IP_VS_NFCT
+int sysctl_ip_vs_conntrack;
+#endif
+int sysctl_ip_vs_snat_reroute = 1;
 
 
 #ifdef CONFIG_IP_VS_DEBUG
@@ -401,7 +405,7 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
  *     Get service by {proto,addr,port} in the service table.
  */
 static inline struct ip_vs_service *
-__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
+__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
                    __be16 vport)
 {
        unsigned hash;
@@ -416,7 +420,6 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
                    && (svc->port == vport)
                    && (svc->protocol == protocol)) {
                        /* HIT */
-                       atomic_inc(&svc->usecnt);
                        return svc;
                }
        }
@@ -429,7 +432,7 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
  *     Get service by {fwmark} in the service table.
  */
 static inline struct ip_vs_service *
-__ip_vs_svc_fwm_get(int af, __u32 fwmark)
+__ip_vs_svc_fwm_find(int af, __u32 fwmark)
 {
        unsigned hash;
        struct ip_vs_service *svc;
@@ -440,7 +443,6 @@ __ip_vs_svc_fwm_get(int af, __u32 fwmark)
        list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
                if (svc->fwmark == fwmark && svc->af == af) {
                        /* HIT */
-                       atomic_inc(&svc->usecnt);
                        return svc;
                }
        }
@@ -459,14 +461,14 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
        /*
         *      Check the table hashed by fwmark first
         */
-       if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
+       if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark)))
                goto out;
 
        /*
         *      Check the table hashed by <protocol,addr,port>
         *      for "full" addressed entries
         */
-       svc = __ip_vs_service_get(af, protocol, vaddr, vport);
+       svc = __ip_vs_service_find(af, protocol, vaddr, vport);
 
        if (svc == NULL
            && protocol == IPPROTO_TCP
@@ -476,7 +478,7 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
                 * Check if ftp service entry exists, the packet
                 * might belong to FTP data connections.
                 */
-               svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
+               svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT);
        }
 
        if (svc == NULL
@@ -484,10 +486,12 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
                /*
                 * Check if the catch-all port (port zero) exists
                 */
-               svc = __ip_vs_service_get(af, protocol, vaddr, 0);
+               svc = __ip_vs_service_find(af, protocol, vaddr, 0);
        }
 
   out:
+       if (svc)
+               atomic_inc(&svc->usecnt);
        read_unlock(&__ip_vs_svc_lock);
 
        IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
@@ -506,14 +510,19 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
        dest->svc = svc;
 }
 
-static inline void
+static void
 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
 {
        struct ip_vs_service *svc = dest->svc;
 
        dest->svc = NULL;
-       if (atomic_dec_and_test(&svc->refcnt))
+       if (atomic_dec_and_test(&svc->refcnt)) {
+               IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+                             svc->fwmark,
+                             IP_VS_DBG_ADDR(svc->af, &svc->addr),
+                             ntohs(svc->port), atomic_read(&svc->usecnt));
                kfree(svc);
+       }
 }
 
 
@@ -758,31 +767,18 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
  *     Update a destination in the given service
  */
 static void
-__ip_vs_update_dest(struct ip_vs_service *svc,
-                   struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
+__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
+                   struct ip_vs_dest_user_kern *udest, int add)
 {
        int conn_flags;
 
        /* set the weight and the flags */
        atomic_set(&dest->weight, udest->weight);
-       conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
-
-       /* check if local node and update the flags */
-#ifdef CONFIG_IP_VS_IPV6
-       if (svc->af == AF_INET6) {
-               if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
-                       conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-                               | IP_VS_CONN_F_LOCALNODE;
-               }
-       } else
-#endif
-               if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
-                       conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-                               | IP_VS_CONN_F_LOCALNODE;
-               }
+       conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
+       conn_flags |= IP_VS_CONN_F_INACTIVE;
 
        /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
-       if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
+       if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
                conn_flags |= IP_VS_CONN_F_NOOUTPUT;
        } else {
                /*
@@ -813,6 +809,29 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
                dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
        dest->u_threshold = udest->u_threshold;
        dest->l_threshold = udest->l_threshold;
+
+       spin_lock(&dest->dst_lock);
+       ip_vs_dst_reset(dest);
+       spin_unlock(&dest->dst_lock);
+
+       if (add)
+               ip_vs_new_estimator(&dest->stats);
+
+       write_lock_bh(&__ip_vs_svc_lock);
+
+       /* Wait until all other svc users go away */
+       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+
+       if (add) {
+               list_add(&dest->n_list, &svc->destinations);
+               svc->num_dests++;
+       }
+
+       /* call the update_service, because server weight may be changed */
+       if (svc->scheduler->update_service)
+               svc->scheduler->update_service(svc);
+
+       write_unlock_bh(&__ip_vs_svc_lock);
 }
 
 
@@ -860,13 +879,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
        atomic_set(&dest->activeconns, 0);
        atomic_set(&dest->inactconns, 0);
        atomic_set(&dest->persistconns, 0);
-       atomic_set(&dest->refcnt, 0);
+       atomic_set(&dest->refcnt, 1);
 
        INIT_LIST_HEAD(&dest->d_list);
        spin_lock_init(&dest->dst_lock);
        spin_lock_init(&dest->stats.lock);
-       __ip_vs_update_dest(svc, dest, udest);
-       ip_vs_new_estimator(&dest->stats);
+       __ip_vs_update_dest(svc, dest, udest, 1);
 
        *dest_p = dest;
 
@@ -926,65 +944,22 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
                              IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
                              ntohs(dest->vport));
 
-               __ip_vs_update_dest(svc, dest, udest);
-
                /*
                 * Get the destination from the trash
                 */
                list_del(&dest->n_list);
 
-               ip_vs_new_estimator(&dest->stats);
-
-               write_lock_bh(&__ip_vs_svc_lock);
-
+               __ip_vs_update_dest(svc, dest, udest, 1);
+               ret = 0;
+       } else {
                /*
-                * Wait until all other svc users go away.
+                * Allocate and initialize the dest structure
                 */
-               IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-               list_add(&dest->n_list, &svc->destinations);
-               svc->num_dests++;
-
-               /* call the update_service function of its scheduler */
-               if (svc->scheduler->update_service)
-                       svc->scheduler->update_service(svc);
-
-               write_unlock_bh(&__ip_vs_svc_lock);
-               return 0;
-       }
-
-       /*
-        * Allocate and initialize the dest structure
-        */
-       ret = ip_vs_new_dest(svc, udest, &dest);
-       if (ret) {
-               return ret;
+               ret = ip_vs_new_dest(svc, udest, &dest);
        }
-
-       /*
-        * Add the dest entry into the list
-        */
-       atomic_inc(&dest->refcnt);
-
-       write_lock_bh(&__ip_vs_svc_lock);
-
-       /*
-        * Wait until all other svc users go away.
-        */
-       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-       list_add(&dest->n_list, &svc->destinations);
-       svc->num_dests++;
-
-       /* call the update_service function of its scheduler */
-       if (svc->scheduler->update_service)
-               svc->scheduler->update_service(svc);
-
-       write_unlock_bh(&__ip_vs_svc_lock);
-
        LeaveFunction(2);
 
-       return 0;
+       return ret;
 }
 
 
@@ -1023,19 +998,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
                return -ENOENT;
        }
 
-       __ip_vs_update_dest(svc, dest, udest);
-
-       write_lock_bh(&__ip_vs_svc_lock);
-
-       /* Wait until all other svc users go away */
-       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-       /* call the update_service, because server weight may be changed */
-       if (svc->scheduler->update_service)
-               svc->scheduler->update_service(svc);
-
-       write_unlock_bh(&__ip_vs_svc_lock);
-
+       __ip_vs_update_dest(svc, dest, udest, 0);
        LeaveFunction(2);
 
        return 0;
@@ -1062,6 +1025,10 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
         *  the destination into the trash.
         */
        if (atomic_dec_and_test(&dest->refcnt)) {
+               IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
+                             dest->vfwmark,
+                             IP_VS_DBG_ADDR(dest->af, &dest->addr),
+                             ntohs(dest->port));
                ip_vs_dst_reset(dest);
                /* simply decrease svc->refcnt here, let the caller check
                   and release the service if nobody refers to it.
@@ -1128,7 +1095,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
        /*
         *      Wait until all other svc users go away.
         */
-       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
        /*
         *      Unlink dest from the service
@@ -1157,6 +1124,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 {
        int ret = 0;
        struct ip_vs_scheduler *sched = NULL;
+       struct ip_vs_pe *pe = NULL;
        struct ip_vs_service *svc = NULL;
 
        /* increase the module use count */
@@ -1167,7 +1135,17 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
        if (sched == NULL) {
                pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
                ret = -ENOENT;
-               goto out_mod_dec;
+               goto out_err;
+       }
+
+       if (u->pe_name && *u->pe_name) {
+               pe = ip_vs_pe_get(u->pe_name);
+               if (pe == NULL) {
+                       pr_info("persistence engine module ip_vs_pe_%s "
+                               "not found\n", u->pe_name);
+                       ret = -ENOENT;
+                       goto out_err;
+               }
        }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -1185,7 +1163,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
        }
 
        /* I'm the first user of the service */
-       atomic_set(&svc->usecnt, 1);
+       atomic_set(&svc->usecnt, 0);
        atomic_set(&svc->refcnt, 0);
 
        svc->af = u->af;
@@ -1207,6 +1185,10 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
                goto out_err;
        sched = NULL;
 
+       /* Bind the ct retriever */
+       ip_vs_bind_pe(svc, pe);
+       pe = NULL;
+
        /* Update the virtual service counters */
        if (svc->port == FTPPORT)
                atomic_inc(&ip_vs_ftpsvc_counter);
@@ -1227,10 +1209,9 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
        *svc_p = svc;
        return 0;
 
 out_err:
+ out_err:
        if (svc != NULL) {
-               if (svc->scheduler)
-                       ip_vs_unbind_scheduler(svc);
+               ip_vs_unbind_scheduler(svc);
                if (svc->inc) {
                        local_bh_disable();
                        ip_vs_app_inc_put(svc->inc);
@@ -1239,8 +1220,8 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
                kfree(svc);
        }
        ip_vs_scheduler_put(sched);
+       ip_vs_pe_put(pe);
 
-  out_mod_dec:
        /* decrease the module use count */
        ip_vs_use_count_dec();
 
@@ -1255,6 +1236,7 @@ static int
 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 {
        struct ip_vs_scheduler *sched, *old_sched;
+       struct ip_vs_pe *pe = NULL, *old_pe = NULL;
        int ret = 0;
 
        /*
@@ -1267,6 +1249,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
        }
        old_sched = sched;
 
+       if (u->pe_name && *u->pe_name) {
+               pe = ip_vs_pe_get(u->pe_name);
+               if (pe == NULL) {
+                       pr_info("persistence engine module ip_vs_pe_%s "
+                               "not found\n", u->pe_name);
+                       ret = -ENOENT;
+                       goto out;
+               }
+               old_pe = pe;
+       }
+
 #ifdef CONFIG_IP_VS_IPV6
        if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
                ret = -EINVAL;
@@ -1279,7 +1272,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
        /*
         * Wait until all other svc users go away.
         */
-       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
        /*
         * Set the flags and timeout value
@@ -1318,15 +1311,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
                }
        }
 
+       old_pe = svc->pe;
+       if (pe != old_pe) {
+               ip_vs_unbind_pe(svc);
+               ip_vs_bind_pe(svc, pe);
+       }
+
   out_unlock:
        write_unlock_bh(&__ip_vs_svc_lock);
-#ifdef CONFIG_IP_VS_IPV6
   out:
-#endif
-
-       if (old_sched)
-               ip_vs_scheduler_put(old_sched);
-
+       ip_vs_scheduler_put(old_sched);
+       ip_vs_pe_put(old_pe);
        return ret;
 }
 
@@ -1340,6 +1335,9 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 {
        struct ip_vs_dest *dest, *nxt;
        struct ip_vs_scheduler *old_sched;
+       struct ip_vs_pe *old_pe;
+
+       pr_info("%s: enter\n", __func__);
 
        /* Count only IPv4 services for old get/setsockopt interface */
        if (svc->af == AF_INET)
@@ -1350,8 +1348,12 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
        /* Unbind scheduler */
        old_sched = svc->scheduler;
        ip_vs_unbind_scheduler(svc);
-       if (old_sched)
-               ip_vs_scheduler_put(old_sched);
+       ip_vs_scheduler_put(old_sched);
+
+       /* Unbind persistence engine */
+       old_pe = svc->pe;
+       ip_vs_unbind_pe(svc);
+       ip_vs_pe_put(old_pe);
 
        /* Unbind app inc */
        if (svc->inc) {
@@ -1378,21 +1380,23 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
        /*
         *    Free the service if nobody refers to it
         */
-       if (atomic_read(&svc->refcnt) == 0)
+       if (atomic_read(&svc->refcnt) == 0) {
+               IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+                             svc->fwmark,
+                             IP_VS_DBG_ADDR(svc->af, &svc->addr),
+                             ntohs(svc->port), atomic_read(&svc->usecnt));
                kfree(svc);
+       }
 
        /* decrease the module use count */
        ip_vs_use_count_dec();
 }
 
 /*
- *     Delete a service from the service list
+ * Unlink a service from list and try to delete it if its refcnt reached 0
  */
-static int ip_vs_del_service(struct ip_vs_service *svc)
+static void ip_vs_unlink_service(struct ip_vs_service *svc)
 {
-       if (svc == NULL)
-               return -EEXIST;
-
        /*
         * Unhash it from the service table
         */
@@ -1403,11 +1407,21 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
        /*
         * Wait until all the svc users go away.
         */
-       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
        __ip_vs_del_service(svc);
 
        write_unlock_bh(&__ip_vs_svc_lock);
+}
+
+/*
+ *     Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+       if (svc == NULL)
+               return -EEXIST;
+       ip_vs_unlink_service(svc);
 
        return 0;
 }
@@ -1426,14 +1440,7 @@ static int ip_vs_flush(void)
         */
        for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
                list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
-                       write_lock_bh(&__ip_vs_svc_lock);
-                       ip_vs_svc_unhash(svc);
-                       /*
-                        * Wait until all the svc users go away.
-                        */
-                       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-                       __ip_vs_del_service(svc);
-                       write_unlock_bh(&__ip_vs_svc_lock);
+                       ip_vs_unlink_service(svc);
                }
        }
 
@@ -1443,14 +1450,7 @@ static int ip_vs_flush(void)
        for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
                list_for_each_entry_safe(svc, nxt,
                                         &ip_vs_svc_fwm_table[idx], f_list) {
-                       write_lock_bh(&__ip_vs_svc_lock);
-                       ip_vs_svc_unhash(svc);
-                       /*
-                        * Wait until all the svc users go away.
-                        */
-                       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-                       __ip_vs_del_service(svc);
-                       write_unlock_bh(&__ip_vs_svc_lock);
+                       ip_vs_unlink_service(svc);
                }
        }
 
@@ -1579,6 +1579,15 @@ static struct ctl_table vs_vars[] = {
                .mode           = 0644,
                .proc_handler   = proc_do_defense_mode,
        },
+#ifdef CONFIG_IP_VS_NFCT
+       {
+               .procname       = "conntrack",
+               .data           = &sysctl_ip_vs_conntrack,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+#endif
        {
                .procname       = "secure_tcp",
                .data           = &sysctl_ip_vs_secure_tcp,
@@ -1586,6 +1595,13 @@ static struct ctl_table vs_vars[] = {
                .mode           = 0644,
                .proc_handler   = proc_do_defense_mode,
        },
+       {
+               .procname       = "snat_reroute",
+               .data           = &sysctl_ip_vs_snat_reroute,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
 #if 0
        {
                .procname       = "timeout_established",
@@ -2041,6 +2057,8 @@ static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
                                  struct ip_vs_service_user *usvc_compat)
 {
+       memset(usvc, 0, sizeof(*usvc));
+
        usvc->af                = AF_INET;
        usvc->protocol          = usvc_compat->protocol;
        usvc->addr.ip           = usvc_compat->addr;
@@ -2058,6 +2076,8 @@ static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
                                   struct ip_vs_dest_user *udest_compat)
 {
+       memset(udest, 0, sizeof(*udest));
+
        udest->addr.ip          = udest_compat->addr;
        udest->port             = udest_compat->port;
        udest->conn_flags       = udest_compat->conn_flags;
@@ -2147,15 +2167,15 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 
        /* Lookup the exact service by <protocol, addr, port> or fwmark */
        if (usvc.fwmark == 0)
-               svc = __ip_vs_service_get(usvc.af, usvc.protocol,
-                                         &usvc.addr, usvc.port);
+               svc = __ip_vs_service_find(usvc.af, usvc.protocol,
+                                          &usvc.addr, usvc.port);
        else
-               svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
+               svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark);
 
        if (cmd != IP_VS_SO_SET_ADD
            && (svc == NULL || svc->protocol != usvc.protocol)) {
                ret = -ESRCH;
-               goto out_drop_service;
+               goto out_unlock;
        }
 
        switch (cmd) {
@@ -2189,10 +2209,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
                ret = -EINVAL;
        }
 
-out_drop_service:
-       if (svc)
-               ip_vs_service_put(svc);
-
   out_unlock:
        mutex_unlock(&__ip_vs_mutex);
   out_dec:
@@ -2285,10 +2301,10 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
        int ret = 0;
 
        if (get->fwmark)
-               svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
+               svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark);
        else
-               svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
-                                         get->port);
+               svc = __ip_vs_service_find(AF_INET, get->protocol, &addr,
+                                          get->port);
 
        if (svc) {
                int count = 0;
@@ -2316,7 +2332,6 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
                        }
                        count++;
                }
-               ip_vs_service_put(svc);
        } else
                ret = -ESRCH;
        return ret;
@@ -2437,15 +2452,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
                entry = (struct ip_vs_service_entry *)arg;
                addr.ip = entry->addr;
                if (entry->fwmark)
-                       svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
+                       svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark);
                else
-                       svc = __ip_vs_service_get(AF_INET, entry->protocol,
-                                                 &addr, entry->port);
+                       svc = __ip_vs_service_find(AF_INET, entry->protocol,
+                                                  &addr, entry->port);
                if (svc) {
                        ip_vs_copy_service(entry, svc);
                        if (copy_to_user(user, entry, sizeof(*entry)) != 0)
                                ret = -EFAULT;
-                       ip_vs_service_put(svc);
                } else
                        ret = -ESRCH;
        }
@@ -2560,6 +2574,8 @@ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
        [IPVS_SVC_ATTR_FWMARK]          = { .type = NLA_U32 },
        [IPVS_SVC_ATTR_SCHED_NAME]      = { .type = NLA_NUL_STRING,
                                            .len = IP_VS_SCHEDNAME_MAXLEN },
+       [IPVS_SVC_ATTR_PE_NAME]         = { .type = NLA_NUL_STRING,
+                                           .len = IP_VS_PENAME_MAXLEN },
        [IPVS_SVC_ATTR_FLAGS]           = { .type = NLA_BINARY,
                                            .len = sizeof(struct ip_vs_flags) },
        [IPVS_SVC_ATTR_TIMEOUT]         = { .type = NLA_U32 },
@@ -2636,6 +2652,8 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
        }
 
        NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
+       if (svc->pe)
+               NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
        NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
        NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
        NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
@@ -2712,10 +2730,12 @@ nla_put_failure:
 }
 
 static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
-                                   struct nlattr *nla, int full_entry)
+                                   struct nlattr *nla, int full_entry,
+                                   struct ip_vs_service **ret_svc)
 {
        struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
        struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
+       struct ip_vs_service *svc;
 
        /* Parse mandatory identifying service fields first */
        if (nla == NULL ||
@@ -2751,14 +2771,21 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
                usvc->fwmark = 0;
        }
 
+       if (usvc->fwmark)
+               svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark);
+       else
+               svc = __ip_vs_service_find(usvc->af, usvc->protocol,
+                                          &usvc->addr, usvc->port);
+       *ret_svc = svc;
+
        /* If a full entry was requested, check for the additional fields */
        if (full_entry) {
-               struct nlattr *nla_sched, *nla_flags, *nla_timeout,
+               struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
                              *nla_netmask;
                struct ip_vs_flags flags;
-               struct ip_vs_service *svc;
 
                nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
+               nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
                nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
                nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
                nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
@@ -2769,21 +2796,14 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
                nla_memcpy(&flags, nla_flags, sizeof(flags));
 
                /* prefill flags from service if it already exists */
-               if (usvc->fwmark)
-                       svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
-               else
-                       svc = __ip_vs_service_get(usvc->af, usvc->protocol,
-                                                 &usvc->addr, usvc->port);
-               if (svc) {
+               if (svc)
                        usvc->flags = svc->flags;
-                       ip_vs_service_put(svc);
-               } else
-                       usvc->flags = 0;
 
                /* set new flags from userland */
                usvc->flags = (usvc->flags & ~flags.mask) |
                              (flags.flags & flags.mask);
                usvc->sched_name = nla_data(nla_sched);
+               usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
                usvc->timeout = nla_get_u32(nla_timeout);
                usvc->netmask = nla_get_u32(nla_netmask);
        }
@@ -2794,17 +2814,11 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
 {
        struct ip_vs_service_user_kern usvc;
+       struct ip_vs_service *svc;
        int ret;
 
-       ret = ip_vs_genl_parse_service(&usvc, nla, 0);
-       if (ret)
-               return ERR_PTR(ret);
-
-       if (usvc.fwmark)
-               return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-       else
-               return __ip_vs_service_get(usvc.af, usvc.protocol,
-                                          &usvc.addr, usvc.port);
+       ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc);
+       return ret ? ERR_PTR(ret) : svc;
 }
 
 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
@@ -2895,7 +2909,6 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
 
 nla_put_failure:
        cb->args[0] = idx;
-       ip_vs_service_put(svc);
 
 out_err:
        mutex_unlock(&__ip_vs_mutex);
@@ -3108,17 +3121,10 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 
        ret = ip_vs_genl_parse_service(&usvc,
                                       info->attrs[IPVS_CMD_ATTR_SERVICE],
-                                      need_full_svc);
+                                      need_full_svc, &svc);
        if (ret)
                goto out;
 
-       /* Lookup the exact service by <protocol, addr, port> or fwmark */
-       if (usvc.fwmark == 0)
-               svc = __ip_vs_service_get(usvc.af, usvc.protocol,
-                                         &usvc.addr, usvc.port);
-       else
-               svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-
        /* Unless we're adding a new service, the service must already exist */
        if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
                ret = -ESRCH;
@@ -3152,6 +3158,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
                break;
        case IPVS_CMD_DEL_SERVICE:
                ret = ip_vs_del_service(svc);
+               /* do not use svc, it can be freed */
                break;
        case IPVS_CMD_NEW_DEST:
                ret = ip_vs_add_dest(svc, &udest);
@@ -3170,8 +3177,6 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
        }
 
 out:
-       if (svc)
-               ip_vs_service_put(svc);
        mutex_unlock(&__ip_vs_mutex);
 
        return ret;
@@ -3217,7 +3222,6 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
                        goto out_err;
                } else if (svc) {
                        ret = ip_vs_genl_fill_service(msg, svc);
-                       ip_vs_service_put(svc);
                        if (ret)
                                goto nla_put_failure;
                } else {
@@ -3386,6 +3390,16 @@ int __init ip_vs_control_init(void)
 
        EnterFunction(2);
 
+       /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+       for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
+               INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
+               INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+       }
+       for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
+               INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+       }
+       smp_wmb();
+
        ret = nf_register_sockopt(&ip_vs_sockopts);
        if (ret) {
                pr_err("cannot register sockopt.\n");
@@ -3404,15 +3418,6 @@ int __init ip_vs_control_init(void)
 
        sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
 
-       /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
-       for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
-               INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
-               INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
-       }
-       for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
-               INIT_LIST_HEAD(&ip_vs_rtable[idx]);
-       }
-
        ip_vs_new_estimator(&ip_vs_stats);
 
        /* Hook the defense timer */
index 7e9af5b76d9eb280cc7d197538e5ce6161c42e66..75455000ad1c1cde82b2134970ab3a67a5a97b82 100644 (file)
  *
  * Author:     Wouter Gadeyne
  *
- *
- * Code for ip_vs_expect_related and ip_vs_expect_callback is taken from
- * http://www.ssi.bg/~ja/nfct/:
- *
- * ip_vs_nfct.c:       Netfilter connection tracking support for IPVS
- *
- * Portions Copyright (C) 2001-2002
- * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
- *
- * Portions Copyright (C) 2003-2008
- * Julian Anastasov
  */
 
 #define KMSG_COMPONENT "IPVS"
 #define SERVER_STRING "227 Entering Passive Mode ("
 #define CLIENT_STRING "PORT "
 
-#define FMT_TUPLE      "%pI4:%u->%pI4:%u/%u"
-#define ARG_TUPLE(T)   &(T)->src.u3.ip, ntohs((T)->src.u.all), \
-                       &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
-                       (T)->dst.protonum
-
-#define FMT_CONN       "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
-#define ARG_CONN(C)    &((C)->caddr.ip), ntohs((C)->cport), \
-                       &((C)->vaddr.ip), ntohs((C)->vport), \
-                       &((C)->daddr.ip), ntohs((C)->dport), \
-                       (C)->protocol, (C)->state
 
 /*
  * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -85,6 +64,8 @@ static int ip_vs_ftp_pasv;
 static int
 ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
 {
+       /* We use connection tracking for the command connection */
+       cp->flags |= IP_VS_CONN_F_NFCT;
        return 0;
 }
 
@@ -148,120 +129,6 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
        return 1;
 }
 
-/*
- * Called from init_conntrack() as expectfn handler.
- */
-static void
-ip_vs_expect_callback(struct nf_conn *ct,
-                     struct nf_conntrack_expect *exp)
-{
-       struct nf_conntrack_tuple *orig, new_reply;
-       struct ip_vs_conn *cp;
-
-       if (exp->tuple.src.l3num != PF_INET)
-               return;
-
-       /*
-        * We assume that no NF locks are held before this callback.
-        * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
-        * expectations even if they use wildcard values, now we provide the
-        * actual values from the newly created original conntrack direction.
-        * The conntrack is confirmed when packet reaches IPVS hooks.
-        */
-
-       /* RS->CLIENT */
-       orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
-       cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum,
-                               &orig->src.u3, orig->src.u.tcp.port,
-                               &orig->dst.u3, orig->dst.u.tcp.port);
-       if (cp) {
-               /* Change reply CLIENT->RS to CLIENT->VS */
-               new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-               IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
-                         FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
-                         __func__, ct, ct->status,
-                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-                         ARG_CONN(cp));
-               new_reply.dst.u3 = cp->vaddr;
-               new_reply.dst.u.tcp.port = cp->vport;
-               IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
-                         ", inout cp=" FMT_CONN "\n",
-                         __func__, ct,
-                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-                         ARG_CONN(cp));
-               goto alter;
-       }
-
-       /* CLIENT->VS */
-       cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum,
-                              &orig->src.u3, orig->src.u.tcp.port,
-                              &orig->dst.u3, orig->dst.u.tcp.port);
-       if (cp) {
-               /* Change reply VS->CLIENT to RS->CLIENT */
-               new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-               IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
-                         FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
-                         __func__, ct, ct->status,
-                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-                         ARG_CONN(cp));
-               new_reply.src.u3 = cp->daddr;
-               new_reply.src.u.tcp.port = cp->dport;
-               IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", "
-                         FMT_TUPLE ", outin cp=" FMT_CONN "\n",
-                         __func__, ct,
-                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-                         ARG_CONN(cp));
-               goto alter;
-       }
-
-       IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuple=" FMT_TUPLE
-                 " - unknown expect\n",
-                 __func__, ct, ct->status, ARG_TUPLE(orig));
-       return;
-
-alter:
-       /* Never alter conntrack for non-NAT conns */
-       if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
-               nf_conntrack_alter_reply(ct, &new_reply);
-       ip_vs_conn_put(cp);
-       return;
-}
-
-/*
- * Create NF conntrack expectation with wildcard (optional) source port.
- * Then the default callback function will alter the reply and will confirm
- * the conntrack entry when the first packet comes.
- */
-static void
-ip_vs_expect_related(struct sk_buff *skb, struct nf_conn *ct,
-                    struct ip_vs_conn *cp, u_int8_t proto,
-                    const __be16 *port, int from_rs)
-{
-       struct nf_conntrack_expect *exp;
-
-       BUG_ON(!ct || ct == &nf_conntrack_untracked);
-
-       exp = nf_ct_expect_alloc(ct);
-       if (!exp)
-               return;
-
-       if (from_rs)
-               nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
-                                 nf_ct_l3num(ct), &cp->daddr, &cp->caddr,
-                                 proto, port, &cp->cport);
-       else
-               nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
-                                 nf_ct_l3num(ct), &cp->caddr, &cp->vaddr,
-                                 proto, port, &cp->vport);
-
-       exp->expectfn = ip_vs_expect_callback;
-
-       IP_VS_DBG(7, "%s(): ct=%p, expect tuple=" FMT_TUPLE "\n",
-                 __func__, ct, ARG_TUPLE(&exp->tuple));
-       nf_ct_expect_related(exp);
-       nf_ct_expect_put(exp);
-}
-
 /*
  * Look at outgoing ftp packets to catch the response to a PASV command
  * from the server (inside-to-outside).
@@ -328,14 +195,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
                /*
                 * Now update or create an connection entry for it
                 */
-               n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
-                                         &cp->caddr, 0);
+               {
+                       struct ip_vs_conn_param p;
+                       ip_vs_conn_fill_param(AF_INET, iph->protocol,
+                                             &from, port, &cp->caddr, 0, &p);
+                       n_cp = ip_vs_conn_out_get(&p);
+               }
                if (!n_cp) {
-                       n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
-                                             &cp->caddr, 0,
-                                             &cp->vaddr, port,
-                                             &from, port,
-                                             IP_VS_CONN_F_NO_CPORT,
+                       struct ip_vs_conn_param p;
+                       ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr,
+                                             0, &cp->vaddr, port, &p);
+                       n_cp = ip_vs_conn_new(&p, &from, port,
+                                             IP_VS_CONN_F_NO_CPORT |
+                                             IP_VS_CONN_F_NFCT,
                                              cp->dest);
                        if (!n_cp)
                                return 0;
@@ -370,9 +242,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
                        ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
                                                       start-data, end-start,
                                                       buf, buf_len);
-                       if (ret)
-                               ip_vs_expect_related(skb, ct, n_cp,
-                                                    IPPROTO_TCP, NULL, 0);
+                       if (ret) {
+                               ip_vs_nfct_expect_related(skb, ct, n_cp,
+                                                         IPPROTO_TCP, 0, 0);
+                               if (skb->ip_summed == CHECKSUM_COMPLETE)
+                                       skb->ip_summed = CHECKSUM_UNNECESSARY;
+                               /* csum is updated */
+                               ret = 1;
+                       }
                }
 
                /*
@@ -479,21 +356,22 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
                  ip_vs_proto_name(iph->protocol),
                  &to.ip, ntohs(port), &cp->vaddr.ip, 0);
 
-       n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
-                                &to, port,
-                                &cp->vaddr, htons(ntohs(cp->vport)-1));
-       if (!n_cp) {
-               n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
-                                     &to, port,
+       {
+               struct ip_vs_conn_param p;
+               ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port,
                                      &cp->vaddr, htons(ntohs(cp->vport)-1),
-                                     &cp->daddr, htons(ntohs(cp->dport)-1),
-                                     0,
-                                     cp->dest);
-               if (!n_cp)
-                       return 0;
+                                     &p);
+               n_cp = ip_vs_conn_in_get(&p);
+               if (!n_cp) {
+                       n_cp = ip_vs_conn_new(&p, &cp->daddr,
+                                             htons(ntohs(cp->dport)-1),
+                                             IP_VS_CONN_F_NFCT, cp->dest);
+                       if (!n_cp)
+                               return 0;
 
-               /* add its controller */
-               ip_vs_control_add(n_cp, cp);
+                       /* add its controller */
+                       ip_vs_control_add(n_cp, cp);
+               }
        }
 
        /*
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
new file mode 100644 (file)
index 0000000..4680647
--- /dev/null
@@ -0,0 +1,292 @@
+/*
+ * ip_vs_nfct.c:       Netfilter connection tracking support for IPVS
+ *
+ * Portions Copyright (C) 2001-2002
+ * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
+ *
+ * Portions Copyright (C) 2003-2010
+ * Julian Anastasov
+ *
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * Authors:
+ * Ben North <ben@redfrontdoor.org>
+ * Julian Anastasov <ja@ssi.bg>                Reorganize and sync with latest kernels
+ * Hannes Eder <heder@google.com>      Extend NFCT support for FTP, ipvs match
+ *
+ *
+ * Current status:
+ *
+ * - provide conntrack confirmation for new and related connections, by
+ * this way we can see their proper conntrack state in all hooks
+ * - support for all forwarding methods, not only NAT
+ * - FTP support (NAT), ability to support other NAT apps with expectations
+ * - to correctly create expectations for related NAT connections the proper
+ * NF conntrack support must be already installed, eg. ip_vs_ftp requires
+ * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables
+ * NAT rules are needed)
+ * - alter reply for NAT when forwarding packet in original direction:
+ * conntrack from client in NEW or RELATED (Passive FTP DATA) state or
+ * when RELATED conntrack is created from real server (Active FTP DATA)
+ * - if iptables_nat is not loaded the Passive FTP will not work (the
+ * PASV response can not be NAT-ed) but Active FTP should work
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+
+#define FMT_TUPLE      "%pI4:%u->%pI4:%u/%u"
+#define ARG_TUPLE(T)   &(T)->src.u3.ip, ntohs((T)->src.u.all), \
+                       &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+                       (T)->dst.protonum
+
+#define FMT_CONN       "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
+#define ARG_CONN(C)    &((C)->caddr.ip), ntohs((C)->cport), \
+                       &((C)->vaddr.ip), ntohs((C)->vport), \
+                       &((C)->daddr.ip), ntohs((C)->dport), \
+                       (C)->protocol, (C)->state
+
+void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
+{
+       enum ip_conntrack_info ctinfo;
+       struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+       struct nf_conntrack_tuple new_tuple;
+
+       if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) ||
+           nf_ct_is_dying(ct))
+               return;
+
+       /* Never alter conntrack for non-NAT conns */
+       if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+               return;
+
+       /* Alter reply only in original direction */
+       if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+               return;
+
+       /*
+        * The connection is not yet in the hashtable, so we update it.
+        * CIP->VIP will remain the same, so leave the tuple in
+        * IP_CT_DIR_ORIGINAL untouched.  When the reply comes back from the
+        * real-server we will see RIP->DIP.
+        */
+       new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+       /*
+        * This will also take care of UDP and other protocols.
+        */
+       if (outin) {
+               new_tuple.src.u3 = cp->daddr;
+               if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+                   new_tuple.dst.protonum != IPPROTO_ICMPV6)
+                       new_tuple.src.u.tcp.port = cp->dport;
+       } else {
+               new_tuple.dst.u3 = cp->vaddr;
+               if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+                   new_tuple.dst.protonum != IPPROTO_ICMPV6)
+                       new_tuple.dst.u.tcp.port = cp->vport;
+       }
+       IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+                 "ctinfo=%d, old reply=" FMT_TUPLE
+                 ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
+                 __func__, ct, ct->status, ctinfo,
+                 ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
+                 ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+       nf_conntrack_alter_reply(ct, &new_tuple);
+}
+
+int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+       return nf_conntrack_confirm(skb);
+}
+
+/*
+ * Called from init_conntrack() as expectfn handler.
+ */
+static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
+       struct nf_conntrack_expect *exp)
+{
+       struct nf_conntrack_tuple *orig, new_reply;
+       struct ip_vs_conn *cp;
+       struct ip_vs_conn_param p;
+
+       if (exp->tuple.src.l3num != PF_INET)
+               return;
+
+       /*
+        * We assume that no NF locks are held before this callback.
+        * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
+        * expectations even if they use wildcard values, now we provide the
+        * actual values from the newly created original conntrack direction.
+        * The conntrack is confirmed when packet reaches IPVS hooks.
+        */
+
+       /* RS->CLIENT */
+       orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+       ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum,
+                             &orig->src.u3, orig->src.u.tcp.port,
+                             &orig->dst.u3, orig->dst.u.tcp.port, &p);
+       cp = ip_vs_conn_out_get(&p);
+       if (cp) {
+               /* Change reply CLIENT->RS to CLIENT->VS */
+               new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+               IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+                         FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
+                         __func__, ct, ct->status,
+                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+                         ARG_CONN(cp));
+               new_reply.dst.u3 = cp->vaddr;
+               new_reply.dst.u.tcp.port = cp->vport;
+               IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
+                         ", inout cp=" FMT_CONN "\n",
+                         __func__, ct,
+                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+                         ARG_CONN(cp));
+               goto alter;
+       }
+
+       /* CLIENT->VS */
+       cp = ip_vs_conn_in_get(&p);
+       if (cp) {
+               /* Change reply VS->CLIENT to RS->CLIENT */
+               new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+               IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+                         FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
+                         __func__, ct, ct->status,
+                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+                         ARG_CONN(cp));
+               new_reply.src.u3 = cp->daddr;
+               new_reply.src.u.tcp.port = cp->dport;
+               IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
+                         FMT_TUPLE ", outin cp=" FMT_CONN "\n",
+                         __func__, ct,
+                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+                         ARG_CONN(cp));
+               goto alter;
+       }
+
+       IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+                 " - unknown expect\n",
+                 __func__, ct, ct->status, ARG_TUPLE(orig));
+       return;
+
+alter:
+       /* Never alter conntrack for non-NAT conns */
+       if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
+               nf_conntrack_alter_reply(ct, &new_reply);
+       ip_vs_conn_put(cp);
+       return;
+}
+
+/*
+ * Create NF conntrack expectation with wildcard (optional) source port.
+ * Then the default callback function will alter the reply and will confirm
+ * the conntrack entry when the first packet comes.
+ * Use port 0 to expect connection from any port.
+ */
+void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+                              struct ip_vs_conn *cp, u_int8_t proto,
+                              const __be16 port, int from_rs)
+{
+       struct nf_conntrack_expect *exp;
+
+       if (ct == NULL || nf_ct_is_untracked(ct))
+               return;
+
+       exp = nf_ct_expect_alloc(ct);
+       if (!exp)
+               return;
+
+       nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+                       from_rs ? &cp->daddr : &cp->caddr,
+                       from_rs ? &cp->caddr : &cp->vaddr,
+                       proto, port ? &port : NULL,
+                       from_rs ? &cp->cport : &cp->vport);
+
+       exp->expectfn = ip_vs_nfct_expect_callback;
+
+       IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+               __func__, ct, ARG_TUPLE(&exp->tuple));
+       nf_ct_expect_related(exp);
+       nf_ct_expect_put(exp);
+}
+EXPORT_SYMBOL(ip_vs_nfct_expect_related);
+
+/*
+ * Our connection was terminated, try to drop the conntrack immediately
+ */
+void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
+{
+       struct nf_conntrack_tuple_hash *h;
+       struct nf_conn *ct;
+       struct nf_conntrack_tuple tuple;
+
+       if (!cp->cport)
+               return;
+
+       tuple = (struct nf_conntrack_tuple) {
+               .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } };
+       tuple.src.u3 = cp->caddr;
+       tuple.src.u.all = cp->cport;
+       tuple.src.l3num = cp->af;
+       tuple.dst.u3 = cp->vaddr;
+       tuple.dst.u.all = cp->vport;
+
+       IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
+               " for conn " FMT_CONN "\n",
+               __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+
+       h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
+       if (h) {
+               ct = nf_ct_tuplehash_to_ctrack(h);
+               /* Show what happens instead of calling nf_ct_kill() */
+               if (del_timer(&ct->timeout)) {
+                       IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
+                               FMT_TUPLE "\n",
+                               __func__, ct, ARG_TUPLE(&tuple));
+                       if (ct->timeout.function)
+                               ct->timeout.function(ct->timeout.data);
+               } else {
+                       IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
+                               FMT_TUPLE "\n",
+                               __func__, ct, ARG_TUPLE(&tuple));
+               }
+               nf_ct_put(ct);
+       } else {
+               IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+                       __func__, ARG_TUPLE(&tuple));
+       }
+}
+
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
new file mode 100644 (file)
index 0000000..3414af7
--- /dev/null
@@ -0,0 +1,147 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+/* IPVS pe list */
+static LIST_HEAD(ip_vs_pe);
+
+/* lock for service table */
+static DEFINE_SPINLOCK(ip_vs_pe_lock);
+
+/* Bind a service with a pe */
+void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
+{
+       svc->pe = pe;
+}
+
+/* Unbind a service from its pe */
+void ip_vs_unbind_pe(struct ip_vs_service *svc)
+{
+       svc->pe = NULL;
+}
+
+/* Get pe in the pe list by name */
+static struct ip_vs_pe *
+ip_vs_pe_getbyname(const char *pe_name)
+{
+       struct ip_vs_pe *pe;
+
+       IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__,
+                 pe_name);
+
+       spin_lock_bh(&ip_vs_pe_lock);
+
+       list_for_each_entry(pe, &ip_vs_pe, n_list) {
+               /* Test and get the modules atomically */
+               if (pe->module &&
+                   !try_module_get(pe->module)) {
+                       /* This pe is just deleted */
+                       continue;
+               }
+               if (strcmp(pe_name, pe->name)==0) {
+                       /* HIT */
+                       spin_unlock_bh(&ip_vs_pe_lock);
+                       return pe;
+               }
+               if (pe->module)
+                       module_put(pe->module);
+       }
+
+       spin_unlock_bh(&ip_vs_pe_lock);
+       return NULL;
+}
+
+/* Lookup pe and try to load it if it doesn't exist */
+struct ip_vs_pe *ip_vs_pe_get(const char *name)
+{
+       struct ip_vs_pe *pe;
+
+       /* Search for the pe by name */
+       pe = ip_vs_pe_getbyname(name);
+
+       /* If pe not found, load the module and search again */
+       if (!pe) {
+               request_module("ip_vs_pe_%s", name);
+               pe = ip_vs_pe_getbyname(name);
+       }
+
+       return pe;
+}
+
+void ip_vs_pe_put(struct ip_vs_pe *pe)
+{
+       if (pe && pe->module)
+               module_put(pe->module);
+}
+
+/* Register a pe in the pe list */
+int register_ip_vs_pe(struct ip_vs_pe *pe)
+{
+       struct ip_vs_pe *tmp;
+
+       /* increase the module use count */
+       ip_vs_use_count_inc();
+
+       spin_lock_bh(&ip_vs_pe_lock);
+
+       if (!list_empty(&pe->n_list)) {
+               spin_unlock_bh(&ip_vs_pe_lock);
+               ip_vs_use_count_dec();
+               pr_err("%s(): [%s] pe already linked\n",
+                      __func__, pe->name);
+               return -EINVAL;
+       }
+
+       /* Make sure that the pe with this name doesn't exist
+        * in the pe list.
+        */
+       list_for_each_entry(tmp, &ip_vs_pe, n_list) {
+               if (strcmp(tmp->name, pe->name) == 0) {
+                       spin_unlock_bh(&ip_vs_pe_lock);
+                       ip_vs_use_count_dec();
+                       pr_err("%s(): [%s] pe already existed "
+                              "in the system\n", __func__, pe->name);
+                       return -EINVAL;
+               }
+       }
+       /* Add it into the d-linked pe list */
+       list_add(&pe->n_list, &ip_vs_pe);
+       spin_unlock_bh(&ip_vs_pe_lock);
+
+       pr_info("[%s] pe registered.\n", pe->name);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(register_ip_vs_pe);
+
+/* Unregister a pe from the pe list */
+int unregister_ip_vs_pe(struct ip_vs_pe *pe)
+{
+       spin_lock_bh(&ip_vs_pe_lock);
+       if (list_empty(&pe->n_list)) {
+               spin_unlock_bh(&ip_vs_pe_lock);
+               pr_err("%s(): [%s] pe is not in the list. failed\n",
+                      __func__, pe->name);
+               return -EINVAL;
+       }
+
+       /* Remove it from the d-linked pe list */
+       list_del(&pe->n_list);
+       spin_unlock_bh(&ip_vs_pe_lock);
+
+       /* decrease the module use count */
+       ip_vs_use_count_dec();
+
+       pr_info("[%s] pe unregistered.\n", pe->name);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ip_vs_pe);
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
new file mode 100644 (file)
index 0000000..b8b4e96
--- /dev/null
@@ -0,0 +1,169 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+#ifdef CONFIG_IP_VS_DEBUG
+static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
+                                   const char *callid, size_t callid_len,
+                                   int *idx)
+{
+       size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1);
+       memcpy(buf + *idx, callid, len);
+       buf[*idx+len] = '\0';
+       *idx += len + 1;
+       return buf + *idx - len;
+}
+
+#define IP_VS_DEBUG_CALLID(callid, len)                                        \
+       ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf),          \
+                        callid, len, &ip_vs_dbg_idx)
+#endif
+
+static int get_callid(const char *dptr, unsigned int dataoff,
+                     unsigned int datalen,
+                     unsigned int *matchoff, unsigned int *matchlen)
+{
+       /* Find callid */
+       while (1) {
+               int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen,
+                                           SIP_HDR_CALL_ID, matchoff,
+                                           matchlen);
+               if (ret > 0)
+                       break;
+               if (!ret)
+                       return 0;
+               dataoff += *matchoff;
+       }
+
+       /* Empty callid is useless */
+       if (!*matchlen)
+               return -EINVAL;
+
+       /* Too large is useless */
+       if (*matchlen > IP_VS_PEDATA_MAXLEN)
+               return -EINVAL;
+
+       /* SIP headers are always followed by a line terminator */
+       if (*matchoff + *matchlen == datalen)
+               return -EINVAL;
+
+       /* RFC 2543 allows lines to be terminated with CR, LF or CRLF,
+        * RFC 3261 allows only CRLF, we support both. */
+       if (*(dptr + *matchoff + *matchlen) != '\r' &&
+           *(dptr + *matchoff + *matchlen) != '\n')
+               return -EINVAL;
+
+       IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n",
+                     IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen),
+                     *matchlen);
+       return 0;
+}
+
+static int
+ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
+{
+       struct ip_vs_iphdr iph;
+       unsigned int dataoff, datalen, matchoff, matchlen;
+       const char *dptr;
+
+       ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
+
+       /* Only useful with UDP */
+       if (iph.protocol != IPPROTO_UDP)
+               return -EINVAL;
+
+       /* No Data ? */
+       dataoff = iph.len + sizeof(struct udphdr);
+       if (dataoff >= skb->len)
+               return -EINVAL;
+
+       dptr = skb->data + dataoff;
+       datalen = skb->len - dataoff;
+
+       if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
+               return -EINVAL;
+
+       p->pe_data = kmalloc(matchlen, GFP_ATOMIC);
+       if (!p->pe_data)
+               return -ENOMEM;
+
+       /* N.B: pe_data is only set on success,
+        * this allows fallback to the default persistence logic on failure
+        */
+       memcpy(p->pe_data, dptr + matchoff, matchlen);
+       p->pe_data_len = matchlen;
+
+       return 0;
+}
+
+static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
+                                 struct ip_vs_conn *ct)
+
+{
+       bool ret = 0;
+
+       if (ct->af == p->af &&
+           ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) &&
+           /* protocol should only be IPPROTO_IP if
+            * d_addr is a fwmark */
+           ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+                            p->vaddr, &ct->vaddr) &&
+           ct->vport == p->vport &&
+           ct->flags & IP_VS_CONN_F_TEMPLATE &&
+           ct->protocol == p->protocol &&
+           ct->pe_data && ct->pe_data_len == p->pe_data_len &&
+           !memcmp(ct->pe_data, p->pe_data, p->pe_data_len))
+               ret = 1;
+
+       IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",
+                     ip_vs_proto_name(p->protocol),
+                     IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len),
+                     IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+                     ret ? "hit" : "not hit");
+
+       return ret;
+}
+
+static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
+                                u32 initval, bool inverse)
+{
+       return jhash(p->pe_data, p->pe_data_len, initval);
+}
+
+static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
+{
+       memcpy(buf, cp->pe_data, cp->pe_data_len);
+       return cp->pe_data_len;
+}
+
+static struct ip_vs_pe ip_vs_sip_pe =
+{
+       .name =                 "sip",
+       .refcnt =               ATOMIC_INIT(0),
+       .module =               THIS_MODULE,
+       .n_list =               LIST_HEAD_INIT(ip_vs_sip_pe.n_list),
+       .fill_param =           ip_vs_sip_fill_param,
+       .ct_match =             ip_vs_sip_ct_match,
+       .hashkey_raw =          ip_vs_sip_hashkey_raw,
+       .show_pe_data =         ip_vs_sip_show_pe_data,
+};
+
+static int __init ip_vs_sip_init(void)
+{
+       return register_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+static void __exit ip_vs_sip_cleanup(void)
+{
+       unregister_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+module_init(ip_vs_sip_init);
+module_exit(ip_vs_sip_cleanup);
+MODULE_LICENSE("GPL");
index 027f654799feb969fe5a64de8b262e37287e8f48..c539983908771ead7df2e14b7d4eb9c259369064 100644 (file)
@@ -172,8 +172,8 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
        else if (ih->frag_off & htons(IP_OFFSET))
                sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr);
        else {
-               __be16 _ports[2], *pptr
-;
+               __be16 _ports[2], *pptr;
+
                pptr = skb_header_pointer(skb, offset + ih->ihl*4,
                                          sizeof(_ports), _ports);
                if (pptr == NULL)
@@ -223,13 +223,13 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
 
 
 void
-ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
+ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
                          const struct sk_buff *skb,
                          int offset,
                          const char *msg)
 {
 #ifdef CONFIG_IP_VS_IPV6
-       if (skb->protocol == htons(ETH_P_IPV6))
+       if (af == AF_INET6)
                ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
        else
 #endif
index 1892dfc12fdd96ffa1169d5922243cfd0010e7e5..3a0461117d3fad6216747b60b5ec24856ca9d7fc 100644 (file)
@@ -40,6 +40,19 @@ struct isakmp_hdr {
 
 #define PORT_ISAKMP    500
 
+static void
+ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph,
+                            int inverse, struct ip_vs_conn_param *p)
+{
+       if (likely(!inverse))
+               ip_vs_conn_fill_param(af, IPPROTO_UDP,
+                                     &iph->saddr, htons(PORT_ISAKMP),
+                                     &iph->daddr, htons(PORT_ISAKMP), p);
+       else
+               ip_vs_conn_fill_param(af, IPPROTO_UDP,
+                                     &iph->daddr, htons(PORT_ISAKMP),
+                                     &iph->saddr, htons(PORT_ISAKMP), p);
+}
 
 static struct ip_vs_conn *
 ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
@@ -47,21 +60,10 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
                   int inverse)
 {
        struct ip_vs_conn *cp;
+       struct ip_vs_conn_param p;
 
-       if (likely(!inverse)) {
-               cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
-                                      &iph->saddr,
-                                      htons(PORT_ISAKMP),
-                                      &iph->daddr,
-                                      htons(PORT_ISAKMP));
-       } else {
-               cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
-                                      &iph->daddr,
-                                      htons(PORT_ISAKMP),
-                                      &iph->saddr,
-                                      htons(PORT_ISAKMP));
-       }
-
+       ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+       cp = ip_vs_conn_in_get(&p);
        if (!cp) {
                /*
                 * We are not sure if the packet is from our
@@ -87,21 +89,10 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
                    int inverse)
 {
        struct ip_vs_conn *cp;
+       struct ip_vs_conn_param p;
 
-       if (likely(!inverse)) {
-               cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
-                                       &iph->saddr,
-                                       htons(PORT_ISAKMP),
-                                       &iph->daddr,
-                                       htons(PORT_ISAKMP));
-       } else {
-               cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
-                                       &iph->daddr,
-                                       htons(PORT_ISAKMP),
-                                       &iph->saddr,
-                                       htons(PORT_ISAKMP));
-       }
-
+       ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+       cp = ip_vs_conn_out_get(&p);
        if (!cp) {
                IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
                              "%s%s %s->%s\n",
@@ -126,54 +117,6 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
        return 0;
 }
 
-
-static void
-ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-                      int offset, const char *msg)
-{
-       char buf[256];
-       struct iphdr _iph, *ih;
-
-       ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-       if (ih == NULL)
-               sprintf(buf, "TRUNCATED");
-       else
-               sprintf(buf, "%pI4->%pI4", &ih->saddr, &ih->daddr);
-
-       pr_debug("%s: %s %s\n", msg, pp->name, buf);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static void
-ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-                      int offset, const char *msg)
-{
-       char buf[256];
-       struct ipv6hdr _iph, *ih;
-
-       ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-       if (ih == NULL)
-               sprintf(buf, "TRUNCATED");
-       else
-               sprintf(buf, "%pI6->%pI6", &ih->saddr, &ih->daddr);
-
-       pr_debug("%s: %s %s\n", msg, pp->name, buf);
-}
-#endif
-
-static void
-ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-                   int offset, const char *msg)
-{
-#ifdef CONFIG_IP_VS_IPV6
-       if (skb->protocol == htons(ETH_P_IPV6))
-               ah_esp_debug_packet_v6(pp, skb, offset, msg);
-       else
-#endif
-               ah_esp_debug_packet_v4(pp, skb, offset, msg);
-}
-
-
 static void ah_esp_init(struct ip_vs_protocol *pp)
 {
        /* nothing to do now */
@@ -204,7 +147,7 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
        .register_app =         NULL,
        .unregister_app =       NULL,
        .app_conn_bind =        NULL,
-       .debug_packet =         ah_esp_debug_packet,
+       .debug_packet =         ip_vs_tcpudp_debug_packet,
        .timeout_change =       NULL,           /* ISAKMP */
        .set_state_timeout =    NULL,
 };
@@ -228,7 +171,7 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
        .register_app =         NULL,
        .unregister_app =       NULL,
        .app_conn_bind =        NULL,
-       .debug_packet =         ah_esp_debug_packet,
+       .debug_packet =         ip_vs_tcpudp_debug_packet,
        .timeout_change =       NULL,           /* ISAKMP */
 };
 #endif
index 2f982a4c47709fe42e061a5d65565041d8b97cbf..1ea96bcd342b8fc81eb437a6826fc6652c2be4c2 100644 (file)
@@ -31,6 +31,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
        if ((sch->type == SCTP_CID_INIT) &&
            (svc = ip_vs_service_get(af, skb->mark, iph.protocol,
                                     &iph.daddr, sh->dest))) {
+               int ignored;
+
                if (ip_vs_todrop()) {
                        /*
                         * It seems that we are very loaded.
@@ -44,8 +46,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
                 * Let the virtual server select a real server for the
                 * incoming connection, and create a connection entry.
                 */
-               *cpp = ip_vs_schedule(svc, skb);
-               if (!*cpp) {
+               *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+               if (!*cpp && !ignored) {
                        *verdict = ip_vs_leave(svc, skb, pp);
                        return 0;
                }
@@ -175,7 +177,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 
        if (val != cmp) {
                /* CRC failure, dump it. */
-               IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+               IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
                                "Failed checksum for");
                return 0;
        }
index 282d24de8592e659466657533b10ad6eadf4bd5c..f6c5200e214663fe915b2136532c54d03861e5eb 100644 (file)
@@ -43,9 +43,12 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
                return 0;
        }
 
+       /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
        if (th->syn &&
            (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
                                     th->dest))) {
+               int ignored;
+
                if (ip_vs_todrop()) {
                        /*
                         * It seems that we are very loaded.
@@ -60,8 +63,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
                 * Let the virtual server select a real server for the
                 * incoming connection, and create a connection entry.
                 */
-               *cpp = ip_vs_schedule(svc, skb);
-               if (!*cpp) {
+               *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+               if (!*cpp && !ignored) {
                        *verdict = ip_vs_leave(svc, skb, pp);
                        return 0;
                }
@@ -101,15 +104,15 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph,
 #ifdef CONFIG_IP_VS_IPV6
        if (af == AF_INET6)
                tcph->check =
-                       csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+                       ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
                                         ip_vs_check_diff2(oldlen, newlen,
-                                               ~csum_unfold(tcph->check))));
+                                               csum_unfold(tcph->check))));
        else
 #endif
        tcph->check =
-               csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+               ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
                                ip_vs_check_diff2(oldlen, newlen,
-                                               ~csum_unfold(tcph->check))));
+                                               csum_unfold(tcph->check))));
 }
 
 
@@ -120,6 +123,7 @@ tcp_snat_handler(struct sk_buff *skb,
        struct tcphdr *tcph;
        unsigned int tcphoff;
        int oldlen;
+       int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
        if (cp->af == AF_INET6)
@@ -134,13 +138,20 @@ tcp_snat_handler(struct sk_buff *skb,
                return 0;
 
        if (unlikely(cp->app != NULL)) {
+               int ret;
+
                /* Some checks before mangling */
                if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
                        return 0;
 
                /* Call application helper if needed */
-               if (!ip_vs_app_pkt_out(cp, skb))
+               if (!(ret = ip_vs_app_pkt_out(cp, skb)))
                        return 0;
+               /* ret=2: csum update is needed after payload mangling */
+               if (ret == 1)
+                       oldlen = skb->len - tcphoff;
+               else
+                       payload_csum = 1;
        }
 
        tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -151,12 +162,13 @@ tcp_snat_handler(struct sk_buff *skb,
                tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
                                        htons(oldlen),
                                        htons(skb->len - tcphoff));
-       } else if (!cp->app) {
+       } else if (!payload_csum) {
                /* Only port and addr are changed, do fast csum update */
                tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
                                     cp->dport, cp->vport);
                if (skb->ip_summed == CHECKSUM_COMPLETE)
-                       skb->ip_summed = CHECKSUM_NONE;
+                       skb->ip_summed = (cp->app && pp->csum_check) ?
+                                        CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
        } else {
                /* full checksum calculation */
                tcph->check = 0;
@@ -174,6 +186,7 @@ tcp_snat_handler(struct sk_buff *skb,
                                                        skb->len - tcphoff,
                                                        cp->protocol,
                                                        skb->csum);
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
 
                IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
                          pp->name, tcph->check,
@@ -190,6 +203,7 @@ tcp_dnat_handler(struct sk_buff *skb,
        struct tcphdr *tcph;
        unsigned int tcphoff;
        int oldlen;
+       int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
        if (cp->af == AF_INET6)
@@ -204,6 +218,8 @@ tcp_dnat_handler(struct sk_buff *skb,
                return 0;
 
        if (unlikely(cp->app != NULL)) {
+               int ret;
+
                /* Some checks before mangling */
                if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
                        return 0;
@@ -212,8 +228,13 @@ tcp_dnat_handler(struct sk_buff *skb,
                 *      Attempt ip_vs_app call.
                 *      It will fix ip_vs_conn and iph ack_seq stuff
                 */
-               if (!ip_vs_app_pkt_in(cp, skb))
+               if (!(ret = ip_vs_app_pkt_in(cp, skb)))
                        return 0;
+               /* ret=2: csum update is needed after payload mangling */
+               if (ret == 1)
+                       oldlen = skb->len - tcphoff;
+               else
+                       payload_csum = 1;
        }
 
        tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -223,15 +244,16 @@ tcp_dnat_handler(struct sk_buff *skb,
         *      Adjust TCP checksums
         */
        if (skb->ip_summed == CHECKSUM_PARTIAL) {
-               tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+               tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
                                        htons(oldlen),
                                        htons(skb->len - tcphoff));
-       } else if (!cp->app) {
+       } else if (!payload_csum) {
                /* Only port and addr are changed, do fast csum update */
                tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
                                     cp->vport, cp->dport);
                if (skb->ip_summed == CHECKSUM_COMPLETE)
-                       skb->ip_summed = CHECKSUM_NONE;
+                       skb->ip_summed = (cp->app && pp->csum_check) ?
+                                        CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
        } else {
                /* full checksum calculation */
                tcph->check = 0;
@@ -278,7 +300,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
                                            skb->len - tcphoff,
                                            ipv6_hdr(skb)->nexthdr,
                                            skb->csum)) {
-                               IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                               IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
                                                 "Failed checksum for");
                                return 0;
                        }
@@ -289,7 +311,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
                                              skb->len - tcphoff,
                                              ip_hdr(skb)->protocol,
                                              skb->csum)) {
-                               IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                               IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
                                                 "Failed checksum for");
                                return 0;
                        }
index 8553231b5d412ca557f8699ee998e05351152213..9d106a06bb0a46376252b32f2d30882d921b8b16 100644 (file)
@@ -46,6 +46,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
        svc = ip_vs_service_get(af, skb->mark, iph.protocol,
                                &iph.daddr, uh->dest);
        if (svc) {
+               int ignored;
+
                if (ip_vs_todrop()) {
                        /*
                         * It seems that we are very loaded.
@@ -60,8 +62,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
                 * Let the virtual server select a real server for the
                 * incoming connection, and create a connection entry.
                 */
-               *cpp = ip_vs_schedule(svc, skb);
-               if (!*cpp) {
+               *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+               if (!*cpp && !ignored) {
                        *verdict = ip_vs_leave(svc, skb, pp);
                        return 0;
                }
@@ -102,15 +104,15 @@ udp_partial_csum_update(int af, struct udphdr *uhdr,
 #ifdef CONFIG_IP_VS_IPV6
        if (af == AF_INET6)
                uhdr->check =
-                       csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+                       ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
                                         ip_vs_check_diff2(oldlen, newlen,
-                                               ~csum_unfold(uhdr->check))));
+                                               csum_unfold(uhdr->check))));
        else
 #endif
        uhdr->check =
-               csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+               ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
                                ip_vs_check_diff2(oldlen, newlen,
-                                               ~csum_unfold(uhdr->check))));
+                                               csum_unfold(uhdr->check))));
 }
 
 
@@ -121,6 +123,7 @@ udp_snat_handler(struct sk_buff *skb,
        struct udphdr *udph;
        unsigned int udphoff;
        int oldlen;
+       int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
        if (cp->af == AF_INET6)
@@ -135,6 +138,8 @@ udp_snat_handler(struct sk_buff *skb,
                return 0;
 
        if (unlikely(cp->app != NULL)) {
+               int ret;
+
                /* Some checks before mangling */
                if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
                        return 0;
@@ -142,8 +147,13 @@ udp_snat_handler(struct sk_buff *skb,
                /*
                 *      Call application helper if needed
                 */
-               if (!ip_vs_app_pkt_out(cp, skb))
+               if (!(ret = ip_vs_app_pkt_out(cp, skb)))
                        return 0;
+               /* ret=2: csum update is needed after payload mangling */
+               if (ret == 1)
+                       oldlen = skb->len - udphoff;
+               else
+                       payload_csum = 1;
        }
 
        udph = (void *)skb_network_header(skb) + udphoff;
@@ -156,12 +166,13 @@ udp_snat_handler(struct sk_buff *skb,
                udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
                                        htons(oldlen),
                                        htons(skb->len - udphoff));
-       } else if (!cp->app && (udph->check != 0)) {
+       } else if (!payload_csum && (udph->check != 0)) {
                /* Only port and addr are changed, do fast csum update */
                udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
                                     cp->dport, cp->vport);
                if (skb->ip_summed == CHECKSUM_COMPLETE)
-                       skb->ip_summed = CHECKSUM_NONE;
+                       skb->ip_summed = (cp->app && pp->csum_check) ?
+                                        CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
        } else {
                /* full checksum calculation */
                udph->check = 0;
@@ -181,6 +192,7 @@ udp_snat_handler(struct sk_buff *skb,
                                                        skb->csum);
                if (udph->check == 0)
                        udph->check = CSUM_MANGLED_0;
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
                IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
                          pp->name, udph->check,
                          (char*)&(udph->check) - (char*)udph);
@@ -196,6 +208,7 @@ udp_dnat_handler(struct sk_buff *skb,
        struct udphdr *udph;
        unsigned int udphoff;
        int oldlen;
+       int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
        if (cp->af == AF_INET6)
@@ -210,6 +223,8 @@ udp_dnat_handler(struct sk_buff *skb,
                return 0;
 
        if (unlikely(cp->app != NULL)) {
+               int ret;
+
                /* Some checks before mangling */
                if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
                        return 0;
@@ -218,8 +233,13 @@ udp_dnat_handler(struct sk_buff *skb,
                 *      Attempt ip_vs_app call.
                 *      It will fix ip_vs_conn
                 */
-               if (!ip_vs_app_pkt_in(cp, skb))
+               if (!(ret = ip_vs_app_pkt_in(cp, skb)))
                        return 0;
+               /* ret=2: csum update is needed after payload mangling */
+               if (ret == 1)
+                       oldlen = skb->len - udphoff;
+               else
+                       payload_csum = 1;
        }
 
        udph = (void *)skb_network_header(skb) + udphoff;
@@ -229,15 +249,16 @@ udp_dnat_handler(struct sk_buff *skb,
         *      Adjust UDP checksums
         */
        if (skb->ip_summed == CHECKSUM_PARTIAL) {
-               udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+               udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
                                        htons(oldlen),
                                        htons(skb->len - udphoff));
-       } else if (!cp->app && (udph->check != 0)) {
+       } else if (!payload_csum && (udph->check != 0)) {
                /* Only port and addr are changed, do fast csum update */
                udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
                                     cp->vport, cp->dport);
                if (skb->ip_summed == CHECKSUM_COMPLETE)
-                       skb->ip_summed = CHECKSUM_NONE;
+                       skb->ip_summed = (cp->app && pp->csum_check) ?
+                                        CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
        } else {
                /* full checksum calculation */
                udph->check = 0;
@@ -293,7 +314,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
                                                    skb->len - udphoff,
                                                    ipv6_hdr(skb)->nexthdr,
                                                    skb->csum)) {
-                                       IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                                       IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
                                                         "Failed checksum for");
                                        return 0;
                                }
@@ -304,7 +325,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
                                                      skb->len - udphoff,
                                                      ip_hdr(skb)->protocol,
                                                      skb->csum)) {
-                                       IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                                       IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
                                                         "Failed checksum for");
                                        return 0;
                                }
index 727e45b669531338538b123649e696f0be20e775..076ebe00435deef930f428fbb02414c550c14b98 100644 (file)
@@ -46,15 +46,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
 {
        int ret;
 
-       if (svc == NULL) {
-               pr_err("%s(): svc arg NULL\n", __func__);
-               return -EINVAL;
-       }
-       if (scheduler == NULL) {
-               pr_err("%s(): scheduler arg NULL\n", __func__);
-               return -EINVAL;
-       }
-
        svc->scheduler = scheduler;
 
        if (scheduler->init_service) {
@@ -74,18 +65,10 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
  */
 int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
 {
-       struct ip_vs_scheduler *sched;
+       struct ip_vs_scheduler *sched = svc->scheduler;
 
-       if (svc == NULL) {
-               pr_err("%s(): svc arg NULL\n", __func__);
-               return -EINVAL;
-       }
-
-       sched = svc->scheduler;
-       if (sched == NULL) {
-               pr_err("%s(): svc isn't bound\n", __func__);
-               return -EINVAL;
-       }
+       if (!sched)
+               return 0;
 
        if (sched->done_service) {
                if (sched->done_service(svc) != 0) {
@@ -159,7 +142,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
 
 void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
 {
-       if (scheduler->module)
+       if (scheduler && scheduler->module)
                module_put(scheduler->module);
 }
 
index 7ba06939829f3774ca34d813a41e58e8725cc6a8..ab85aedea17eea6100eb1aefe48b028d371f29d2 100644 (file)
@@ -288,6 +288,16 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
                ip_vs_sync_conn(cp->control);
 }
 
+static inline int
+ip_vs_conn_fill_param_sync(int af, int protocol,
+                          const union nf_inet_addr *caddr, __be16 cport,
+                          const union nf_inet_addr *vaddr, __be16 vport,
+                          struct ip_vs_conn_param *p)
+{
+       /* XXX: Need to take into account persistence engine */
+       ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
+       return 0;
+}
 
 /*
  *      Process received multicast message and create the corresponding
@@ -301,6 +311,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
        struct ip_vs_conn *cp;
        struct ip_vs_protocol *pp;
        struct ip_vs_dest *dest;
+       struct ip_vs_conn_param param;
        char *p;
        int i;
 
@@ -370,18 +381,20 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
                        }
                }
 
-               if (!(flags & IP_VS_CONN_F_TEMPLATE))
-                       cp = ip_vs_conn_in_get(AF_INET, s->protocol,
-                                              (union nf_inet_addr *)&s->caddr,
-                                              s->cport,
-                                              (union nf_inet_addr *)&s->vaddr,
-                                              s->vport);
-               else
-                       cp = ip_vs_ct_in_get(AF_INET, s->protocol,
-                                            (union nf_inet_addr *)&s->caddr,
-                                            s->cport,
-                                            (union nf_inet_addr *)&s->vaddr,
-                                            s->vport);
+               {
+                       if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
+                                             (union nf_inet_addr *)&s->caddr,
+                                             s->cport,
+                                             (union nf_inet_addr *)&s->vaddr,
+                                             s->vport, &param)) {
+                               pr_err("ip_vs_conn_fill_param_sync failed");
+                               return;
+                       }
+                       if (!(flags & IP_VS_CONN_F_TEMPLATE))
+                               cp = ip_vs_conn_in_get(&param);
+                       else
+                               cp = ip_vs_ct_in_get(&param);
+               }
                if (!cp) {
                        /*
                         * Find the appropriate destination for the connection.
@@ -406,14 +419,9 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
                                else
                                        flags &= ~IP_VS_CONN_F_INACTIVE;
                        }
-                       cp = ip_vs_conn_new(AF_INET, s->protocol,
-                                           (union nf_inet_addr *)&s->caddr,
-                                           s->cport,
-                                           (union nf_inet_addr *)&s->vaddr,
-                                           s->vport,
+                       cp = ip_vs_conn_new(&param,
                                            (union nf_inet_addr *)&s->daddr,
-                                           s->dport,
-                                           flags, dest);
+                                           s->dport, flags, dest);
                        if (dest)
                                atomic_dec(&dest->refcnt);
                        if (!cp) {
index 49df6bea6a2ddaec391ce077cf9f72a72efc4a7f..de04ea39cde8990025bdb5f63ff408fcc948fb0a 100644 (file)
  *
  * Changes:
  *
+ * Description of forwarding methods:
+ * - all transmitters are called from LOCAL_IN (remote clients) and
+ * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
+ * - not all connections have destination server, for example,
+ * connections in backup server when fwmark is used
+ * - bypass connections use daddr from packet
+ * LOCAL_OUT rules:
+ * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
+ * - skb->pkt_type is not set yet
+ * - the only place where we can see skb->sk != NULL
  */
 
 #define KMSG_COMPONENT "IPVS"
@@ -26,9 +36,9 @@
 #include <net/route.h>                  /* for ip_route_output */
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/addrconf.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
-#include <net/netfilter/nf_conntrack.h>
 #include <linux/netfilter_ipv4.h>
 
 #include <net/ip_vs.h>
  *      Destination cache to speed up outgoing route lookup
  */
 static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
+               u32 dst_cookie)
 {
        struct dst_entry *old_dst;
 
        old_dst = dest->dst_cache;
        dest->dst_cache = dst;
        dest->dst_rtos = rtos;
+       dest->dst_cookie = dst_cookie;
        dst_release(old_dst);
 }
 
 static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
 {
        struct dst_entry *dst = dest->dst_cache;
 
        if (!dst)
                return NULL;
-       if ((dst->obsolete
-            || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
-           dst->ops->check(dst, cookie) == NULL) {
+       if ((dst->obsolete || rtos != dest->dst_rtos) &&
+           dst->ops->check(dst, dest->dst_cookie) == NULL) {
                dest->dst_cache = NULL;
                dst_release(dst);
                return NULL;
@@ -66,16 +77,24 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
        return dst;
 }
 
+/*
+ * Get route to destination or remote server
+ * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
+ *         &4=Allow redirect from remote daddr to local
+ */
 static struct rtable *
-__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
+                  __be32 daddr, u32 rtos, int rt_mode)
 {
+       struct net *net = dev_net(skb_dst(skb)->dev);
        struct rtable *rt;                      /* Route to the other host */
-       struct ip_vs_dest *dest = cp->dest;
+       struct rtable *ort;                     /* Original route */
+       int local;
 
        if (dest) {
                spin_lock(&dest->dst_lock);
                if (!(rt = (struct rtable *)
-                     __ip_vs_dst_check(dest, rtos, 0))) {
+                     __ip_vs_dst_check(dest, rtos))) {
                        struct flowi fl = {
                                .oif = 0,
                                .nl_u = {
@@ -85,13 +104,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
                                                .tos = rtos, } },
                        };
 
-                       if (ip_route_output_key(&init_net, &rt, &fl)) {
+                       if (ip_route_output_key(net, &rt, &fl)) {
                                spin_unlock(&dest->dst_lock);
                                IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
                                             &dest->addr.ip);
                                return NULL;
                        }
-                       __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
+                       __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
                        IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
                                  &dest->addr.ip,
                                  atomic_read(&rt->dst.__refcnt), rtos);
@@ -102,78 +121,199 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
                        .oif = 0,
                        .nl_u = {
                                .ip4_u = {
-                                       .daddr = cp->daddr.ip,
+                                       .daddr = daddr,
                                        .saddr = 0,
                                        .tos = rtos, } },
                };
 
-               if (ip_route_output_key(&init_net, &rt, &fl)) {
+               if (ip_route_output_key(net, &rt, &fl)) {
                        IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
-                                    &cp->daddr.ip);
+                                    &daddr);
                        return NULL;
                }
        }
 
+       local = rt->rt_flags & RTCF_LOCAL;
+       if (!((local ? 1 : 2) & rt_mode)) {
+               IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
+                            (rt->rt_flags & RTCF_LOCAL) ?
+                            "local":"non-local", &rt->rt_dst);
+               ip_rt_put(rt);
+               return NULL;
+       }
+       if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) &&
+                                        ort->rt_flags & RTCF_LOCAL)) {
+               IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
+                            "requires NAT method, dest: %pI4\n",
+                            &ip_hdr(skb)->daddr, &rt->rt_dst);
+               ip_rt_put(rt);
+               return NULL;
+       }
+       if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
+               IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
+                            "to non-local address, dest: %pI4\n",
+                            &ip_hdr(skb)->saddr, &rt->rt_dst);
+               ip_rt_put(rt);
+               return NULL;
+       }
+
        return rt;
 }
 
+/* Reroute packet to local IPv4 stack after DNAT */
+static int
+__ip_vs_reroute_locally(struct sk_buff *skb)
+{
+       struct rtable *rt = skb_rtable(skb);
+       struct net_device *dev = rt->dst.dev;
+       struct net *net = dev_net(dev);
+       struct iphdr *iph = ip_hdr(skb);
+
+       if (rt->fl.iif) {
+               unsigned long orefdst = skb->_skb_refdst;
+
+               if (ip_route_input(skb, iph->daddr, iph->saddr,
+                                  iph->tos, skb->dev))
+                       return 0;
+               refdst_drop(orefdst);
+       } else {
+               struct flowi fl = {
+                       .oif = 0,
+                       .nl_u = {
+                               .ip4_u = {
+                                       .daddr = iph->daddr,
+                                       .saddr = iph->saddr,
+                                       .tos = RT_TOS(iph->tos),
+                               }
+                       },
+                       .mark = skb->mark,
+               };
+               struct rtable *rt;
+
+               if (ip_route_output_key(net, &rt, &fl))
+                       return 0;
+               if (!(rt->rt_flags & RTCF_LOCAL)) {
+                       ip_rt_put(rt);
+                       return 0;
+               }
+               /* Drop old route. */
+               skb_dst_drop(skb);
+               skb_dst_set(skb, &rt->dst);
+       }
+       return 1;
+}
+
 #ifdef CONFIG_IP_VS_IPV6
+
+static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
+{
+       return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK;
+}
+
+static struct dst_entry *
+__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
+                       struct in6_addr *ret_saddr, int do_xfrm)
+{
+       struct dst_entry *dst;
+       struct flowi fl = {
+               .oif = 0,
+               .nl_u = {
+                       .ip6_u = {
+                               .daddr = *daddr,
+                       },
+               },
+       };
+
+       dst = ip6_route_output(net, NULL, &fl);
+       if (dst->error)
+               goto out_err;
+       if (!ret_saddr)
+               return dst;
+       if (ipv6_addr_any(&fl.fl6_src) &&
+           ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+                              &fl.fl6_dst, 0, &fl.fl6_src) < 0)
+               goto out_err;
+       if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
+               goto out_err;
+       ipv6_addr_copy(ret_saddr, &fl.fl6_src);
+       return dst;
+
+out_err:
+       dst_release(dst);
+       IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
+       return NULL;
+}
+
+/*
+ * Get route to destination or remote server
+ * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
+ *         &4=Allow redirect from remote daddr to local
+ */
 static struct rt6_info *
-__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
+__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
+                     struct in6_addr *daddr, struct in6_addr *ret_saddr,
+                     int do_xfrm, int rt_mode)
 {
+       struct net *net = dev_net(skb_dst(skb)->dev);
        struct rt6_info *rt;                    /* Route to the other host */
-       struct ip_vs_dest *dest = cp->dest;
+       struct rt6_info *ort;                   /* Original route */
+       struct dst_entry *dst;
+       int local;
 
        if (dest) {
                spin_lock(&dest->dst_lock);
-               rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
+               rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
                if (!rt) {
-                       struct flowi fl = {
-                               .oif = 0,
-                               .nl_u = {
-                                       .ip6_u = {
-                                               .daddr = dest->addr.in6,
-                                               .saddr = {
-                                                       .s6_addr32 =
-                                                               { 0, 0, 0, 0 },
-                                               },
-                                       },
-                               },
-                       };
+                       u32 cookie;
 
-                       rt = (struct rt6_info *)ip6_route_output(&init_net,
-                                                                NULL, &fl);
-                       if (!rt) {
+                       dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
+                                                     &dest->dst_saddr,
+                                                     do_xfrm);
+                       if (!dst) {
                                spin_unlock(&dest->dst_lock);
-                               IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-                                            &dest->addr.in6);
                                return NULL;
                        }
-                       __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
-                       IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
-                                 &dest->addr.in6,
+                       rt = (struct rt6_info *) dst;
+                       cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+                       __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+                       IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
+                                 &dest->addr.in6, &dest->dst_saddr,
                                  atomic_read(&rt->dst.__refcnt));
                }
+               if (ret_saddr)
+                       ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
                spin_unlock(&dest->dst_lock);
        } else {
-               struct flowi fl = {
-                       .oif = 0,
-                       .nl_u = {
-                               .ip6_u = {
-                                       .daddr = cp->daddr.in6,
-                                       .saddr = {
-                                               .s6_addr32 = { 0, 0, 0, 0 },
-                                       },
-                               },
-                       },
-               };
-
-               rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-               if (!rt) {
-                       IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-                                    &cp->daddr.in6);
+               dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
+               if (!dst)
                        return NULL;
-               }
+               rt = (struct rt6_info *) dst;
+       }
+
+       local = __ip_vs_is_local_route6(rt);
+       if (!((local ? 1 : 2) & rt_mode)) {
+               IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n",
+                            local ? "local":"non-local", daddr);
+               dst_release(&rt->dst);
+               return NULL;
+       }
+       if (local && !(rt_mode & 4) &&
+           !((ort = (struct rt6_info *) skb_dst(skb)) &&
+             __ip_vs_is_local_route6(ort))) {
+               IP_VS_DBG_RL("Redirect from non-local address %pI6 to local "
+                            "requires NAT method, dest: %pI6\n",
+                            &ipv6_hdr(skb)->daddr, daddr);
+               dst_release(&rt->dst);
+               return NULL;
+       }
+       if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+                    ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
+                                   IPV6_ADDR_LOOPBACK)) {
+               IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 "
+                            "to non-local address, dest: %pI6\n",
+                            &ipv6_hdr(skb)->saddr, daddr);
+               dst_release(&rt->dst);
+               return NULL;
        }
 
        return rt;
@@ -194,12 +334,44 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
        dst_release(old_dst);
 }
 
-#define IP_VS_XMIT(pf, skb, rt)                                \
+#define IP_VS_XMIT_TUNNEL(skb, cp)                             \
+({                                                             \
+       int __ret = NF_ACCEPT;                                  \
+                                                               \
+       (skb)->ipvs_property = 1;                               \
+       if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT))          \
+               __ret = ip_vs_confirm_conntrack(skb, cp);       \
+       if (__ret == NF_ACCEPT) {                               \
+               nf_reset(skb);                                  \
+               skb_forward_csum(skb);                          \
+       }                                                       \
+       __ret;                                                  \
+})
+
+#define IP_VS_XMIT_NAT(pf, skb, cp, local)             \
+do {                                                   \
+       (skb)->ipvs_property = 1;                       \
+       if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
+               ip_vs_notrack(skb);                     \
+       else                                            \
+               ip_vs_update_conntrack(skb, cp, 1);     \
+       if (local)                                      \
+               return NF_ACCEPT;                       \
+       skb_forward_csum(skb);                          \
+       NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,     \
+               skb_dst(skb)->dev, dst_output);         \
+} while (0)
+
+#define IP_VS_XMIT(pf, skb, cp, local)                 \
 do {                                                   \
        (skb)->ipvs_property = 1;                       \
+       if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
+               ip_vs_notrack(skb);                     \
+       if (local)                                      \
+               return NF_ACCEPT;                       \
        skb_forward_csum(skb);                          \
        NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,     \
-               (rt)->dst.dev, dst_output);             \
+               skb_dst(skb)->dev, dst_output);         \
 } while (0)
 
 
@@ -211,7 +383,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
                struct ip_vs_protocol *pp)
 {
        /* we do not touch skb and do not need pskb ptr */
-       return NF_ACCEPT;
+       IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
 }
 
 
@@ -226,24 +398,13 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
        struct rtable *rt;                      /* Route to the other host */
        struct iphdr  *iph = ip_hdr(skb);
-       u8     tos = iph->tos;
        int    mtu;
-       struct flowi fl = {
-               .oif = 0,
-               .nl_u = {
-                       .ip4_u = {
-                               .daddr = iph->daddr,
-                               .saddr = 0,
-                               .tos = RT_TOS(tos), } },
-       };
 
        EnterFunction(10);
 
-       if (ip_route_output_key(&init_net, &rt, &fl)) {
-               IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
-                            __func__, &iph->daddr);
+       if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr,
+                                     RT_TOS(iph->tos), 2)))
                goto tx_error_icmp;
-       }
 
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
@@ -271,7 +432,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+       IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
 
        LeaveFunction(10);
        return NF_STOLEN;
@@ -292,28 +453,22 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        struct rt6_info *rt;                    /* Route to the other host */
        struct ipv6hdr  *iph = ipv6_hdr(skb);
        int    mtu;
-       struct flowi fl = {
-               .oif = 0,
-               .nl_u = {
-                       .ip6_u = {
-                               .daddr = iph->daddr,
-                               .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
-       };
 
        EnterFunction(10);
 
-       rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-       if (!rt) {
-               IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
-                            __func__, &iph->daddr);
+       if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, 2)))
                goto tx_error_icmp;
-       }
 
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if (skb->len > mtu) {
-               dst_release(&rt->dst);
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+               dst_release(&rt->dst);
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
                goto tx_error;
        }
@@ -335,7 +490,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+       IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
 
        LeaveFunction(10);
        return NF_STOLEN;
@@ -349,36 +504,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 }
 #endif
 
-void
-ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
-{
-       struct nf_conn *ct = (struct nf_conn *)skb->nfct;
-       struct nf_conntrack_tuple new_tuple;
-
-       if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct))
-               return;
-
-       /*
-        * The connection is not yet in the hashtable, so we update it.
-        * CIP->VIP will remain the same, so leave the tuple in
-        * IP_CT_DIR_ORIGINAL untouched.  When the reply comes back from the
-        * real-server we will see RIP->DIP.
-        */
-       new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-       if (outin)
-               new_tuple.src.u3 = cp->daddr;
-       else
-               new_tuple.dst.u3 = cp->vaddr;
-       /*
-        * This will also take care of UDP and other protocols.
-        */
-       if (outin)
-               new_tuple.src.u.tcp.port = cp->dport;
-       else
-               new_tuple.dst.u.tcp.port = cp->vport;
-       nf_conntrack_alter_reply(ct, &new_tuple);
-}
-
 /*
  *      NAT transmitter (only for outside-to-inside nat forwarding)
  *      Not used for related ICMP
@@ -390,6 +515,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        struct rtable *rt;              /* Route to the other host */
        int mtu;
        struct iphdr *iph = ip_hdr(skb);
+       int local;
 
        EnterFunction(10);
 
@@ -403,16 +529,42 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
                IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
        }
 
-       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+       if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+                                     RT_TOS(iph->tos), 1|2|4)))
                goto tx_error_icmp;
+       local = rt->rt_flags & RTCF_LOCAL;
+       /*
+        * Avoid duplicate tuple in reply direction for NAT traffic
+        * to local address when connection is sync-ed
+        */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+       if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+               enum ip_conntrack_info ctinfo;
+               struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+               if (ct && !nf_ct_is_untracked(ct)) {
+                       IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
+                                        "ip_vs_nat_xmit(): "
+                                        "stopping DNAT to local address");
+                       goto tx_error_put;
+               }
+       }
+#endif
+
+       /* From world but DNAT to loopback address? */
+       if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
+               IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
+                                "stopping DNAT to loopback address");
+               goto tx_error_put;
+       }
 
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
-               ip_rt_put(rt);
                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-               IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
-               goto tx_error;
+               IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
+                                "ip_vs_nat_xmit(): frag needed for");
+               goto tx_error_put;
        }
 
        /* copy-on-write the packet before mangling it */
@@ -422,19 +574,28 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        if (skb_cow(skb, rt->dst.dev->hard_header_len))
                goto tx_error_put;
 
-       /* drop old route */
-       skb_dst_drop(skb);
-       skb_dst_set(skb, &rt->dst);
-
        /* mangle the packet */
        if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
-               goto tx_error;
+               goto tx_error_put;
        ip_hdr(skb)->daddr = cp->daddr.ip;
        ip_send_check(ip_hdr(skb));
 
-       IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+       if (!local) {
+               /* drop old route */
+               skb_dst_drop(skb);
+               skb_dst_set(skb, &rt->dst);
+       } else {
+               ip_rt_put(rt);
+               /*
+                * Some IPv4 replies get local address from routes,
+                * not from iph, so while we DNAT after routing
+                * we need this second input/output route.
+                */
+               if (!__ip_vs_reroute_locally(skb))
+                       goto tx_error;
+       }
 
-       ip_vs_update_conntrack(skb, cp, 1);
+       IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
 
        /* FIXME: when application helper enlarges the packet and the length
           is larger than the MTU of outgoing device, there will be still
@@ -443,7 +604,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+       IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
 
        LeaveFunction(10);
        return NF_STOLEN;
@@ -451,8 +612,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
   tx_error_icmp:
        dst_link_failure(skb);
   tx_error:
-       LeaveFunction(10);
        kfree_skb(skb);
+       LeaveFunction(10);
        return NF_STOLEN;
   tx_error_put:
        ip_rt_put(rt);
@@ -466,6 +627,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
        struct rt6_info *rt;            /* Route to the other host */
        int mtu;
+       int local;
 
        EnterFunction(10);
 
@@ -480,18 +642,49 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
                IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
        }
 
-       rt = __ip_vs_get_out_rt_v6(cp);
-       if (!rt)
+       if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+                                        0, 1|2|4)))
                goto tx_error_icmp;
+       local = __ip_vs_is_local_route6(rt);
+       /*
+        * Avoid duplicate tuple in reply direction for NAT traffic
+        * to local address when connection is sync-ed
+        */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+       if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+               enum ip_conntrack_info ctinfo;
+               struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+               if (ct && !nf_ct_is_untracked(ct)) {
+                       IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
+                                        "ip_vs_nat_xmit_v6(): "
+                                        "stopping DNAT to local address");
+                       goto tx_error_put;
+               }
+       }
+#endif
+
+       /* From world but DNAT to loopback address? */
+       if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+           ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+               IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
+                                "ip_vs_nat_xmit_v6(): "
+                                "stopping DNAT to loopback address");
+               goto tx_error_put;
+       }
 
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if (skb->len > mtu) {
-               dst_release(&rt->dst);
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-               IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+               IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
                                 "ip_vs_nat_xmit_v6(): frag needed for");
-               goto tx_error;
+               goto tx_error_put;
        }
 
        /* copy-on-write the packet before mangling it */
@@ -501,18 +694,21 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        if (skb_cow(skb, rt->dst.dev->hard_header_len))
                goto tx_error_put;
 
-       /* drop old route */
-       skb_dst_drop(skb);
-       skb_dst_set(skb, &rt->dst);
-
        /* mangle the packet */
        if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
                goto tx_error;
-       ipv6_hdr(skb)->daddr = cp->daddr.in6;
+       ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6);
 
-       IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+       if (!local || !skb->dev) {
+               /* drop the old route when skb is not shared */
+               skb_dst_drop(skb);
+               skb_dst_set(skb, &rt->dst);
+       } else {
+               /* destined to loopback, do we need to change route? */
+               dst_release(&rt->dst);
+       }
 
-       ip_vs_update_conntrack(skb, cp, 1);
+       IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
 
        /* FIXME: when application helper enlarges the packet and the length
           is larger than the MTU of outgoing device, there will be still
@@ -521,7 +717,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+       IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
 
        LeaveFunction(10);
        return NF_STOLEN;
@@ -567,30 +763,27 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        struct iphdr  *old_iph = ip_hdr(skb);
        u8     tos = old_iph->tos;
        __be16 df = old_iph->frag_off;
-       sk_buff_data_t old_transport_header = skb->transport_header;
        struct iphdr  *iph;                     /* Our new IP header */
        unsigned int max_headroom;              /* The extra header space needed */
        int    mtu;
+       int ret;
 
        EnterFunction(10);
 
-       if (skb->protocol != htons(ETH_P_IP)) {
-               IP_VS_DBG_RL("%s(): protocol error, "
-                            "ETH_P_IP: %d, skb protocol: %d\n",
-                            __func__, htons(ETH_P_IP), skb->protocol);
-               goto tx_error;
-       }
-
-       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+       if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+                                     RT_TOS(tos), 1|2)))
                goto tx_error_icmp;
+       if (rt->rt_flags & RTCF_LOCAL) {
+               ip_rt_put(rt);
+               IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+       }
 
        tdev = rt->dst.dev;
 
        mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
        if (mtu < 68) {
-               ip_rt_put(rt);
                IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
-               goto tx_error;
+               goto tx_error_put;
        }
        if (skb_dst(skb))
                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
@@ -600,9 +793,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        if ((old_iph->frag_off & htons(IP_DF))
            && mtu < ntohs(old_iph->tot_len)) {
                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-               ip_rt_put(rt);
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-               goto tx_error;
+               goto tx_error_put;
        }
 
        /*
@@ -625,7 +817,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
                old_iph = ip_hdr(skb);
        }
 
-       skb->transport_header = old_transport_header;
+       skb->transport_header = skb->network_header;
 
        /* fix old IP header checksum */
        ip_send_check(old_iph);
@@ -655,7 +847,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       ip_local_out(skb);
+       ret = IP_VS_XMIT_TUNNEL(skb, cp);
+       if (ret == NF_ACCEPT)
+               ip_local_out(skb);
+       else if (ret == NF_DROP)
+               kfree_skb(skb);
 
        LeaveFunction(10);
 
@@ -667,6 +863,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        kfree_skb(skb);
        LeaveFunction(10);
        return NF_STOLEN;
+tx_error_put:
+       ip_rt_put(rt);
+       goto tx_error;
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -675,43 +874,44 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
                     struct ip_vs_protocol *pp)
 {
        struct rt6_info *rt;            /* Route to the other host */
+       struct in6_addr saddr;          /* Source for tunnel */
        struct net_device *tdev;        /* Device to other host */
        struct ipv6hdr  *old_iph = ipv6_hdr(skb);
-       sk_buff_data_t old_transport_header = skb->transport_header;
        struct ipv6hdr  *iph;           /* Our new IP header */
        unsigned int max_headroom;      /* The extra header space needed */
        int    mtu;
+       int ret;
 
        EnterFunction(10);
 
-       if (skb->protocol != htons(ETH_P_IPV6)) {
-               IP_VS_DBG_RL("%s(): protocol error, "
-                            "ETH_P_IPV6: %d, skb protocol: %d\n",
-                            __func__, htons(ETH_P_IPV6), skb->protocol);
-               goto tx_error;
-       }
-
-       rt = __ip_vs_get_out_rt_v6(cp);
-       if (!rt)
+       if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
+                                        &saddr, 1, 1|2)))
                goto tx_error_icmp;
+       if (__ip_vs_is_local_route6(rt)) {
+               dst_release(&rt->dst);
+               IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+       }
 
        tdev = rt->dst.dev;
 
        mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
-       /* TODO IPv6: do we need this check in IPv6? */
-       if (mtu < 1280) {
-               dst_release(&rt->dst);
-               IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
-               goto tx_error;
+       if (mtu < IPV6_MIN_MTU) {
+               IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+                            IPV6_MIN_MTU);
+               goto tx_error_put;
        }
        if (skb_dst(skb))
                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
        if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-               dst_release(&rt->dst);
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-               goto tx_error;
+               goto tx_error_put;
        }
 
        /*
@@ -734,7 +934,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
                old_iph = ipv6_hdr(skb);
        }
 
-       skb->transport_header = old_transport_header;
+       skb->transport_header = skb->network_header;
 
        skb_push(skb, sizeof(struct ipv6hdr));
        skb_reset_network_header(skb);
@@ -754,14 +954,18 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
        iph->priority           =       old_iph->priority;
        memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
-       iph->daddr              =       rt->rt6i_dst.addr;
-       iph->saddr              =       cp->vaddr.in6; /* rt->rt6i_src.addr; */
+       ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
+       ipv6_addr_copy(&iph->saddr, &saddr);
        iph->hop_limit          =       old_iph->hop_limit;
 
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       ip6_local_out(skb);
+       ret = IP_VS_XMIT_TUNNEL(skb, cp);
+       if (ret == NF_ACCEPT)
+               ip6_local_out(skb);
+       else if (ret == NF_DROP)
+               kfree_skb(skb);
 
        LeaveFunction(10);
 
@@ -773,6 +977,9 @@ tx_error:
        kfree_skb(skb);
        LeaveFunction(10);
        return NF_STOLEN;
+tx_error_put:
+       dst_release(&rt->dst);
+       goto tx_error;
 }
 #endif
 
@@ -791,8 +998,13 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
        EnterFunction(10);
 
-       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+       if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+                                     RT_TOS(iph->tos), 1|2)))
                goto tx_error_icmp;
+       if (rt->rt_flags & RTCF_LOCAL) {
+               ip_rt_put(rt);
+               IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+       }
 
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
@@ -820,7 +1032,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+       IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
 
        LeaveFunction(10);
        return NF_STOLEN;
@@ -843,13 +1055,22 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
        EnterFunction(10);
 
-       rt = __ip_vs_get_out_rt_v6(cp);
-       if (!rt)
+       if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+                                        0, 1|2)))
                goto tx_error_icmp;
+       if (__ip_vs_is_local_route6(rt)) {
+               dst_release(&rt->dst);
+               IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+       }
 
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if (skb->len > mtu) {
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
                dst_release(&rt->dst);
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -873,7 +1094,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+       IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
 
        LeaveFunction(10);
        return NF_STOLEN;
@@ -899,6 +1120,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        struct rtable   *rt;    /* Route to the other host */
        int mtu;
        int rc;
+       int local;
 
        EnterFunction(10);
 
@@ -919,16 +1141,43 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
         * mangle and send the packet here (only for VS/NAT)
         */
 
-       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
+       if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+                                     RT_TOS(ip_hdr(skb)->tos), 1|2|4)))
                goto tx_error_icmp;
+       local = rt->rt_flags & RTCF_LOCAL;
+
+       /*
+        * Avoid duplicate tuple in reply direction for NAT traffic
+        * to local address when connection is sync-ed
+        */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+       if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+               enum ip_conntrack_info ctinfo;
+               struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+               if (ct && !nf_ct_is_untracked(ct)) {
+                       IP_VS_DBG(10, "%s(): "
+                                 "stopping DNAT to local address %pI4\n",
+                                 __func__, &cp->daddr.ip);
+                       goto tx_error_put;
+               }
+       }
+#endif
+
+       /* From world but DNAT to loopback address? */
+       if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
+               IP_VS_DBG(1, "%s(): "
+                         "stopping DNAT to loopback %pI4\n",
+                         __func__, &cp->daddr.ip);
+               goto tx_error_put;
+       }
 
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
-               ip_rt_put(rt);
                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-               goto tx_error;
+               goto tx_error_put;
        }
 
        /* copy-on-write the packet before mangling it */
@@ -938,16 +1187,27 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        if (skb_cow(skb, rt->dst.dev->hard_header_len))
                goto tx_error_put;
 
-       /* drop the old route when skb is not shared */
-       skb_dst_drop(skb);
-       skb_dst_set(skb, &rt->dst);
-
        ip_vs_nat_icmp(skb, pp, cp, 0);
 
+       if (!local) {
+               /* drop the old route when skb is not shared */
+               skb_dst_drop(skb);
+               skb_dst_set(skb, &rt->dst);
+       } else {
+               ip_rt_put(rt);
+               /*
+                * Some IPv4 replies get local address from routes,
+                * not from iph, so while we DNAT after routing
+                * we need this second input/output route.
+                */
+               if (!__ip_vs_reroute_locally(skb))
+                       goto tx_error;
+       }
+
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+       IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
 
        rc = NF_STOLEN;
        goto out;
@@ -973,6 +1233,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        struct rt6_info *rt;    /* Route to the other host */
        int mtu;
        int rc;
+       int local;
 
        EnterFunction(10);
 
@@ -993,17 +1254,49 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
         * mangle and send the packet here (only for VS/NAT)
         */
 
-       rt = __ip_vs_get_out_rt_v6(cp);
-       if (!rt)
+       if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+                                        0, 1|2|4)))
                goto tx_error_icmp;
 
+       local = __ip_vs_is_local_route6(rt);
+       /*
+        * Avoid duplicate tuple in reply direction for NAT traffic
+        * to local address when connection is sync-ed
+        */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+       if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+               enum ip_conntrack_info ctinfo;
+               struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+               if (ct && !nf_ct_is_untracked(ct)) {
+                       IP_VS_DBG(10, "%s(): "
+                                 "stopping DNAT to local address %pI6\n",
+                                 __func__, &cp->daddr.in6);
+                       goto tx_error_put;
+               }
+       }
+#endif
+
+       /* From world but DNAT to loopback address? */
+       if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+           ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+               IP_VS_DBG(1, "%s(): "
+                         "stopping DNAT to loopback %pI6\n",
+                         __func__, &cp->daddr.in6);
+               goto tx_error_put;
+       }
+
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if (skb->len > mtu) {
-               dst_release(&rt->dst);
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-               goto tx_error;
+               goto tx_error_put;
        }
 
        /* copy-on-write the packet before mangling it */
@@ -1013,16 +1306,21 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        if (skb_cow(skb, rt->dst.dev->hard_header_len))
                goto tx_error_put;
 
-       /* drop the old route when skb is not shared */
-       skb_dst_drop(skb);
-       skb_dst_set(skb, &rt->dst);
-
        ip_vs_nat_icmp_v6(skb, pp, cp, 0);
 
+       if (!local || !skb->dev) {
+               /* drop the old route when skb is not shared */
+               skb_dst_drop(skb);
+               skb_dst_set(skb, &rt->dst);
+       } else {
+               /* destined to loopback, do we need to change route? */
+               dst_release(&rt->dst);
+       }
+
        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
 
-       IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+       IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
 
        rc = NF_STOLEN;
        goto out;
index df3eedb142ff809b9ce8f9dd49c41782f91aba42..1eacf8d9966aa292f7f051964f822a00c0ab6605 100644 (file)
@@ -65,32 +65,42 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
 DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
 EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
 
-static int nf_conntrack_hash_rnd_initted;
-static unsigned int nf_conntrack_hash_rnd;
+static unsigned int nf_conntrack_hash_rnd __read_mostly;
 
-static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
-                                 u16 zone, unsigned int size, unsigned int rnd)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
 {
        unsigned int n;
-       u_int32_t h;
 
        /* The direction must be ignored, so we hash everything up to the
         * destination ports (which is a multiple of 4) and treat the last
         * three bytes manually.
         */
        n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
-       h = jhash2((u32 *)tuple, n,
-                  zone ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) |
-                                tuple->dst.protonum));
+       return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^
+                     (((__force __u16)tuple->dst.u.all << 16) |
+                     tuple->dst.protonum));
+}
+
+static u32 __hash_bucket(u32 hash, unsigned int size)
+{
+       return ((u64)hash * size) >> 32;
+}
+
+static u32 hash_bucket(u32 hash, const struct net *net)
+{
+       return __hash_bucket(hash, net->ct.htable_size);
+}
 
-       return ((u64)h * size) >> 32;
+static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
+                                 u16 zone, unsigned int size)
+{
+       return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
 }
 
 static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
                                       const struct nf_conntrack_tuple *tuple)
 {
-       return __hash_conntrack(tuple, zone, net->ct.htable_size,
-                               nf_conntrack_hash_rnd);
+       return __hash_conntrack(tuple, zone, net->ct.htable_size);
 }
 
 bool
@@ -292,20 +302,20 @@ static void death_by_timeout(unsigned long ul_conntrack)
  * OR
  * - Caller must lock nf_conntrack_lock before calling this function
  */
-struct nf_conntrack_tuple_hash *
-__nf_conntrack_find(struct net *net, u16 zone,
-                   const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+____nf_conntrack_find(struct net *net, u16 zone,
+                     const struct nf_conntrack_tuple *tuple, u32 hash)
 {
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_node *n;
-       unsigned int hash = hash_conntrack(net, zone, tuple);
+       unsigned int bucket = hash_bucket(hash, net);
 
        /* Disable BHs the entire time since we normally need to disable them
         * at least once for the stats anyway.
         */
        local_bh_disable();
 begin:
-       hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
+       hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
                if (nf_ct_tuple_equal(tuple, &h->tuple) &&
                    nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) {
                        NF_CT_STAT_INC(net, found);
@@ -319,7 +329,7 @@ begin:
         * not the expected one, we must restart lookup.
         * We probably met an item that was moved to another chain.
         */
-       if (get_nulls_value(n) != hash) {
+       if (get_nulls_value(n) != bucket) {
                NF_CT_STAT_INC(net, search_restart);
                goto begin;
        }
@@ -327,19 +337,27 @@ begin:
 
        return NULL;
 }
+
+struct nf_conntrack_tuple_hash *
+__nf_conntrack_find(struct net *net, u16 zone,
+                   const struct nf_conntrack_tuple *tuple)
+{
+       return ____nf_conntrack_find(net, zone, tuple,
+                                    hash_conntrack_raw(tuple, zone));
+}
 EXPORT_SYMBOL_GPL(__nf_conntrack_find);
 
 /* Find a connection corresponding to a tuple. */
-struct nf_conntrack_tuple_hash *
-nf_conntrack_find_get(struct net *net, u16 zone,
-                     const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+__nf_conntrack_find_get(struct net *net, u16 zone,
+                       const struct nf_conntrack_tuple *tuple, u32 hash)
 {
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct;
 
        rcu_read_lock();
 begin:
-       h = __nf_conntrack_find(net, zone, tuple);
+       h = ____nf_conntrack_find(net, zone, tuple, hash);
        if (h) {
                ct = nf_ct_tuplehash_to_ctrack(h);
                if (unlikely(nf_ct_is_dying(ct) ||
@@ -357,6 +375,14 @@ begin:
 
        return h;
 }
+
+struct nf_conntrack_tuple_hash *
+nf_conntrack_find_get(struct net *net, u16 zone,
+                     const struct nf_conntrack_tuple *tuple)
+{
+       return __nf_conntrack_find_get(net, zone, tuple,
+                                      hash_conntrack_raw(tuple, zone));
+}
 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 
 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
@@ -409,8 +435,11 @@ __nf_conntrack_confirm(struct sk_buff *skb)
                return NF_ACCEPT;
 
        zone = nf_ct_zone(ct);
-       hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-       repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+       /* reuse the hash saved before */
+       hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
+       hash = hash_bucket(hash, net);
+       repl_hash = hash_conntrack(net, zone,
+                                  &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 
        /* We're not in hash table, and we refuse to set up related
           connections for unconfirmed conns.  But packet copies and
@@ -567,17 +596,29 @@ static noinline int early_drop(struct net *net, unsigned int hash)
        return dropped;
 }
 
-struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
-                                  const struct nf_conntrack_tuple *orig,
-                                  const struct nf_conntrack_tuple *repl,
-                                  gfp_t gfp)
+static struct nf_conn *
+__nf_conntrack_alloc(struct net *net, u16 zone,
+                    const struct nf_conntrack_tuple *orig,
+                    const struct nf_conntrack_tuple *repl,
+                    gfp_t gfp, u32 hash)
 {
        struct nf_conn *ct;
 
-       if (unlikely(!nf_conntrack_hash_rnd_initted)) {
-               get_random_bytes(&nf_conntrack_hash_rnd,
-                               sizeof(nf_conntrack_hash_rnd));
-               nf_conntrack_hash_rnd_initted = 1;
+       if (unlikely(!nf_conntrack_hash_rnd)) {
+               unsigned int rand;
+
+               /*
+                * Why not initialize nf_conntrack_rnd in a "init()" function ?
+                * Because there isn't enough entropy when system initializing,
+                * and we initialize it as late as possible.
+                */
+               do {
+                       get_random_bytes(&rand, sizeof(rand));
+               } while (!rand);
+               cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
+
+               /* recompute the hash as nf_conntrack_hash_rnd is initialized */
+               hash = hash_conntrack_raw(orig, zone);
        }
 
        /* We don't want any race condition at early drop stage */
@@ -585,8 +626,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
 
        if (nf_conntrack_max &&
            unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
-               unsigned int hash = hash_conntrack(net, zone, orig);
-               if (!early_drop(net, hash)) {
+               if (!early_drop(net, hash_bucket(hash, net))) {
                        atomic_dec(&net->ct.count);
                        if (net_ratelimit())
                                printk(KERN_WARNING
@@ -616,7 +656,8 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
        ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
        ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
        ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
-       ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
+       /* save hash for reusing when confirming */
+       *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
        /* Don't set timer yet: wait for confirmation */
        setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
        write_pnet(&ct->ct_net, net);
@@ -643,6 +684,14 @@ out_free:
        return ERR_PTR(-ENOMEM);
 #endif
 }
+
+struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
+                                  const struct nf_conntrack_tuple *orig,
+                                  const struct nf_conntrack_tuple *repl,
+                                  gfp_t gfp)
+{
+       return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
+}
 EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
 
 void nf_conntrack_free(struct nf_conn *ct)
@@ -664,7 +713,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
               struct nf_conntrack_l3proto *l3proto,
               struct nf_conntrack_l4proto *l4proto,
               struct sk_buff *skb,
-              unsigned int dataoff)
+              unsigned int dataoff, u32 hash)
 {
        struct nf_conn *ct;
        struct nf_conn_help *help;
@@ -678,7 +727,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
                return NULL;
        }
 
-       ct = nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC);
+       ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
+                                 hash);
        if (IS_ERR(ct)) {
                pr_debug("Can't allocate conntrack.\n");
                return (struct nf_conntrack_tuple_hash *)ct;
@@ -755,6 +805,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct;
        u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+       u32 hash;
 
        if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
                             dataoff, l3num, protonum, &tuple, l3proto,
@@ -764,10 +815,11 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
        }
 
        /* look for tuple match */
-       h = nf_conntrack_find_get(net, zone, &tuple);
+       hash = hash_conntrack_raw(&tuple, zone);
+       h = __nf_conntrack_find_get(net, zone, &tuple, hash);
        if (!h) {
                h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
-                                  skb, dataoff);
+                                  skb, dataoff, hash);
                if (!h)
                        return NULL;
                if (IS_ERR(h))
@@ -1307,8 +1359,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
                        ct = nf_ct_tuplehash_to_ctrack(h);
                        hlist_nulls_del_rcu(&h->hnnode);
                        bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
-                                                 hashsize,
-                                                 nf_conntrack_hash_rnd);
+                                                 hashsize);
                        hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
                }
        }
index acb29ccaa41fd46624754adb794b5d9389a45c3c..46e8966912b1d9db01fc2997ce0574f0c6094062 100644 (file)
@@ -38,25 +38,30 @@ static int nf_ct_expect_hash_rnd_initted __read_mostly;
 
 static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
 
+static HLIST_HEAD(nf_ct_userspace_expect_list);
+
 /* nf_conntrack_expect helper functions */
-void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
+void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
+                               u32 pid, int report)
 {
        struct nf_conn_help *master_help = nfct_help(exp->master);
        struct net *net = nf_ct_exp_net(exp);
 
-       NF_CT_ASSERT(master_help);
        NF_CT_ASSERT(!timer_pending(&exp->timeout));
 
        hlist_del_rcu(&exp->hnode);
        net->ct.expect_count--;
 
        hlist_del(&exp->lnode);
-       master_help->expecting[exp->class]--;
+       if (!(exp->flags & NF_CT_EXPECT_USERSPACE))
+               master_help->expecting[exp->class]--;
+
+       nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report);
        nf_ct_expect_put(exp);
 
        NF_CT_STAT_INC(net, expect_delete);
 }
-EXPORT_SYMBOL_GPL(nf_ct_unlink_expect);
+EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
 
 static void nf_ct_expectation_timed_out(unsigned long ul_expect)
 {
@@ -320,16 +325,21 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 
        atomic_inc(&exp->use);
 
-       hlist_add_head(&exp->lnode, &master_help->expectations);
-       master_help->expecting[exp->class]++;
+       if (master_help) {
+               hlist_add_head(&exp->lnode, &master_help->expectations);
+               master_help->expecting[exp->class]++;
+       } else if (exp->flags & NF_CT_EXPECT_USERSPACE)
+               hlist_add_head(&exp->lnode, &nf_ct_userspace_expect_list);
 
        hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
        net->ct.expect_count++;
 
        setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
                    (unsigned long)exp);
-       p = &master_help->helper->expect_policy[exp->class];
-       exp->timeout.expires = jiffies + p->timeout * HZ;
+       if (master_help) {
+               p = &master_help->helper->expect_policy[exp->class];
+               exp->timeout.expires = jiffies + p->timeout * HZ;
+       }
        add_timer(&exp->timeout);
 
        atomic_inc(&exp->use);
@@ -380,7 +390,9 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
        unsigned int h;
        int ret = 1;
 
-       if (!master_help->helper) {
+       /* Don't allow expectations created from kernel-space with no helper */
+       if (!(expect->flags & NF_CT_EXPECT_USERSPACE) &&
+           (!master_help || (master_help && !master_help->helper))) {
                ret = -ESHUTDOWN;
                goto out;
        }
@@ -398,13 +410,16 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
                }
        }
        /* Will be over limit? */
-       p = &master_help->helper->expect_policy[expect->class];
-       if (p->max_expected &&
-           master_help->expecting[expect->class] >= p->max_expected) {
-               evict_oldest_expect(master, expect);
-               if (master_help->expecting[expect->class] >= p->max_expected) {
-                       ret = -EMFILE;
-                       goto out;
+       if (master_help) {
+               p = &master_help->helper->expect_policy[expect->class];
+               if (p->max_expected &&
+                   master_help->expecting[expect->class] >= p->max_expected) {
+                       evict_oldest_expect(master, expect);
+                       if (master_help->expecting[expect->class]
+                                               >= p->max_expected) {
+                               ret = -EMFILE;
+                               goto out;
+                       }
                }
        }
 
@@ -439,6 +454,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
 
+void nf_ct_remove_userspace_expectations(void)
+{
+       struct nf_conntrack_expect *exp;
+       struct hlist_node *n, *next;
+
+       hlist_for_each_entry_safe(exp, n, next,
+                                 &nf_ct_userspace_expect_list, lnode) {
+               if (del_timer(&exp->timeout)) {
+                       nf_ct_unlink_expect(exp);
+                       nf_ct_expect_put(exp);
+               }
+       }
+}
+EXPORT_SYMBOL_GPL(nf_ct_remove_userspace_expectations);
+
 #ifdef CONFIG_PROC_FS
 struct ct_expect_iter_state {
        struct seq_net_private p;
@@ -529,8 +559,12 @@ static int exp_seq_show(struct seq_file *s, void *v)
                seq_printf(s, "PERMANENT");
                delim = ",";
        }
-       if (expect->flags & NF_CT_EXPECT_INACTIVE)
+       if (expect->flags & NF_CT_EXPECT_INACTIVE) {
                seq_printf(s, "%sINACTIVE", delim);
+               delim = ",";
+       }
+       if (expect->flags & NF_CT_EXPECT_USERSPACE)
+               seq_printf(s, "%sUSERSPACE", delim);
 
        helper = rcu_dereference(nfct_help(expect->master)->helper);
        if (helper) {
index 5bae1cd15eea93ee3f74cb51dab972c10c96d33c..62bad229106b8db50ccf3b192e24dbfcc6280e07 100644 (file)
@@ -1560,8 +1560,8 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
                          const struct nf_conntrack_expect *exp)
 {
        struct nf_conn *master = exp->master;
-       struct nf_conntrack_helper *helper;
        long timeout = (exp->timeout.expires - jiffies) / HZ;
+       struct nf_conn_help *help;
 
        if (timeout < 0)
                timeout = 0;
@@ -1577,9 +1577,15 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
 
        NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout));
        NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp));
-       helper = rcu_dereference(nfct_help(master)->helper);
-       if (helper)
-               NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+       NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags));
+       help = nfct_help(master);
+       if (help) {
+               struct nf_conntrack_helper *helper;
+
+               helper = rcu_dereference(help->helper);
+               if (helper)
+                       NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+       }
 
        return 0;
 
@@ -1626,17 +1632,20 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
        struct nlmsghdr *nlh;
        struct nfgenmsg *nfmsg;
        struct sk_buff *skb;
-       unsigned int type;
+       unsigned int type, group;
        int flags = 0;
 
-       if (events & (1 << IPEXP_NEW)) {
+       if (events & (1 << IPEXP_DESTROY)) {
+               type = IPCTNL_MSG_EXP_DELETE;
+               group = NFNLGRP_CONNTRACK_EXP_DESTROY;
+       } else if (events & (1 << IPEXP_NEW)) {
                type = IPCTNL_MSG_EXP_NEW;
                flags = NLM_F_CREATE|NLM_F_EXCL;
+               group = NFNLGRP_CONNTRACK_EXP_NEW;
        } else
                return 0;
 
-       if (!item->report &&
-           !nfnetlink_has_listeners(net, NFNLGRP_CONNTRACK_EXP_NEW))
+       if (!item->report && !nfnetlink_has_listeners(net, group))
                return 0;
 
        skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
@@ -1659,8 +1668,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
        rcu_read_unlock();
 
        nlmsg_end(skb, nlh);
-       nfnetlink_send(skb, net, item->pid, NFNLGRP_CONNTRACK_EXP_NEW,
-                      item->report, GFP_ATOMIC);
+       nfnetlink_send(skb, net, item->pid, group, item->report, GFP_ATOMIC);
        return 0;
 
 nla_put_failure:
@@ -1733,6 +1741,8 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
        [CTA_EXPECT_TIMEOUT]    = { .type = NLA_U32 },
        [CTA_EXPECT_ID]         = { .type = NLA_U32 },
        [CTA_EXPECT_HELP_NAME]  = { .type = NLA_NUL_STRING },
+       [CTA_EXPECT_ZONE]       = { .type = NLA_U16 },
+       [CTA_EXPECT_FLAGS]      = { .type = NLA_U32 },
 };
 
 static int
@@ -1841,7 +1851,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
                }
 
                /* after list removal, usage count == 1 */
-               nf_ct_unexpect_related(exp);
+               spin_lock_bh(&nf_conntrack_lock);
+               if (del_timer(&exp->timeout)) {
+                       nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).pid,
+                                                  nlmsg_report(nlh));
+                       nf_ct_expect_put(exp);
+               }
+               spin_unlock_bh(&nf_conntrack_lock);
                /* have to put what we 'get' above.
                 * after this line usage count == 0 */
                nf_ct_expect_put(exp);
@@ -1858,7 +1874,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
                                m_help = nfct_help(exp->master);
                                if (!strcmp(m_help->helper->name, name) &&
                                    del_timer(&exp->timeout)) {
-                                       nf_ct_unlink_expect(exp);
+                                       nf_ct_unlink_expect_report(exp,
+                                                       NETLINK_CB(skb).pid,
+                                                       nlmsg_report(nlh));
                                        nf_ct_expect_put(exp);
                                }
                        }
@@ -1872,7 +1890,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
                                                  &net->ct.expect_hash[i],
                                                  hnode) {
                                if (del_timer(&exp->timeout)) {
-                                       nf_ct_unlink_expect(exp);
+                                       nf_ct_unlink_expect_report(exp,
+                                                       NETLINK_CB(skb).pid,
+                                                       nlmsg_report(nlh));
                                        nf_ct_expect_put(exp);
                                }
                        }
@@ -1918,23 +1938,35 @@ ctnetlink_create_expect(struct net *net, u16 zone,
        if (!h)
                return -ENOENT;
        ct = nf_ct_tuplehash_to_ctrack(h);
-       help = nfct_help(ct);
-
-       if (!help || !help->helper) {
-               /* such conntrack hasn't got any helper, abort */
-               err = -EOPNOTSUPP;
-               goto out;
-       }
-
        exp = nf_ct_expect_alloc(ct);
        if (!exp) {
                err = -ENOMEM;
                goto out;
        }
+       help = nfct_help(ct);
+       if (!help) {
+               if (!cda[CTA_EXPECT_TIMEOUT]) {
+                       err = -EINVAL;
+                       goto out;
+               }
+               exp->timeout.expires =
+                 jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
+
+               exp->flags = NF_CT_EXPECT_USERSPACE;
+               if (cda[CTA_EXPECT_FLAGS]) {
+                       exp->flags |=
+                               ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+               }
+       } else {
+               if (cda[CTA_EXPECT_FLAGS]) {
+                       exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+                       exp->flags &= ~NF_CT_EXPECT_USERSPACE;
+               } else
+                       exp->flags = 0;
+       }
 
        exp->class = 0;
        exp->expectfn = NULL;
-       exp->flags = 0;
        exp->master = ct;
        exp->helper = NULL;
        memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple));
@@ -2102,6 +2134,7 @@ static void __exit ctnetlink_exit(void)
 {
        pr_info("ctnetlink: unregistering from nfnetlink.\n");
 
+       nf_ct_remove_userspace_expectations();
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
        nf_ct_expect_unregister_notifier(&ctnl_notifier_exp);
        nf_conntrack_unregister_notifier(&ctnl_notifier);
index f64de95448669242cf23ac5d8a38987c2f8cd344..bcf47eb518effb4645440a6f64726089984eaeb7 100644 (file)
@@ -130,6 +130,44 @@ static int digits_len(const struct nf_conn *ct, const char *dptr,
        return len;
 }
 
+static int iswordc(const char c)
+{
+       if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
+           (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' ||
+           c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
+           c == '{' || c == '}' || c == '~')
+               return 1;
+       return 0;
+}
+
+static int word_len(const char *dptr, const char *limit)
+{
+       int len = 0;
+       while (dptr < limit && iswordc(*dptr)) {
+               dptr++;
+               len++;
+       }
+       return len;
+}
+
+static int callid_len(const struct nf_conn *ct, const char *dptr,
+                     const char *limit, int *shift)
+{
+       int len, domain_len;
+
+       len = word_len(dptr, limit);
+       dptr += len;
+       if (!len || dptr == limit || *dptr != '@')
+               return len;
+       dptr++;
+       len++;
+
+       domain_len = word_len(dptr, limit);
+       if (!domain_len)
+               return 0;
+       return len + domain_len;
+}
+
 /* get media type + port length */
 static int media_len(const struct nf_conn *ct, const char *dptr,
                     const char *limit, int *shift)
@@ -152,6 +190,9 @@ static int parse_addr(const struct nf_conn *ct, const char *cp,
        const char *end;
        int ret = 0;
 
+       if (!ct)
+               return 0;
+
        memset(addr, 0, sizeof(*addr));
        switch (nf_ct_l3num(ct)) {
        case AF_INET:
@@ -296,6 +337,7 @@ static const struct sip_header ct_sip_hdrs[] = {
        [SIP_HDR_VIA_TCP]               = SIP_HDR("Via", "v", "TCP ", epaddr_len),
        [SIP_HDR_EXPIRES]               = SIP_HDR("Expires", NULL, NULL, digits_len),
        [SIP_HDR_CONTENT_LENGTH]        = SIP_HDR("Content-Length", "l", NULL, digits_len),
+       [SIP_HDR_CALL_ID]               = SIP_HDR("Call-Id", "i", NULL, callid_len),
 };
 
 static const char *sip_follow_continuation(const char *dptr, const char *limit)
index daab8c4a903ca20103c1c5d6ebe997fdb657f581..4d87befb04c04c793a54360de809e5eb64ee44c2 100644 (file)
 #include <net/udp.h>
 #include <net/netfilter/nf_tproxy_core.h>
 
-struct sock *
-nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
-                     const __be32 saddr, const __be32 daddr,
-                     const __be16 sport, const __be16 dport,
-                     const struct net_device *in, bool listening_only)
-{
-       struct sock *sk;
-
-       /* look up socket */
-       switch (protocol) {
-       case IPPROTO_TCP:
-               if (listening_only)
-                       sk = __inet_lookup_listener(net, &tcp_hashinfo,
-                                                   daddr, ntohs(dport),
-                                                   in->ifindex);
-               else
-                       sk = __inet_lookup(net, &tcp_hashinfo,
-                                          saddr, sport, daddr, dport,
-                                          in->ifindex);
-               break;
-       case IPPROTO_UDP:
-               sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
-                                    in->ifindex);
-               break;
-       default:
-               WARN_ON(1);
-               sk = NULL;
-       }
-
-       pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, listener only: %d, sock %p\n",
-                protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), listening_only, sk);
-
-       return sk;
-}
-EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4);
 
 static void
 nf_tproxy_destructor(struct sk_buff *skb)
index e34622fa000357c5e24eeddd2aad835d616fd8ce..80463507420edffe34544074be18b891928c7dc5 100644 (file)
@@ -116,10 +116,8 @@ EXPORT_SYMBOL(xt_register_targets);
 void
 xt_unregister_targets(struct xt_target *target, unsigned int n)
 {
-       unsigned int i;
-
-       for (i = 0; i < n; i++)
-               xt_unregister_target(&target[i]);
+       while (n-- > 0)
+               xt_unregister_target(&target[n]);
 }
 EXPORT_SYMBOL(xt_unregister_targets);
 
@@ -174,10 +172,8 @@ EXPORT_SYMBOL(xt_register_matches);
 void
 xt_unregister_matches(struct xt_match *match, unsigned int n)
 {
-       unsigned int i;
-
-       for (i = 0; i < n; i++)
-               xt_unregister_match(&match[i]);
+       while (n-- > 0)
+               xt_unregister_match(&match[n]);
 }
 EXPORT_SYMBOL(xt_unregister_matches);
 
index c61294d85fdafbcb4696b704a6754f9f1c9476c1..19c482caf30b7f1587f41cd7b3c26a3d598bc084 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * Transparent proxy support for Linux/iptables
  *
- * Copyright (c) 2006-2007 BalaBit IT Ltd.
+ * Copyright (c) 2006-2010 BalaBit IT Ltd.
  * Author: Balazs Scheidler, Krisztian Kovacs
  *
  * This program is free software; you can redistribute it and/or modify
 #include <net/checksum.h>
 #include <net/udp.h>
 #include <net/inet_sock.h>
-
+#include <linux/inetdevice.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter/xt_TPROXY.h>
 
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/if_inet6.h>
+#include <net/addrconf.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
 #include <net/netfilter/nf_tproxy_core.h>
+#include <linux/netfilter/xt_TPROXY.h>
+
+static inline __be32
+tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+       struct in_device *indev;
+       __be32 laddr;
+
+       if (user_laddr)
+               return user_laddr;
+
+       laddr = 0;
+       rcu_read_lock();
+       indev = __in_dev_get_rcu(skb->dev);
+       for_primary_ifa(indev) {
+               laddr = ifa->ifa_local;
+               break;
+       } endfor_ifa(indev);
+       rcu_read_unlock();
+
+       return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections
+ * @skb:       The skb being processed.
+ * @laddr:     IPv4 address to redirect to or zero.
+ * @lport:     TCP port to redirect to or zero.
+ * @sk:                The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait4() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+                       struct sock *sk)
+{
+       const struct iphdr *iph = ip_hdr(skb);
+       struct tcphdr _hdr, *hp;
+
+       hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+       if (hp == NULL) {
+               inet_twsk_put(inet_twsk(sk));
+               return NULL;
+       }
+
+       if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+               /* SYN to a TIME_WAIT socket, we'd rather redirect it
+                * to a listener socket if there's one */
+               struct sock *sk2;
+
+               sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+                                           iph->saddr, laddr ? laddr : iph->daddr,
+                                           hp->source, lport ? lport : hp->dest,
+                                           skb->dev, NFT_LOOKUP_LISTENER);
+               if (sk2) {
+                       inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+                       inet_twsk_put(inet_twsk(sk));
+                       sk = sk2;
+               }
+       }
+
+       return sk;
+}
 
 static unsigned int
-tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
+tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+          u_int32_t mark_mask, u_int32_t mark_value)
 {
        const struct iphdr *iph = ip_hdr(skb);
-       const struct xt_tproxy_target_info *tgi = par->targinfo;
        struct udphdr _hdr, *hp;
        struct sock *sk;
 
@@ -36,12 +113,195 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
        if (hp == NULL)
                return NF_DROP;
 
+       /* check if there's an ongoing connection on the packet
+        * addresses, this happens if the redirect already happened
+        * and the current packet belongs to an already established
+        * connection */
        sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
-                                  iph->saddr,
-                                  tgi->laddr ? tgi->laddr : iph->daddr,
-                                  hp->source,
-                                  tgi->lport ? tgi->lport : hp->dest,
-                                  par->in, true);
+                                  iph->saddr, iph->daddr,
+                                  hp->source, hp->dest,
+                                  skb->dev, NFT_LOOKUP_ESTABLISHED);
+
+       laddr = tproxy_laddr4(skb, laddr, iph->daddr);
+       if (!lport)
+               lport = hp->dest;
+
+       /* UDP has no TCP_TIME_WAIT state, so we never enter here */
+       if (sk && sk->sk_state == TCP_TIME_WAIT)
+               /* reopening a TIME_WAIT connection needs special handling */
+               sk = tproxy_handle_time_wait4(skb, laddr, lport, sk);
+       else if (!sk)
+               /* no, there's no established connection, check if
+                * there's a listener on the redirected addr/port */
+               sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+                                          iph->saddr, laddr,
+                                          hp->source, lport,
+                                          skb->dev, NFT_LOOKUP_LISTENER);
+
+       /* NOTE: assign_sock consumes our sk reference */
+       if (sk && nf_tproxy_assign_sock(skb, sk)) {
+               /* This should be in a separate target, but we don't do multiple
+                  targets on the same rule yet */
+               skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
+
+               pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+                        iph->protocol, &iph->daddr, ntohs(hp->dest),
+                        &laddr, ntohs(lport), skb->mark);
+               return NF_ACCEPT;
+       }
+
+       pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+                iph->protocol, &iph->saddr, ntohs(hp->source),
+                &iph->daddr, ntohs(hp->dest), skb->mark);
+       return NF_DROP;
+}
+
+static unsigned int
+tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+       const struct xt_tproxy_target_info *tgi = par->targinfo;
+
+       return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+static unsigned int
+tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+       const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+       return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+static inline const struct in6_addr *
+tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
+             const struct in6_addr *daddr)
+{
+       struct inet6_dev *indev;
+       struct inet6_ifaddr *ifa;
+       struct in6_addr *laddr;
+
+       if (!ipv6_addr_any(user_laddr))
+               return user_laddr;
+       laddr = NULL;
+
+       rcu_read_lock();
+       indev = __in6_dev_get(skb->dev);
+       if (indev)
+               list_for_each_entry(ifa, &indev->addr_list, if_list) {
+                       if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
+                               continue;
+
+                       laddr = &ifa->addr;
+                       break;
+               }
+       rcu_read_unlock();
+
+       return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
+ * @skb:       The skb being processed.
+ * @tproto:    Transport protocol.
+ * @thoff:     Transport protocol header offset.
+ * @par:       Iptables target parameters.
+ * @sk:                The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait6() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+                        const struct xt_action_param *par,
+                        struct sock *sk)
+{
+       const struct ipv6hdr *iph = ipv6_hdr(skb);
+       struct tcphdr _hdr, *hp;
+       const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+       hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+       if (hp == NULL) {
+               inet_twsk_put(inet_twsk(sk));
+               return NULL;
+       }
+
+       if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+               /* SYN to a TIME_WAIT socket, we'd rather redirect it
+                * to a listener socket if there's one */
+               struct sock *sk2;
+
+               sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+                                           &iph->saddr,
+                                           tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
+                                           hp->source,
+                                           tgi->lport ? tgi->lport : hp->dest,
+                                           skb->dev, NFT_LOOKUP_LISTENER);
+               if (sk2) {
+                       inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+                       inet_twsk_put(inet_twsk(sk));
+                       sk = sk2;
+               }
+       }
+
+       return sk;
+}
+
+static unsigned int
+tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+       const struct ipv6hdr *iph = ipv6_hdr(skb);
+       const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+       struct udphdr _hdr, *hp;
+       struct sock *sk;
+       const struct in6_addr *laddr;
+       __be16 lport;
+       int thoff;
+       int tproto;
+
+       tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+       if (tproto < 0) {
+               pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+               return NF_DROP;
+       }
+
+       hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+       if (hp == NULL) {
+               pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n");
+               return NF_DROP;
+       }
+
+       /* check if there's an ongoing connection on the packet
+        * addresses, this happens if the redirect already happened
+        * and the current packet belongs to an already established
+        * connection */
+       sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+                                  &iph->saddr, &iph->daddr,
+                                  hp->source, hp->dest,
+                                  par->in, NFT_LOOKUP_ESTABLISHED);
+
+       laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
+       lport = tgi->lport ? tgi->lport : hp->dest;
+
+       /* UDP has no TCP_TIME_WAIT state, so we never enter here */
+       if (sk && sk->sk_state == TCP_TIME_WAIT)
+               /* reopening a TIME_WAIT connection needs special handling */
+               sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk);
+       else if (!sk)
+               /* no there's no established connection, check if
+                * there's a listener on the redirected addr/port */
+               sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+                                          &iph->saddr, laddr,
+                                          hp->source, lport,
+                                          par->in, NFT_LOOKUP_LISTENER);
 
        /* NOTE: assign_sock consumes our sk reference */
        if (sk && nf_tproxy_assign_sock(skb, sk)) {
@@ -49,19 +309,34 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
                   targets on the same rule yet */
                skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
 
-               pr_debug("redirecting: proto %u %08x:%u -> %08x:%u, mark: %x\n",
-                        iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
-                        ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+               pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+                        tproto, &iph->saddr, ntohs(hp->source),
+                        laddr, ntohs(lport), skb->mark);
                return NF_ACCEPT;
        }
 
-       pr_debug("no socket, dropping: proto %u %08x:%u -> %08x:%u, mark: %x\n",
-                iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
-                ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+       pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+                tproto, &iph->saddr, ntohs(hp->source),
+                &iph->daddr, ntohs(hp->dest), skb->mark);
+
        return NF_DROP;
 }
 
-static int tproxy_tg_check(const struct xt_tgchk_param *par)
+static int tproxy_tg6_check(const struct xt_tgchk_param *par)
+{
+       const struct ip6t_ip6 *i = par->entryinfo;
+
+       if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
+           && !(i->flags & IP6T_INV_PROTO))
+               return 0;
+
+       pr_info("Can be used only in combination with "
+               "either -p tcp or -p udp\n");
+       return -EINVAL;
+}
+#endif
+
+static int tproxy_tg4_check(const struct xt_tgchk_param *par)
 {
        const struct ipt_ip *i = par->entryinfo;
 
@@ -74,31 +349,64 @@ static int tproxy_tg_check(const struct xt_tgchk_param *par)
        return -EINVAL;
 }
 
-static struct xt_target tproxy_tg_reg __read_mostly = {
-       .name           = "TPROXY",
-       .family         = AF_INET,
-       .table          = "mangle",
-       .target         = tproxy_tg,
-       .targetsize     = sizeof(struct xt_tproxy_target_info),
-       .checkentry     = tproxy_tg_check,
-       .hooks          = 1 << NF_INET_PRE_ROUTING,
-       .me             = THIS_MODULE,
+static struct xt_target tproxy_tg_reg[] __read_mostly = {
+       {
+               .name           = "TPROXY",
+               .family         = NFPROTO_IPV4,
+               .table          = "mangle",
+               .target         = tproxy_tg4_v0,
+               .revision       = 0,
+               .targetsize     = sizeof(struct xt_tproxy_target_info),
+               .checkentry     = tproxy_tg4_check,
+               .hooks          = 1 << NF_INET_PRE_ROUTING,
+               .me             = THIS_MODULE,
+       },
+       {
+               .name           = "TPROXY",
+               .family         = NFPROTO_IPV4,
+               .table          = "mangle",
+               .target         = tproxy_tg4_v1,
+               .revision       = 1,
+               .targetsize     = sizeof(struct xt_tproxy_target_info_v1),
+               .checkentry     = tproxy_tg4_check,
+               .hooks          = 1 << NF_INET_PRE_ROUTING,
+               .me             = THIS_MODULE,
+       },
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+       {
+               .name           = "TPROXY",
+               .family         = NFPROTO_IPV6,
+               .table          = "mangle",
+               .target         = tproxy_tg6_v1,
+               .revision       = 1,
+               .targetsize     = sizeof(struct xt_tproxy_target_info_v1),
+               .checkentry     = tproxy_tg6_check,
+               .hooks          = 1 << NF_INET_PRE_ROUTING,
+               .me             = THIS_MODULE,
+       },
+#endif
+
 };
 
 static int __init tproxy_tg_init(void)
 {
        nf_defrag_ipv4_enable();
-       return xt_register_target(&tproxy_tg_reg);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+       nf_defrag_ipv6_enable();
+#endif
+
+       return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
 }
 
 static void __exit tproxy_tg_exit(void)
 {
-       xt_unregister_target(&tproxy_tg_reg);
+       xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
 }
 
 module_init(tproxy_tg_init);
 module_exit(tproxy_tg_exit);
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Krisztian Kovacs");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
 MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module.");
 MODULE_ALIAS("ipt_TPROXY");
+MODULE_ALIAS("ip6t_TPROXY");
index 7a4d66db95aed4bade12679308045c942c155ba3..9127a3d8aa355d5ff94612bae755d69cb89fa402 100644 (file)
@@ -16,7 +16,6 @@
 #include <linux/ip_vs.h>
 #include <linux/types.h>
 #include <linux/netfilter/x_tables.h>
-#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_ipvs.h>
 #include <net/netfilter/nf_conntrack.h>
 
index 1ca89908cbad84d2dc45093aa474cca056f239e3..2dbd4c857735abdddde389dd5154bbba0e5f2aef 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/skbuff.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <net/icmp.h>
@@ -21,6 +22,7 @@
 #include <net/inet_sock.h>
 #include <net/netfilter/nf_tproxy_core.h>
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 
 #include <linux/netfilter/xt_socket.h>
 
@@ -30,7 +32,7 @@
 #endif
 
 static int
-extract_icmp_fields(const struct sk_buff *skb,
+extract_icmp4_fields(const struct sk_buff *skb,
                    u8 *protocol,
                    __be32 *raddr,
                    __be32 *laddr,
@@ -86,7 +88,6 @@ extract_icmp_fields(const struct sk_buff *skb,
        return 0;
 }
 
-
 static bool
 socket_match(const struct sk_buff *skb, struct xt_action_param *par,
             const struct xt_socket_mtinfo1 *info)
@@ -115,7 +116,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
                dport = hp->dest;
 
        } else if (iph->protocol == IPPROTO_ICMP) {
-               if (extract_icmp_fields(skb, &protocol, &saddr, &daddr,
+               if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
                                        &sport, &dport))
                        return false;
        } else {
@@ -142,7 +143,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 #endif
 
        sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol,
-                                  saddr, daddr, sport, dport, par->in, false);
+                                  saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
        if (sk != NULL) {
                bool wildcard;
                bool transparent = true;
@@ -165,32 +166,157 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
                        sk = NULL;
        }
 
-       pr_debug("proto %u %08x:%u -> %08x:%u (orig %08x:%u) sock %p\n",
-                protocol, ntohl(saddr), ntohs(sport),
-                ntohl(daddr), ntohs(dport),
-                ntohl(iph->daddr), hp ? ntohs(hp->dest) : 0, sk);
+       pr_debug("proto %hhu %pI4:%hu -> %pI4:%hu (orig %pI4:%hu) sock %p\n",
+                protocol, &saddr, ntohs(sport),
+                &daddr, ntohs(dport),
+                &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
 
        return (sk != NULL);
 }
 
 static bool
-socket_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)
 {
        return socket_match(skb, par, NULL);
 }
 
 static bool
-socket_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
        return socket_match(skb, par, par->matchinfo);
 }
 
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+static int
+extract_icmp6_fields(const struct sk_buff *skb,
+                    unsigned int outside_hdrlen,
+                    u8 *protocol,
+                    struct in6_addr **raddr,
+                    struct in6_addr **laddr,
+                    __be16 *rport,
+                    __be16 *lport)
+{
+       struct ipv6hdr *inside_iph, _inside_iph;
+       struct icmp6hdr *icmph, _icmph;
+       __be16 *ports, _ports[2];
+       u8 inside_nexthdr;
+       int inside_hdrlen;
+
+       icmph = skb_header_pointer(skb, outside_hdrlen,
+                                  sizeof(_icmph), &_icmph);
+       if (icmph == NULL)
+               return 1;
+
+       if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
+               return 1;
+
+       inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph);
+       if (inside_iph == NULL)
+               return 1;
+       inside_nexthdr = inside_iph->nexthdr;
+
+       inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), &inside_nexthdr);
+       if (inside_hdrlen < 0)
+               return 1; /* hjm: Packet has no/incomplete transport layer headers. */
+
+       if (inside_nexthdr != IPPROTO_TCP &&
+           inside_nexthdr != IPPROTO_UDP)
+               return 1;
+
+       ports = skb_header_pointer(skb, inside_hdrlen,
+                                  sizeof(_ports), &_ports);
+       if (ports == NULL)
+               return 1;
+
+       /* the inside IP packet is the one quoted from our side, thus
+        * its saddr is the local address */
+       *protocol = inside_nexthdr;
+       *laddr = &inside_iph->saddr;
+       *lport = ports[0];
+       *raddr = &inside_iph->daddr;
+       *rport = ports[1];
+
+       return 0;
+}
+
+static bool
+socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+       struct ipv6hdr *iph = ipv6_hdr(skb);
+       struct udphdr _hdr, *hp = NULL;
+       struct sock *sk;
+       struct in6_addr *daddr, *saddr;
+       __be16 dport, sport;
+       int thoff;
+       u8 tproto;
+       const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+
+       tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+       if (tproto < 0) {
+               pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+               return NF_DROP;
+       }
+
+       if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
+               hp = skb_header_pointer(skb, thoff,
+                                       sizeof(_hdr), &_hdr);
+               if (hp == NULL)
+                       return false;
+
+               saddr = &iph->saddr;
+               sport = hp->source;
+               daddr = &iph->daddr;
+               dport = hp->dest;
+
+       } else if (tproto == IPPROTO_ICMPV6) {
+               if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
+                                        &sport, &dport))
+                       return false;
+       } else {
+               return false;
+       }
+
+       sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+                                  saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
+       if (sk != NULL) {
+               bool wildcard;
+               bool transparent = true;
+
+               /* Ignore sockets listening on INADDR_ANY */
+               wildcard = (sk->sk_state != TCP_TIME_WAIT &&
+                           ipv6_addr_any(&inet6_sk(sk)->rcv_saddr));
+
+               /* Ignore non-transparent sockets,
+                  if XT_SOCKET_TRANSPARENT is used */
+               if (info && info->flags & XT_SOCKET_TRANSPARENT)
+                       transparent = ((sk->sk_state != TCP_TIME_WAIT &&
+                                       inet_sk(sk)->transparent) ||
+                                      (sk->sk_state == TCP_TIME_WAIT &&
+                                       inet_twsk(sk)->tw_transparent));
+
+               nf_tproxy_put_sock(sk);
+
+               if (wildcard || !transparent)
+                       sk = NULL;
+       }
+
+       pr_debug("proto %hhu %pI6:%hu -> %pI6:%hu "
+                "(orig %pI6:%hu) sock %p\n",
+                tproto, saddr, ntohs(sport),
+                daddr, ntohs(dport),
+                &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
+
+       return (sk != NULL);
+}
+#endif
+
 static struct xt_match socket_mt_reg[] __read_mostly = {
        {
                .name           = "socket",
                .revision       = 0,
                .family         = NFPROTO_IPV4,
-               .match          = socket_mt_v0,
+               .match          = socket_mt4_v0,
                .hooks          = (1 << NF_INET_PRE_ROUTING) |
                                  (1 << NF_INET_LOCAL_IN),
                .me             = THIS_MODULE,
@@ -199,17 +325,33 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
                .name           = "socket",
                .revision       = 1,
                .family         = NFPROTO_IPV4,
-               .match          = socket_mt_v1,
+               .match          = socket_mt4_v1,
                .matchsize      = sizeof(struct xt_socket_mtinfo1),
                .hooks          = (1 << NF_INET_PRE_ROUTING) |
                                  (1 << NF_INET_LOCAL_IN),
                .me             = THIS_MODULE,
        },
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+       {
+               .name           = "socket",
+               .revision       = 1,
+               .family         = NFPROTO_IPV6,
+               .match          = socket_mt6_v1,
+               .matchsize      = sizeof(struct xt_socket_mtinfo1),
+               .hooks          = (1 << NF_INET_PRE_ROUTING) |
+                                 (1 << NF_INET_LOCAL_IN),
+               .me             = THIS_MODULE,
+       },
+#endif
 };
 
 static int __init socket_mt_init(void)
 {
        nf_defrag_ipv4_enable();
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+       nf_defrag_ipv6_enable();
+#endif
+
        return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
 }
 
@@ -225,3 +367,4 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
 MODULE_DESCRIPTION("x_tables socket match module");
 MODULE_ALIAS("ipt_socket");
+MODULE_ALIAS("ip6t_socket");
index c7e59e6ec34938a3762fead555fc6748ab5d0e09..8daef9632255349f135867672941caa8181c6b54 100644 (file)
@@ -39,7 +39,7 @@ static struct tcf_hashinfo ipt_hash_info = {
        .lock   =       &ipt_lock,
 };
 
-static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
+static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
 {
        struct xt_tgchk_param par;
        struct xt_target *target;
@@ -66,7 +66,7 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int
        return 0;
 }
 
-static void ipt_destroy_target(struct ipt_entry_target *t)
+static void ipt_destroy_target(struct xt_entry_target *t)
 {
        struct xt_tgdtor_param par = {
                .target   = t->u.kernel.target,
@@ -99,7 +99,7 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
        [TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ },
        [TCA_IPT_HOOK]  = { .type = NLA_U32 },
        [TCA_IPT_INDEX] = { .type = NLA_U32 },
-       [TCA_IPT_TARG]  = { .len = sizeof(struct ipt_entry_target) },
+       [TCA_IPT_TARG]  = { .len = sizeof(struct xt_entry_target) },
 };
 
 static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
@@ -108,7 +108,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
        struct nlattr *tb[TCA_IPT_MAX + 1];
        struct tcf_ipt *ipt;
        struct tcf_common *pc;
-       struct ipt_entry_target *td, *t;
+       struct xt_entry_target *td, *t;
        char *tname;
        int ret = 0, err;
        u32 hook = 0;
@@ -126,7 +126,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
        if (tb[TCA_IPT_TARG] == NULL)
                return -EINVAL;
 
-       td = (struct ipt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
+       td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
        if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size)
                return -EINVAL;
 
@@ -230,7 +230,7 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
                result = TC_ACT_SHOT;
                ipt->tcf_qstats.drops++;
                break;
-       case IPT_CONTINUE:
+       case XT_CONTINUE:
                result = TC_ACT_PIPE;
                break;
        default:
@@ -249,7 +249,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
 {
        unsigned char *b = skb_tail_pointer(skb);
        struct tcf_ipt *ipt = a->priv;
-       struct ipt_entry_target *t;
+       struct xt_entry_target *t;
        struct tcf_t tm;
        struct tc_cnt c;