]> bbs.cooldavid.org Git - net-next-2.6.git/commitdiff
Merge branch 'for-davem' of git://oss.oracle.com/git/agrover/linux-2.6
authorDavid S. Miller <davem@davemloft.net>
Thu, 9 Sep 2010 21:58:11 +0000 (14:58 -0700)
committerDavid S. Miller <davem@davemloft.net>
Thu, 9 Sep 2010 21:58:11 +0000 (14:58 -0700)
42 files changed:
include/linux/Kbuild
include/linux/rds.h
net/rds/af_rds.c
net/rds/bind.c
net/rds/cong.c
net/rds/connection.c
net/rds/ib.c
net/rds/ib.h
net/rds/ib_cm.c
net/rds/ib_rdma.c
net/rds/ib_recv.c
net/rds/ib_send.c
net/rds/ib_stats.c
net/rds/ib_sysctl.c
net/rds/info.c
net/rds/iw.c
net/rds/iw.h
net/rds/iw_cm.c
net/rds/iw_rdma.c
net/rds/iw_recv.c
net/rds/iw_send.c
net/rds/iw_sysctl.c
net/rds/loop.c
net/rds/message.c
net/rds/page.c
net/rds/rdma.c
net/rds/rdma.h [deleted file]
net/rds/rdma_transport.c
net/rds/rds.h
net/rds/recv.c
net/rds/send.c
net/rds/stats.c
net/rds/sysctl.c
net/rds/tcp.c
net/rds/tcp.h
net/rds/tcp_connect.c
net/rds/tcp_listen.c
net/rds/tcp_recv.c
net/rds/tcp_send.c
net/rds/threads.c
net/rds/transport.c
net/rds/xlist.h [new file with mode: 0644]

index 626b629429ff2fc3f30fcf63b25fb52211f184c0..c7fbf298ad6899a6884db700cc47c43cb194619d 100644 (file)
@@ -302,6 +302,7 @@ header-y += quota.h
 header-y += radeonfb.h
 header-y += random.h
 header-y += raw.h
+header-y += rds.h
 header-y += reboot.h
 header-y += reiserfs_fs.h
 header-y += reiserfs_xattr.h
index 7f3971d9fc5cf32ab63ad665201c9fee1d692fdf..91950950aa598060a8e0e370f82654cd9a75e7d6 100644 (file)
 #define RDS_CMSG_RDMA_MAP              3
 #define RDS_CMSG_RDMA_STATUS           4
 #define RDS_CMSG_CONG_UPDATE           5
+#define RDS_CMSG_ATOMIC_FADD           6
+#define RDS_CMSG_ATOMIC_CSWP           7
+#define RDS_CMSG_MASKED_ATOMIC_FADD    8
+#define RDS_CMSG_MASKED_ATOMIC_CSWP    9
 
 #define RDS_INFO_FIRST                 10000
 #define RDS_INFO_COUNTERS              10000
@@ -89,9 +93,9 @@
 #define RDS_INFO_LAST                  10010
 
 struct rds_info_counter {
-       u_int8_t        name[32];
-       u_int64_t       value;
-} __packed;
+       uint8_t name[32];
+       uint64_t        value;
+} __attribute__((packed));
 
 #define RDS_INFO_CONNECTION_FLAG_SENDING       0x01
 #define RDS_INFO_CONNECTION_FLAG_CONNECTING    0x02
@@ -100,56 +104,48 @@ struct rds_info_counter {
 #define TRANSNAMSIZ    16
 
 struct rds_info_connection {
-       u_int64_t       next_tx_seq;
-       u_int64_t       next_rx_seq;
+       uint64_t        next_tx_seq;
+       uint64_t        next_rx_seq;
        __be32          laddr;
        __be32          faddr;
-       u_int8_t        transport[TRANSNAMSIZ];         /* null term ascii */
-       u_int8_t        flags;
-} __packed;
-
-struct rds_info_flow {
-       __be32          laddr;
-       __be32          faddr;
-       u_int32_t       bytes;
-       __be16          lport;
-       __be16          fport;
-} __packed;
+       uint8_t transport[TRANSNAMSIZ];         /* null term ascii */
+       uint8_t flags;
+} __attribute__((packed));
 
 #define RDS_INFO_MESSAGE_FLAG_ACK               0x01
 #define RDS_INFO_MESSAGE_FLAG_FAST_ACK          0x02
 
 struct rds_info_message {
-       u_int64_t       seq;
-       u_int32_t       len;
+       uint64_t        seq;
+       uint32_t        len;
        __be32          laddr;
        __be32          faddr;
        __be16          lport;
        __be16          fport;
-       u_int8_t        flags;
-} __packed;
+       uint8_t flags;
+} __attribute__((packed));
 
 struct rds_info_socket {
-       u_int32_t       sndbuf;
+       uint32_t        sndbuf;
        __be32          bound_addr;
        __be32          connected_addr;
        __be16          bound_port;
        __be16          connected_port;
-       u_int32_t       rcvbuf;
-       u_int64_t       inum;
-} __packed;
+       uint32_t        rcvbuf;
+       uint64_t        inum;
+} __attribute__((packed));
 
 struct rds_info_tcp_socket {
        __be32          local_addr;
        __be16          local_port;
        __be32          peer_addr;
        __be16          peer_port;
-       u_int64_t       hdr_rem;
-       u_int64_t       data_rem;
-       u_int32_t       last_sent_nxt;
-       u_int32_t       last_expected_una;
-       u_int32_t       last_seen_una;
-} __packed;
+       uint64_t       hdr_rem;
+       uint64_t       data_rem;
+       uint32_t       last_sent_nxt;
+       uint32_t       last_expected_una;
+       uint32_t       last_seen_una;
+} __attribute__((packed));
 
 #define RDS_IB_GID_LEN 16
 struct rds_info_rdma_connection {
@@ -203,42 +199,69 @@ struct rds_info_rdma_connection {
  * (so that the application does not have to worry about
  * alignment).
  */
-typedef u_int64_t      rds_rdma_cookie_t;
+typedef uint64_t       rds_rdma_cookie_t;
 
 struct rds_iovec {
-       u_int64_t       addr;
-       u_int64_t       bytes;
+       uint64_t        addr;
+       uint64_t        bytes;
 };
 
 struct rds_get_mr_args {
        struct rds_iovec vec;
-       u_int64_t       cookie_addr;
+       uint64_t        cookie_addr;
        uint64_t        flags;
 };
 
 struct rds_get_mr_for_dest_args {
        struct sockaddr_storage dest_addr;
        struct rds_iovec        vec;
-       u_int64_t               cookie_addr;
+       uint64_t                cookie_addr;
        uint64_t                flags;
 };
 
 struct rds_free_mr_args {
        rds_rdma_cookie_t cookie;
-       u_int64_t       flags;
+       uint64_t        flags;
 };
 
 struct rds_rdma_args {
        rds_rdma_cookie_t cookie;
        struct rds_iovec remote_vec;
-       u_int64_t       local_vec_addr;
-       u_int64_t       nr_local;
-       u_int64_t       flags;
-       u_int64_t       user_token;
+       uint64_t        local_vec_addr;
+       uint64_t        nr_local;
+       uint64_t        flags;
+       uint64_t        user_token;
+};
+
+struct rds_atomic_args {
+       rds_rdma_cookie_t cookie;
+       uint64_t        local_addr;
+       uint64_t        remote_addr;
+       union {
+               struct {
+                       uint64_t        compare;
+                       uint64_t        swap;
+               } cswp;
+               struct {
+                       uint64_t        add;
+               } fadd;
+               struct {
+                       uint64_t        compare;
+                       uint64_t        swap;
+                       uint64_t        compare_mask;
+                       uint64_t        swap_mask;
+               } m_cswp;
+               struct {
+                       uint64_t        add;
+                       uint64_t        nocarry_mask;
+               } m_fadd;
+       };
+       uint64_t        flags;
+       uint64_t        user_token;
 };
 
 struct rds_rdma_notify {
-       u_int64_t       user_token;
+       uint64_t        user_token;
        int32_t         status;
 };
 
@@ -257,5 +280,6 @@ struct rds_rdma_notify {
 #define RDS_RDMA_USE_ONCE      0x0008  /* free MR after use */
 #define RDS_RDMA_DONTWAIT      0x0010  /* Don't wait in SET_BARRIER */
 #define RDS_RDMA_NOTIFY_ME     0x0020  /* Notify when operation completes */
+#define RDS_RDMA_SILENT                0x0040  /* Do not interrupt remote */
 
 #endif /* IB_RDS_H */
index aebfecbdb8417cfa3bd3196e2c22f18433a7b08b..bb6ad81b671d055b89fbe74287ed328a834aac03 100644 (file)
 #include <net/sock.h>
 
 #include "rds.h"
-#include "rdma.h"
+
+char *rds_str_array(char **array, size_t elements, size_t index)
+{
+       if ((index < elements) && array[index])
+               return array[index];
+       else
+               return "unknown";
+}
+EXPORT_SYMBOL(rds_str_array);
 
 /* this is just used for stats gathering :/ */
 static DEFINE_SPINLOCK(rds_sock_lock);
@@ -62,7 +70,7 @@ static int rds_release(struct socket *sock)
        struct rds_sock *rs;
        unsigned long flags;
 
-       if (sk == NULL)
+       if (!sk)
                goto out;
 
        rs = rds_sk_to_rs(sk);
@@ -73,7 +81,15 @@ static int rds_release(struct socket *sock)
         * with the socket. */
        rds_clear_recv_queue(rs);
        rds_cong_remove_socket(rs);
+
+       /*
+        * the binding lookup hash uses rcu, we need to
+        * make sure we sychronize_rcu before we free our
+        * entry
+        */
        rds_remove_bound(rs);
+       synchronize_rcu();
+
        rds_send_drop_to(rs, NULL);
        rds_rdma_drop_keys(rs);
        rds_notify_queue_get(rs, NULL);
@@ -83,6 +99,8 @@ static int rds_release(struct socket *sock)
        rds_sock_count--;
        spin_unlock_irqrestore(&rds_sock_lock, flags);
 
+       rds_trans_put(rs->rs_transport);
+
        sock->sk = NULL;
        sock_put(sk);
 out:
@@ -514,7 +532,7 @@ out:
        spin_unlock_irqrestore(&rds_sock_lock, flags);
 }
 
-static void __exit rds_exit(void)
+static void rds_exit(void)
 {
        sock_unregister(rds_family_ops.family);
        proto_unregister(&rds_proto);
@@ -529,7 +547,7 @@ static void __exit rds_exit(void)
 }
 module_exit(rds_exit);
 
-static int __init rds_init(void)
+static int rds_init(void)
 {
        int ret;
 
index 5d95fc007f1aa6244d500561ccc0324063cda221..2f6b3fcc79f81b9985ca5774a7fa3e070a105d11 100644 (file)
 #include <net/sock.h>
 #include <linux/in.h>
 #include <linux/if_arp.h>
+#include <linux/jhash.h>
 #include "rds.h"
 
-/*
- * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
- * particularly zippy.
- *
- * This is now called for every incoming frame so we arguably care much more
- * about it than we used to.
- */
+#define BIND_HASH_SIZE 1024
+static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
 static DEFINE_SPINLOCK(rds_bind_lock);
-static struct rb_root rds_bind_tree = RB_ROOT;
 
-static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
-                                          struct rds_sock *insert)
+static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
+{
+       return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
+                                 (BIND_HASH_SIZE - 1));
+}
+
+static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
+                                       struct rds_sock *insert)
 {
-       struct rb_node **p = &rds_bind_tree.rb_node;
-       struct rb_node *parent = NULL;
        struct rds_sock *rs;
+       struct hlist_node *node;
+       struct hlist_head *head = hash_to_bucket(addr, port);
        u64 cmp;
        u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
 
-       while (*p) {
-               parent = *p;
-               rs = rb_entry(parent, struct rds_sock, rs_bound_node);
-
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) {
                cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
                      be16_to_cpu(rs->rs_bound_port);
 
-               if (needle < cmp)
-                       p = &(*p)->rb_left;
-               else if (needle > cmp)
-                       p = &(*p)->rb_right;
-               else
+               if (cmp == needle) {
+                       rcu_read_unlock();
                        return rs;
+               }
        }
+       rcu_read_unlock();
 
        if (insert) {
-               rb_link_node(&insert->rs_bound_node, parent, p);
-               rb_insert_color(&insert->rs_bound_node, &rds_bind_tree);
+               /*
+                * make sure our addr and port are set before
+                * we are added to the list, other people
+                * in rcu will find us as soon as the
+                * hlist_add_head_rcu is done
+                */
+               insert->rs_bound_addr = addr;
+               insert->rs_bound_port = port;
+               rds_sock_addref(insert);
+
+               hlist_add_head_rcu(&insert->rs_bound_node, head);
        }
        return NULL;
 }
@@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
 struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
 {
        struct rds_sock *rs;
-       unsigned long flags;
 
-       spin_lock_irqsave(&rds_bind_lock, flags);
-       rs = rds_bind_tree_walk(addr, port, NULL);
+       rs = rds_bind_lookup(addr, port, NULL);
+
        if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
                rds_sock_addref(rs);
        else
                rs = NULL;
-       spin_unlock_irqrestore(&rds_bind_lock, flags);
 
        rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
                ntohs(port));
@@ -121,22 +126,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
        do {
                if (rover == 0)
                        rover++;
-               if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) {
-                       *port = cpu_to_be16(rover);
+               if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
+                       *port = rs->rs_bound_port;
                        ret = 0;
+                       rdsdebug("rs %p binding to %pI4:%d\n",
+                         rs, &addr, (int)ntohs(*port));
                        break;
                }
        } while (rover++ != last);
 
-       if (ret == 0)  {
-               rs->rs_bound_addr = addr;
-               rs->rs_bound_port = *port;
-               rds_sock_addref(rs);
-
-               rdsdebug("rs %p binding to %pI4:%d\n",
-                 rs, &addr, (int)ntohs(*port));
-       }
-
        spin_unlock_irqrestore(&rds_bind_lock, flags);
 
        return ret;
@@ -153,7 +151,7 @@ void rds_remove_bound(struct rds_sock *rs)
                  rs, &rs->rs_bound_addr,
                  ntohs(rs->rs_bound_port));
 
-               rb_erase(&rs->rs_bound_node, &rds_bind_tree);
+               hlist_del_init_rcu(&rs->rs_bound_node);
                rds_sock_put(rs);
                rs->rs_bound_addr = 0;
        }
@@ -184,7 +182,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
                goto out;
 
        trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
-       if (trans == NULL) {
+       if (!trans) {
                ret = -EADDRNOTAVAIL;
                rds_remove_bound(rs);
                if (printk_ratelimit())
@@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 out:
        release_sock(sk);
+
+       /* we might have called rds_remove_bound on error */
+       if (ret)
+               synchronize_rcu();
        return ret;
 }
index 0871a29f078000ee79370fd20e6d94ca46ba3efd..75ea686f27d5aeedfd220dfbfa514295d6cc8ecc 100644 (file)
@@ -141,7 +141,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
        unsigned long flags;
 
        map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
-       if (map == NULL)
+       if (!map)
                return NULL;
 
        map->m_addr = addr;
@@ -159,7 +159,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
        ret = rds_cong_tree_walk(addr, map);
        spin_unlock_irqrestore(&rds_cong_lock, flags);
 
-       if (ret == NULL) {
+       if (!ret) {
                ret = map;
                map = NULL;
        }
@@ -205,7 +205,7 @@ int rds_cong_get_maps(struct rds_connection *conn)
        conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
        conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
 
-       if (conn->c_lcong == NULL || conn->c_fcong == NULL)
+       if (!(conn->c_lcong && conn->c_fcong))
                return -ENOMEM;
 
        return 0;
@@ -221,7 +221,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
        list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
                if (!test_and_set_bit(0, &conn->c_map_queued)) {
                        rds_stats_inc(s_cong_update_queued);
-                       queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+                       rds_send_xmit(conn);
                }
        }
 
index 7619b671ca2829f0e197a93f6abd06df2f108229..870992e08cae2e86681d569b3ac27d83b6dffb89 100644 (file)
@@ -37,7 +37,6 @@
 
 #include "rds.h"
 #include "loop.h"
-#include "rdma.h"
 
 #define RDS_CONNECTION_HASH_BITS 12
 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
@@ -63,18 +62,7 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
                var |= RDS_INFO_CONNECTION_FLAG_##suffix;       \
 } while (0)
 
-static inline int rds_conn_is_sending(struct rds_connection *conn)
-{
-       int ret = 0;
-
-       if (!mutex_trylock(&conn->c_send_lock))
-               ret = 1;
-       else
-               mutex_unlock(&conn->c_send_lock);
-
-       return ret;
-}
-
+/* rcu read lock must be held or the connection spinlock */
 static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
                                              __be32 laddr, __be32 faddr,
                                              struct rds_transport *trans)
@@ -82,7 +70,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
        struct rds_connection *conn, *ret = NULL;
        struct hlist_node *pos;
 
-       hlist_for_each_entry(conn, pos, head, c_hash_node) {
+       hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
                if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
                                conn->c_trans == trans) {
                        ret = conn;
@@ -129,10 +117,11 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 {
        struct rds_connection *conn, *parent = NULL;
        struct hlist_head *head = rds_conn_bucket(laddr, faddr);
+       struct rds_transport *loop_trans;
        unsigned long flags;
        int ret;
 
-       spin_lock_irqsave(&rds_conn_lock, flags);
+       rcu_read_lock();
        conn = rds_conn_lookup(head, laddr, faddr, trans);
        if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
            !is_outgoing) {
@@ -143,12 +132,12 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
                parent = conn;
                conn = parent->c_passive;
        }
-       spin_unlock_irqrestore(&rds_conn_lock, flags);
+       rcu_read_unlock();
        if (conn)
                goto out;
 
        conn = kmem_cache_zalloc(rds_conn_slab, gfp);
-       if (conn == NULL) {
+       if (!conn) {
                conn = ERR_PTR(-ENOMEM);
                goto out;
        }
@@ -159,7 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
        spin_lock_init(&conn->c_lock);
        conn->c_next_tx_seq = 1;
 
-       mutex_init(&conn->c_send_lock);
+       init_waitqueue_head(&conn->c_waitq);
        INIT_LIST_HEAD(&conn->c_send_queue);
        INIT_LIST_HEAD(&conn->c_retrans);
 
@@ -175,7 +164,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
         * can bind to the destination address then we'd rather the messages
         * flow through loopback rather than either transport.
         */
-       if (rds_trans_get_preferred(faddr)) {
+       loop_trans = rds_trans_get_preferred(faddr);
+       if (loop_trans) {
+               rds_trans_put(loop_trans);
                conn->c_loopback = 1;
                if (is_outgoing && trans->t_prefer_loopback) {
                        /* "outgoing" connection - and the transport
@@ -238,7 +229,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
                        kmem_cache_free(rds_conn_slab, conn);
                        conn = found;
                } else {
-                       hlist_add_head(&conn->c_hash_node, head);
+                       hlist_add_head_rcu(&conn->c_hash_node, head);
                        rds_cong_add_conn(conn);
                        rds_conn_count++;
                }
@@ -263,21 +254,91 @@ struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
 }
 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
 
+void rds_conn_shutdown(struct rds_connection *conn)
+{
+       /* shut it down unless it's down already */
+       if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+               /*
+                * Quiesce the connection mgmt handlers before we start tearing
+                * things down. We don't hold the mutex for the entire
+                * duration of the shutdown operation, else we may be
+                * deadlocking with the CM handler. Instead, the CM event
+                * handler is supposed to check for state DISCONNECTING
+                */
+               mutex_lock(&conn->c_cm_lock);
+               if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
+                && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
+                       rds_conn_error(conn, "shutdown called in state %d\n",
+                                       atomic_read(&conn->c_state));
+                       mutex_unlock(&conn->c_cm_lock);
+                       return;
+               }
+               mutex_unlock(&conn->c_cm_lock);
+
+               wait_event(conn->c_waitq,
+                          !test_bit(RDS_IN_XMIT, &conn->c_flags));
+
+               conn->c_trans->conn_shutdown(conn);
+               rds_conn_reset(conn);
+
+               if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+                       /* This can happen - eg when we're in the middle of tearing
+                        * down the connection, and someone unloads the rds module.
+                        * Quite reproduceable with loopback connections.
+                        * Mostly harmless.
+                        */
+                       rds_conn_error(conn,
+                               "%s: failed to transition to state DOWN, "
+                               "current state is %d\n",
+                               __func__,
+                               atomic_read(&conn->c_state));
+                       return;
+               }
+       }
+
+       /* Then reconnect if it's still live.
+        * The passive side of an IB loopback connection is never added
+        * to the conn hash, so we never trigger a reconnect on this
+        * conn - the reconnect is always triggered by the active peer. */
+       cancel_delayed_work_sync(&conn->c_conn_w);
+       rcu_read_lock();
+       if (!hlist_unhashed(&conn->c_hash_node)) {
+               rcu_read_unlock();
+               rds_queue_reconnect(conn);
+       } else {
+               rcu_read_unlock();
+       }
+}
+
+/*
+ * Stop and free a connection.
+ *
+ * This can only be used in very limited circumstances.  It assumes that once
+ * the conn has been shutdown that no one else is referencing the connection.
+ * We can only ensure this in the rmmod path in the current code.
+ */
 void rds_conn_destroy(struct rds_connection *conn)
 {
        struct rds_message *rm, *rtmp;
+       unsigned long flags;
 
        rdsdebug("freeing conn %p for %pI4 -> "
                 "%pI4\n", conn, &conn->c_laddr,
                 &conn->c_faddr);
 
-       hlist_del_init(&conn->c_hash_node);
+       /* Ensure conn will not be scheduled for reconnect */
+       spin_lock_irq(&rds_conn_lock);
+       hlist_del_init_rcu(&conn->c_hash_node);
+       spin_unlock_irq(&rds_conn_lock);
+       synchronize_rcu();
 
-       /* wait for the rds thread to shut it down */
-       atomic_set(&conn->c_state, RDS_CONN_ERROR);
-       cancel_delayed_work(&conn->c_conn_w);
-       queue_work(rds_wq, &conn->c_down_w);
-       flush_workqueue(rds_wq);
+       /* shut the connection down */
+       rds_conn_drop(conn);
+       flush_work(&conn->c_down_w);
+
+       /* make sure lingering queued work won't try to ref the conn */
+       cancel_delayed_work_sync(&conn->c_send_w);
+       cancel_delayed_work_sync(&conn->c_recv_w);
 
        /* tear down queued messages */
        list_for_each_entry_safe(rm, rtmp,
@@ -302,7 +363,9 @@ void rds_conn_destroy(struct rds_connection *conn)
        BUG_ON(!list_empty(&conn->c_retrans));
        kmem_cache_free(rds_conn_slab, conn);
 
+       spin_lock_irqsave(&rds_conn_lock, flags);
        rds_conn_count--;
+       spin_unlock_irqrestore(&rds_conn_lock, flags);
 }
 EXPORT_SYMBOL_GPL(rds_conn_destroy);
 
@@ -316,23 +379,23 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
        struct list_head *list;
        struct rds_connection *conn;
        struct rds_message *rm;
-       unsigned long flags;
        unsigned int total = 0;
+       unsigned long flags;
        size_t i;
 
        len /= sizeof(struct rds_info_message);
 
-       spin_lock_irqsave(&rds_conn_lock, flags);
+       rcu_read_lock();
 
        for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
             i++, head++) {
-               hlist_for_each_entry(conn, pos, head, c_hash_node) {
+               hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
                        if (want_send)
                                list = &conn->c_send_queue;
                        else
                                list = &conn->c_retrans;
 
-                       spin_lock(&conn->c_lock);
+                       spin_lock_irqsave(&conn->c_lock, flags);
 
                        /* XXX too lazy to maintain counts.. */
                        list_for_each_entry(rm, list, m_conn_item) {
@@ -343,11 +406,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
                                                          conn->c_faddr, 0);
                        }
 
-                       spin_unlock(&conn->c_lock);
+                       spin_unlock_irqrestore(&conn->c_lock, flags);
                }
        }
-
-       spin_unlock_irqrestore(&rds_conn_lock, flags);
+       rcu_read_unlock();
 
        lens->nr = total;
        lens->each = sizeof(struct rds_info_message);
@@ -377,19 +439,17 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
        uint64_t buffer[(item_len + 7) / 8];
        struct hlist_head *head;
        struct hlist_node *pos;
-       struct hlist_node *tmp;
        struct rds_connection *conn;
-       unsigned long flags;
        size_t i;
 
-       spin_lock_irqsave(&rds_conn_lock, flags);
+       rcu_read_lock();
 
        lens->nr = 0;
        lens->each = item_len;
 
        for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
             i++, head++) {
-               hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) {
+               hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
 
                        /* XXX no c_lock usage.. */
                        if (!visitor(conn, buffer))
@@ -405,8 +465,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
                        lens->nr++;
                }
        }
-
-       spin_unlock_irqrestore(&rds_conn_lock, flags);
+       rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
 
@@ -423,8 +482,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn,
                sizeof(cinfo->transport));
        cinfo->flags = 0;
 
-       rds_conn_info_set(cinfo->flags,
-                         rds_conn_is_sending(conn), SENDING);
+       rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
+                         SENDING);
        /* XXX Future: return the state rather than these funky bits */
        rds_conn_info_set(cinfo->flags,
                          atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
@@ -444,12 +503,12 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
                                sizeof(struct rds_info_connection));
 }
 
-int __init rds_conn_init(void)
+int rds_conn_init(void)
 {
        rds_conn_slab = kmem_cache_create("rds_connection",
                                          sizeof(struct rds_connection),
                                          0, 0, NULL);
-       if (rds_conn_slab == NULL)
+       if (!rds_conn_slab)
                return -ENOMEM;
 
        rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
@@ -486,6 +545,18 @@ void rds_conn_drop(struct rds_connection *conn)
 }
 EXPORT_SYMBOL_GPL(rds_conn_drop);
 
+/*
+ * If the connection is down, trigger a connect. We may have scheduled a
+ * delayed reconnect however - in this case we should not interfere.
+ */
+void rds_conn_connect_if_down(struct rds_connection *conn)
+{
+       if (rds_conn_state(conn) == RDS_CONN_DOWN &&
+           !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+               queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+}
+EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
+
 /*
  * An error occurred on the connection
  */
index 8f2d6dd7700a8a0e20192b521c30dc7bc2df553b..b12a3951167dc2f264c465ee4923a98ef98510be 100644 (file)
@@ -53,12 +53,71 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
 module_param(rds_ib_retry_count, int, 0444);
 MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
 
+/*
+ * we have a clumsy combination of RCU and a rwsem protecting this list
+ * because it is used both in the get_mr fast path and while blocking in
+ * the FMR flushing path.
+ */
+DECLARE_RWSEM(rds_ib_devices_lock);
 struct list_head rds_ib_devices;
 
 /* NOTE: if also grabbing ibdev lock, grab this first */
 DEFINE_SPINLOCK(ib_nodev_conns_lock);
 LIST_HEAD(ib_nodev_conns);
 
+void rds_ib_nodev_connect(void)
+{
+       struct rds_ib_connection *ic;
+
+       spin_lock(&ib_nodev_conns_lock);
+       list_for_each_entry(ic, &ib_nodev_conns, ib_node)
+               rds_conn_connect_if_down(ic->conn);
+       spin_unlock(&ib_nodev_conns_lock);
+}
+
+void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
+{
+       struct rds_ib_connection *ic;
+       unsigned long flags;
+
+       spin_lock_irqsave(&rds_ibdev->spinlock, flags);
+       list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
+               rds_conn_drop(ic->conn);
+       spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
+}
+
+/*
+ * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
+ * from interrupt context so we push freing off into a work struct in krdsd.
+ */
+static void rds_ib_dev_free(struct work_struct *work)
+{
+       struct rds_ib_ipaddr *i_ipaddr, *i_next;
+       struct rds_ib_device *rds_ibdev = container_of(work,
+                                       struct rds_ib_device, free_work);
+
+       if (rds_ibdev->mr_pool)
+               rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
+       if (rds_ibdev->mr)
+               ib_dereg_mr(rds_ibdev->mr);
+       if (rds_ibdev->pd)
+               ib_dealloc_pd(rds_ibdev->pd);
+
+       list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
+               list_del(&i_ipaddr->list);
+               kfree(i_ipaddr);
+       }
+
+       kfree(rds_ibdev);
+}
+
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
+{
+       BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
+       if (atomic_dec_and_test(&rds_ibdev->refcount))
+               queue_work(rds_wq, &rds_ibdev->free_work);
+}
+
 void rds_ib_add_one(struct ib_device *device)
 {
        struct rds_ib_device *rds_ibdev;
@@ -77,11 +136,14 @@ void rds_ib_add_one(struct ib_device *device)
                goto free_attr;
        }
 
-       rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL);
+       rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
+                                ibdev_to_node(device));
        if (!rds_ibdev)
                goto free_attr;
 
        spin_lock_init(&rds_ibdev->spinlock);
+       atomic_set(&rds_ibdev->refcount, 1);
+       INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
 
        rds_ibdev->max_wrs = dev_attr->max_qp_wr;
        rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
@@ -91,68 +153,107 @@ void rds_ib_add_one(struct ib_device *device)
                        min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
                        fmr_pool_size;
 
+       rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
+       rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
+
        rds_ibdev->dev = device;
        rds_ibdev->pd = ib_alloc_pd(device);
-       if (IS_ERR(rds_ibdev->pd))
-               goto free_dev;
+       if (IS_ERR(rds_ibdev->pd)) {
+               rds_ibdev->pd = NULL;
+               goto put_dev;
+       }
 
-       rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
-                                     IB_ACCESS_LOCAL_WRITE);
-       if (IS_ERR(rds_ibdev->mr))
-               goto err_pd;
+       rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
+       if (IS_ERR(rds_ibdev->mr)) {
+               rds_ibdev->mr = NULL;
+               goto put_dev;
+       }
 
        rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
        if (IS_ERR(rds_ibdev->mr_pool)) {
                rds_ibdev->mr_pool = NULL;
-               goto err_mr;
+               goto put_dev;
        }
 
        INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
        INIT_LIST_HEAD(&rds_ibdev->conn_list);
-       list_add_tail(&rds_ibdev->list, &rds_ib_devices);
+
+       down_write(&rds_ib_devices_lock);
+       list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
+       up_write(&rds_ib_devices_lock);
+       atomic_inc(&rds_ibdev->refcount);
 
        ib_set_client_data(device, &rds_ib_client, rds_ibdev);
+       atomic_inc(&rds_ibdev->refcount);
 
-       goto free_attr;
+       rds_ib_nodev_connect();
 
-err_mr:
-       ib_dereg_mr(rds_ibdev->mr);
-err_pd:
-       ib_dealloc_pd(rds_ibdev->pd);
-free_dev:
-       kfree(rds_ibdev);
+put_dev:
+       rds_ib_dev_put(rds_ibdev);
 free_attr:
        kfree(dev_attr);
 }
 
+/*
+ * New connections use this to find the device to associate with the
+ * connection.  It's not in the fast path so we're not concerned about the
+ * performance of the IB call.  (As of this writing, it uses an interrupt
+ * blocking spinlock to serialize walking a per-device list of all registered
+ * clients.)
+ *
+ * RCU is used to handle incoming connections racing with device teardown.
+ * Rather than use a lock to serialize removal from the client_data and
+ * getting a new reference, we use an RCU grace period.  The destruction
+ * path removes the device from client_data and then waits for all RCU
+ * readers to finish.
+ *
+ * A new connection can get NULL from this if its arriving on a
+ * device that is in the process of being removed.
+ */
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
+{
+       struct rds_ib_device *rds_ibdev;
+
+       rcu_read_lock();
+       rds_ibdev = ib_get_client_data(device, &rds_ib_client);
+       if (rds_ibdev)
+               atomic_inc(&rds_ibdev->refcount);
+       rcu_read_unlock();
+       return rds_ibdev;
+}
+
+/*
+ * The IB stack is letting us know that a device is going away.  This can
+ * happen if the underlying HCA driver is removed or if PCI hotplug is removing
+ * the pci function, for example.
+ *
+ * This can be called at any time and can be racing with any other RDS path.
+ */
 void rds_ib_remove_one(struct ib_device *device)
 {
        struct rds_ib_device *rds_ibdev;
-       struct rds_ib_ipaddr *i_ipaddr, *i_next;
 
        rds_ibdev = ib_get_client_data(device, &rds_ib_client);
        if (!rds_ibdev)
                return;
 
-       list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
-               list_del(&i_ipaddr->list);
-               kfree(i_ipaddr);
-       }
+       rds_ib_dev_shutdown(rds_ibdev);
 
-       rds_ib_destroy_conns(rds_ibdev);
+       /* stop connection attempts from getting a reference to this device. */
+       ib_set_client_data(device, &rds_ib_client, NULL);
 
-       if (rds_ibdev->mr_pool)
-               rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
-
-       ib_dereg_mr(rds_ibdev->mr);
-
-       while (ib_dealloc_pd(rds_ibdev->pd)) {
-               rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd);
-               msleep(1);
-       }
+       down_write(&rds_ib_devices_lock);
+       list_del_rcu(&rds_ibdev->list);
+       up_write(&rds_ib_devices_lock);
 
-       list_del(&rds_ibdev->list);
-       kfree(rds_ibdev);
+       /*
+        * This synchronize rcu is waiting for readers of both the ib
+        * client data and the devices list to finish before we drop
+        * both of those references.
+        */
+       synchronize_rcu();
+       rds_ib_dev_put(rds_ibdev);
+       rds_ib_dev_put(rds_ibdev);
 }
 
 struct ib_client rds_ib_client = {
@@ -186,7 +287,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
                rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
                rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
 
-               rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+               rds_ibdev = ic->rds_ibdev;
                iinfo->max_send_wr = ic->i_send_ring.w_nr;
                iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
                iinfo->max_send_sge = rds_ibdev->max_sge;
@@ -248,29 +349,36 @@ static int rds_ib_laddr_check(__be32 addr)
        return ret;
 }
 
+static void rds_ib_unregister_client(void)
+{
+       ib_unregister_client(&rds_ib_client);
+       /* wait for rds_ib_dev_free() to complete */
+       flush_workqueue(rds_wq);
+}
+
 void rds_ib_exit(void)
 {
        rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+       rds_ib_unregister_client();
        rds_ib_destroy_nodev_conns();
-       ib_unregister_client(&rds_ib_client);
        rds_ib_sysctl_exit();
        rds_ib_recv_exit();
        rds_trans_unregister(&rds_ib_transport);
+       rds_ib_fmr_exit();
 }
 
 struct rds_transport rds_ib_transport = {
        .laddr_check            = rds_ib_laddr_check,
        .xmit_complete          = rds_ib_xmit_complete,
        .xmit                   = rds_ib_xmit,
-       .xmit_cong_map          = NULL,
        .xmit_rdma              = rds_ib_xmit_rdma,
+       .xmit_atomic            = rds_ib_xmit_atomic,
        .recv                   = rds_ib_recv,
        .conn_alloc             = rds_ib_conn_alloc,
        .conn_free              = rds_ib_conn_free,
        .conn_connect           = rds_ib_conn_connect,
        .conn_shutdown          = rds_ib_conn_shutdown,
        .inc_copy_to_user       = rds_ib_inc_copy_to_user,
-       .inc_purge              = rds_ib_inc_purge,
        .inc_free               = rds_ib_inc_free,
        .cm_initiate_connect    = rds_ib_cm_initiate_connect,
        .cm_handle_connect      = rds_ib_cm_handle_connect,
@@ -286,16 +394,20 @@ struct rds_transport rds_ib_transport = {
        .t_type                 = RDS_TRANS_IB
 };
 
-int __init rds_ib_init(void)
+int rds_ib_init(void)
 {
        int ret;
 
        INIT_LIST_HEAD(&rds_ib_devices);
 
-       ret = ib_register_client(&rds_ib_client);
+       ret = rds_ib_fmr_init();
        if (ret)
                goto out;
 
+       ret = ib_register_client(&rds_ib_client);
+       if (ret)
+               goto out_fmr_exit;
+
        ret = rds_ib_sysctl_init();
        if (ret)
                goto out_ibreg;
@@ -317,7 +429,9 @@ out_recv:
 out_sysctl:
        rds_ib_sysctl_exit();
 out_ibreg:
-       ib_unregister_client(&rds_ib_client);
+       rds_ib_unregister_client();
+out_fmr_exit:
+       rds_ib_fmr_exit();
 out:
        return ret;
 }
index 64df4e79b29f27ffcb8e8242df4b344284734e74..7ad3d57e06a556563d89513bc86bb170b6cfa815 100644 (file)
@@ -3,11 +3,13 @@
 
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
 #include "rds.h"
 #include "rdma_transport.h"
 
 #define RDS_FMR_SIZE                   256
-#define RDS_FMR_POOL_SIZE              4096
+#define RDS_FMR_POOL_SIZE              8192
 
 #define RDS_IB_MAX_SGE                 8
 #define RDS_IB_RECV_SGE                2
@@ -19,6 +21,9 @@
 
 #define RDS_IB_SUPPORTED_PROTOCOLS     0x00000003      /* minor versions supported */
 
+#define RDS_IB_RECYCLE_BATCH_COUNT     32
+
+extern struct rw_semaphore rds_ib_devices_lock;
 extern struct list_head rds_ib_devices;
 
 /*
@@ -26,20 +31,29 @@ extern struct list_head rds_ib_devices;
  * try and minimize the amount of memory tied up both the device and
  * socket receive queues.
  */
-/* page offset of the final full frag that fits in the page */
-#define RDS_PAGE_LAST_OFF (((PAGE_SIZE  / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
 struct rds_page_frag {
        struct list_head        f_item;
-       struct page             *f_page;
-       unsigned long           f_offset;
-       dma_addr_t              f_mapped;
+       struct list_head        f_cache_entry;
+       struct scatterlist      f_sg;
 };
 
 struct rds_ib_incoming {
        struct list_head        ii_frags;
+       struct list_head        ii_cache_entry;
        struct rds_incoming     ii_inc;
 };
 
+struct rds_ib_cache_head {
+       struct list_head *first;
+       unsigned long count;
+};
+
+struct rds_ib_refill_cache {
+       struct rds_ib_cache_head *percpu;
+       struct list_head         *xfer;
+       struct list_head         *ready;
+};
+
 struct rds_ib_connect_private {
        /* Add new fields at the end, and don't permute existing fields. */
        __be32                  dp_saddr;
@@ -53,8 +67,7 @@ struct rds_ib_connect_private {
 };
 
 struct rds_ib_send_work {
-       struct rds_message      *s_rm;
-       struct rds_rdma_op      *s_op;
+       void                    *s_op;
        struct ib_send_wr       s_wr;
        struct ib_sge           s_sge[RDS_IB_MAX_SGE];
        unsigned long           s_queued;
@@ -92,10 +105,11 @@ struct rds_ib_connection {
 
        /* tx */
        struct rds_ib_work_ring i_send_ring;
-       struct rds_message      *i_rm;
+       struct rm_data_op       *i_data_op;
        struct rds_header       *i_send_hdrs;
        u64                     i_send_hdrs_dma;
        struct rds_ib_send_work *i_sends;
+       atomic_t                i_signaled_sends;
 
        /* rx */
        struct tasklet_struct   i_recv_tasklet;
@@ -106,8 +120,9 @@ struct rds_ib_connection {
        struct rds_header       *i_recv_hdrs;
        u64                     i_recv_hdrs_dma;
        struct rds_ib_recv_work *i_recvs;
-       struct rds_page_frag    i_frag;
        u64                     i_ack_recv;     /* last ACK received */
+       struct rds_ib_refill_cache i_cache_incs;
+       struct rds_ib_refill_cache i_cache_frags;
 
        /* sending acks */
        unsigned long           i_ack_flags;
@@ -138,7 +153,6 @@ struct rds_ib_connection {
 
        /* Batched completions */
        unsigned int            i_unsignaled_wrs;
-       long                    i_unsignaled_bytes;
 };
 
 /* This assumes that atomic_t is at least 32 bits */
@@ -164,9 +178,17 @@ struct rds_ib_device {
        unsigned int            max_fmrs;
        int                     max_sge;
        unsigned int            max_wrs;
+       unsigned int            max_initiator_depth;
+       unsigned int            max_responder_resources;
        spinlock_t              spinlock;       /* protect the above */
+       atomic_t                refcount;
+       struct work_struct      free_work;
 };
 
+#define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus)
+#define ibdev_to_node(ibdev) pcidev_to_node(to_pci_dev(ibdev->dma_device))
+#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
+
 /* bits for i_ack_flags */
 #define IB_ACK_IN_FLIGHT       0
 #define IB_ACK_REQUESTED       1
@@ -202,6 +224,8 @@ struct rds_ib_statistics {
        uint64_t        s_ib_rdma_mr_pool_flush;
        uint64_t        s_ib_rdma_mr_pool_wait;
        uint64_t        s_ib_rdma_mr_pool_depleted;
+       uint64_t        s_ib_atomic_cswp;
+       uint64_t        s_ib_atomic_fadd;
 };
 
 extern struct workqueue_struct *rds_ib_wq;
@@ -243,6 +267,8 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
 extern struct rds_transport rds_ib_transport;
 extern void rds_ib_add_one(struct ib_device *device);
 extern void rds_ib_remove_one(struct ib_device *device);
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
 extern struct ib_client rds_ib_client;
 
 extern unsigned int fmr_pool_size;
@@ -258,7 +284,7 @@ void rds_ib_conn_free(void *arg);
 int rds_ib_conn_connect(struct rds_connection *conn);
 void rds_ib_conn_shutdown(struct rds_connection *conn);
 void rds_ib_state_change(struct sock *sk);
-int __init rds_ib_listen_init(void);
+int rds_ib_listen_init(void);
 void rds_ib_listen_stop(void);
 void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -275,15 +301,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
 int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
 void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
 void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
-void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock);
-static inline void rds_ib_destroy_nodev_conns(void)
-{
-       __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock);
-}
-static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev)
-{
-       __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock);
-}
+void rds_ib_destroy_nodev_conns(void);
 struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
 void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
@@ -292,14 +310,16 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 void rds_ib_sync_mr(void *trans_private, int dir);
 void rds_ib_free_mr(void *trans_private, int invalidate);
 void rds_ib_flush_mrs(void);
+int rds_ib_fmr_init(void);
+void rds_ib_fmr_exit(void);
 
 /* ib_recv.c */
-int __init rds_ib_recv_init(void);
+int rds_ib_recv_init(void);
 void rds_ib_recv_exit(void);
 int rds_ib_recv(struct rds_connection *conn);
-int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
-                      gfp_t page_gfp, int prefill);
-void rds_ib_inc_purge(struct rds_incoming *inc);
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
 void rds_ib_inc_free(struct rds_incoming *inc);
 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
                             size_t size);
@@ -325,17 +345,19 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
 extern wait_queue_head_t rds_ib_ring_empty_wait;
 
 /* ib_send.c */
+char *rds_ib_wc_status_str(enum ib_wc_status status);
 void rds_ib_xmit_complete(struct rds_connection *conn);
 int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                unsigned int hdr_off, unsigned int sg, unsigned int off);
 void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
 void rds_ib_send_init_ring(struct rds_ib_connection *ic);
 void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
-int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
 void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
 void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
 int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
                             u32 *adv_credits, int need_posted, int max_posted);
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
 
 /* ib_stats.c */
 DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
@@ -344,7 +366,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
                                    unsigned int avail);
 
 /* ib_sysctl.c */
-int __init rds_ib_sysctl_init(void);
+int rds_ib_sysctl_init(void);
 void rds_ib_sysctl_exit(void);
 extern unsigned long rds_ib_sysctl_max_send_wr;
 extern unsigned long rds_ib_sysctl_max_recv_wr;
@@ -354,28 +376,4 @@ extern unsigned long rds_ib_sysctl_max_recv_allocation;
 extern unsigned int rds_ib_sysctl_flow_control;
 extern ctl_table rds_ib_sysctl_table[];
 
-/*
- * Helper functions for getting/setting the header and data SGEs in
- * RDS packets (not RDMA)
- *
- * From version 3.1 onwards, header is in front of data in the sge.
- */
-static inline struct ib_sge *
-rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
-{
-       if (ic->conn->c_version > RDS_PROTOCOL_3_0)
-               return &sge[0];
-       else
-               return &sge[1];
-}
-
-static inline struct ib_sge *
-rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
-{
-       if (ic->conn->c_version > RDS_PROTOCOL_3_0)
-               return &sge[1];
-       else
-               return &sge[0];
-}
-
 #endif
index f68832798db224d6abffcc08f4ad494fa2b3bc17..bc3dbc1ba61f38cb68a1e5cef1a466254bd52414 100644 (file)
 #include "rds.h"
 #include "ib.h"
 
+static char *rds_ib_event_type_strings[] = {
+#define RDS_IB_EVENT_STRING(foo) \
+               [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
+       RDS_IB_EVENT_STRING(CQ_ERR),
+       RDS_IB_EVENT_STRING(QP_FATAL),
+       RDS_IB_EVENT_STRING(QP_REQ_ERR),
+       RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
+       RDS_IB_EVENT_STRING(COMM_EST),
+       RDS_IB_EVENT_STRING(SQ_DRAINED),
+       RDS_IB_EVENT_STRING(PATH_MIG),
+       RDS_IB_EVENT_STRING(PATH_MIG_ERR),
+       RDS_IB_EVENT_STRING(DEVICE_FATAL),
+       RDS_IB_EVENT_STRING(PORT_ACTIVE),
+       RDS_IB_EVENT_STRING(PORT_ERR),
+       RDS_IB_EVENT_STRING(LID_CHANGE),
+       RDS_IB_EVENT_STRING(PKEY_CHANGE),
+       RDS_IB_EVENT_STRING(SM_CHANGE),
+       RDS_IB_EVENT_STRING(SRQ_ERR),
+       RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
+       RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
+       RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
+#undef RDS_IB_EVENT_STRING
+};
+
+static char *rds_ib_event_str(enum ib_event_type type)
+{
+       return rds_str_array(rds_ib_event_type_strings,
+                            ARRAY_SIZE(rds_ib_event_type_strings), type);
+};
+
 /*
  * Set the selected protocol version
  */
@@ -95,7 +125,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 {
        const struct rds_ib_connect_private *dp = NULL;
        struct rds_ib_connection *ic = conn->c_transport_data;
-       struct rds_ib_device *rds_ibdev;
        struct ib_qp_attr qp_attr;
        int err;
 
@@ -111,11 +140,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
                }
        }
 
-       printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
-                       &conn->c_faddr,
-                       RDS_PROTOCOL_MAJOR(conn->c_version),
-                       RDS_PROTOCOL_MINOR(conn->c_version),
-                       ic->i_flowctl ? ", flow control" : "");
+       if (conn->c_version < RDS_PROTOCOL(3,1)) {
+               printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
+                      " no longer supported\n",
+                      &conn->c_faddr,
+                      RDS_PROTOCOL_MAJOR(conn->c_version),
+                      RDS_PROTOCOL_MINOR(conn->c_version));
+               rds_conn_destroy(conn);
+               return;
+       } else {
+               printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
+                      &conn->c_faddr,
+                      RDS_PROTOCOL_MAJOR(conn->c_version),
+                      RDS_PROTOCOL_MINOR(conn->c_version),
+                      ic->i_flowctl ? ", flow control" : "");
+       }
 
        /*
         * Init rings and fill recv. this needs to wait until protocol negotiation
@@ -125,7 +164,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
        rds_ib_recv_init_ring(ic);
        /* Post receive buffers - as a side effect, this will update
         * the posted credit count. */
-       rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+       rds_ib_recv_refill(conn, 1);
 
        /* Tune RNR behavior */
        rds_ib_tune_rnr(ic, &qp_attr);
@@ -135,12 +174,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
        if (err)
                printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
 
-       /* update ib_device with this local ipaddr & conn */
-       rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
-       err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
+       /* update ib_device with this local ipaddr */
+       err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
        if (err)
-               printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
-       rds_ib_add_conn(rds_ibdev, conn);
+               printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
+                       err);
 
        /* If the peer gave us the last packet it saw, process this as if
         * we had received a regular ACK. */
@@ -153,18 +191,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
                        struct rdma_conn_param *conn_param,
                        struct rds_ib_connect_private *dp,
-                       u32 protocol_version)
+                       u32 protocol_version,
+                       u32 max_responder_resources,
+                       u32 max_initiator_depth)
 {
+       struct rds_ib_connection *ic = conn->c_transport_data;
+       struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
+
        memset(conn_param, 0, sizeof(struct rdma_conn_param));
-       /* XXX tune these? */
-       conn_param->responder_resources = 1;
-       conn_param->initiator_depth = 1;
+
+       conn_param->responder_resources =
+               min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
+       conn_param->initiator_depth =
+               min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
        conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
        conn_param->rnr_retry_count = 7;
 
        if (dp) {
-               struct rds_ib_connection *ic = conn->c_transport_data;
-
                memset(dp, 0, sizeof(*dp));
                dp->dp_saddr = conn->c_laddr;
                dp->dp_daddr = conn->c_faddr;
@@ -189,7 +232,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
 
 static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
 {
-       rdsdebug("event %u data %p\n", event->event, data);
+       rdsdebug("event %u (%s) data %p\n",
+                event->event, rds_ib_event_str(event->event), data);
 }
 
 static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
@@ -197,16 +241,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
        struct rds_connection *conn = data;
        struct rds_ib_connection *ic = conn->c_transport_data;
 
-       rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
+       rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
+                rds_ib_event_str(event->event));
 
        switch (event->event) {
        case IB_EVENT_COMM_EST:
                rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
                break;
        default:
-               rdsdebug("Fatal QP Event %u "
+               rdsdebug("Fatal QP Event %u (%s) "
                        "- connection %pI4->%pI4, reconnecting\n",
-                       event->event, &conn->c_laddr, &conn->c_faddr);
+                       event->event, rds_ib_event_str(event->event),
+                       &conn->c_laddr, &conn->c_faddr);
                rds_conn_drop(conn);
                break;
        }
@@ -224,18 +270,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
        struct rds_ib_device *rds_ibdev;
        int ret;
 
-       /* rds_ib_add_one creates a rds_ib_device object per IB device,
-        * and allocates a protection domain, memory range and FMR pool
-        * for each.  If that fails for any reason, it will not register
-        * the rds_ibdev at all.
+       /*
+        * It's normal to see a null device if an incoming connection races
+        * with device removal, so we don't print a warning.
         */
-       rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
-       if (rds_ibdev == NULL) {
-               if (printk_ratelimit())
-                       printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
-                                       dev->name);
+       rds_ibdev = rds_ib_get_client_data(dev);
+       if (!rds_ibdev)
                return -EOPNOTSUPP;
-       }
+
+       /* add the conn now so that connection establishment has the dev */
+       rds_ib_add_conn(rds_ibdev, conn);
 
        if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
                rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
@@ -306,7 +350,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
                                           ic->i_send_ring.w_nr *
                                                sizeof(struct rds_header),
                                           &ic->i_send_hdrs_dma, GFP_KERNEL);
-       if (ic->i_send_hdrs == NULL) {
+       if (!ic->i_send_hdrs) {
                ret = -ENOMEM;
                rdsdebug("ib_dma_alloc_coherent send failed\n");
                goto out;
@@ -316,7 +360,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
                                           ic->i_recv_ring.w_nr *
                                                sizeof(struct rds_header),
                                           &ic->i_recv_hdrs_dma, GFP_KERNEL);
-       if (ic->i_recv_hdrs == NULL) {
+       if (!ic->i_recv_hdrs) {
                ret = -ENOMEM;
                rdsdebug("ib_dma_alloc_coherent recv failed\n");
                goto out;
@@ -324,22 +368,24 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 
        ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
                                       &ic->i_ack_dma, GFP_KERNEL);
-       if (ic->i_ack == NULL) {
+       if (!ic->i_ack) {
                ret = -ENOMEM;
                rdsdebug("ib_dma_alloc_coherent ack failed\n");
                goto out;
        }
 
-       ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
-       if (ic->i_sends == NULL) {
+       ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
+                                  ibdev_to_node(dev));
+       if (!ic->i_sends) {
                ret = -ENOMEM;
                rdsdebug("send allocation failed\n");
                goto out;
        }
        memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
 
-       ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
-       if (ic->i_recvs == NULL) {
+       ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
+                                  ibdev_to_node(dev));
+       if (!ic->i_recvs) {
                ret = -ENOMEM;
                rdsdebug("recv allocation failed\n");
                goto out;
@@ -352,6 +398,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
                 ic->i_send_cq, ic->i_recv_cq);
 
 out:
+       rds_ib_dev_put(rds_ibdev);
        return ret;
 }
 
@@ -409,7 +456,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
        struct rds_ib_connection *ic = NULL;
        struct rdma_conn_param conn_param;
        u32 version;
-       int err, destroy = 1;
+       int err = 1, destroy = 1;
 
        /* Check whether the remote protocol version matches ours. */
        version = rds_ib_protocol_compatible(event);
@@ -448,7 +495,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
                        /* Wait and see - our connect may still be succeeding */
                        rds_ib_stats_inc(s_ib_connect_raced);
                }
-               mutex_unlock(&conn->c_cm_lock);
                goto out;
        }
 
@@ -479,20 +525,20 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
                goto out;
        }
 
-       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
+               event->param.conn.responder_resources,
+               event->param.conn.initiator_depth);
 
        /* rdma_accept() calls rdma_reject() internally if it fails */
        err = rdma_accept(cm_id, &conn_param);
-       mutex_unlock(&conn->c_cm_lock);
-       if (err) {
+       if (err)
                rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
-               goto out;
-       }
-
-       return 0;
 
 out:
-       rdma_reject(cm_id, NULL, 0);
+       if (conn)
+               mutex_unlock(&conn->c_cm_lock);
+       if (err)
+               rdma_reject(cm_id, NULL, 0);
        return destroy;
 }
 
@@ -516,8 +562,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
                goto out;
        }
 
-       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
-
+       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
+               UINT_MAX, UINT_MAX);
        ret = rdma_connect(cm_id, &conn_param);
        if (ret)
                rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
@@ -601,9 +647,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
                                ic->i_cm_id, err);
                }
 
+               /*
+                * We want to wait for tx and rx completion to finish
+                * before we tear down the connection, but we have to be
+                * careful not to get stuck waiting on a send ring that
+                * only has unsignaled sends in it.  We've shutdown new
+                * sends before getting here so by waiting for signaled
+                * sends to complete we're ensured that there will be no
+                * more tx processing.
+                */
                wait_event(rds_ib_ring_empty_wait,
-                       rds_ib_ring_empty(&ic->i_send_ring) &&
-                       rds_ib_ring_empty(&ic->i_recv_ring));
+                          rds_ib_ring_empty(&ic->i_recv_ring) &&
+                          (atomic_read(&ic->i_signaled_sends) == 0));
+               tasklet_kill(&ic->i_recv_tasklet);
 
                if (ic->i_send_hdrs)
                        ib_dma_free_coherent(dev,
@@ -654,9 +710,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
        BUG_ON(ic->rds_ibdev);
 
        /* Clear pending transmit */
-       if (ic->i_rm) {
-               rds_message_put(ic->i_rm);
-               ic->i_rm = NULL;
+       if (ic->i_data_op) {
+               struct rds_message *rm;
+
+               rm = container_of(ic->i_data_op, struct rds_message, data);
+               rds_message_put(rm);
+               ic->i_data_op = NULL;
        }
 
        /* Clear the ACK state */
@@ -690,12 +749,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 {
        struct rds_ib_connection *ic;
        unsigned long flags;
+       int ret;
 
        /* XXX too lazy? */
        ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
-       if (ic == NULL)
+       if (!ic)
                return -ENOMEM;
 
+       ret = rds_ib_recv_alloc_caches(ic);
+       if (ret) {
+               kfree(ic);
+               return ret;
+       }
+
        INIT_LIST_HEAD(&ic->ib_node);
        tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
                     (unsigned long) ic);
@@ -703,6 +769,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 #ifndef KERNEL_HAS_ATOMIC64
        spin_lock_init(&ic->i_ack_lock);
 #endif
+       atomic_set(&ic->i_signaled_sends, 0);
 
        /*
         * rds_ib_conn_shutdown() waits for these to be emptied so they
@@ -744,6 +811,8 @@ void rds_ib_conn_free(void *arg)
        list_del(&ic->ib_node);
        spin_unlock_irq(lock_ptr);
 
+       rds_ib_recv_free_caches(ic);
+
        kfree(ic);
 }
 
index a54cd63f9e35bd0f3e33a80d9ac06801fb3e0122..8f6e221c9f7836c1db5e459752e7ff577273835d 100644 (file)
  */
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/rculist.h>
 
 #include "rds.h"
-#include "rdma.h"
 #include "ib.h"
+#include "xlist.h"
 
+struct workqueue_struct *rds_ib_fmr_wq;
+
+static DEFINE_PER_CPU(unsigned long, clean_list_grace);
+#define CLEAN_LIST_BUSY_BIT 0
 
 /*
  * This is stored as mr->r_trans_private.
@@ -45,7 +50,11 @@ struct rds_ib_mr {
        struct rds_ib_device    *device;
        struct rds_ib_mr_pool   *pool;
        struct ib_fmr           *fmr;
-       struct list_head        list;
+
+       struct xlist_head       xlist;
+
+       /* unmap_list is for freeing */
+       struct list_head        unmap_list;
        unsigned int            remap_count;
 
        struct scatterlist      *sg;
@@ -59,14 +68,16 @@ struct rds_ib_mr {
  */
 struct rds_ib_mr_pool {
        struct mutex            flush_lock;             /* serialize fmr invalidate */
-       struct work_struct      flush_worker;           /* flush worker */
+       struct delayed_work     flush_worker;           /* flush worker */
 
-       spinlock_t              list_lock;              /* protect variables below */
        atomic_t                item_count;             /* total # of MRs */
        atomic_t                dirty_count;            /* # dirty of MRs */
-       struct list_head        drop_list;              /* MRs that have reached their max_maps limit */
-       struct list_head        free_list;              /* unused MRs */
-       struct list_head        clean_list;             /* unused & unamapped MRs */
+
+       struct xlist_head       drop_list;              /* MRs that have reached their max_maps limit */
+       struct xlist_head       free_list;              /* unused MRs */
+       struct xlist_head       clean_list;             /* global unused & unamapped MRs */
+       wait_queue_head_t       flush_wait;
+
        atomic_t                free_pinned;            /* memory pinned by free MRs */
        unsigned long           max_items;
        unsigned long           max_items_soft;
@@ -74,7 +85,7 @@ struct rds_ib_mr_pool {
        struct ib_fmr_attr      fmr_attr;
 };
 
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all);
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
 static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
 static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
 
@@ -83,16 +94,17 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
        struct rds_ib_device *rds_ibdev;
        struct rds_ib_ipaddr *i_ipaddr;
 
-       list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
-               spin_lock_irq(&rds_ibdev->spinlock);
-               list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
+               list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
                        if (i_ipaddr->ipaddr == ipaddr) {
-                               spin_unlock_irq(&rds_ibdev->spinlock);
+                               atomic_inc(&rds_ibdev->refcount);
+                               rcu_read_unlock();
                                return rds_ibdev;
                        }
                }
-               spin_unlock_irq(&rds_ibdev->spinlock);
        }
+       rcu_read_unlock();
 
        return NULL;
 }
@@ -108,7 +120,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
        i_ipaddr->ipaddr = ipaddr;
 
        spin_lock_irq(&rds_ibdev->spinlock);
-       list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
+       list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
        spin_unlock_irq(&rds_ibdev->spinlock);
 
        return 0;
@@ -116,17 +128,24 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 
 static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 {
-       struct rds_ib_ipaddr *i_ipaddr, *next;
+       struct rds_ib_ipaddr *i_ipaddr;
+       struct rds_ib_ipaddr *to_free = NULL;
+
 
        spin_lock_irq(&rds_ibdev->spinlock);
-       list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) {
+       list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
                if (i_ipaddr->ipaddr == ipaddr) {
-                       list_del(&i_ipaddr->list);
-                       kfree(i_ipaddr);
+                       list_del_rcu(&i_ipaddr->list);
+                       to_free = i_ipaddr;
                        break;
                }
        }
        spin_unlock_irq(&rds_ibdev->spinlock);
+
+       if (to_free) {
+               synchronize_rcu();
+               kfree(to_free);
+       }
 }
 
 int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
@@ -134,8 +153,10 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
        struct rds_ib_device *rds_ibdev_old;
 
        rds_ibdev_old = rds_ib_get_device(ipaddr);
-       if (rds_ibdev_old)
+       if (rds_ibdev_old) {
                rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+               rds_ib_dev_put(rds_ibdev_old);
+       }
 
        return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
 }
@@ -156,6 +177,7 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con
        spin_unlock_irq(&ib_nodev_conns_lock);
 
        ic->rds_ibdev = rds_ibdev;
+       atomic_inc(&rds_ibdev->refcount);
 }
 
 void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
@@ -175,18 +197,18 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *
        spin_unlock(&ib_nodev_conns_lock);
 
        ic->rds_ibdev = NULL;
+       rds_ib_dev_put(rds_ibdev);
 }
 
-void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock)
+void rds_ib_destroy_nodev_conns(void)
 {
        struct rds_ib_connection *ic, *_ic;
        LIST_HEAD(tmp_list);
 
        /* avoid calling conn_destroy with irqs off */
-       spin_lock_irq(list_lock);
-       list_splice(list, &tmp_list);
-       INIT_LIST_HEAD(list);
-       spin_unlock_irq(list_lock);
+       spin_lock_irq(&ib_nodev_conns_lock);
+       list_splice(&ib_nodev_conns, &tmp_list);
+       spin_unlock_irq(&ib_nodev_conns_lock);
 
        list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
                rds_conn_destroy(ic->conn);
@@ -200,12 +222,12 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
        if (!pool)
                return ERR_PTR(-ENOMEM);
 
-       INIT_LIST_HEAD(&pool->free_list);
-       INIT_LIST_HEAD(&pool->drop_list);
-       INIT_LIST_HEAD(&pool->clean_list);
+       INIT_XLIST_HEAD(&pool->free_list);
+       INIT_XLIST_HEAD(&pool->drop_list);
+       INIT_XLIST_HEAD(&pool->clean_list);
        mutex_init(&pool->flush_lock);
-       spin_lock_init(&pool->list_lock);
-       INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+       init_waitqueue_head(&pool->flush_wait);
+       INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
 
        pool->fmr_attr.max_pages = fmr_message_size;
        pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
@@ -233,34 +255,60 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
 
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
 {
-       flush_workqueue(rds_wq);
-       rds_ib_flush_mr_pool(pool, 1);
+       cancel_delayed_work_sync(&pool->flush_worker);
+       rds_ib_flush_mr_pool(pool, 1, NULL);
        WARN_ON(atomic_read(&pool->item_count));
        WARN_ON(atomic_read(&pool->free_pinned));
        kfree(pool);
 }
 
+static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl,
+                        struct rds_ib_mr **ibmr_ret)
+{
+       struct xlist_head *ibmr_xl;
+       ibmr_xl = xlist_del_head_fast(xl);
+       *ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist);
+}
+
 static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
 {
        struct rds_ib_mr *ibmr = NULL;
-       unsigned long flags;
+       struct xlist_head *ret;
+       unsigned long *flag;
 
-       spin_lock_irqsave(&pool->list_lock, flags);
-       if (!list_empty(&pool->clean_list)) {
-               ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list);
-               list_del_init(&ibmr->list);
-       }
-       spin_unlock_irqrestore(&pool->list_lock, flags);
+       preempt_disable();
+       flag = &__get_cpu_var(clean_list_grace);
+       set_bit(CLEAN_LIST_BUSY_BIT, flag);
+       ret = xlist_del_head(&pool->clean_list);
+       if (ret)
+               ibmr = list_entry(ret, struct rds_ib_mr, xlist);
 
+       clear_bit(CLEAN_LIST_BUSY_BIT, flag);
+       preempt_enable();
        return ibmr;
 }
 
+static inline void wait_clean_list_grace(void)
+{
+       int cpu;
+       unsigned long *flag;
+
+       for_each_online_cpu(cpu) {
+               flag = &per_cpu(clean_list_grace, cpu);
+               while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
+                       cpu_relax();
+       }
+}
+
 static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
 {
        struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
        struct rds_ib_mr *ibmr = NULL;
        int err = 0, iter = 0;
 
+       if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+               queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
+
        while (1) {
                ibmr = rds_ib_reuse_fmr(pool);
                if (ibmr)
@@ -287,19 +335,24 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
 
                /* We do have some empty MRs. Flush them out. */
                rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
-               rds_ib_flush_mr_pool(pool, 0);
+               rds_ib_flush_mr_pool(pool, 0, &ibmr);
+               if (ibmr)
+                       return ibmr;
        }
 
-       ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+       ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
        if (!ibmr) {
                err = -ENOMEM;
                goto out_no_cigar;
        }
 
+       memset(ibmr, 0, sizeof(*ibmr));
+
        ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
                        (IB_ACCESS_LOCAL_WRITE |
                         IB_ACCESS_REMOTE_READ |
-                        IB_ACCESS_REMOTE_WRITE),
+                        IB_ACCESS_REMOTE_WRITE|
+                        IB_ACCESS_REMOTE_ATOMIC),
                        &pool->fmr_attr);
        if (IS_ERR(ibmr->fmr)) {
                err = PTR_ERR(ibmr->fmr);
@@ -367,7 +420,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
        if (page_cnt > fmr_message_size)
                return -EINVAL;
 
-       dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC);
+       dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
+                                rdsibdev_to_node(rds_ibdev));
        if (!dma_pages)
                return -ENOMEM;
 
@@ -441,7 +495,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
 
                        /* FIXME we need a way to tell a r/w MR
                         * from a r/o MR */
-                       BUG_ON(in_interrupt());
+                       BUG_ON(irqs_disabled());
                        set_page_dirty(page);
                        put_page(page);
                }
@@ -476,34 +530,110 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr
        return 0;
 }
 
+/*
+ * given an xlist of mrs, put them all into the list_head for more processing
+ */
+static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list)
+{
+       struct rds_ib_mr *ibmr;
+       struct xlist_head splice;
+       struct xlist_head *cur;
+       struct xlist_head *next;
+
+       splice.next = NULL;
+       xlist_splice(xlist, &splice);
+       cur = splice.next;
+       while (cur) {
+               next = cur->next;
+               ibmr = list_entry(cur, struct rds_ib_mr, xlist);
+               list_add_tail(&ibmr->unmap_list, list);
+               cur = next;
+       }
+}
+
+/*
+ * this takes a list head of mrs and turns it into an xlist of clusters.
+ * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for
+ * reuse.
+ */
+static void list_append_to_xlist(struct rds_ib_mr_pool *pool,
+                               struct list_head *list, struct xlist_head *xlist,
+                               struct xlist_head **tail_ret)
+{
+       struct rds_ib_mr *ibmr;
+       struct xlist_head *cur_mr = xlist;
+       struct xlist_head *tail_mr = NULL;
+
+       list_for_each_entry(ibmr, list, unmap_list) {
+               tail_mr = &ibmr->xlist;
+               tail_mr->next = NULL;
+               cur_mr->next = tail_mr;
+               cur_mr = tail_mr;
+       }
+       *tail_ret = tail_mr;
+}
+
 /*
  * Flush our pool of MRs.
  * At a minimum, all currently unused MRs are unmapped.
  * If the number of MRs allocated exceeds the limit, we also try
  * to free as many MRs as needed to get back to this limit.
  */
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
+                               int free_all, struct rds_ib_mr **ibmr_ret)
 {
        struct rds_ib_mr *ibmr, *next;
+       struct xlist_head clean_xlist;
+       struct xlist_head *clean_tail;
        LIST_HEAD(unmap_list);
        LIST_HEAD(fmr_list);
        unsigned long unpinned = 0;
-       unsigned long flags;
        unsigned int nfreed = 0, ncleaned = 0, free_goal;
        int ret = 0;
 
        rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
 
-       mutex_lock(&pool->flush_lock);
+       if (ibmr_ret) {
+               DEFINE_WAIT(wait);
+               while(!mutex_trylock(&pool->flush_lock)) {
+                       ibmr = rds_ib_reuse_fmr(pool);
+                       if (ibmr) {
+                               *ibmr_ret = ibmr;
+                               finish_wait(&pool->flush_wait, &wait);
+                               goto out_nolock;
+                       }
+
+                       prepare_to_wait(&pool->flush_wait, &wait,
+                                       TASK_UNINTERRUPTIBLE);
+                       if (xlist_empty(&pool->clean_list))
+                               schedule();
+
+                       ibmr = rds_ib_reuse_fmr(pool);
+                       if (ibmr) {
+                               *ibmr_ret = ibmr;
+                               finish_wait(&pool->flush_wait, &wait);
+                               goto out_nolock;
+                       }
+               }
+               finish_wait(&pool->flush_wait, &wait);
+       } else
+               mutex_lock(&pool->flush_lock);
+
+       if (ibmr_ret) {
+               ibmr = rds_ib_reuse_fmr(pool);
+               if (ibmr) {
+                       *ibmr_ret = ibmr;
+                       goto out;
+               }
+       }
 
-       spin_lock_irqsave(&pool->list_lock, flags);
        /* Get the list of all MRs to be dropped. Ordering matters -
-        * we want to put drop_list ahead of free_list. */
-       list_splice_init(&pool->free_list, &unmap_list);
-       list_splice_init(&pool->drop_list, &unmap_list);
+        * we want to put drop_list ahead of free_list.
+        */
+       xlist_append_to_list(&pool->drop_list, &unmap_list);
+       xlist_append_to_list(&pool->free_list, &unmap_list);
        if (free_all)
-               list_splice_init(&pool->clean_list, &unmap_list);
-       spin_unlock_irqrestore(&pool->list_lock, flags);
+               xlist_append_to_list(&pool->clean_list, &unmap_list);
 
        free_goal = rds_ib_flush_goal(pool, free_all);
 
@@ -511,19 +641,20 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
                goto out;
 
        /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
-       list_for_each_entry(ibmr, &unmap_list, list)
+       list_for_each_entry(ibmr, &unmap_list, unmap_list)
                list_add(&ibmr->fmr->list, &fmr_list);
+
        ret = ib_unmap_fmr(&fmr_list);
        if (ret)
                printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
 
        /* Now we can destroy the DMA mapping and unpin any pages */
-       list_for_each_entry_safe(ibmr, next, &unmap_list, list) {
+       list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
                unpinned += ibmr->sg_len;
                __rds_ib_teardown_mr(ibmr);
                if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
                        rds_ib_stats_inc(s_ib_rdma_mr_free);
-                       list_del(&ibmr->list);
+                       list_del(&ibmr->unmap_list);
                        ib_dealloc_fmr(ibmr->fmr);
                        kfree(ibmr);
                        nfreed++;
@@ -531,9 +662,27 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
                ncleaned++;
        }
 
-       spin_lock_irqsave(&pool->list_lock, flags);
-       list_splice(&unmap_list, &pool->clean_list);
-       spin_unlock_irqrestore(&pool->list_lock, flags);
+       if (!list_empty(&unmap_list)) {
+               /* we have to make sure that none of the things we're about
+                * to put on the clean list would race with other cpus trying
+                * to pull items off.  The xlist would explode if we managed to
+                * remove something from the clean list and then add it back again
+                * while another CPU was spinning on that same item in xlist_del_head.
+                *
+                * This is pretty unlikely, but just in case  wait for an xlist grace period
+                * here before adding anything back into the clean list.
+                */
+               wait_clean_list_grace();
+
+               list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail);
+               if (ibmr_ret)
+                       refill_local(pool, &clean_xlist, ibmr_ret);
+
+               /* refill_local may have emptied our list */
+               if (!xlist_empty(&clean_xlist))
+                       xlist_add(clean_xlist.next, clean_tail, &pool->clean_list);
+
+       }
 
        atomic_sub(unpinned, &pool->free_pinned);
        atomic_sub(ncleaned, &pool->dirty_count);
@@ -541,14 +690,35 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
 
 out:
        mutex_unlock(&pool->flush_lock);
+       if (waitqueue_active(&pool->flush_wait))
+               wake_up(&pool->flush_wait);
+out_nolock:
        return ret;
 }
 
+int rds_ib_fmr_init(void)
+{
+       rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd");
+       if (!rds_ib_fmr_wq)
+               return -ENOMEM;
+       return 0;
+}
+
+/*
+ * By the time this is called all the IB devices should have been torn down and
+ * had their pools freed.  As each pool is freed its work struct is waited on,
+ * so the pool flushing work queue should be idle by the time we get here.
+ */
+void rds_ib_fmr_exit(void)
+{
+       destroy_workqueue(rds_ib_fmr_wq);
+}
+
 static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
 {
-       struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker);
+       struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
 
-       rds_ib_flush_mr_pool(pool, 0);
+       rds_ib_flush_mr_pool(pool, 0, NULL);
 }
 
 void rds_ib_free_mr(void *trans_private, int invalidate)
@@ -556,47 +726,49 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
        struct rds_ib_mr *ibmr = trans_private;
        struct rds_ib_device *rds_ibdev = ibmr->device;
        struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
-       unsigned long flags;
 
        rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
 
        /* Return it to the pool's free list */
-       spin_lock_irqsave(&pool->list_lock, flags);
        if (ibmr->remap_count >= pool->fmr_attr.max_maps)
-               list_add(&ibmr->list, &pool->drop_list);
+               xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list);
        else
-               list_add(&ibmr->list, &pool->free_list);
+               xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list);
 
        atomic_add(ibmr->sg_len, &pool->free_pinned);
        atomic_inc(&pool->dirty_count);
-       spin_unlock_irqrestore(&pool->list_lock, flags);
 
        /* If we've pinned too many pages, request a flush */
        if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
            atomic_read(&pool->dirty_count) >= pool->max_items / 10)
-               queue_work(rds_wq, &pool->flush_worker);
+               queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
 
        if (invalidate) {
                if (likely(!in_interrupt())) {
-                       rds_ib_flush_mr_pool(pool, 0);
+                       rds_ib_flush_mr_pool(pool, 0, NULL);
                } else {
                        /* We get here if the user created a MR marked
                         * as use_once and invalidate at the same time. */
-                       queue_work(rds_wq, &pool->flush_worker);
+                       queue_delayed_work(rds_ib_fmr_wq,
+                                          &pool->flush_worker, 10);
                }
        }
+
+       rds_ib_dev_put(rds_ibdev);
 }
 
 void rds_ib_flush_mrs(void)
 {
        struct rds_ib_device *rds_ibdev;
 
+       down_read(&rds_ib_devices_lock);
        list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
                struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
 
                if (pool)
-                       rds_ib_flush_mr_pool(pool, 0);
+                       rds_ib_flush_mr_pool(pool, 0, NULL);
        }
+       up_read(&rds_ib_devices_lock);
 }
 
 void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
@@ -628,6 +800,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
                printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
 
        ibmr->device = rds_ibdev;
+       rds_ibdev = NULL;
 
  out:
        if (ret) {
@@ -635,5 +808,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
                        rds_ib_free_mr(ibmr, 0);
                ibmr = ERR_PTR(ret);
        }
+       if (rds_ibdev)
+               rds_ib_dev_put(rds_ibdev);
        return ibmr;
 }
+
index c74e9904a6b2c20872917c79bf06a905c1ecd403..e29e0ca32f740d978aeccf23a4fba5453a4e8aa4 100644 (file)
@@ -43,42 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab;
 static struct kmem_cache *rds_ib_frag_slab;
 static atomic_t        rds_ib_allocation = ATOMIC_INIT(0);
 
-static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
-{
-       rdsdebug("frag %p page %p\n", frag, frag->f_page);
-       __free_page(frag->f_page);
-       frag->f_page = NULL;
-}
-
-static void rds_ib_frag_free(struct rds_page_frag *frag)
-{
-       rdsdebug("frag %p page %p\n", frag, frag->f_page);
-       BUG_ON(frag->f_page != NULL);
-       kmem_cache_free(rds_ib_frag_slab, frag);
-}
-
-/*
- * We map a page at a time.  Its fragments are posted in order.  This
- * is called in fragment order as the fragments get send completion events.
- * Only the last frag in the page performs the unmapping.
- *
- * It's OK for ring cleanup to call this in whatever order it likes because
- * DMA is not in flight and so we can unmap while other ring entries still
- * hold page references in their frags.
- */
-static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
-                                  struct rds_ib_recv_work *recv)
-{
-       struct rds_page_frag *frag = recv->r_frag;
-
-       rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
-       if (frag->f_mapped)
-               ib_dma_unmap_page(ic->i_cm_id->device,
-                              frag->f_mapped,
-                              RDS_FRAG_SIZE, DMA_FROM_DEVICE);
-       frag->f_mapped = 0;
-}
-
 void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
 {
        struct rds_ib_recv_work *recv;
@@ -95,16 +59,161 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
                recv->r_wr.sg_list = recv->r_sge;
                recv->r_wr.num_sge = RDS_IB_RECV_SGE;
 
-               sge = rds_ib_data_sge(ic, recv->r_sge);
+               sge = &recv->r_sge[0];
+               sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+               sge->length = sizeof(struct rds_header);
+               sge->lkey = ic->i_mr->lkey;
+
+               sge = &recv->r_sge[1];
                sge->addr = 0;
                sge->length = RDS_FRAG_SIZE;
                sge->lkey = ic->i_mr->lkey;
+       }
+}
 
-               sge = rds_ib_header_sge(ic, recv->r_sge);
-               sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
-               sge->length = sizeof(struct rds_header);
-               sge->lkey = ic->i_mr->lkey;
+/*
+ * The entire 'from' list, including the from element itself, is put on
+ * to the tail of the 'to' list.
+ */
+static void list_splice_entire_tail(struct list_head *from,
+                                   struct list_head *to)
+{
+       struct list_head *from_last = from->prev;
+
+       list_splice_tail(from_last, to);
+       list_add_tail(from_last, to);
+}
+
+static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
+{
+       struct list_head *tmp;
+
+       tmp = xchg(&cache->xfer, NULL);
+       if (tmp) {
+               if (cache->ready)
+                       list_splice_entire_tail(tmp, cache->ready);
+               else
+                       cache->ready = tmp;
+       }
+}
+
+static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
+{
+       struct rds_ib_cache_head *head;
+       int cpu;
+
+       cache->percpu = alloc_percpu(struct rds_ib_cache_head);
+       if (!cache->percpu)
+              return -ENOMEM;
+
+       for_each_possible_cpu(cpu) {
+               head = per_cpu_ptr(cache->percpu, cpu);
+               head->first = NULL;
+               head->count = 0;
+       }
+       cache->xfer = NULL;
+       cache->ready = NULL;
+
+       return 0;
+}
+
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
+{
+       int ret;
+
+       ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
+       if (!ret) {
+               ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
+               if (ret)
+                       free_percpu(ic->i_cache_incs.percpu);
        }
+
+       return ret;
+}
+
+static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
+                                         struct list_head *caller_list)
+{
+       struct rds_ib_cache_head *head;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               head = per_cpu_ptr(cache->percpu, cpu);
+               if (head->first) {
+                       list_splice_entire_tail(head->first, caller_list);
+                       head->first = NULL;
+               }
+       }
+
+       if (cache->ready) {
+               list_splice_entire_tail(cache->ready, caller_list);
+               cache->ready = NULL;
+       }
+}
+
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
+{
+       struct rds_ib_incoming *inc;
+       struct rds_ib_incoming *inc_tmp;
+       struct rds_page_frag *frag;
+       struct rds_page_frag *frag_tmp;
+       LIST_HEAD(list);
+
+       rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+       rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
+       free_percpu(ic->i_cache_incs.percpu);
+
+       list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
+               list_del(&inc->ii_cache_entry);
+               WARN_ON(!list_empty(&inc->ii_frags));
+               kmem_cache_free(rds_ib_incoming_slab, inc);
+       }
+
+       rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+       rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
+       free_percpu(ic->i_cache_frags.percpu);
+
+       list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
+               list_del(&frag->f_cache_entry);
+               WARN_ON(!list_empty(&frag->f_item));
+               kmem_cache_free(rds_ib_frag_slab, frag);
+       }
+}
+
+/* fwd decl */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+                                 struct rds_ib_refill_cache *cache);
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
+
+
+/* Recycle frag and attached recv buffer f_sg */
+static void rds_ib_frag_free(struct rds_ib_connection *ic,
+                            struct rds_page_frag *frag)
+{
+       rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
+
+       rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
+}
+
+/* Recycle inc after freeing attached frags */
+void rds_ib_inc_free(struct rds_incoming *inc)
+{
+       struct rds_ib_incoming *ibinc;
+       struct rds_page_frag *frag;
+       struct rds_page_frag *pos;
+       struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
+
+       ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+
+       /* Free attached frags */
+       list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+               list_del_init(&frag->f_item);
+               rds_ib_frag_free(ic, frag);
+       }
+       BUG_ON(!list_empty(&ibinc->ii_frags));
+
+       rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+       rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
 }
 
 static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
@@ -115,10 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
                recv->r_ibinc = NULL;
        }
        if (recv->r_frag) {
-               rds_ib_recv_unmap_page(ic, recv);
-               if (recv->r_frag->f_page)
-                       rds_ib_frag_drop_page(recv->r_frag);
-               rds_ib_frag_free(recv->r_frag);
+               ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+               rds_ib_frag_free(ic, recv->r_frag);
                recv->r_frag = NULL;
        }
 }
@@ -129,84 +236,111 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
 
        for (i = 0; i < ic->i_recv_ring.w_nr; i++)
                rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
-
-       if (ic->i_frag.f_page)
-               rds_ib_frag_drop_page(&ic->i_frag);
 }
 
-static int rds_ib_recv_refill_one(struct rds_connection *conn,
-                                 struct rds_ib_recv_work *recv,
-                                 gfp_t kptr_gfp, gfp_t page_gfp)
+static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
+                                                    gfp_t slab_mask)
 {
-       struct rds_ib_connection *ic = conn->c_transport_data;
-       dma_addr_t dma_addr;
-       struct ib_sge *sge;
-       int ret = -ENOMEM;
+       struct rds_ib_incoming *ibinc;
+       struct list_head *cache_item;
+       int avail_allocs;
 
-       if (recv->r_ibinc == NULL) {
-               if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) {
+       cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
+       if (cache_item) {
+               ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
+       } else {
+               avail_allocs = atomic_add_unless(&rds_ib_allocation,
+                                                1, rds_ib_sysctl_max_recv_allocation);
+               if (!avail_allocs) {
                        rds_ib_stats_inc(s_ib_rx_alloc_limit);
-                       goto out;
+                       return NULL;
                }
-               recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab,
-                                                kptr_gfp);
-               if (recv->r_ibinc == NULL) {
+               ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
+               if (!ibinc) {
                        atomic_dec(&rds_ib_allocation);
-                       goto out;
+                       return NULL;
                }
-               INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
-               rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
        }
+       INIT_LIST_HEAD(&ibinc->ii_frags);
+       rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
 
-       if (recv->r_frag == NULL) {
-               recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp);
-               if (recv->r_frag == NULL)
-                       goto out;
-               INIT_LIST_HEAD(&recv->r_frag->f_item);
-               recv->r_frag->f_page = NULL;
+       return ibinc;
+}
+
+static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
+                                                   gfp_t slab_mask, gfp_t page_mask)
+{
+       struct rds_page_frag *frag;
+       struct list_head *cache_item;
+       int ret;
+
+       cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
+       if (cache_item) {
+               frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
+       } else {
+               frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
+               if (!frag)
+                       return NULL;
+
+               sg_init_table(&frag->f_sg, 1);
+               ret = rds_page_remainder_alloc(&frag->f_sg,
+                                              RDS_FRAG_SIZE, page_mask);
+               if (ret) {
+                       kmem_cache_free(rds_ib_frag_slab, frag);
+                       return NULL;
+               }
        }
 
-       if (ic->i_frag.f_page == NULL) {
-               ic->i_frag.f_page = alloc_page(page_gfp);
-               if (ic->i_frag.f_page == NULL)
-                       goto out;
-               ic->i_frag.f_offset = 0;
+       INIT_LIST_HEAD(&frag->f_item);
+
+       return frag;
+}
+
+static int rds_ib_recv_refill_one(struct rds_connection *conn,
+                                 struct rds_ib_recv_work *recv, int prefill)
+{
+       struct rds_ib_connection *ic = conn->c_transport_data;
+       struct ib_sge *sge;
+       int ret = -ENOMEM;
+       gfp_t slab_mask = GFP_NOWAIT;
+       gfp_t page_mask = GFP_NOWAIT;
+
+       if (prefill) {
+               slab_mask = GFP_KERNEL;
+               page_mask = GFP_HIGHUSER;
        }
 
-       dma_addr = ib_dma_map_page(ic->i_cm_id->device,
-                                 ic->i_frag.f_page,
-                                 ic->i_frag.f_offset,
-                                 RDS_FRAG_SIZE,
-                                 DMA_FROM_DEVICE);
-       if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
-               goto out;
+       if (!ic->i_cache_incs.ready)
+               rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+       if (!ic->i_cache_frags.ready)
+               rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
 
        /*
-        * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
-        * must be called on this recv.  This happens as completions hit
-        * in order or on connection shutdown.
+        * ibinc was taken from recv if recv contained the start of a message.
+        * recvs that were continuations will still have this allocated.
         */
-       recv->r_frag->f_page = ic->i_frag.f_page;
-       recv->r_frag->f_offset = ic->i_frag.f_offset;
-       recv->r_frag->f_mapped = dma_addr;
+       if (!recv->r_ibinc) {
+               recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
+               if (!recv->r_ibinc)
+                       goto out;
+       }
 
-       sge = rds_ib_data_sge(ic, recv->r_sge);
-       sge->addr = dma_addr;
-       sge->length = RDS_FRAG_SIZE;
+       WARN_ON(recv->r_frag); /* leak! */
+       recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
+       if (!recv->r_frag)
+               goto out;
+
+       ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
+                           1, DMA_FROM_DEVICE);
+       WARN_ON(ret != 1);
 
-       sge = rds_ib_header_sge(ic, recv->r_sge);
+       sge = &recv->r_sge[0];
        sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
        sge->length = sizeof(struct rds_header);
 
-       get_page(recv->r_frag->f_page);
-
-       if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
-               ic->i_frag.f_offset += RDS_FRAG_SIZE;
-       } else {
-               put_page(ic->i_frag.f_page);
-               ic->i_frag.f_page = NULL;
-               ic->i_frag.f_offset = 0;
-       }
+       sge = &recv->r_sge[1];
+       sge->addr = sg_dma_address(&recv->r_frag->f_sg);
+       sge->length = sg_dma_len(&recv->r_frag->f_sg);
 
        ret = 0;
 out:
@@ -216,13 +350,11 @@ out:
 /*
  * This tries to allocate and post unused work requests after making sure that
  * they have all the allocations they need to queue received fragments into
- * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
- * pairs don't go unmatched.
+ * sockets.
  *
  * -1 is returned if posting fails due to temporary resource exhaustion.
  */
-int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
-                      gfp_t page_gfp, int prefill)
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
 {
        struct rds_ib_connection *ic = conn->c_transport_data;
        struct rds_ib_recv_work *recv;
@@ -236,28 +368,25 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
                if (pos >= ic->i_recv_ring.w_nr) {
                        printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
                                        pos);
-                       ret = -EINVAL;
                        break;
                }
 
                recv = &ic->i_recvs[pos];
-               ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
+               ret = rds_ib_recv_refill_one(conn, recv, prefill);
                if (ret) {
-                       ret = -1;
                        break;
                }
 
                /* XXX when can this fail? */
                ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
                rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
-                        recv->r_ibinc, recv->r_frag->f_page,
-                        (long) recv->r_frag->f_mapped, ret);
+                        recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
+                        (long) sg_dma_address(&recv->r_frag->f_sg), ret);
                if (ret) {
                        rds_ib_conn_error(conn, "recv post on "
                               "%pI4 returned %d, disconnecting and "
                               "reconnecting\n", &conn->c_faddr,
                               ret);
-                       ret = -1;
                        break;
                }
 
@@ -270,37 +399,73 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
 
        if (ret)
                rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
-       return ret;
 }
 
-void rds_ib_inc_purge(struct rds_incoming *inc)
+/*
+ * We want to recycle several types of recv allocations, like incs and frags.
+ * To use this, the *_free() function passes in the ptr to a list_head within
+ * the recyclee, as well as the cache to put it on.
+ *
+ * First, we put the memory on a percpu list. When this reaches a certain size,
+ * We move it to an intermediate non-percpu list in a lockless manner, with some
+ * xchg/compxchg wizardry.
+ *
+ * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
+ * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
+ * list_empty() will return true with one element is actually present.
+ */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+                                struct rds_ib_refill_cache *cache)
 {
-       struct rds_ib_incoming *ibinc;
-       struct rds_page_frag *frag;
-       struct rds_page_frag *pos;
+       unsigned long flags;
+       struct rds_ib_cache_head *chp;
+       struct list_head *old;
 
-       ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
-       rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
+       local_irq_save(flags);
 
-       list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
-               list_del_init(&frag->f_item);
-               rds_ib_frag_drop_page(frag);
-               rds_ib_frag_free(frag);
-       }
+       chp = per_cpu_ptr(cache->percpu, smp_processor_id());
+       if (!chp->first)
+               INIT_LIST_HEAD(new_item);
+       else /* put on front */
+               list_add_tail(new_item, chp->first);
+       chp->first = new_item;
+       chp->count++;
+
+       if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT)
+               goto end;
+
+       /*
+        * Return our per-cpu first list to the cache's xfer by atomically
+        * grabbing the current xfer list, appending it to our per-cpu list,
+        * and then atomically returning that entire list back to the
+        * cache's xfer list as long as it's still empty.
+        */
+       do {
+               old = xchg(&cache->xfer, NULL);
+               if (old)
+                       list_splice_entire_tail(old, chp->first);
+               old = cmpxchg(&cache->xfer, NULL, chp->first);
+       } while (old);
+
+       chp->first = NULL;
+       chp->count = 0;
+end:
+       local_irq_restore(flags);
 }
 
-void rds_ib_inc_free(struct rds_incoming *inc)
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
 {
-       struct rds_ib_incoming *ibinc;
-
-       ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+       struct list_head *head = cache->ready;
+
+       if (head) {
+               if (!list_empty(head)) {
+                       cache->ready = head->next;
+                       list_del_init(head);
+               } else
+                       cache->ready = NULL;
+       }
 
-       rds_ib_inc_purge(inc);
-       rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
-       BUG_ON(!list_empty(&ibinc->ii_frags));
-       kmem_cache_free(rds_ib_incoming_slab, ibinc);
-       atomic_dec(&rds_ib_allocation);
-       BUG_ON(atomic_read(&rds_ib_allocation) < 0);
+       return head;
 }
 
 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
@@ -336,13 +501,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
                to_copy = min_t(unsigned long, to_copy, len - copied);
 
                rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
-                        "[%p, %lu] + %lu\n",
+                        "[%p, %u] + %lu\n",
                         to_copy, iov->iov_base, iov->iov_len, iov_off,
-                        frag->f_page, frag->f_offset, frag_off);
+                        sg_page(&frag->f_sg), frag->f_sg.offset, frag_off);
 
                /* XXX needs + offset for multiple recvs per page */
-               ret = rds_page_copy_to_user(frag->f_page,
-                                           frag->f_offset + frag_off,
+               ret = rds_page_copy_to_user(sg_page(&frag->f_sg),
+                                           frag->f_sg.offset + frag_off,
                                            iov->iov_base + iov_off,
                                            to_copy);
                if (ret) {
@@ -557,47 +722,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
        return rds_ib_get_ack(ic);
 }
 
-static struct rds_header *rds_ib_get_header(struct rds_connection *conn,
-                                           struct rds_ib_recv_work *recv,
-                                           u32 data_len)
-{
-       struct rds_ib_connection *ic = conn->c_transport_data;
-       void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs];
-       void *addr;
-       u32 misplaced_hdr_bytes;
-
-       /*
-        * Support header at the front (RDS 3.1+) as well as header-at-end.
-        *
-        * Cases:
-        * 1) header all in header buff (great!)
-        * 2) header all in data page (copy all to header buff)
-        * 3) header split across hdr buf + data page
-        *    (move bit in hdr buff to end before copying other bit from data page)
-        */
-       if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE)
-               return hdr_buff;
-
-       if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) {
-               addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
-               memcpy(hdr_buff,
-                      addr + recv->r_frag->f_offset + data_len,
-                      sizeof(struct rds_header));
-               kunmap_atomic(addr, KM_SOFTIRQ0);
-               return hdr_buff;
-       }
-
-       misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len));
-
-       memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes);
-
-       addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
-       memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len,
-              sizeof(struct rds_header) - misplaced_hdr_bytes);
-       kunmap_atomic(addr, KM_SOFTIRQ0);
-       return hdr_buff;
-}
-
 /*
  * It's kind of lame that we're copying from the posted receive pages into
  * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
@@ -639,7 +763,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
                to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
                BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
 
-               addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
+               addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0);
 
                src = addr + frag_off;
                dst = (void *)map->m_page_addrs[map_page] + map_off;
@@ -710,7 +834,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
        }
        data_len -= sizeof(struct rds_header);
 
-       ihdr = rds_ib_get_header(conn, recv, data_len);
+       ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
 
        /* Validate the checksum. */
        if (!rds_message_verify_checksum(ihdr)) {
@@ -742,12 +866,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
                 * the inc is freed.  We don't go that route, so we have to drop the
                 * page ref ourselves.  We can't just leave the page on the recv
                 * because that confuses the dma mapping of pages and each recv's use
-                * of a partial page.  We can leave the frag, though, it will be
-                * reused.
+                * of a partial page.
                 *
                 * FIXME: Fold this into the code path below.
                 */
-               rds_ib_frag_drop_page(recv->r_frag);
+               rds_ib_frag_free(ic, recv->r_frag);
+               recv->r_frag = NULL;
                return;
        }
 
@@ -757,7 +881,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
         * into the inc and save the inc so we can hang upcoming fragments
         * off its list.
         */
-       if (ibinc == NULL) {
+       if (!ibinc) {
                ibinc = recv->r_ibinc;
                recv->r_ibinc = NULL;
                ic->i_ibinc = ibinc;
@@ -842,32 +966,38 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic,
        struct rds_ib_recv_work *recv;
 
        while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
-               rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
-                        (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+               rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+                        (unsigned long long)wc.wr_id, wc.status,
+                        rds_ib_wc_status_str(wc.status), wc.byte_len,
                         be32_to_cpu(wc.ex.imm_data));
                rds_ib_stats_inc(s_ib_rx_cq_event);
 
                recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
 
-               rds_ib_recv_unmap_page(ic, recv);
+               ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
 
                /*
                 * Also process recvs in connecting state because it is possible
                 * to get a recv completion _before_ the rdmacm ESTABLISHED
                 * event is processed.
                 */
-               if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
+               if (wc.status == IB_WC_SUCCESS) {
+                       rds_ib_process_recv(conn, recv, wc.byte_len, state);
+               } else {
                        /* We expect errors as the qp is drained during shutdown */
-                       if (wc.status == IB_WC_SUCCESS) {
-                               rds_ib_process_recv(conn, recv, wc.byte_len, state);
-                       } else {
-                               rds_ib_conn_error(conn, "recv completion on "
-                                      "%pI4 had status %u, disconnecting and "
-                                      "reconnecting\n", &conn->c_faddr,
-                                      wc.status);
-                       }
+                       if (rds_conn_up(conn) || rds_conn_connecting(conn))
+                               rds_ib_conn_error(conn, "recv completion on %pI4 had "
+                                                 "status %u (%s), disconnecting and "
+                                                 "reconnecting\n", &conn->c_faddr,
+                                                 wc.status,
+                                                 rds_ib_wc_status_str(wc.status));
                }
 
+               /*
+                * It's very important that we only free this ring entry if we've truly
+                * freed the resources allocated to the entry.  The refilling path can
+                * leak if we don't.
+                */
                rds_ib_ring_free(&ic->i_recv_ring, 1);
        }
 }
@@ -897,11 +1027,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data)
        if (rds_ib_ring_empty(&ic->i_recv_ring))
                rds_ib_stats_inc(s_ib_rx_ring_empty);
 
-       /*
-        * If the ring is running low, then schedule the thread to refill.
-        */
        if (rds_ib_ring_low(&ic->i_recv_ring))
-               queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+               rds_ib_recv_refill(conn, 0);
 }
 
 int rds_ib_recv(struct rds_connection *conn)
@@ -910,25 +1037,13 @@ int rds_ib_recv(struct rds_connection *conn)
        int ret = 0;
 
        rdsdebug("conn %p\n", conn);
-
-       /*
-        * If we get a temporary posting failure in this context then
-        * we're really low and we want the caller to back off for a bit.
-        */
-       mutex_lock(&ic->i_recv_mutex);
-       if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
-               ret = -ENOMEM;
-       else
-               rds_ib_stats_inc(s_ib_rx_refill_from_thread);
-       mutex_unlock(&ic->i_recv_mutex);
-
        if (rds_conn_up(conn))
                rds_ib_attempt_ack(ic);
 
        return ret;
 }
 
-int __init rds_ib_recv_init(void)
+int rds_ib_recv_init(void)
 {
        struct sysinfo si;
        int ret = -ENOMEM;
@@ -939,14 +1054,14 @@ int __init rds_ib_recv_init(void)
 
        rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
                                        sizeof(struct rds_ib_incoming),
-                                       0, 0, NULL);
-       if (rds_ib_incoming_slab == NULL)
+                                       0, SLAB_HWCACHE_ALIGN, NULL);
+       if (!rds_ib_incoming_slab)
                goto out;
 
        rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
                                        sizeof(struct rds_page_frag),
-                                       0, 0, NULL);
-       if (rds_ib_frag_slab == NULL)
+                                       0, SLAB_HWCACHE_ALIGN, NULL);
+       if (!rds_ib_frag_slab)
                kmem_cache_destroy(rds_ib_incoming_slab);
        else
                ret = 0;
index 17fa80803ab01ccbd0fa1279ed448c1bd51d501d..71f373c421bc4d8b6a97f2be7201a2dc8c2290e8 100644 (file)
 #include <linux/dmapool.h>
 
 #include "rds.h"
-#include "rdma.h"
 #include "ib.h"
 
-static void rds_ib_send_rdma_complete(struct rds_message *rm,
-                                     int wc_status)
+static char *rds_ib_wc_status_strings[] = {
+#define RDS_IB_WC_STATUS_STR(foo) \
+               [IB_WC_##foo] = __stringify(IB_WC_##foo)
+       RDS_IB_WC_STATUS_STR(SUCCESS),
+       RDS_IB_WC_STATUS_STR(LOC_LEN_ERR),
+       RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR),
+       RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR),
+       RDS_IB_WC_STATUS_STR(LOC_PROT_ERR),
+       RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR),
+       RDS_IB_WC_STATUS_STR(MW_BIND_ERR),
+       RDS_IB_WC_STATUS_STR(BAD_RESP_ERR),
+       RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR),
+       RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR),
+       RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR),
+       RDS_IB_WC_STATUS_STR(REM_OP_ERR),
+       RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR),
+       RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR),
+       RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR),
+       RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR),
+       RDS_IB_WC_STATUS_STR(REM_ABORT_ERR),
+       RDS_IB_WC_STATUS_STR(INV_EECN_ERR),
+       RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR),
+       RDS_IB_WC_STATUS_STR(FATAL_ERR),
+       RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR),
+       RDS_IB_WC_STATUS_STR(GENERAL_ERR),
+#undef RDS_IB_WC_STATUS_STR
+};
+
+char *rds_ib_wc_status_str(enum ib_wc_status status)
+{
+       return rds_str_array(rds_ib_wc_status_strings,
+                            ARRAY_SIZE(rds_ib_wc_status_strings), status);
+}
+
+/*
+ * Convert IB-specific error message to RDS error message and call core
+ * completion handler.
+ */
+static void rds_ib_send_complete(struct rds_message *rm,
+                                int wc_status,
+                                void (*complete)(struct rds_message *rm, int status))
 {
        int notify_status;
 
@@ -60,69 +98,125 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm,
                notify_status = RDS_RDMA_OTHER_ERROR;
                break;
        }
-       rds_rdma_send_complete(rm, notify_status);
+       complete(rm, notify_status);
+}
+
+static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
+                                  struct rm_data_op *op,
+                                  int wc_status)
+{
+       if (op->op_nents)
+               ib_dma_unmap_sg(ic->i_cm_id->device,
+                               op->op_sg, op->op_nents,
+                               DMA_TO_DEVICE);
 }
 
 static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
-                                  struct rds_rdma_op *op)
+                                  struct rm_rdma_op *op,
+                                  int wc_status)
 {
-       if (op->r_mapped) {
+       if (op->op_mapped) {
                ib_dma_unmap_sg(ic->i_cm_id->device,
-                       op->r_sg, op->r_nents,
-                       op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-               op->r_mapped = 0;
+                               op->op_sg, op->op_nents,
+                               op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               op->op_mapped = 0;
        }
+
+       /* If the user asked for a completion notification on this
+        * message, we can implement three different semantics:
+        *  1.  Notify when we received the ACK on the RDS message
+        *      that was queued with the RDMA. This provides reliable
+        *      notification of RDMA status at the expense of a one-way
+        *      packet delay.
+        *  2.  Notify when the IB stack gives us the completion event for
+        *      the RDMA operation.
+        *  3.  Notify when the IB stack gives us the completion event for
+        *      the accompanying RDS messages.
+        * Here, we implement approach #3. To implement approach #2,
+        * we would need to take an event for the rdma WR. To implement #1,
+        * don't call rds_rdma_send_complete at all, and fall back to the notify
+        * handling in the ACK processing code.
+        *
+        * Note: There's no need to explicitly sync any RDMA buffers using
+        * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+        * operation itself unmapped the RDMA buffers, which takes care
+        * of synching.
+        */
+       rds_ib_send_complete(container_of(op, struct rds_message, rdma),
+                            wc_status, rds_rdma_send_complete);
+
+       if (op->op_write)
+               rds_stats_add(s_send_rdma_bytes, op->op_bytes);
+       else
+               rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
 }
 
-static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
-                         struct rds_ib_send_work *send,
-                         int wc_status)
+static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
+                                    struct rm_atomic_op *op,
+                                    int wc_status)
 {
-       struct rds_message *rm = send->s_rm;
-
-       rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
-
-       ib_dma_unmap_sg(ic->i_cm_id->device,
-                    rm->m_sg, rm->m_nents,
-                    DMA_TO_DEVICE);
-
-       if (rm->m_rdma_op != NULL) {
-               rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
-
-               /* If the user asked for a completion notification on this
-                * message, we can implement three different semantics:
-                *  1.  Notify when we received the ACK on the RDS message
-                *      that was queued with the RDMA. This provides reliable
-                *      notification of RDMA status at the expense of a one-way
-                *      packet delay.
-                *  2.  Notify when the IB stack gives us the completion event for
-                *      the RDMA operation.
-                *  3.  Notify when the IB stack gives us the completion event for
-                *      the accompanying RDS messages.
-                * Here, we implement approach #3. To implement approach #2,
-                * call rds_rdma_send_complete from the cq_handler. To implement #1,
-                * don't call rds_rdma_send_complete at all, and fall back to the notify
-                * handling in the ACK processing code.
-                *
-                * Note: There's no need to explicitly sync any RDMA buffers using
-                * ib_dma_sync_sg_for_cpu - the completion for the RDMA
-                * operation itself unmapped the RDMA buffers, which takes care
-                * of synching.
-                */
-               rds_ib_send_rdma_complete(rm, wc_status);
+       /* unmap atomic recvbuf */
+       if (op->op_mapped) {
+               ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
+                               DMA_FROM_DEVICE);
+               op->op_mapped = 0;
+       }
 
-               if (rm->m_rdma_op->r_write)
-                       rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
-               else
-                       rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
+       rds_ib_send_complete(container_of(op, struct rds_message, atomic),
+                            wc_status, rds_atomic_send_complete);
+
+       if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
+               rds_ib_stats_inc(s_ib_atomic_cswp);
+       else
+               rds_ib_stats_inc(s_ib_atomic_fadd);
+}
+
+/*
+ * Unmap the resources associated with a struct send_work.
+ *
+ * Returns the rm for no good reason other than it is unobtainable
+ * other than by switching on wr.opcode, currently, and the caller,
+ * the event handler, needs it.
+ */
+static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
+                                               struct rds_ib_send_work *send,
+                                               int wc_status)
+{
+       struct rds_message *rm = NULL;
+
+       /* In the error case, wc.opcode sometimes contains garbage */
+       switch (send->s_wr.opcode) {
+       case IB_WR_SEND:
+               if (send->s_op) {
+                       rm = container_of(send->s_op, struct rds_message, data);
+                       rds_ib_send_unmap_data(ic, send->s_op, wc_status);
+               }
+               break;
+       case IB_WR_RDMA_WRITE:
+       case IB_WR_RDMA_READ:
+               if (send->s_op) {
+                       rm = container_of(send->s_op, struct rds_message, rdma);
+                       rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
+               }
+               break;
+       case IB_WR_ATOMIC_FETCH_AND_ADD:
+       case IB_WR_ATOMIC_CMP_AND_SWP:
+               if (send->s_op) {
+                       rm = container_of(send->s_op, struct rds_message, atomic);
+                       rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
+               }
+               break;
+       default:
+               if (printk_ratelimit())
+                       printk(KERN_NOTICE
+                              "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
+                              __func__, send->s_wr.opcode);
+               break;
        }
 
-       /* If anyone waited for this message to get flushed out, wake
-        * them up now */
-       rds_message_unmapped(rm);
+       send->s_wr.opcode = 0xdead;
 
-       rds_message_put(rm);
-       send->s_rm = NULL;
+       return rm;
 }
 
 void rds_ib_send_init_ring(struct rds_ib_connection *ic)
@@ -133,23 +227,18 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
        for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
                struct ib_sge *sge;
 
-               send->s_rm = NULL;
                send->s_op = NULL;
 
                send->s_wr.wr_id = i;
                send->s_wr.sg_list = send->s_sge;
-               send->s_wr.num_sge = 1;
-               send->s_wr.opcode = IB_WR_SEND;
-               send->s_wr.send_flags = 0;
                send->s_wr.ex.imm_data = 0;
 
-               sge = rds_ib_data_sge(ic, send->s_sge);
-               sge->lkey = ic->i_mr->lkey;
-
-               sge = rds_ib_header_sge(ic, send->s_sge);
+               sge = &send->s_sge[0];
                sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
                sge->length = sizeof(struct rds_header);
                sge->lkey = ic->i_mr->lkey;
+
+               send->s_sge[1].lkey = ic->i_mr->lkey;
        }
 }
 
@@ -159,15 +248,23 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
        u32 i;
 
        for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
-               if (send->s_wr.opcode == 0xdead)
-                       continue;
-               if (send->s_rm)
-                       rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
-               if (send->s_op)
-                       rds_ib_send_unmap_rdma(ic, send->s_op);
+               if (send->s_op && send->s_wr.opcode != 0xdead)
+                       rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
        }
 }
 
+/*
+ * The only fast path caller always has a non-zero nr, so we don't
+ * bother testing nr before performing the atomic sub.
+ */
+static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
+{
+       if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
+           waitqueue_active(&rds_ib_ring_empty_wait))
+               wake_up(&rds_ib_ring_empty_wait);
+       BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
+}
+
 /*
  * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
  * operations performed in the send path.  As the sender allocs and potentially
@@ -178,12 +275,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 {
        struct rds_connection *conn = context;
        struct rds_ib_connection *ic = conn->c_transport_data;
+       struct rds_message *rm = NULL;
        struct ib_wc wc;
        struct rds_ib_send_work *send;
        u32 completed;
        u32 oldest;
        u32 i = 0;
        int ret;
+       int nr_sig = 0;
 
        rdsdebug("cq %p conn %p\n", cq, conn);
        rds_ib_stats_inc(s_ib_tx_cq_call);
@@ -192,8 +291,9 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
                rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
 
        while (ib_poll_cq(cq, 1, &wc) > 0) {
-               rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
-                        (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+               rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+                        (unsigned long long)wc.wr_id, wc.status,
+                        rds_ib_wc_status_str(wc.status), wc.byte_len,
                         be32_to_cpu(wc.ex.imm_data));
                rds_ib_stats_inc(s_ib_tx_cq_event);
 
@@ -210,51 +310,30 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 
                for (i = 0; i < completed; i++) {
                        send = &ic->i_sends[oldest];
+                       if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+                               nr_sig++;
 
-                       /* In the error case, wc.opcode sometimes contains garbage */
-                       switch (send->s_wr.opcode) {
-                       case IB_WR_SEND:
-                               if (send->s_rm)
-                                       rds_ib_send_unmap_rm(ic, send, wc.status);
-                               break;
-                       case IB_WR_RDMA_WRITE:
-                       case IB_WR_RDMA_READ:
-                               /* Nothing to be done - the SG list will be unmapped
-                                * when the SEND completes. */
-                               break;
-                       default:
-                               if (printk_ratelimit())
-                                       printk(KERN_NOTICE
-                                               "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
-                                               __func__, send->s_wr.opcode);
-                               break;
-                       }
+                       rm = rds_ib_send_unmap_op(ic, send, wc.status);
 
-                       send->s_wr.opcode = 0xdead;
-                       send->s_wr.num_sge = 1;
                        if (send->s_queued + HZ/2 < jiffies)
                                rds_ib_stats_inc(s_ib_tx_stalled);
 
-                       /* If a RDMA operation produced an error, signal this right
-                        * away. If we don't, the subsequent SEND that goes with this
-                        * RDMA will be canceled with ERR_WFLUSH, and the application
-                        * never learn that the RDMA failed. */
-                       if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
-                               struct rds_message *rm;
-
-                               rm = rds_send_get_message(conn, send->s_op);
-                               if (rm) {
-                                       if (rm->m_rdma_op)
-                                               rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
-                                       rds_ib_send_rdma_complete(rm, wc.status);
-                                       rds_message_put(rm);
+                       if (send->s_op) {
+                               if (send->s_op == rm->m_final_op) {
+                                       /* If anyone waited for this message to get flushed out, wake
+                                        * them up now */
+                                       rds_message_unmapped(rm);
                                }
+                               rds_message_put(rm);
+                               send->s_op = NULL;
                        }
 
                        oldest = (oldest + 1) % ic->i_send_ring.w_nr;
                }
 
                rds_ib_ring_free(&ic->i_send_ring, completed);
+               rds_ib_sub_signaled(ic, nr_sig);
+               nr_sig = 0;
 
                if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
                    test_bit(0, &conn->c_map_queued))
@@ -262,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 
                /* We expect errors as the qp is drained during shutdown */
                if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
-                       rds_ib_conn_error(conn,
-                               "send completion on %pI4 "
-                               "had status %u, disconnecting and reconnecting\n",
-                               &conn->c_faddr, wc.status);
+                       rds_ib_conn_error(conn, "send completion on %pI4 had status "
+                                         "%u (%s), disconnecting and reconnecting\n",
+                                         &conn->c_faddr, wc.status,
+                                         rds_ib_wc_status_str(wc.status));
                }
        }
 }
@@ -294,7 +373,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
  * credits (see rds_ib_send_add_credits below).
  *
  * The RDS send code is essentially single-threaded; rds_send_xmit
- * grabs c_send_lock to ensure exclusive access to the send ring.
+ * sets RDS_IN_XMIT to ensure exclusive access to the send ring.
  * However, the ACK sending code is independent and can race with
  * message SENDs.
  *
@@ -413,40 +492,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
                set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
 }
 
-static inline void
-rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
-               struct rds_ib_send_work *send, unsigned int pos,
-               unsigned long buffer, unsigned int length,
-               int send_flags)
+static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
+                                            struct rds_ib_send_work *send,
+                                            bool notify)
 {
-       struct ib_sge *sge;
-
-       WARN_ON(pos != send - ic->i_sends);
-
-       send->s_wr.send_flags = send_flags;
-       send->s_wr.opcode = IB_WR_SEND;
-       send->s_wr.num_sge = 2;
-       send->s_wr.next = NULL;
-       send->s_queued = jiffies;
-       send->s_op = NULL;
-
-       if (length != 0) {
-               sge = rds_ib_data_sge(ic, send->s_sge);
-               sge->addr = buffer;
-               sge->length = length;
-               sge->lkey = ic->i_mr->lkey;
-
-               sge = rds_ib_header_sge(ic, send->s_sge);
-       } else {
-               /* We're sending a packet with no payload. There is only
-                * one SGE */
-               send->s_wr.num_sge = 1;
-               sge = &send->s_sge[0];
+       /*
+        * We want to delay signaling completions just enough to get
+        * the batching benefits but not so much that we create dead time
+        * on the wire.
+        */
+       if (ic->i_unsignaled_wrs-- == 0 || notify) {
+               ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+               send->s_wr.send_flags |= IB_SEND_SIGNALED;
+               return 1;
        }
-
-       sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
-       sge->length = sizeof(struct rds_header);
-       sge->lkey = ic->i_mr->lkey;
+       return 0;
 }
 
 /*
@@ -475,13 +535,14 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
        u32 pos;
        u32 i;
        u32 work_alloc;
-       u32 credit_alloc;
+       u32 credit_alloc = 0;
        u32 posted;
        u32 adv_credits = 0;
        int send_flags = 0;
-       int sent;
+       int bytes_sent = 0;
        int ret;
        int flow_controlled = 0;
+       int nr_sig = 0;
 
        BUG_ON(off % RDS_FRAG_SIZE);
        BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
@@ -507,14 +568,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                goto out;
        }
 
-       credit_alloc = work_alloc;
        if (ic->i_flowctl) {
                credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
                adv_credits += posted;
                if (credit_alloc < work_alloc) {
                        rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
                        work_alloc = credit_alloc;
-                       flow_controlled++;
+                       flow_controlled = 1;
                }
                if (work_alloc == 0) {
                        set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
@@ -525,31 +585,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
        }
 
        /* map the message the first time we see it */
-       if (ic->i_rm == NULL) {
-               /*
-               printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n",
-                               be16_to_cpu(rm->m_inc.i_hdr.h_dport),
-                               rm->m_inc.i_hdr.h_flags,
-                               be32_to_cpu(rm->m_inc.i_hdr.h_len));
-                  */
-               if (rm->m_nents) {
-                       rm->m_count = ib_dma_map_sg(dev,
-                                        rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
-                       rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
-                       if (rm->m_count == 0) {
+       if (!ic->i_data_op) {
+               if (rm->data.op_nents) {
+                       rm->data.op_count = ib_dma_map_sg(dev,
+                                                         rm->data.op_sg,
+                                                         rm->data.op_nents,
+                                                         DMA_TO_DEVICE);
+                       rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+                       if (rm->data.op_count == 0) {
                                rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
                                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
                                ret = -ENOMEM; /* XXX ? */
                                goto out;
                        }
                } else {
-                       rm->m_count = 0;
+                       rm->data.op_count = 0;
                }
 
-               ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
-               ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
                rds_message_addref(rm);
-               ic->i_rm = rm;
+               ic->i_data_op = &rm->data;
 
                /* Finalize the header */
                if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
@@ -559,10 +613,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 
                /* If it has a RDMA op, tell the peer we did it. This is
                 * used by the peer to release use-once RDMA MRs. */
-               if (rm->m_rdma_op) {
+               if (rm->rdma.op_active) {
                        struct rds_ext_header_rdma ext_hdr;
 
-                       ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
+                       ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
                        rds_message_add_extension(&rm->m_inc.i_hdr,
                                        RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
                }
@@ -582,99 +636,77 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                /*
                 * Update adv_credits since we reset the ACK_REQUIRED bit.
                 */
-               rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
-               adv_credits += posted;
-               BUG_ON(adv_credits > 255);
+               if (ic->i_flowctl) {
+                       rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
+                       adv_credits += posted;
+                       BUG_ON(adv_credits > 255);
+               }
        }
 
-       send = &ic->i_sends[pos];
-       first = send;
-       prev = NULL;
-       scat = &rm->m_sg[sg];
-       sent = 0;
-       i = 0;
-
        /* Sometimes you want to put a fence between an RDMA
         * READ and the following SEND.
         * We could either do this all the time
         * or when requested by the user. Right now, we let
         * the application choose.
         */
-       if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+       if (rm->rdma.op_active && rm->rdma.op_fence)
                send_flags = IB_SEND_FENCE;
 
-       /*
-        * We could be copying the header into the unused tail of the page.
-        * That would need to be changed in the future when those pages might
-        * be mapped userspace pages or page cache pages.  So instead we always
-        * use a second sge and our long-lived ring of mapped headers.  We send
-        * the header after the data so that the data payload can be aligned on
-        * the receiver.
-        */
+       /* Each frag gets a header. Msgs may be 0 bytes */
+       send = &ic->i_sends[pos];
+       first = send;
+       prev = NULL;
+       scat = &ic->i_data_op->op_sg[sg];
+       i = 0;
+       do {
+               unsigned int len = 0;
 
-       /* handle a 0-len message */
-       if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
-               rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
-               goto add_header;
-       }
+               /* Set up the header */
+               send->s_wr.send_flags = send_flags;
+               send->s_wr.opcode = IB_WR_SEND;
+               send->s_wr.num_sge = 1;
+               send->s_wr.next = NULL;
+               send->s_queued = jiffies;
+               send->s_op = NULL;
 
-       /* if there's data reference it with a chain of work reqs */
-       for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
-               unsigned int len;
+               send->s_sge[0].addr = ic->i_send_hdrs_dma
+                       + (pos * sizeof(struct rds_header));
+               send->s_sge[0].length = sizeof(struct rds_header);
 
-               send = &ic->i_sends[pos];
+               memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
 
-               len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
-               rds_ib_xmit_populate_wr(ic, send, pos,
-                               ib_sg_dma_address(dev, scat) + off, len,
-                               send_flags);
+               /* Set up the data, if present */
+               if (i < work_alloc
+                   && scat != &rm->data.op_sg[rm->data.op_count]) {
+                       len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+                       send->s_wr.num_sge = 2;
 
-               /*
-                * We want to delay signaling completions just enough to get
-                * the batching benefits but not so much that we create dead time
-                * on the wire.
-                */
-               if (ic->i_unsignaled_wrs-- == 0) {
-                       ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
-                       send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
-               }
+                       send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;
+                       send->s_sge[1].length = len;
 
-               ic->i_unsignaled_bytes -= len;
-               if (ic->i_unsignaled_bytes <= 0) {
-                       ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
-                       send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+                       bytes_sent += len;
+                       off += len;
+                       if (off == ib_sg_dma_len(dev, scat)) {
+                               scat++;
+                               off = 0;
+                       }
                }
 
+               rds_ib_set_wr_signal_state(ic, send, 0);
+
                /*
                 * Always signal the last one if we're stopping due to flow control.
                 */
-               if (flow_controlled && i == (work_alloc-1))
+               if (ic->i_flowctl && flow_controlled && i == (work_alloc-1))
                        send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
 
+               if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+                       nr_sig++;
+
                rdsdebug("send %p wr %p num_sge %u next %p\n", send,
                         &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
 
-               sent += len;
-               off += len;
-               if (off == ib_sg_dma_len(dev, scat)) {
-                       scat++;
-                       off = 0;
-               }
-
-add_header:
-               /* Tack on the header after the data. The header SGE should already
-                * have been set up to point to the right header buffer. */
-               memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
-
-               if (0) {
-                       struct rds_header *hdr = &ic->i_send_hdrs[pos];
-
-                       printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
-                               be16_to_cpu(hdr->h_dport),
-                               hdr->h_flags,
-                               be32_to_cpu(hdr->h_len));
-               }
-               if (adv_credits) {
+               if (ic->i_flowctl && adv_credits) {
                        struct rds_header *hdr = &ic->i_send_hdrs[pos];
 
                        /* add credit and redo the header checksum */
@@ -689,20 +721,25 @@ add_header:
                prev = send;
 
                pos = (pos + 1) % ic->i_send_ring.w_nr;
-       }
+               send = &ic->i_sends[pos];
+               i++;
+
+       } while (i < work_alloc
+                && scat != &rm->data.op_sg[rm->data.op_count]);
 
        /* Account the RDS header in the number of bytes we sent, but just once.
         * The caller has no concept of fragmentation. */
        if (hdr_off == 0)
-               sent += sizeof(struct rds_header);
+               bytes_sent += sizeof(struct rds_header);
 
        /* if we finished the message then send completion owns it */
-       if (scat == &rm->m_sg[rm->m_count]) {
-               prev->s_rm = ic->i_rm;
-               prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
-               ic->i_rm = NULL;
+       if (scat == &rm->data.op_sg[rm->data.op_count]) {
+               prev->s_op = ic->i_data_op;
+               prev->s_wr.send_flags |= IB_SEND_SOLICITED;
+               ic->i_data_op = NULL;
        }
 
+       /* Put back wrs & credits we didn't use */
        if (i < work_alloc) {
                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
                work_alloc = i;
@@ -710,6 +747,9 @@ add_header:
        if (ic->i_flowctl && i < credit_alloc)
                rds_ib_send_add_credits(conn, credit_alloc - i);
 
+       if (nr_sig)
+               atomic_add(nr_sig, &ic->i_signaled_sends);
+
        /* XXX need to worry about failed_wr and partial sends. */
        failed_wr = &first->s_wr;
        ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
@@ -720,32 +760,127 @@ add_header:
                printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
                       "returned %d\n", &conn->c_faddr, ret);
                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
-               if (prev->s_rm) {
-                       ic->i_rm = prev->s_rm;
-                       prev->s_rm = NULL;
+               rds_ib_sub_signaled(ic, nr_sig);
+               if (prev->s_op) {
+                       ic->i_data_op = prev->s_op;
+                       prev->s_op = NULL;
                }
 
                rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
                goto out;
        }
 
-       ret = sent;
+       ret = bytes_sent;
 out:
        BUG_ON(adv_credits);
        return ret;
 }
 
-int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
+/*
+ * Issue atomic operation.
+ * A simplified version of the rdma case, we always map 1 SG, and
+ * only 8 bytes, for the return value from the atomic operation.
+ */
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
+{
+       struct rds_ib_connection *ic = conn->c_transport_data;
+       struct rds_ib_send_work *send = NULL;
+       struct ib_send_wr *failed_wr;
+       struct rds_ib_device *rds_ibdev;
+       u32 pos;
+       u32 work_alloc;
+       int ret;
+       int nr_sig = 0;
+
+       rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
+       work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
+       if (work_alloc != 1) {
+               rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+               rds_ib_stats_inc(s_ib_tx_ring_full);
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       /* address of send request in ring */
+       send = &ic->i_sends[pos];
+       send->s_queued = jiffies;
+
+       if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
+               send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
+               send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
+               send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
+               send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
+               send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
+       } else { /* FADD */
+               send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+               send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
+               send->s_wr.wr.atomic.swap = 0;
+               send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
+               send->s_wr.wr.atomic.swap_mask = 0;
+       }
+       nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+       send->s_wr.num_sge = 1;
+       send->s_wr.next = NULL;
+       send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
+       send->s_wr.wr.atomic.rkey = op->op_rkey;
+       send->s_op = op;
+       rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
+
+       /* map 8 byte retval buffer to the device */
+       ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
+       rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
+       if (ret != 1) {
+               rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+               rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+               ret = -ENOMEM; /* XXX ? */
+               goto out;
+       }
+
+       /* Convert our struct scatterlist to struct ib_sge */
+       send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
+       send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
+       send->s_sge[0].lkey = ic->i_mr->lkey;
+
+       rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
+                send->s_sge[0].addr, send->s_sge[0].length);
+
+       if (nr_sig)
+               atomic_add(nr_sig, &ic->i_signaled_sends);
+
+       failed_wr = &send->s_wr;
+       ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
+       rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
+                send, &send->s_wr, ret, failed_wr);
+       BUG_ON(failed_wr != &send->s_wr);
+       if (ret) {
+               printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
+                      "returned %d\n", &conn->c_faddr, ret);
+               rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+               rds_ib_sub_signaled(ic, nr_sig);
+               goto out;
+       }
+
+       if (unlikely(failed_wr != &send->s_wr)) {
+               printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+               BUG_ON(failed_wr != &send->s_wr);
+       }
+
+out:
+       return ret;
+}
+
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 {
        struct rds_ib_connection *ic = conn->c_transport_data;
        struct rds_ib_send_work *send = NULL;
        struct rds_ib_send_work *first;
        struct rds_ib_send_work *prev;
        struct ib_send_wr *failed_wr;
-       struct rds_ib_device *rds_ibdev;
        struct scatterlist *scat;
        unsigned long len;
-       u64 remote_addr = op->r_remote_addr;
+       u64 remote_addr = op->op_remote_addr;
+       u32 max_sge = ic->rds_ibdev->max_sge;
        u32 pos;
        u32 work_alloc;
        u32 i;
@@ -753,29 +888,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
        int sent;
        int ret;
        int num_sge;
-
-       rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
-
-       /* map the message the first time we see it */
-       if (!op->r_mapped) {
-               op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
-                                       op->r_sg, op->r_nents, (op->r_write) ?
-                                       DMA_TO_DEVICE : DMA_FROM_DEVICE);
-               rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
-               if (op->r_count == 0) {
+       int nr_sig = 0;
+
+       /* map the op the first time we see it */
+       if (!op->op_mapped) {
+               op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
+                                            op->op_sg, op->op_nents, (op->op_write) ?
+                                            DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
+               if (op->op_count == 0) {
                        rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
                        ret = -ENOMEM; /* XXX ? */
                        goto out;
                }
 
-               op->r_mapped = 1;
+               op->op_mapped = 1;
        }
 
        /*
         * Instead of knowing how to return a partial rdma read/write we insist that there
         * be enough work requests to send the entire message.
         */
-       i = ceil(op->r_count, rds_ibdev->max_sge);
+       i = ceil(op->op_count, max_sge);
 
        work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
        if (work_alloc != i) {
@@ -788,30 +922,24 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
        send = &ic->i_sends[pos];
        first = send;
        prev = NULL;
-       scat = &op->r_sg[0];
+       scat = &op->op_sg[0];
        sent = 0;
-       num_sge = op->r_count;
+       num_sge = op->op_count;
 
-       for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
+       for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
                send->s_wr.send_flags = 0;
                send->s_queued = jiffies;
-               /*
-                * We want to delay signaling completions just enough to get
-                * the batching benefits but not so much that we create dead time on the wire.
-                */
-               if (ic->i_unsignaled_wrs-- == 0) {
-                       ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
-                       send->s_wr.send_flags = IB_SEND_SIGNALED;
-               }
+               send->s_op = NULL;
+
+               nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
 
-               send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+               send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
                send->s_wr.wr.rdma.remote_addr = remote_addr;
-               send->s_wr.wr.rdma.rkey = op->r_key;
-               send->s_op = op;
+               send->s_wr.wr.rdma.rkey = op->op_rkey;
 
-               if (num_sge > rds_ibdev->max_sge) {
-                       send->s_wr.num_sge = rds_ibdev->max_sge;
-                       num_sge -= rds_ibdev->max_sge;
+               if (num_sge > max_sge) {
+                       send->s_wr.num_sge = max_sge;
+                       num_sge -= max_sge;
                } else {
                        send->s_wr.num_sge = num_sge;
                }
@@ -821,7 +949,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
                if (prev)
                        prev->s_wr.next = &send->s_wr;
 
-               for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
+               for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
                        len = ib_sg_dma_len(ic->i_cm_id->device, scat);
                        send->s_sge[j].addr =
                                 ib_sg_dma_address(ic->i_cm_id->device, scat);
@@ -843,15 +971,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
                        send = ic->i_sends;
        }
 
-       /* if we finished the message then send completion owns it */
-       if (scat == &op->r_sg[op->r_count])
-               prev->s_wr.send_flags = IB_SEND_SIGNALED;
+       /* give a reference to the last op */
+       if (scat == &op->op_sg[op->op_count]) {
+               prev->s_op = op;
+               rds_message_addref(container_of(op, struct rds_message, rdma));
+       }
 
        if (i < work_alloc) {
                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
                work_alloc = i;
        }
 
+       if (nr_sig)
+               atomic_add(nr_sig, &ic->i_signaled_sends);
+
        failed_wr = &first->s_wr;
        ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
        rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
@@ -861,6 +994,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
                printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
                       "returned %d\n", &conn->c_faddr, ret);
                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+               rds_ib_sub_signaled(ic, nr_sig);
                goto out;
        }
 
index d2c904dd6fbcb9e1eeb275f84cc062a273bb98ec..2d5965d6e97c039517d219bfdad3f28f7437b1d7 100644 (file)
@@ -67,6 +67,8 @@ static const char *const rds_ib_stat_names[] = {
        "ib_rdma_mr_pool_flush",
        "ib_rdma_mr_pool_wait",
        "ib_rdma_mr_pool_depleted",
+       "ib_atomic_cswp",
+       "ib_atomic_fadd",
 };
 
 unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
index 03f01cb4e0fee40487585088b9a6a0b2d9219049..fc3da37220fd671133326d5c915ae6f8a947c644 100644 (file)
@@ -49,10 +49,6 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
 static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
 static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
 
-unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
-static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
-static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
-
 /*
  * This sysctl does nothing.
  *
@@ -93,15 +89,6 @@ ctl_table rds_ib_sysctl_table[] = {
                .extra1         = &rds_ib_sysctl_max_unsig_wr_min,
                .extra2         = &rds_ib_sysctl_max_unsig_wr_max,
        },
-       {
-               .procname       = "max_unsignaled_bytes",
-               .data           = &rds_ib_sysctl_max_unsig_bytes,
-               .maxlen         = sizeof(unsigned long),
-               .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
-               .extra1         = &rds_ib_sysctl_max_unsig_bytes_min,
-               .extra2         = &rds_ib_sysctl_max_unsig_bytes_max,
-       },
        {
                .procname       = "max_recv_allocation",
                .data           = &rds_ib_sysctl_max_recv_allocation,
@@ -132,10 +119,10 @@ void rds_ib_sysctl_exit(void)
                unregister_sysctl_table(rds_ib_sysctl_hdr);
 }
 
-int __init rds_ib_sysctl_init(void)
+int rds_ib_sysctl_init(void)
 {
        rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
-       if (rds_ib_sysctl_hdr == NULL)
+       if (!rds_ib_sysctl_hdr)
                return -ENOMEM;
        return 0;
 }
index c45c4173a44d44eb9ceee633e0cec271f2182411..4fdf1b6e84fff6fd25b2040544245921332b9f89 100644 (file)
@@ -76,7 +76,7 @@ void rds_info_register_func(int optname, rds_info_func func)
        BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
 
        spin_lock(&rds_info_lock);
-       BUG_ON(rds_info_funcs[offset] != NULL);
+       BUG_ON(rds_info_funcs[offset]);
        rds_info_funcs[offset] = func;
        spin_unlock(&rds_info_lock);
 }
@@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func);
  */
 void rds_info_iter_unmap(struct rds_info_iterator *iter)
 {
-       if (iter->addr != NULL) {
+       if (iter->addr) {
                kunmap_atomic(iter->addr, KM_USER0);
                iter->addr = NULL;
        }
@@ -117,7 +117,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
        unsigned long this;
 
        while (bytes) {
-               if (iter->addr == NULL)
+               if (!iter->addr)
                        iter->addr = kmap_atomic(*iter->pages, KM_USER0);
 
                this = min(bytes, PAGE_SIZE - iter->offset);
@@ -188,7 +188,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
                        >> PAGE_SHIFT;
 
        pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
-       if (pages == NULL) {
+       if (!pages) {
                ret = -ENOMEM;
                goto out;
        }
@@ -206,7 +206,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
 
 call_func:
        func = rds_info_funcs[optname - RDS_INFO_FIRST];
-       if (func == NULL) {
+       if (!func) {
                ret = -ENOPROTOOPT;
                goto out;
        }
@@ -234,7 +234,7 @@ call_func:
                ret = -EFAULT;
 
 out:
-       for (i = 0; pages != NULL && i < nr_pages; i++)
+       for (i = 0; pages && i < nr_pages; i++)
                put_page(pages[i]);
        kfree(pages);
 
index c8f3d3525cb9b187daeca402f71cb5b88356ce41..56808cac0fc795dc1564b1928f816be1502473ce 100644 (file)
@@ -264,7 +264,6 @@ struct rds_transport rds_iw_transport = {
        .laddr_check            = rds_iw_laddr_check,
        .xmit_complete          = rds_iw_xmit_complete,
        .xmit                   = rds_iw_xmit,
-       .xmit_cong_map          = NULL,
        .xmit_rdma              = rds_iw_xmit_rdma,
        .recv                   = rds_iw_recv,
        .conn_alloc             = rds_iw_conn_alloc,
@@ -272,7 +271,6 @@ struct rds_transport rds_iw_transport = {
        .conn_connect           = rds_iw_conn_connect,
        .conn_shutdown          = rds_iw_conn_shutdown,
        .inc_copy_to_user       = rds_iw_inc_copy_to_user,
-       .inc_purge              = rds_iw_inc_purge,
        .inc_free               = rds_iw_inc_free,
        .cm_initiate_connect    = rds_iw_cm_initiate_connect,
        .cm_handle_connect      = rds_iw_cm_handle_connect,
@@ -289,7 +287,7 @@ struct rds_transport rds_iw_transport = {
        .t_prefer_loopback      = 1,
 };
 
-int __init rds_iw_init(void)
+int rds_iw_init(void)
 {
        int ret;
 
index eef2f0c2847604078a326df106c09403520410e8..543e665fafe3ae2cb92e3772425a79c1a6f95ffc 100644 (file)
@@ -70,7 +70,7 @@ struct rds_iw_send_work {
        struct rds_message      *s_rm;
 
        /* We should really put these into a union: */
-       struct rds_rdma_op      *s_op;
+       struct rm_rdma_op       *s_op;
        struct rds_iw_mapping   *s_mapping;
        struct ib_mr            *s_mr;
        struct ib_fast_reg_page_list *s_page_list;
@@ -284,7 +284,7 @@ void rds_iw_conn_free(void *arg);
 int rds_iw_conn_connect(struct rds_connection *conn);
 void rds_iw_conn_shutdown(struct rds_connection *conn);
 void rds_iw_state_change(struct sock *sk);
-int __init rds_iw_listen_init(void);
+int rds_iw_listen_init(void);
 void rds_iw_listen_stop(void);
 void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
 int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -321,12 +321,11 @@ void rds_iw_flush_mrs(void);
 void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
 
 /* ib_recv.c */
-int __init rds_iw_recv_init(void);
+int rds_iw_recv_init(void);
 void rds_iw_recv_exit(void);
 int rds_iw_recv(struct rds_connection *conn);
 int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
                       gfp_t page_gfp, int prefill);
-void rds_iw_inc_purge(struct rds_incoming *inc);
 void rds_iw_inc_free(struct rds_incoming *inc);
 int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
                             size_t size);
@@ -358,7 +357,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
 void rds_iw_send_init_ring(struct rds_iw_connection *ic);
 void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
 void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
 void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
 int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
@@ -371,7 +370,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
                                    unsigned int avail);
 
 /* ib_sysctl.c */
-int __init rds_iw_sysctl_init(void);
+int rds_iw_sysctl_init(void);
 void rds_iw_sysctl_exit(void);
 extern unsigned long rds_iw_sysctl_max_send_wr;
 extern unsigned long rds_iw_sysctl_max_recv_wr;
index b5dd6ac39be86b87afc1bd7761971a11550e572f..712cf2d1f28ef36e3bf567540b036ca41179faaf 100644 (file)
@@ -257,7 +257,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
         * the rds_iwdev at all.
         */
        rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
-       if (rds_iwdev == NULL) {
+       if (!rds_iwdev) {
                if (printk_ratelimit())
                        printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
                                        dev->name);
@@ -292,7 +292,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
                                           ic->i_send_ring.w_nr *
                                                sizeof(struct rds_header),
                                           &ic->i_send_hdrs_dma, GFP_KERNEL);
-       if (ic->i_send_hdrs == NULL) {
+       if (!ic->i_send_hdrs) {
                ret = -ENOMEM;
                rdsdebug("ib_dma_alloc_coherent send failed\n");
                goto out;
@@ -302,7 +302,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
                                           ic->i_recv_ring.w_nr *
                                                sizeof(struct rds_header),
                                           &ic->i_recv_hdrs_dma, GFP_KERNEL);
-       if (ic->i_recv_hdrs == NULL) {
+       if (!ic->i_recv_hdrs) {
                ret = -ENOMEM;
                rdsdebug("ib_dma_alloc_coherent recv failed\n");
                goto out;
@@ -310,14 +310,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
 
        ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
                                       &ic->i_ack_dma, GFP_KERNEL);
-       if (ic->i_ack == NULL) {
+       if (!ic->i_ack) {
                ret = -ENOMEM;
                rdsdebug("ib_dma_alloc_coherent ack failed\n");
                goto out;
        }
 
        ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
-       if (ic->i_sends == NULL) {
+       if (!ic->i_sends) {
                ret = -ENOMEM;
                rdsdebug("send allocation failed\n");
                goto out;
@@ -325,7 +325,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
        rds_iw_send_init_ring(ic);
 
        ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
-       if (ic->i_recvs == NULL) {
+       if (!ic->i_recvs) {
                ret = -ENOMEM;
                rdsdebug("recv allocation failed\n");
                goto out;
@@ -696,7 +696,7 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 
        /* XXX too lazy? */
        ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
-       if (ic == NULL)
+       if (!ic)
                return -ENOMEM;
 
        INIT_LIST_HEAD(&ic->iw_node);
index 13dc1862d86264288eb0455056d289008d072234..4e152e2daa3d02cc1ca2397853d00cbca8d75809 100644 (file)
@@ -34,7 +34,6 @@
 #include <linux/slab.h>
 
 #include "rds.h"
-#include "rdma.h"
 #include "iw.h"
 
 
index 3d479067d54dbd02810f9c6b1bf2d373d9388cbf..5e57347f49ff040968067b50ac35283e4463acd7 100644 (file)
@@ -53,7 +53,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
 static void rds_iw_frag_free(struct rds_page_frag *frag)
 {
        rdsdebug("frag %p page %p\n", frag, frag->f_page);
-       BUG_ON(frag->f_page != NULL);
+       BUG_ON(frag->f_page);
        kmem_cache_free(rds_iw_frag_slab, frag);
 }
 
@@ -143,14 +143,14 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
        struct ib_sge *sge;
        int ret = -ENOMEM;
 
-       if (recv->r_iwinc == NULL) {
+       if (!recv->r_iwinc) {
                if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
                        rds_iw_stats_inc(s_iw_rx_alloc_limit);
                        goto out;
                }
                recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
                                                 kptr_gfp);
-               if (recv->r_iwinc == NULL) {
+               if (!recv->r_iwinc) {
                        atomic_dec(&rds_iw_allocation);
                        goto out;
                }
@@ -158,17 +158,17 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
                rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
        }
 
-       if (recv->r_frag == NULL) {
+       if (!recv->r_frag) {
                recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
-               if (recv->r_frag == NULL)
+               if (!recv->r_frag)
                        goto out;
                INIT_LIST_HEAD(&recv->r_frag->f_item);
                recv->r_frag->f_page = NULL;
        }
 
-       if (ic->i_frag.f_page == NULL) {
+       if (!ic->i_frag.f_page) {
                ic->i_frag.f_page = alloc_page(page_gfp);
-               if (ic->i_frag.f_page == NULL)
+               if (!ic->i_frag.f_page)
                        goto out;
                ic->i_frag.f_offset = 0;
        }
@@ -273,7 +273,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
        return ret;
 }
 
-void rds_iw_inc_purge(struct rds_incoming *inc)
+static void rds_iw_inc_purge(struct rds_incoming *inc)
 {
        struct rds_iw_incoming *iwinc;
        struct rds_page_frag *frag;
@@ -716,7 +716,7 @@ static void rds_iw_process_recv(struct rds_connection *conn,
         * into the inc and save the inc so we can hang upcoming fragments
         * off its list.
         */
-       if (iwinc == NULL) {
+       if (!iwinc) {
                iwinc = recv->r_iwinc;
                recv->r_iwinc = NULL;
                ic->i_iwinc = iwinc;
@@ -887,7 +887,7 @@ int rds_iw_recv(struct rds_connection *conn)
        return ret;
 }
 
-int __init rds_iw_recv_init(void)
+int rds_iw_recv_init(void)
 {
        struct sysinfo si;
        int ret = -ENOMEM;
@@ -899,13 +899,13 @@ int __init rds_iw_recv_init(void)
        rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
                                        sizeof(struct rds_iw_incoming),
                                        0, 0, NULL);
-       if (rds_iw_incoming_slab == NULL)
+       if (!rds_iw_incoming_slab)
                goto out;
 
        rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
                                        sizeof(struct rds_page_frag),
                                        0, 0, NULL);
-       if (rds_iw_frag_slab == NULL)
+       if (!rds_iw_frag_slab)
                kmem_cache_destroy(rds_iw_incoming_slab);
        else
                ret = 0;
index 52182ff7519edcde8b7dee746b72282fe6eb832f..6280ea020d4eb0293b3c76be3a2e8e87920aff49 100644 (file)
@@ -36,7 +36,6 @@
 #include <linux/dmapool.h>
 
 #include "rds.h"
-#include "rdma.h"
 #include "iw.h"
 
 static void rds_iw_send_rdma_complete(struct rds_message *rm,
@@ -64,13 +63,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm,
 }
 
 static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
-                                  struct rds_rdma_op *op)
+                                  struct rm_rdma_op *op)
 {
-       if (op->r_mapped) {
+       if (op->op_mapped) {
                ib_dma_unmap_sg(ic->i_cm_id->device,
-                       op->r_sg, op->r_nents,
-                       op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-               op->r_mapped = 0;
+                       op->op_sg, op->op_nents,
+                       op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               op->op_mapped = 0;
        }
 }
 
@@ -83,11 +82,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
        rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
 
        ib_dma_unmap_sg(ic->i_cm_id->device,
-                    rm->m_sg, rm->m_nents,
+                    rm->data.op_sg, rm->data.op_nents,
                     DMA_TO_DEVICE);
 
-       if (rm->m_rdma_op != NULL) {
-               rds_iw_send_unmap_rdma(ic, rm->m_rdma_op);
+       if (rm->rdma.op_active) {
+               rds_iw_send_unmap_rdma(ic, &rm->rdma);
 
                /* If the user asked for a completion notification on this
                 * message, we can implement three different semantics:
@@ -111,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
                 */
                rds_iw_send_rdma_complete(rm, wc_status);
 
-               if (rm->m_rdma_op->r_write)
-                       rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
+               if (rm->rdma.op_write)
+                       rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
                else
-                       rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
+                       rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
        }
 
        /* If anyone waited for this message to get flushed out, wake
@@ -556,25 +555,27 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
        }
 
        /* map the message the first time we see it */
-       if (ic->i_rm == NULL) {
+       if (!ic->i_rm) {
                /*
                printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
                                be16_to_cpu(rm->m_inc.i_hdr.h_dport),
                                rm->m_inc.i_hdr.h_flags,
                                be32_to_cpu(rm->m_inc.i_hdr.h_len));
                   */
-               if (rm->m_nents) {
-                       rm->m_count = ib_dma_map_sg(dev,
-                                        rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
-                       rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
-                       if (rm->m_count == 0) {
+               if (rm->data.op_nents) {
+                       rm->data.op_count = ib_dma_map_sg(dev,
+                                                         rm->data.op_sg,
+                                                         rm->data.op_nents,
+                                                         DMA_TO_DEVICE);
+                       rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+                       if (rm->data.op_count == 0) {
                                rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
                                rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
                                ret = -ENOMEM; /* XXX ? */
                                goto out;
                        }
                } else {
-                       rm->m_count = 0;
+                       rm->data.op_count = 0;
                }
 
                ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
@@ -590,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 
                /* If it has a RDMA op, tell the peer we did it. This is
                 * used by the peer to release use-once RDMA MRs. */
-               if (rm->m_rdma_op) {
+               if (rm->rdma.op_active) {
                        struct rds_ext_header_rdma ext_hdr;
 
-                       ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
+                       ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
                        rds_message_add_extension(&rm->m_inc.i_hdr,
                                        RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
                }
@@ -621,7 +622,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
        send = &ic->i_sends[pos];
        first = send;
        prev = NULL;
-       scat = &rm->m_sg[sg];
+       scat = &rm->data.op_sg[sg];
        sent = 0;
        i = 0;
 
@@ -631,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
         * or when requested by the user. Right now, we let
         * the application choose.
         */
-       if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+       if (rm->rdma.op_active && rm->rdma.op_fence)
                send_flags = IB_SEND_FENCE;
 
        /*
@@ -650,7 +651,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
        }
 
        /* if there's data reference it with a chain of work reqs */
-       for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
+       for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
                unsigned int len;
 
                send = &ic->i_sends[pos];
@@ -728,7 +729,7 @@ add_header:
                sent += sizeof(struct rds_header);
 
        /* if we finished the message then send completion owns it */
-       if (scat == &rm->m_sg[rm->m_count]) {
+       if (scat == &rm->data.op_sg[rm->data.op_count]) {
                prev->s_rm = ic->i_rm;
                prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
                ic->i_rm = NULL;
@@ -784,7 +785,7 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd
        ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
 }
 
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 {
        struct rds_iw_connection *ic = conn->c_transport_data;
        struct rds_iw_send_work *send = NULL;
@@ -794,7 +795,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
        struct rds_iw_device *rds_iwdev;
        struct scatterlist *scat;
        unsigned long len;
-       u64 remote_addr = op->r_remote_addr;
+       u64 remote_addr = op->op_remote_addr;
        u32 pos, fr_pos;
        u32 work_alloc;
        u32 i;
@@ -806,21 +807,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
        rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
 
        /* map the message the first time we see it */
-       if (!op->r_mapped) {
-               op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
-                                       op->r_sg, op->r_nents, (op->r_write) ?
-                                       DMA_TO_DEVICE : DMA_FROM_DEVICE);
-               rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
-               if (op->r_count == 0) {
+       if (!op->op_mapped) {
+               op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
+                                            op->op_sg, op->op_nents, (op->op_write) ?
+                                            DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
+               if (op->op_count == 0) {
                        rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
                        ret = -ENOMEM; /* XXX ? */
                        goto out;
                }
 
-               op->r_mapped = 1;
+               op->op_mapped = 1;
        }
 
-       if (!op->r_write) {
+       if (!op->op_write) {
                /* Alloc space on the send queue for the fastreg */
                work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
                if (work_alloc != 1) {
@@ -835,7 +836,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
         * Instead of knowing how to return a partial rdma read/write we insist that there
         * be enough work requests to send the entire message.
         */
-       i = ceil(op->r_count, rds_iwdev->max_sge);
+       i = ceil(op->op_count, rds_iwdev->max_sge);
 
        work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
        if (work_alloc != i) {
@@ -846,17 +847,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
        }
 
        send = &ic->i_sends[pos];
-       if (!op->r_write) {
+       if (!op->op_write) {
                first = prev = &ic->i_sends[fr_pos];
        } else {
                first = send;
                prev = NULL;
        }
-       scat = &op->r_sg[0];
+       scat = &op->op_sg[0];
        sent = 0;
-       num_sge = op->r_count;
+       num_sge = op->op_count;
 
-       for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
+       for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
                send->s_wr.send_flags = 0;
                send->s_queued = jiffies;
 
@@ -873,13 +874,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
                 * for local access after RDS is finished with it, using
                 * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
                 */
-               if (op->r_write)
+               if (op->op_write)
                        send->s_wr.opcode = IB_WR_RDMA_WRITE;
                else
                        send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
 
                send->s_wr.wr.rdma.remote_addr = remote_addr;
-               send->s_wr.wr.rdma.rkey = op->r_key;
+               send->s_wr.wr.rdma.rkey = op->op_rkey;
                send->s_op = op;
 
                if (num_sge > rds_iwdev->max_sge) {
@@ -893,7 +894,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
                if (prev)
                        prev->s_wr.next = &send->s_wr;
 
-               for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
+               for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
                        len = ib_sg_dma_len(ic->i_cm_id->device, scat);
 
                        if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
@@ -927,7 +928,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
        }
 
        /* if we finished the message then send completion owns it */
-       if (scat == &op->r_sg[op->r_count])
+       if (scat == &op->op_sg[op->op_count])
                first->s_wr.send_flags = IB_SEND_SIGNALED;
 
        if (i < work_alloc) {
@@ -941,9 +942,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
         * adapters do not allow using the lkey for this at all.  To bypass this use a
         * fastreg_mr (or possibly a dma_mr)
         */
-       if (!op->r_write) {
+       if (!op->op_write) {
                rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
-                       op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
+                       op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
                work_alloc++;
        }
 
index 1c4428a61a0259baf3c1af1611f23608ffea0c5d..23e3a9a26aaf36285fef39e8b909ce0d8b62327a 100644 (file)
@@ -122,10 +122,10 @@ void rds_iw_sysctl_exit(void)
                unregister_sysctl_table(rds_iw_sysctl_hdr);
 }
 
-int __init rds_iw_sysctl_init(void)
+int rds_iw_sysctl_init(void)
 {
        rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
-       if (rds_iw_sysctl_hdr == NULL)
+       if (!rds_iw_sysctl_hdr)
                return -ENOMEM;
        return 0;
 }
index dd9879379457e29bc9afcb45bacbb58b4ffd092e..c390156b426fc936c6b05fd607f4c32ddb1e3a81 100644 (file)
@@ -61,10 +61,17 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
                         unsigned int hdr_off, unsigned int sg,
                         unsigned int off)
 {
+       /* Do not send cong updates to loopback */
+       if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
+               rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+               return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
+       }
+
        BUG_ON(hdr_off || sg || off);
 
        rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
-       rds_message_addref(rm); /* for the inc */
+       /* For the embedded inc. Matching put is in loop_inc_free() */
+       rds_message_addref(rm);
 
        rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
                          GFP_KERNEL, KM_USER0);
@@ -77,16 +84,14 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
        return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len);
 }
 
-static int rds_loop_xmit_cong_map(struct rds_connection *conn,
-                                 struct rds_cong_map *map,
-                                 unsigned long offset)
+/*
+ * See rds_loop_xmit(). Since our inc is embedded in the rm, we
+ * make sure the rm lives at least until the inc is done.
+ */
+static void rds_loop_inc_free(struct rds_incoming *inc)
 {
-       BUG_ON(offset);
-       BUG_ON(map != conn->c_lcong);
-
-       rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
-
-       return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
+        struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+        rds_message_put(rm);
 }
 
 /* we need to at least give the thread something to succeed */
@@ -112,7 +117,7 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
        unsigned long flags;
 
        lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
-       if (lc == NULL)
+       if (!lc)
                return -ENOMEM;
 
        INIT_LIST_HEAD(&lc->loop_node);
@@ -169,14 +174,12 @@ void rds_loop_exit(void)
  */
 struct rds_transport rds_loop_transport = {
        .xmit                   = rds_loop_xmit,
-       .xmit_cong_map          = rds_loop_xmit_cong_map,
        .recv                   = rds_loop_recv,
        .conn_alloc             = rds_loop_conn_alloc,
        .conn_free              = rds_loop_conn_free,
        .conn_connect           = rds_loop_conn_connect,
        .conn_shutdown          = rds_loop_conn_shutdown,
        .inc_copy_to_user       = rds_message_inc_copy_to_user,
-       .inc_purge              = rds_message_inc_purge,
-       .inc_free               = rds_message_inc_free,
+       .inc_free               = rds_loop_inc_free,
        .t_name                 = "loopback",
 };
index 9a1d67e001ba60a79608275ecfa3ed452f373b19..84f937f11d475870710abbbfc746b06c4160b5f3 100644 (file)
@@ -34,9 +34,6 @@
 #include <linux/slab.h>
 
 #include "rds.h"
-#include "rdma.h"
-
-static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
 
 static unsigned int    rds_exthdr_size[__RDS_EXTHDR_MAX] = {
 [RDS_EXTHDR_NONE]      = 0,
@@ -63,29 +60,31 @@ static void rds_message_purge(struct rds_message *rm)
        if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
                return;
 
-       for (i = 0; i < rm->m_nents; i++) {
-               rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i]));
+       for (i = 0; i < rm->data.op_nents; i++) {
+               rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
                /* XXX will have to put_page for page refs */
-               __free_page(sg_page(&rm->m_sg[i]));
+               __free_page(sg_page(&rm->data.op_sg[i]));
        }
-       rm->m_nents = 0;
+       rm->data.op_nents = 0;
 
-       if (rm->m_rdma_op)
-               rds_rdma_free_op(rm->m_rdma_op);
-       if (rm->m_rdma_mr)
-               rds_mr_put(rm->m_rdma_mr);
-}
+       if (rm->rdma.op_active)
+               rds_rdma_free_op(&rm->rdma);
+       if (rm->rdma.op_rdma_mr)
+               rds_mr_put(rm->rdma.op_rdma_mr);
 
-void rds_message_inc_purge(struct rds_incoming *inc)
-{
-       struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
-       rds_message_purge(rm);
+       if (rm->atomic.op_active)
+               rds_atomic_free_op(&rm->atomic);
+       if (rm->atomic.op_rdma_mr)
+               rds_mr_put(rm->atomic.op_rdma_mr);
 }
 
 void rds_message_put(struct rds_message *rm)
 {
        rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
-
+       if (atomic_read(&rm->m_refcount) == 0) {
+printk(KERN_CRIT "danger refcount zero on %p\n", rm);
+WARN_ON(1);
+       }
        if (atomic_dec_and_test(&rm->m_refcount)) {
                BUG_ON(!list_empty(&rm->m_sock_item));
                BUG_ON(!list_empty(&rm->m_conn_item));
@@ -96,12 +95,6 @@ void rds_message_put(struct rds_message *rm)
 }
 EXPORT_SYMBOL_GPL(rds_message_put);
 
-void rds_message_inc_free(struct rds_incoming *inc)
-{
-       struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
-       rds_message_put(rm);
-}
-
 void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
                                 __be16 dport, u64 seq)
 {
@@ -214,41 +207,68 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
 }
 EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
 
-struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
+/*
+ * Each rds_message is allocated with extra space for the scatterlist entries
+ * rds ops will need. This is to minimize memory allocation count. Then, each rds op
+ * can grab SGs when initializing its part of the rds_message.
+ */
+struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
 {
        struct rds_message *rm;
 
-       rm = kzalloc(sizeof(struct rds_message) +
-                    (nents * sizeof(struct scatterlist)), gfp);
+       rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
        if (!rm)
                goto out;
 
-       if (nents)
-               sg_init_table(rm->m_sg, nents);
+       rm->m_used_sgs = 0;
+       rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
+
        atomic_set(&rm->m_refcount, 1);
        INIT_LIST_HEAD(&rm->m_sock_item);
        INIT_LIST_HEAD(&rm->m_conn_item);
        spin_lock_init(&rm->m_rs_lock);
+       init_waitqueue_head(&rm->m_flush_wait);
 
 out:
        return rm;
 }
 
+/*
+ * RDS ops use this to grab SG entries from the rm's sg pool.
+ */
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
+{
+       struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
+       struct scatterlist *sg_ret;
+
+       WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
+       WARN_ON(!nents);
+
+       sg_ret = &sg_first[rm->m_used_sgs];
+       sg_init_table(sg_ret, nents);
+       rm->m_used_sgs += nents;
+
+       return sg_ret;
+}
+
 struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
 {
        struct rds_message *rm;
        unsigned int i;
+       int num_sgs = ceil(total_len, PAGE_SIZE);
+       int extra_bytes = num_sgs * sizeof(struct scatterlist);
 
-       rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
-       if (rm == NULL)
+       rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
+       if (!rm)
                return ERR_PTR(-ENOMEM);
 
        set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
        rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
-       rm->m_nents = ceil(total_len, PAGE_SIZE);
+       rm->data.op_nents = ceil(total_len, PAGE_SIZE);
+       rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
 
-       for (i = 0; i < rm->m_nents; ++i) {
-               sg_set_page(&rm->m_sg[i],
+       for (i = 0; i < rm->data.op_nents; ++i) {
+               sg_set_page(&rm->data.op_sg[i],
                                virt_to_page(page_addrs[i]),
                                PAGE_SIZE, 0);
        }
@@ -256,40 +276,33 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
        return rm;
 }
 
-struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
                                               size_t total_len)
 {
        unsigned long to_copy;
        unsigned long iov_off;
        unsigned long sg_off;
-       struct rds_message *rm;
        struct iovec *iov;
        struct scatterlist *sg;
-       int ret;
-
-       rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
-       if (rm == NULL) {
-               ret = -ENOMEM;
-               goto out;
-       }
+       int ret = 0;
 
        rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
 
        /*
         * now allocate and copy in the data payload.
         */
-       sg = rm->m_sg;
+       sg = rm->data.op_sg;
        iov = first_iov;
        iov_off = 0;
        sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
 
        while (total_len) {
-               if (sg_page(sg) == NULL) {
+               if (!sg_page(sg)) {
                        ret = rds_page_remainder_alloc(sg, total_len,
                                                       GFP_HIGHUSER);
                        if (ret)
                                goto out;
-                       rm->m_nents++;
+                       rm->data.op_nents++;
                        sg_off = 0;
                }
 
@@ -320,14 +333,8 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
                        sg++;
        }
 
-       ret = 0;
 out:
-       if (ret) {
-               if (rm)
-                       rds_message_put(rm);
-               rm = ERR_PTR(ret);
-       }
-       return rm;
+       return ret;
 }
 
 int rds_message_inc_copy_to_user(struct rds_incoming *inc,
@@ -348,7 +355,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
 
        iov = first_iov;
        iov_off = 0;
-       sg = rm->m_sg;
+       sg = rm->data.op_sg;
        vec_off = 0;
        copied = 0;
 
@@ -394,15 +401,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
  */
 void rds_message_wait(struct rds_message *rm)
 {
-       wait_event(rds_message_flush_waitq,
+       wait_event_interruptible(rm->m_flush_wait,
                        !test_bit(RDS_MSG_MAPPED, &rm->m_flags));
 }
 
 void rds_message_unmapped(struct rds_message *rm)
 {
        clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
-       if (waitqueue_active(&rds_message_flush_waitq))
-               wake_up(&rds_message_flush_waitq);
+       wake_up_interruptible(&rm->m_flush_wait);
 }
 EXPORT_SYMBOL_GPL(rds_message_unmapped);
 
index 595a952d4b17f069c60a457701d6e207f68e621b..5e44f5ae78987315757b1243f7bcd43373bd5df4 100644 (file)
@@ -116,7 +116,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
        /* jump straight to allocation if we're trying for a huge page */
        if (bytes >= PAGE_SIZE) {
                page = alloc_page(gfp);
-               if (page == NULL) {
+               if (!page) {
                        ret = -ENOMEM;
                } else {
                        sg_set_page(scat, page, PAGE_SIZE, 0);
@@ -162,7 +162,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
                rem = &per_cpu(rds_page_remainders, get_cpu());
                local_irq_save(flags);
 
-               if (page == NULL) {
+               if (!page) {
                        ret = -ENOMEM;
                        break;
                }
@@ -186,6 +186,7 @@ out:
                 ret ? 0 : scat->length);
        return ret;
 }
+EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
 
 static int rds_page_remainder_cpu_notify(struct notifier_block *self,
                                         unsigned long action, void *hcpu)
index 75fd13bb631bbc06bf8493c06c266951b6e47c95..48064673fc76dc7b19792fe3d5a7bcb49592ef98 100644 (file)
@@ -35,7 +35,7 @@
 #include <linux/rbtree.h>
 #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
 
-#include "rdma.h"
+#include "rds.h"
 
 /*
  * XXX
@@ -130,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
 {
        struct rds_mr *mr;
        struct rb_node *node;
+       unsigned long flags;
 
        /* Release any MRs associated with this socket */
+       spin_lock_irqsave(&rs->rs_rdma_lock, flags);
        while ((node = rb_first(&rs->rs_rdma_keys))) {
                mr = container_of(node, struct rds_mr, r_rb_node);
                if (mr->r_trans == rs->rs_transport)
                        mr->r_invalidate = 0;
+               rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+               RB_CLEAR_NODE(&mr->r_rb_node);
+               spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+               rds_destroy_mr(mr);
                rds_mr_put(mr);
+               spin_lock_irqsave(&rs->rs_rdma_lock, flags);
        }
+       spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 
        if (rs->rs_transport && rs->rs_transport->flush_mrs)
                rs->rs_transport->flush_mrs();
@@ -181,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
                goto out;
        }
 
-       if (rs->rs_transport->get_mr == NULL) {
+       if (!rs->rs_transport->get_mr) {
                ret = -EOPNOTSUPP;
                goto out;
        }
@@ -197,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
 
        /* XXX clamp nr_pages to limit the size of this alloc? */
        pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
-       if (pages == NULL) {
+       if (!pages) {
                ret = -ENOMEM;
                goto out;
        }
 
        mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
-       if (mr == NULL) {
+       if (!mr) {
                ret = -ENOMEM;
                goto out;
        }
@@ -230,13 +238,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
         * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
         * the zero page.
         */
-       ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1);
+       ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
        if (ret < 0)
                goto out;
 
        nents = ret;
        sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
-       if (sg == NULL) {
+       if (!sg) {
                ret = -ENOMEM;
                goto out;
        }
@@ -406,68 +414,127 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
 
        spin_lock_irqsave(&rs->rs_rdma_lock, flags);
        mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
-       if (mr && (mr->r_use_once || force)) {
+       if (!mr) {
+               printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
+               spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+               return;
+       }
+
+       if (mr->r_use_once || force) {
                rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
                RB_CLEAR_NODE(&mr->r_rb_node);
                zot_me = 1;
-       } else if (mr)
-               atomic_inc(&mr->r_refcount);
+       }
        spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 
        /* May have to issue a dma_sync on this memory region.
         * Note we could avoid this if the operation was a RDMA READ,
         * but at this point we can't tell. */
-       if (mr != NULL) {
-               if (mr->r_trans->sync_mr)
-                       mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
-
-               /* If the MR was marked as invalidate, this will
-                * trigger an async flush. */
-               if (zot_me)
-                       rds_destroy_mr(mr);
-               rds_mr_put(mr);
-       }
+       if (mr->r_trans->sync_mr)
+               mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+
+       /* If the MR was marked as invalidate, this will
+        * trigger an async flush. */
+       if (zot_me)
+               rds_destroy_mr(mr);
+       rds_mr_put(mr);
 }
 
-void rds_rdma_free_op(struct rds_rdma_op *ro)
+void rds_rdma_free_op(struct rm_rdma_op *ro)
 {
        unsigned int i;
 
-       for (i = 0; i < ro->r_nents; i++) {
-               struct page *page = sg_page(&ro->r_sg[i]);
+       for (i = 0; i < ro->op_nents; i++) {
+               struct page *page = sg_page(&ro->op_sg[i]);
 
                /* Mark page dirty if it was possibly modified, which
                 * is the case for a RDMA_READ which copies from remote
                 * to local memory */
-               if (!ro->r_write) {
-                       BUG_ON(in_interrupt());
+               if (!ro->op_write) {
+                       BUG_ON(irqs_disabled());
                        set_page_dirty(page);
                }
                put_page(page);
        }
 
-       kfree(ro->r_notifier);
-       kfree(ro);
+       kfree(ro->op_notifier);
+       ro->op_notifier = NULL;
+       ro->op_active = 0;
+}
+
+void rds_atomic_free_op(struct rm_atomic_op *ao)
+{
+       struct page *page = sg_page(ao->op_sg);
+
+       /* Mark page dirty if it was possibly modified, which
+        * is the case for a RDMA_READ which copies from remote
+        * to local memory */
+       set_page_dirty(page);
+       put_page(page);
+
+       kfree(ao->op_notifier);
+       ao->op_notifier = NULL;
+       ao->op_active = 0;
+}
+
+
+/*
+ * Count the number of pages needed to describe an incoming iovec.
+ */
+static int rds_rdma_pages(struct rds_rdma_args *args)
+{
+       struct rds_iovec vec;
+       struct rds_iovec __user *local_vec;
+       unsigned int tot_pages = 0;
+       unsigned int nr_pages;
+       unsigned int i;
+
+       local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+       /* figure out the number of pages in the vector */
+       for (i = 0; i < args->nr_local; i++) {
+               if (copy_from_user(&vec, &local_vec[i],
+                                  sizeof(struct rds_iovec)))
+                       return -EFAULT;
+
+               nr_pages = rds_pages_in_vec(&vec);
+               if (nr_pages == 0)
+                       return -EINVAL;
+
+               tot_pages += nr_pages;
+       }
+
+       return tot_pages;
+}
+
+int rds_rdma_extra_size(struct rds_rdma_args *args)
+{
+       return rds_rdma_pages(args) * sizeof(struct scatterlist);
 }
 
 /*
- * args is a pointer to an in-kernel copy in the sendmsg cmsg.
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
  */
-static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
-                                           struct rds_rdma_args *args)
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg)
 {
+       struct rds_rdma_args *args;
        struct rds_iovec vec;
-       struct rds_rdma_op *op = NULL;
+       struct rm_rdma_op *op = &rm->rdma;
        unsigned int nr_pages;
-       unsigned int max_pages;
        unsigned int nr_bytes;
        struct page **pages = NULL;
        struct rds_iovec __user *local_vec;
-       struct scatterlist *sg;
        unsigned int nr;
        unsigned int i, j;
-       int ret;
+       int ret = 0;
+
+       if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
+           || rm->rdma.op_active)
+               return -EINVAL;
 
+       args = CMSG_DATA(cmsg);
 
        if (rs->rs_bound_addr == 0) {
                ret = -ENOTCONN; /* XXX not a great errno */
@@ -479,61 +546,38 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
                goto out;
        }
 
-       nr_pages = 0;
-       max_pages = 0;
-
-       local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
-
-       /* figure out the number of pages in the vector */
-       for (i = 0; i < args->nr_local; i++) {
-               if (copy_from_user(&vec, &local_vec[i],
-                                  sizeof(struct rds_iovec))) {
-                       ret = -EFAULT;
-                       goto out;
-               }
-
-               nr = rds_pages_in_vec(&vec);
-               if (nr == 0) {
-                       ret = -EINVAL;
-                       goto out;
-               }
-
-               max_pages = max(nr, max_pages);
-               nr_pages += nr;
-       }
-
-       pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
-       if (pages == NULL) {
-               ret = -ENOMEM;
+       nr_pages = rds_rdma_pages(args);
+       if (nr_pages < 0)
                goto out;
-       }
 
-       op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
-       if (op == NULL) {
+       pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+       if (!pages) {
                ret = -ENOMEM;
                goto out;
        }
 
-       op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
-       op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
-       op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
-       op->r_recverr = rs->rs_recverr;
+       op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
+       op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
+       op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+       op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
+       op->op_active = 1;
+       op->op_recverr = rs->rs_recverr;
        WARN_ON(!nr_pages);
-       sg_init_table(op->r_sg, nr_pages);
+       op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
 
-       if (op->r_notify || op->r_recverr) {
+       if (op->op_notify || op->op_recverr) {
                /* We allocate an uninitialized notifier here, because
                 * we don't want to do that in the completion handler. We
                 * would have to use GFP_ATOMIC there, and don't want to deal
                 * with failed allocations.
                 */
-               op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
-               if (!op->r_notifier) {
+               op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
+               if (!op->op_notifier) {
                        ret = -ENOMEM;
                        goto out;
                }
-               op->r_notifier->n_user_token = args->user_token;
-               op->r_notifier->n_status = RDS_RDMA_SUCCESS;
+               op->op_notifier->n_user_token = args->user_token;
+               op->op_notifier->n_status = RDS_RDMA_SUCCESS;
        }
 
        /* The cookie contains the R_Key of the remote memory region, and
@@ -543,15 +587,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
         * destination address (which is really an offset into the MR)
         * FIXME: We may want to move this into ib_rdma.c
         */
-       op->r_key = rds_rdma_cookie_key(args->cookie);
-       op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
+       op->op_rkey = rds_rdma_cookie_key(args->cookie);
+       op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
 
        nr_bytes = 0;
 
        rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
               (unsigned long long)args->nr_local,
               (unsigned long long)args->remote_vec.addr,
-              op->r_key);
+              op->op_rkey);
+
+       local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
 
        for (i = 0; i < args->nr_local; i++) {
                if (copy_from_user(&vec, &local_vec[i],
@@ -569,15 +615,10 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
                rs->rs_user_addr = vec.addr;
                rs->rs_user_bytes = vec.bytes;
 
-               /* did the user change the vec under us? */
-               if (nr > max_pages || op->r_nents + nr > nr_pages) {
-                       ret = -EINVAL;
-                       goto out;
-               }
                /* If it's a WRITE operation, we want to pin the pages for reading.
                 * If it's a READ operation, we need to pin the pages for writing.
                 */
-               ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write);
+               ret = rds_pin_pages(vec.addr, nr, pages, !op->op_write);
                if (ret < 0)
                        goto out;
 
@@ -588,8 +629,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
 
                for (j = 0; j < nr; j++) {
                        unsigned int offset = vec.addr & ~PAGE_MASK;
+                       struct scatterlist *sg;
 
-                       sg = &op->r_sg[op->r_nents + j];
+                       sg = &op->op_sg[op->op_nents + j];
                        sg_set_page(sg, pages[j],
                                        min_t(unsigned int, vec.bytes, PAGE_SIZE - offset),
                                        offset);
@@ -601,10 +643,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
                        vec.bytes -= sg->length;
                }
 
-               op->r_nents += nr;
+               op->op_nents += nr;
        }
 
-
        if (nr_bytes > args->remote_vec.bytes) {
                rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
                                nr_bytes,
@@ -612,38 +653,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
                ret = -EINVAL;
                goto out;
        }
-       op->r_bytes = nr_bytes;
+       op->op_bytes = nr_bytes;
 
        ret = 0;
 out:
        kfree(pages);
-       if (ret) {
-               if (op)
-                       rds_rdma_free_op(op);
-               op = ERR_PTR(ret);
-       }
-       return op;
-}
-
-/*
- * The application asks for a RDMA transfer.
- * Extract all arguments and set up the rdma_op
- */
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-                         struct cmsghdr *cmsg)
-{
-       struct rds_rdma_op *op;
-
-       if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) ||
-           rm->m_rdma_op != NULL)
-               return -EINVAL;
+       if (ret)
+               rds_rdma_free_op(op);
 
-       op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
-       if (IS_ERR(op))
-               return PTR_ERR(op);
        rds_stats_inc(s_send_rdma);
-       rm->m_rdma_op = op;
-       return 0;
+
+       return ret;
 }
 
 /*
@@ -673,7 +693,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
 
        spin_lock_irqsave(&rs->rs_rdma_lock, flags);
        mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
-       if (mr == NULL)
+       if (!mr)
                err = -EINVAL;  /* invalid r_key */
        else
                atomic_inc(&mr->r_refcount);
@@ -681,7 +701,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
 
        if (mr) {
                mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
-               rm->m_rdma_mr = mr;
+               rm->rdma.op_rdma_mr = mr;
        }
        return err;
 }
@@ -699,5 +719,98 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
            rm->m_rdma_cookie != 0)
                return -EINVAL;
 
-       return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr);
+       return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
+}
+
+/*
+ * Fill in rds_message for an atomic request.
+ */
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+                   struct cmsghdr *cmsg)
+{
+       struct page *page = NULL;
+       struct rds_atomic_args *args;
+       int ret = 0;
+
+       if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
+        || rm->atomic.op_active)
+               return -EINVAL;
+
+       args = CMSG_DATA(cmsg);
+
+       /* Nonmasked & masked cmsg ops converted to masked hw ops */
+       switch (cmsg->cmsg_type) {
+       case RDS_CMSG_ATOMIC_FADD:
+               rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+               rm->atomic.op_m_fadd.add = args->fadd.add;
+               rm->atomic.op_m_fadd.nocarry_mask = 0;
+               break;
+       case RDS_CMSG_MASKED_ATOMIC_FADD:
+               rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+               rm->atomic.op_m_fadd.add = args->m_fadd.add;
+               rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
+               break;
+       case RDS_CMSG_ATOMIC_CSWP:
+               rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+               rm->atomic.op_m_cswp.compare = args->cswp.compare;
+               rm->atomic.op_m_cswp.swap = args->cswp.swap;
+               rm->atomic.op_m_cswp.compare_mask = ~0;
+               rm->atomic.op_m_cswp.swap_mask = ~0;
+               break;
+       case RDS_CMSG_MASKED_ATOMIC_CSWP:
+               rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+               rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
+               rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
+               rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
+               rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
+               break;
+       default:
+               BUG(); /* should never happen */
+       }
+
+       rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+       rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
+       rm->atomic.op_active = 1;
+       rm->atomic.op_recverr = rs->rs_recverr;
+       rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
+
+       /* verify 8 byte-aligned */
+       if (args->local_addr & 0x7) {
+               ret = -EFAULT;
+               goto err;
+       }
+
+       ret = rds_pin_pages(args->local_addr, 1, &page, 1);
+       if (ret != 1)
+               goto err;
+       ret = 0;
+
+       sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
+
+       if (rm->atomic.op_notify || rm->atomic.op_recverr) {
+               /* We allocate an uninitialized notifier here, because
+                * we don't want to do that in the completion handler. We
+                * would have to use GFP_ATOMIC there, and don't want to deal
+                * with failed allocations.
+                */
+               rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
+               if (!rm->atomic.op_notifier) {
+                       ret = -ENOMEM;
+                       goto err;
+               }
+
+               rm->atomic.op_notifier->n_user_token = args->user_token;
+               rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
+       }
+
+       rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
+       rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
+
+       return ret;
+err:
+       if (page)
+               put_page(page);
+       kfree(rm->atomic.op_notifier);
+
+       return ret;
 }
diff --git a/net/rds/rdma.h b/net/rds/rdma.h
deleted file mode 100644 (file)
index 909c398..0000000
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef _RDS_RDMA_H
-#define _RDS_RDMA_H
-
-#include <linux/rbtree.h>
-#include <linux/spinlock.h>
-#include <linux/scatterlist.h>
-
-#include "rds.h"
-
-struct rds_mr {
-       struct rb_node          r_rb_node;
-       atomic_t                r_refcount;
-       u32                     r_key;
-
-       /* A copy of the creation flags */
-       unsigned int            r_use_once:1;
-       unsigned int            r_invalidate:1;
-       unsigned int            r_write:1;
-
-       /* This is for RDS_MR_DEAD.
-        * It would be nice & consistent to make this part of the above
-        * bit field here, but we need to use test_and_set_bit.
-        */
-       unsigned long           r_state;
-       struct rds_sock         *r_sock; /* back pointer to the socket that owns us */
-       struct rds_transport    *r_trans;
-       void                    *r_trans_private;
-};
-
-/* Flags for mr->r_state */
-#define RDS_MR_DEAD            0
-
-struct rds_rdma_op {
-       u32                     r_key;
-       u64                     r_remote_addr;
-       unsigned int            r_write:1;
-       unsigned int            r_fence:1;
-       unsigned int            r_notify:1;
-       unsigned int            r_recverr:1;
-       unsigned int            r_mapped:1;
-       struct rds_notifier     *r_notifier;
-       unsigned int            r_bytes;
-       unsigned int            r_nents;
-       unsigned int            r_count;
-       struct scatterlist      r_sg[0];
-};
-
-static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
-{
-       return r_key | (((u64) offset) << 32);
-}
-
-static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
-{
-       return cookie;
-}
-
-static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
-{
-       return cookie >> 32;
-}
-
-int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
-void rds_rdma_drop_keys(struct rds_sock *rs);
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-                         struct cmsghdr *cmsg);
-int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
-                         struct cmsghdr *cmsg);
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-                         struct cmsghdr *cmsg);
-int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
-                         struct cmsghdr *cmsg);
-void rds_rdma_free_op(struct rds_rdma_op *ro);
-void rds_rdma_send_complete(struct rds_message *rm, int);
-
-extern void __rds_put_mr_final(struct rds_mr *mr);
-static inline void rds_mr_put(struct rds_mr *mr)
-{
-       if (atomic_dec_and_test(&mr->r_refcount))
-               __rds_put_mr_final(mr);
-}
-
-#endif
index e599ba2f950d72e03c4bc98f7b1a9b86742134e7..e6ed10aee190afa60a74b9f07222dc7ad8083fd0 100644 (file)
 
 static struct rdma_cm_id *rds_rdma_listen_id;
 
+static char *rds_cm_event_strings[] = {
+#define RDS_CM_EVENT_STRING(foo) \
+               [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
+       RDS_CM_EVENT_STRING(ADDR_RESOLVED),
+       RDS_CM_EVENT_STRING(ADDR_ERROR),
+       RDS_CM_EVENT_STRING(ROUTE_RESOLVED),
+       RDS_CM_EVENT_STRING(ROUTE_ERROR),
+       RDS_CM_EVENT_STRING(CONNECT_REQUEST),
+       RDS_CM_EVENT_STRING(CONNECT_RESPONSE),
+       RDS_CM_EVENT_STRING(CONNECT_ERROR),
+       RDS_CM_EVENT_STRING(UNREACHABLE),
+       RDS_CM_EVENT_STRING(REJECTED),
+       RDS_CM_EVENT_STRING(ESTABLISHED),
+       RDS_CM_EVENT_STRING(DISCONNECTED),
+       RDS_CM_EVENT_STRING(DEVICE_REMOVAL),
+       RDS_CM_EVENT_STRING(MULTICAST_JOIN),
+       RDS_CM_EVENT_STRING(MULTICAST_ERROR),
+       RDS_CM_EVENT_STRING(ADDR_CHANGE),
+       RDS_CM_EVENT_STRING(TIMEWAIT_EXIT),
+#undef RDS_CM_EVENT_STRING
+};
+
+static char *rds_cm_event_str(enum rdma_cm_event_type type)
+{
+       return rds_str_array(rds_cm_event_strings,
+                            ARRAY_SIZE(rds_cm_event_strings), type);
+};
+
 int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                              struct rdma_cm_event *event)
 {
@@ -44,8 +72,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
        struct rds_transport *trans;
        int ret = 0;
 
-       rdsdebug("conn %p id %p handling event %u\n", conn, cm_id,
-                event->event);
+       rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
+                event->event, rds_cm_event_str(event->event));
 
        if (cm_id->device->node_type == RDMA_NODE_RNIC)
                trans = &rds_iw_transport;
@@ -109,7 +137,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 
        default:
                /* things like device disconnect? */
-               printk(KERN_ERR "RDS: unknown event %u!\n", event->event);
+               printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
+                      event->event, rds_cm_event_str(event->event));
                break;
        }
 
@@ -117,12 +146,13 @@ out:
        if (conn)
                mutex_unlock(&conn->c_cm_lock);
 
-       rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret);
+       rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
+                rds_cm_event_str(event->event), ret);
 
        return ret;
 }
 
-static int __init rds_rdma_listen_init(void)
+static int rds_rdma_listen_init(void)
 {
        struct sockaddr_in sin;
        struct rdma_cm_id *cm_id;
@@ -177,7 +207,7 @@ static void rds_rdma_listen_stop(void)
        }
 }
 
-int __init rds_rdma_init(void)
+int rds_rdma_init(void)
 {
        int ret;
 
index c224b5bb3ba9368fd00fad5f6a25f90d68364ebd..8103dcf8b97679b1f73ed1a339d348fd0da6a28b 100644 (file)
@@ -80,6 +80,7 @@ enum {
 /* Bits for c_flags */
 #define RDS_LL_SEND_FULL       0
 #define RDS_RECONNECT_PENDING  1
+#define RDS_IN_XMIT            2
 
 struct rds_connection {
        struct hlist_node       c_hash_node;
@@ -91,12 +92,13 @@ struct rds_connection {
        struct rds_cong_map     *c_lcong;
        struct rds_cong_map     *c_fcong;
 
-       struct mutex            c_send_lock;    /* protect send ring */
        struct rds_message      *c_xmit_rm;
        unsigned long           c_xmit_sg;
        unsigned int            c_xmit_hdr_off;
        unsigned int            c_xmit_data_off;
+       unsigned int            c_xmit_atomic_sent;
        unsigned int            c_xmit_rdma_sent;
+       unsigned int            c_xmit_data_sent;
 
        spinlock_t              c_lock;         /* protect msg queues */
        u64                     c_next_tx_seq;
@@ -116,11 +118,10 @@ struct rds_connection {
        struct delayed_work     c_conn_w;
        struct work_struct      c_down_w;
        struct mutex            c_cm_lock;      /* protect conn state & cm */
+       wait_queue_head_t       c_waitq;
 
        struct list_head        c_map_item;
        unsigned long           c_map_queued;
-       unsigned long           c_map_offset;
-       unsigned long           c_map_bytes;
 
        unsigned int            c_unacked_packets;
        unsigned int            c_unacked_bytes;
@@ -206,6 +207,48 @@ struct rds_incoming {
        rds_rdma_cookie_t       i_rdma_cookie;
 };
 
+struct rds_mr {
+       struct rb_node          r_rb_node;
+       atomic_t                r_refcount;
+       u32                     r_key;
+
+       /* A copy of the creation flags */
+       unsigned int            r_use_once:1;
+       unsigned int            r_invalidate:1;
+       unsigned int            r_write:1;
+
+       /* This is for RDS_MR_DEAD.
+        * It would be nice & consistent to make this part of the above
+        * bit field here, but we need to use test_and_set_bit.
+        */
+       unsigned long           r_state;
+       struct rds_sock         *r_sock; /* back pointer to the socket that owns us */
+       struct rds_transport    *r_trans;
+       void                    *r_trans_private;
+};
+
+/* Flags for mr->r_state */
+#define RDS_MR_DEAD            0
+
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+       return r_key | (((u64) offset) << 32);
+}
+
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+       return cookie;
+}
+
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+       return cookie >> 32;
+}
+
+/* atomic operation types */
+#define RDS_ATOMIC_TYPE_CSWP           0
+#define RDS_ATOMIC_TYPE_FADD           1
+
 /*
  * m_sock_item and m_conn_item are on lists that are serialized under
  * conn->c_lock.  m_sock_item has additional meaning in that once it is empty
@@ -258,13 +301,71 @@ struct rds_message {
         *   -> rs->rs_lock
         */
        spinlock_t              m_rs_lock;
+       wait_queue_head_t       m_flush_wait;
+
        struct rds_sock         *m_rs;
-       struct rds_rdma_op      *m_rdma_op;
+
+       /* cookie to send to remote, in rds header */
        rds_rdma_cookie_t       m_rdma_cookie;
-       struct rds_mr           *m_rdma_mr;
-       unsigned int            m_nents;
-       unsigned int            m_count;
-       struct scatterlist      m_sg[0];
+
+       unsigned int            m_used_sgs;
+       unsigned int            m_total_sgs;
+
+       void                    *m_final_op;
+
+       struct {
+               struct rm_atomic_op {
+                       int                     op_type;
+                       union {
+                               struct {
+                                       uint64_t        compare;
+                                       uint64_t        swap;
+                                       uint64_t        compare_mask;
+                                       uint64_t        swap_mask;
+                               } op_m_cswp;
+                               struct {
+                                       uint64_t        add;
+                                       uint64_t        nocarry_mask;
+                               } op_m_fadd;
+                       };
+
+                       u32                     op_rkey;
+                       u64                     op_remote_addr;
+                       unsigned int            op_notify:1;
+                       unsigned int            op_recverr:1;
+                       unsigned int            op_mapped:1;
+                       unsigned int            op_silent:1;
+                       unsigned int            op_active:1;
+                       struct scatterlist      *op_sg;
+                       struct rds_notifier     *op_notifier;
+
+                       struct rds_mr           *op_rdma_mr;
+               } atomic;
+               struct rm_rdma_op {
+                       u32                     op_rkey;
+                       u64                     op_remote_addr;
+                       unsigned int            op_write:1;
+                       unsigned int            op_fence:1;
+                       unsigned int            op_notify:1;
+                       unsigned int            op_recverr:1;
+                       unsigned int            op_mapped:1;
+                       unsigned int            op_silent:1;
+                       unsigned int            op_active:1;
+                       unsigned int            op_bytes;
+                       unsigned int            op_nents;
+                       unsigned int            op_count;
+                       struct scatterlist      *op_sg;
+                       struct rds_notifier     *op_notifier;
+
+                       struct rds_mr           *op_rdma_mr;
+               } rdma;
+               struct rm_data_op {
+                       unsigned int            op_active:1;
+                       unsigned int            op_nents;
+                       unsigned int            op_count;
+                       struct scatterlist      *op_sg;
+               } data;
+       };
 };
 
 /*
@@ -305,10 +406,6 @@ struct rds_notifier {
  *                 transport is responsible for other serialization, including
  *                 rds_recv_incoming().  This is called in process context but
  *                 should try hard not to block.
- *
- * @xmit_cong_map: This asks the transport to send the local bitmap down the
- *                given connection.  XXX get a better story about the bitmap
- *                flag and header.
  */
 
 #define RDS_TRANS_IB   0
@@ -332,13 +429,11 @@ struct rds_transport {
        void (*xmit_complete)(struct rds_connection *conn);
        int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
                    unsigned int hdr_off, unsigned int sg, unsigned int off);
-       int (*xmit_cong_map)(struct rds_connection *conn,
-                            struct rds_cong_map *map, unsigned long offset);
-       int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
+       int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
+       int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
        int (*recv)(struct rds_connection *conn);
        int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
                                size_t size);
-       void (*inc_purge)(struct rds_incoming *inc);
        void (*inc_free)(struct rds_incoming *inc);
 
        int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
@@ -367,17 +462,11 @@ struct rds_sock {
         * bound_addr used for both incoming and outgoing, no INADDR_ANY
         * support.
         */
-       struct rb_node          rs_bound_node;
+       struct hlist_node       rs_bound_node;
        __be32                  rs_bound_addr;
        __be32                  rs_conn_addr;
        __be16                  rs_bound_port;
        __be16                  rs_conn_port;
-
-       /*
-        * This is only used to communicate the transport between bind and
-        * initiating connections.  All other trans use is referenced through
-        * the connection.
-        */
        struct rds_transport    *rs_transport;
 
        /*
@@ -466,8 +555,8 @@ struct rds_statistics {
        uint64_t        s_recv_ping;
        uint64_t        s_send_queue_empty;
        uint64_t        s_send_queue_full;
-       uint64_t        s_send_sem_contention;
-       uint64_t        s_send_sem_queue_raced;
+       uint64_t        s_send_lock_contention;
+       uint64_t        s_send_lock_queue_raced;
        uint64_t        s_send_immediate_retry;
        uint64_t        s_send_delayed_retry;
        uint64_t        s_send_drop_acked;
@@ -487,6 +576,7 @@ struct rds_statistics {
 };
 
 /* af_rds.c */
+char *rds_str_array(char **array, size_t elements, size_t index);
 void rds_sock_addref(struct rds_sock *rs);
 void rds_sock_put(struct rds_sock *rs);
 void rds_wake_sk_sleep(struct rds_sock *rs);
@@ -521,15 +611,17 @@ void rds_cong_exit(void);
 struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
 
 /* conn.c */
-int __init rds_conn_init(void);
+int rds_conn_init(void);
 void rds_conn_exit(void);
 struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
                                       struct rds_transport *trans, gfp_t gfp);
 struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
                               struct rds_transport *trans, gfp_t gfp);
+void rds_conn_shutdown(struct rds_connection *conn);
 void rds_conn_destroy(struct rds_connection *conn);
 void rds_conn_reset(struct rds_connection *conn);
 void rds_conn_drop(struct rds_connection *conn);
+void rds_conn_connect_if_down(struct rds_connection *conn);
 void rds_for_each_conn_info(struct socket *sock, unsigned int len,
                          struct rds_info_iterator *iter,
                          struct rds_info_lengths *lens,
@@ -566,7 +658,8 @@ rds_conn_connecting(struct rds_connection *conn)
 
 /* message.c */
 struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
-struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
+int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
                                               size_t total_len);
 struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
 void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
@@ -580,7 +673,6 @@ int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *vers
 int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
 int rds_message_inc_copy_to_user(struct rds_incoming *inc,
                                 struct iovec *first_iov, size_t size);
-void rds_message_inc_purge(struct rds_incoming *inc);
 void rds_message_inc_free(struct rds_incoming *inc);
 void rds_message_addref(struct rds_message *rm);
 void rds_message_put(struct rds_message *rm);
@@ -636,14 +728,39 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
 typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
 void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
                         is_acked_func is_acked);
-int rds_send_acked_before(struct rds_connection *conn, u64 seq);
 void rds_send_remove_from_sock(struct list_head *messages, int status);
 int rds_send_pong(struct rds_connection *conn, __be16 dport);
 struct rds_message *rds_send_get_message(struct rds_connection *,
-                                        struct rds_rdma_op *);
+                                        struct rm_rdma_op *);
 
 /* rdma.c */
 void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_rdma_extra_size(struct rds_rdma_args *args);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rm_rdma_op *ro);
+void rds_atomic_free_op(struct rm_atomic_op *ao);
+void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
+void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+                   struct cmsghdr *cmsg);
+
+extern void __rds_put_mr_final(struct rds_mr *mr);
+static inline void rds_mr_put(struct rds_mr *mr)
+{
+       if (atomic_dec_and_test(&mr->r_refcount))
+               __rds_put_mr_final(mr);
+}
 
 /* stats.c */
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
@@ -657,14 +774,14 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
        put_cpu();                                      \
 } while (0)
 #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
-int __init rds_stats_init(void);
+int rds_stats_init(void);
 void rds_stats_exit(void);
 void rds_stats_info_copy(struct rds_info_iterator *iter,
                         uint64_t *values, const char *const *names,
                         size_t nr);
 
 /* sysctl.c */
-int __init rds_sysctl_init(void);
+int rds_sysctl_init(void);
 void rds_sysctl_exit(void);
 extern unsigned long rds_sysctl_sndbuf_min;
 extern unsigned long rds_sysctl_sndbuf_default;
@@ -678,9 +795,10 @@ extern unsigned long rds_sysctl_trace_flags;
 extern unsigned int  rds_sysctl_trace_level;
 
 /* threads.c */
-int __init rds_threads_init(void);
+int rds_threads_init(void);
 void rds_threads_exit(void);
 extern struct workqueue_struct *rds_wq;
+void rds_queue_reconnect(struct rds_connection *conn);
 void rds_connect_worker(struct work_struct *);
 void rds_shutdown_worker(struct work_struct *);
 void rds_send_worker(struct work_struct *);
@@ -691,9 +809,10 @@ void rds_connect_complete(struct rds_connection *conn);
 int rds_trans_register(struct rds_transport *trans);
 void rds_trans_unregister(struct rds_transport *trans);
 struct rds_transport *rds_trans_get_preferred(__be32 addr);
+void rds_trans_put(struct rds_transport *trans);
 unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
                                       unsigned int avail);
-int __init rds_trans_init(void);
+int rds_trans_init(void);
 void rds_trans_exit(void);
 
 #endif
index c93588c2d553cf6b162ab500cf1fd72dbbc9c26c..68800f02aa3047c12f59eb47c7c71db84c5127aa 100644 (file)
@@ -36,7 +36,6 @@
 #include <linux/in.h>
 
 #include "rds.h"
-#include "rdma.h"
 
 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
                  __be32 saddr)
@@ -210,7 +209,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
        }
 
        rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
-       if (rs == NULL) {
+       if (!rs) {
                rds_stats_inc(s_recv_drop_no_sock);
                goto out;
        }
@@ -251,7 +250,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
 {
        unsigned long flags;
 
-       if (*inc == NULL) {
+       if (!*inc) {
                read_lock_irqsave(&rs->rs_recv_lock, flags);
                if (!list_empty(&rs->rs_recv_queue)) {
                        *inc = list_entry(rs->rs_recv_queue.next,
@@ -334,10 +333,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
 
                if (msghdr) {
                        cmsg.user_token = notifier->n_user_token;
-                       cmsg.status  = notifier->n_status;
+                       cmsg.status = notifier->n_status;
 
                        err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
-                                       sizeof(cmsg), &cmsg);
+                                      sizeof(cmsg), &cmsg);
                        if (err)
                                break;
                }
index 9c1c6bcaa6c9532e9abdbbd198b8ab6e6c7b9400..9b951a0ab6b7f6a7c3bdb14a7e1c23e630f3ec99 100644 (file)
@@ -37,7 +37,6 @@
 #include <linux/list.h>
 
 #include "rds.h"
-#include "rdma.h"
 
 /* When transmitting messages in rds_send_xmit, we need to emerge from
  * time to time and briefly release the CPU. Otherwise the softlock watchdog
@@ -54,7 +53,8 @@ module_param(send_batch_count, int, 0444);
 MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
 
 /*
- * Reset the send state. Caller must hold c_send_lock when calling here.
+ * Reset the send state.  Callers must ensure that this doesn't race with
+ * rds_send_xmit().
  */
 void rds_send_reset(struct rds_connection *conn)
 {
@@ -62,18 +62,22 @@ void rds_send_reset(struct rds_connection *conn)
        unsigned long flags;
 
        if (conn->c_xmit_rm) {
+               rm = conn->c_xmit_rm;
+               conn->c_xmit_rm = NULL;
                /* Tell the user the RDMA op is no longer mapped by the
                 * transport. This isn't entirely true (it's flushed out
                 * independently) but as the connection is down, there's
                 * no ongoing RDMA to/from that memory */
-               rds_message_unmapped(conn->c_xmit_rm);
-               rds_message_put(conn->c_xmit_rm);
-               conn->c_xmit_rm = NULL;
+               rds_message_unmapped(rm);
+               rds_message_put(rm);
        }
+
        conn->c_xmit_sg = 0;
        conn->c_xmit_hdr_off = 0;
        conn->c_xmit_data_off = 0;
+       conn->c_xmit_atomic_sent = 0;
        conn->c_xmit_rdma_sent = 0;
+       conn->c_xmit_data_sent = 0;
 
        conn->c_map_queued = 0;
 
@@ -90,6 +94,25 @@ void rds_send_reset(struct rds_connection *conn)
        spin_unlock_irqrestore(&conn->c_lock, flags);
 }
 
+static int acquire_in_xmit(struct rds_connection *conn)
+{
+       return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0;
+}
+
+static void release_in_xmit(struct rds_connection *conn)
+{
+       clear_bit(RDS_IN_XMIT, &conn->c_flags);
+       smp_mb__after_clear_bit();
+       /*
+        * We don't use wait_on_bit()/wake_up_bit() because our waking is in a
+        * hot path and finding waiters is very rare.  We don't want to walk
+        * the system-wide hashed waitqueue buckets in the fast path only to
+        * almost never find waiters.
+        */
+       if (waitqueue_active(&conn->c_waitq))
+               wake_up_all(&conn->c_waitq);
+}
+
 /*
  * We're making the concious trade-off here to only send one message
  * down the connection at a time.
@@ -109,102 +132,69 @@ int rds_send_xmit(struct rds_connection *conn)
        struct rds_message *rm;
        unsigned long flags;
        unsigned int tmp;
-       unsigned int send_quota = send_batch_count;
        struct scatterlist *sg;
        int ret = 0;
-       int was_empty = 0;
        LIST_HEAD(to_be_dropped);
 
+restart:
+
        /*
         * sendmsg calls here after having queued its message on the send
         * queue.  We only have one task feeding the connection at a time.  If
         * another thread is already feeding the queue then we back off.  This
         * avoids blocking the caller and trading per-connection data between
         * caches per message.
-        *
-        * The sem holder will issue a retry if they notice that someone queued
-        * a message after they stopped walking the send queue but before they
-        * dropped the sem.
         */
-       if (!mutex_trylock(&conn->c_send_lock)) {
-               rds_stats_inc(s_send_sem_contention);
+       if (!acquire_in_xmit(conn)) {
+               rds_stats_inc(s_send_lock_contention);
                ret = -ENOMEM;
                goto out;
        }
 
+       /*
+        * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
+        * we do the opposite to avoid races.
+        */
+       if (!rds_conn_up(conn)) {
+               release_in_xmit(conn);
+               ret = 0;
+               goto out;
+       }
+
        if (conn->c_trans->xmit_prepare)
                conn->c_trans->xmit_prepare(conn);
 
        /*
         * spin trying to push headers and data down the connection until
-        * the connection doens't make forward progress.
+        * the connection doesn't make forward progress.
         */
-       while (--send_quota) {
-               /*
-                * See if need to send a congestion map update if we're
-                * between sending messages.  The send_sem protects our sole
-                * use of c_map_offset and _bytes.
-                * Note this is used only by transports that define a special
-                * xmit_cong_map function. For all others, we create allocate
-                * a cong_map message and treat it just like any other send.
-                */
-               if (conn->c_map_bytes) {
-                       ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
-                                               conn->c_map_offset);
-                       if (ret <= 0)
-                               break;
+       while (1) {
 
-                       conn->c_map_offset += ret;
-                       conn->c_map_bytes -= ret;
-                       if (conn->c_map_bytes)
-                               continue;
-               }
-
-               /* If we're done sending the current message, clear the
-                * offset and S/G temporaries.
-                */
                rm = conn->c_xmit_rm;
-               if (rm != NULL &&
-                   conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
-                   conn->c_xmit_sg == rm->m_nents) {
-                       conn->c_xmit_rm = NULL;
-                       conn->c_xmit_sg = 0;
-                       conn->c_xmit_hdr_off = 0;
-                       conn->c_xmit_data_off = 0;
-                       conn->c_xmit_rdma_sent = 0;
 
-                       /* Release the reference to the previous message. */
-                       rds_message_put(rm);
-                       rm = NULL;
-               }
-
-               /* If we're asked to send a cong map update, do so.
+               /*
+                * If between sending messages, we can send a pending congestion
+                * map update.
                 */
-               if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
-                       if (conn->c_trans->xmit_cong_map != NULL) {
-                               conn->c_map_offset = 0;
-                               conn->c_map_bytes = sizeof(struct rds_header) +
-                                       RDS_CONG_MAP_BYTES;
-                               continue;
-                       }
-
+               if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
                        rm = rds_cong_update_alloc(conn);
                        if (IS_ERR(rm)) {
                                ret = PTR_ERR(rm);
                                break;
                        }
+                       rm->data.op_active = 1;
 
                        conn->c_xmit_rm = rm;
                }
 
                /*
-                * Grab the next message from the send queue, if there is one.
+                * If not already working on one, grab the next message.
                 *
                 * c_xmit_rm holds a ref while we're sending this message down
                 * the connction.  We can use this ref while holding the
                 * send_sem.. rds_send_reset() is serialized with it.
                 */
-               if (rm == NULL) {
+               if (!rm) {
                        unsigned int len;
 
                        spin_lock_irqsave(&conn->c_lock, flags);
@@ -224,10 +214,8 @@ int rds_send_xmit(struct rds_connection *conn)
 
                        spin_unlock_irqrestore(&conn->c_lock, flags);
 
-                       if (rm == NULL) {
-                               was_empty = 1;
+                       if (!rm)
                                break;
-                       }
 
                        /* Unfortunately, the way Infiniband deals with
                         * RDMA to a bad MR key is by moving the entire
@@ -236,13 +224,12 @@ int rds_send_xmit(struct rds_connection *conn)
                         * connection.
                         * Therefore, we never retransmit messages with RDMA ops.
                         */
-                       if (rm->m_rdma_op &&
+                       if (rm->rdma.op_active &&
                            test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
                                spin_lock_irqsave(&conn->c_lock, flags);
                                if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
                                        list_move(&rm->m_conn_item, &to_be_dropped);
                                spin_unlock_irqrestore(&conn->c_lock, flags);
-                               rds_message_put(rm);
                                continue;
                        }
 
@@ -263,23 +250,55 @@ int rds_send_xmit(struct rds_connection *conn)
                        conn->c_xmit_rm = rm;
                }
 
-               /*
-                * Try and send an rdma message.  Let's see if we can
-                * keep this simple and require that the transport either
-                * send the whole rdma or none of it.
-                */
-               if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
-                       ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
+               /* The transport either sends the whole rdma or none of it */
+               if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
+                       rm->m_final_op = &rm->rdma;
+                       ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
                        if (ret)
                                break;
                        conn->c_xmit_rdma_sent = 1;
+
                        /* The transport owns the mapped memory for now.
                         * You can't unmap it while it's on the send queue */
                        set_bit(RDS_MSG_MAPPED, &rm->m_flags);
                }
 
-               if (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
-                   conn->c_xmit_sg < rm->m_nents) {
+               if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
+                       rm->m_final_op = &rm->atomic;
+                       ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
+                       if (ret)
+                               break;
+                       conn->c_xmit_atomic_sent = 1;
+
+                       /* The transport owns the mapped memory for now.
+                        * You can't unmap it while it's on the send queue */
+                       set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+               }
+
+               /*
+                * A number of cases require an RDS header to be sent
+                * even if there is no data.
+                * We permit 0-byte sends; rds-ping depends on this.
+                * However, if there are exclusively attached silent ops,
+                * we skip the hdr/data send, to enable silent operation.
+                */
+               if (rm->data.op_nents == 0) {
+                       int ops_present;
+                       int all_ops_are_silent = 1;
+
+                       ops_present = (rm->atomic.op_active || rm->rdma.op_active);
+                       if (rm->atomic.op_active && !rm->atomic.op_silent)
+                               all_ops_are_silent = 0;
+                       if (rm->rdma.op_active && !rm->rdma.op_silent)
+                               all_ops_are_silent = 0;
+
+                       if (ops_present && all_ops_are_silent
+                           && !rm->m_rdma_cookie)
+                               rm->data.op_active = 0;
+               }
+
+               if (rm->data.op_active && !conn->c_xmit_data_sent) {
+                       rm->m_final_op = &rm->data;
                        ret = conn->c_trans->xmit(conn, rm,
                                                  conn->c_xmit_hdr_off,
                                                  conn->c_xmit_sg,
@@ -295,7 +314,7 @@ int rds_send_xmit(struct rds_connection *conn)
                                ret -= tmp;
                        }
 
-                       sg = &rm->m_sg[conn->c_xmit_sg];
+                       sg = &rm->data.op_sg[conn->c_xmit_sg];
                        while (ret) {
                                tmp = min_t(int, ret, sg->length -
                                                      conn->c_xmit_data_off);
@@ -306,49 +325,63 @@ int rds_send_xmit(struct rds_connection *conn)
                                        sg++;
                                        conn->c_xmit_sg++;
                                        BUG_ON(ret != 0 &&
-                                              conn->c_xmit_sg == rm->m_nents);
+                                              conn->c_xmit_sg == rm->data.op_nents);
                                }
                        }
+
+                       if (conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
+                           (conn->c_xmit_sg == rm->data.op_nents))
+                               conn->c_xmit_data_sent = 1;
                }
-       }
 
-       /* Nuke any messages we decided not to retransmit. */
-       if (!list_empty(&to_be_dropped))
-               rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
+               /*
+                * A rm will only take multiple times through this loop
+                * if there is a data op. Thus, if the data is sent (or there was
+                * none), then we're done with the rm.
+                */
+               if (!rm->data.op_active || conn->c_xmit_data_sent) {
+                       conn->c_xmit_rm = NULL;
+                       conn->c_xmit_sg = 0;
+                       conn->c_xmit_hdr_off = 0;
+                       conn->c_xmit_data_off = 0;
+                       conn->c_xmit_rdma_sent = 0;
+                       conn->c_xmit_atomic_sent = 0;
+                       conn->c_xmit_data_sent = 0;
+
+                       rds_message_put(rm);
+               }
+       }
 
        if (conn->c_trans->xmit_complete)
                conn->c_trans->xmit_complete(conn);
 
-       /*
-        * We might be racing with another sender who queued a message but
-        * backed off on noticing that we held the c_send_lock.  If we check
-        * for queued messages after dropping the sem then either we'll
-        * see the queued message or the queuer will get the sem.  If we
-        * notice the queued message then we trigger an immediate retry.
-        *
-        * We need to be careful only to do this when we stopped processing
-        * the send queue because it was empty.  It's the only way we
-        * stop processing the loop when the transport hasn't taken
-        * responsibility for forward progress.
-        */
-       mutex_unlock(&conn->c_send_lock);
+       release_in_xmit(conn);
 
-       if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) {
-               /* We exhausted the send quota, but there's work left to
-                * do. Return and (re-)schedule the send worker.
-                */
-               ret = -EAGAIN;
+       /* Nuke any messages we decided not to retransmit. */
+       if (!list_empty(&to_be_dropped)) {
+               /* irqs on here, so we can put(), unlike above */
+               list_for_each_entry(rm, &to_be_dropped, m_conn_item)
+                       rds_message_put(rm);
+               rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
        }
 
-       if (ret == 0 && was_empty) {
-               /* A simple bit test would be way faster than taking the
-                * spin lock */
-               spin_lock_irqsave(&conn->c_lock, flags);
+       /*
+        * Other senders can queue a message after we last test the send queue
+        * but before we clear RDS_IN_XMIT.  In that case they'd back off and
+        * not try and send their newly queued message.  We need to check the
+        * send queue after having cleared RDS_IN_XMIT so that their message
+        * doesn't get stuck on the send queue.
+        *
+        * If the transport cannot continue (i.e ret != 0), then it must
+        * call us when more room is available, such as from the tx
+        * completion handler.
+        */
+       if (ret == 0) {
+               smp_mb();
                if (!list_empty(&conn->c_send_queue)) {
-                       rds_stats_inc(s_send_sem_queue_raced);
-                       ret = -EAGAIN;
+                       rds_stats_inc(s_send_lock_queue_raced);
+                       goto restart;
                }
-               spin_unlock_irqrestore(&conn->c_lock, flags);
        }
 out:
        return ret;
@@ -376,52 +409,60 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
 }
 
 /*
- * Returns true if there are no messages on the send and retransmit queues
- * which have a sequence number greater than or equal to the given sequence
- * number.
+ * This is pretty similar to what happens below in the ACK
+ * handling code - except that we call here as soon as we get
+ * the IB send completion on the RDMA op and the accompanying
+ * message.
  */
-int rds_send_acked_before(struct rds_connection *conn, u64 seq)
+void rds_rdma_send_complete(struct rds_message *rm, int status)
 {
-       struct rds_message *rm, *tmp;
-       int ret = 1;
+       struct rds_sock *rs = NULL;
+       struct rm_rdma_op *ro;
+       struct rds_notifier *notifier;
+       unsigned long flags;
 
-       spin_lock(&conn->c_lock);
+       spin_lock_irqsave(&rm->m_rs_lock, flags);
 
-       list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
-               if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
-                       ret = 0;
-               break;
-       }
+       ro = &rm->rdma;
+       if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
+           ro->op_active && ro->op_notify && ro->op_notifier) {
+               notifier = ro->op_notifier;
+               rs = rm->m_rs;
+               sock_hold(rds_rs_to_sk(rs));
 
-       list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
-               if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
-                       ret = 0;
-               break;
+               notifier->n_status = status;
+               spin_lock(&rs->rs_lock);
+               list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+               spin_unlock(&rs->rs_lock);
+
+               ro->op_notifier = NULL;
        }
 
-       spin_unlock(&conn->c_lock);
+       spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
-       return ret;
+       if (rs) {
+               rds_wake_sk_sleep(rs);
+               sock_put(rds_rs_to_sk(rs));
+       }
 }
+EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
 
 /*
- * This is pretty similar to what happens below in the ACK
- * handling code - except that we call here as soon as we get
- * the IB send completion on the RDMA op and the accompanying
- * message.
+ * Just like above, except looks at atomic op
  */
-void rds_rdma_send_complete(struct rds_message *rm, int status)
+void rds_atomic_send_complete(struct rds_message *rm, int status)
 {
        struct rds_sock *rs = NULL;
-       struct rds_rdma_op *ro;
+       struct rm_atomic_op *ao;
        struct rds_notifier *notifier;
+       unsigned long flags;
 
-       spin_lock(&rm->m_rs_lock);
+       spin_lock_irqsave(&rm->m_rs_lock, flags);
 
-       ro = rm->m_rdma_op;
-       if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
-           ro && ro->r_notify && ro->r_notifier) {
-               notifier = ro->r_notifier;
+       ao = &rm->atomic;
+       if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
+           && ao->op_active && ao->op_notify && ao->op_notifier) {
+               notifier = ao->op_notifier;
                rs = rm->m_rs;
                sock_hold(rds_rs_to_sk(rs));
 
@@ -430,17 +471,17 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
                list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
                spin_unlock(&rs->rs_lock);
 
-               ro->r_notifier = NULL;
+               ao->op_notifier = NULL;
        }
 
-       spin_unlock(&rm->m_rs_lock);
+       spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
        if (rs) {
                rds_wake_sk_sleep(rs);
                sock_put(rds_rs_to_sk(rs));
        }
 }
-EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
+EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
 
 /*
  * This is the same as rds_rdma_send_complete except we
@@ -448,15 +489,23 @@ EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
  * socket, socket lock) and can just move the notifier.
  */
 static inline void
-__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
+__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
 {
-       struct rds_rdma_op *ro;
+       struct rm_rdma_op *ro;
+       struct rm_atomic_op *ao;
+
+       ro = &rm->rdma;
+       if (ro->op_active && ro->op_notify && ro->op_notifier) {
+               ro->op_notifier->n_status = status;
+               list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
+               ro->op_notifier = NULL;
+       }
 
-       ro = rm->m_rdma_op;
-       if (ro && ro->r_notify && ro->r_notifier) {
-               ro->r_notifier->n_status = status;
-               list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
-               ro->r_notifier = NULL;
+       ao = &rm->atomic;
+       if (ao->op_active && ao->op_notify && ao->op_notifier) {
+               ao->op_notifier->n_status = status;
+               list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
+               ao->op_notifier = NULL;
        }
 
        /* No need to wake the app - caller does this */
@@ -468,7 +517,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status
  * So speed is not an issue here.
  */
 struct rds_message *rds_send_get_message(struct rds_connection *conn,
-                                        struct rds_rdma_op *op)
+                                        struct rm_rdma_op *op)
 {
        struct rds_message *rm, *tmp, *found = NULL;
        unsigned long flags;
@@ -476,7 +525,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
        spin_lock_irqsave(&conn->c_lock, flags);
 
        list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
-               if (rm->m_rdma_op == op) {
+               if (&rm->rdma == op) {
                        atomic_inc(&rm->m_refcount);
                        found = rm;
                        goto out;
@@ -484,7 +533,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
        }
 
        list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
-               if (rm->m_rdma_op == op) {
+               if (&rm->rdma == op) {
                        atomic_inc(&rm->m_refcount);
                        found = rm;
                        break;
@@ -544,19 +593,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status)
                spin_lock(&rs->rs_lock);
 
                if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
-                       struct rds_rdma_op *ro = rm->m_rdma_op;
+                       struct rm_rdma_op *ro = &rm->rdma;
                        struct rds_notifier *notifier;
 
                        list_del_init(&rm->m_sock_item);
                        rds_send_sndbuf_remove(rs, rm);
 
-                       if (ro && ro->r_notifier && (status || ro->r_notify)) {
-                               notifier = ro->r_notifier;
+                       if (ro->op_active && ro->op_notifier &&
+                              (ro->op_notify || (ro->op_recverr && status))) {
+                               notifier = ro->op_notifier;
                                list_add_tail(&notifier->n_list,
                                                &rs->rs_notify_queue);
                                if (!notifier->n_status)
                                        notifier->n_status = status;
-                               rm->m_rdma_op->r_notifier = NULL;
+                               rm->rdma.op_notifier = NULL;
                        }
                        was_on_sock = 1;
                        rm->m_rs = NULL;
@@ -619,9 +669,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 {
        struct rds_message *rm, *tmp;
        struct rds_connection *conn;
-       unsigned long flags, flags2;
+       unsigned long flags;
        LIST_HEAD(list);
-       int wake = 0;
 
        /* get all the messages we're dropping under the rs lock */
        spin_lock_irqsave(&rs->rs_lock, flags);
@@ -631,59 +680,54 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
                             dest->sin_port != rm->m_inc.i_hdr.h_dport))
                        continue;
 
-               wake = 1;
                list_move(&rm->m_sock_item, &list);
                rds_send_sndbuf_remove(rs, rm);
                clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
        }
 
        /* order flag updates with the rs lock */
-       if (wake)
-               smp_mb__after_clear_bit();
+       smp_mb__after_clear_bit();
 
        spin_unlock_irqrestore(&rs->rs_lock, flags);
 
-       conn = NULL;
+       if (list_empty(&list))
+               return;
 
-       /* now remove the messages from the conn list as needed */
+       /* Remove the messages from the conn */
        list_for_each_entry(rm, &list, m_sock_item) {
-               /* We do this here rather than in the loop above, so that
-                * we don't have to nest m_rs_lock under rs->rs_lock */
-               spin_lock_irqsave(&rm->m_rs_lock, flags2);
-               /* If this is a RDMA operation, notify the app. */
-               spin_lock(&rs->rs_lock);
-               __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
-               spin_unlock(&rs->rs_lock);
-               rm->m_rs = NULL;
-               spin_unlock_irqrestore(&rm->m_rs_lock, flags2);
 
+               conn = rm->m_inc.i_conn;
+
+               spin_lock_irqsave(&conn->c_lock, flags);
                /*
-                * If we see this flag cleared then we're *sure* that someone
-                * else beat us to removing it from the conn.  If we race
-                * with their flag update we'll get the lock and then really
-                * see that the flag has been cleared.
+                * Maybe someone else beat us to removing rm from the conn.
+                * If we race with their flag update we'll get the lock and
+                * then really see that the flag has been cleared.
                 */
-               if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
+               if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
+                       spin_unlock_irqrestore(&conn->c_lock, flags);
                        continue;
-
-               if (conn != rm->m_inc.i_conn) {
-                       if (conn)
-                               spin_unlock_irqrestore(&conn->c_lock, flags);
-                       conn = rm->m_inc.i_conn;
-                       spin_lock_irqsave(&conn->c_lock, flags);
                }
+               list_del_init(&rm->m_conn_item);
+               spin_unlock_irqrestore(&conn->c_lock, flags);
 
-               if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
-                       list_del_init(&rm->m_conn_item);
-                       rds_message_put(rm);
-               }
-       }
+               /*
+                * Couldn't grab m_rs_lock in top loop (lock ordering),
+                * but we can now.
+                */
+               spin_lock_irqsave(&rm->m_rs_lock, flags);
 
-       if (conn)
-               spin_unlock_irqrestore(&conn->c_lock, flags);
+               spin_lock(&rs->rs_lock);
+               __rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
+               spin_unlock(&rs->rs_lock);
 
-       if (wake)
-               rds_wake_sk_sleep(rs);
+               rm->m_rs = NULL;
+               spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+               rds_message_put(rm);
+       }
+
+       rds_wake_sk_sleep(rs);
 
        while (!list_empty(&list)) {
                rm = list_entry(list.next, struct rds_message, m_sock_item);
@@ -763,6 +807,63 @@ out:
        return *queued;
 }
 
+/*
+ * rds_message is getting to be quite complicated, and we'd like to allocate
+ * it all in one go. This figures out how big it needs to be up front.
+ */
+static int rds_rm_size(struct msghdr *msg, int data_len)
+{
+       struct cmsghdr *cmsg;
+       int size = 0;
+       int cmsg_groups = 0;
+       int retval;
+
+       for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+               if (!CMSG_OK(msg, cmsg))
+                       return -EINVAL;
+
+               if (cmsg->cmsg_level != SOL_RDS)
+                       continue;
+
+               switch (cmsg->cmsg_type) {
+               case RDS_CMSG_RDMA_ARGS:
+                       cmsg_groups |= 1;
+                       retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
+                       if (retval < 0)
+                               return retval;
+                       size += retval;
+
+                       break;
+
+               case RDS_CMSG_RDMA_DEST:
+               case RDS_CMSG_RDMA_MAP:
+                       cmsg_groups |= 2;
+                       /* these are valid but do no add any size */
+                       break;
+
+               case RDS_CMSG_ATOMIC_CSWP:
+               case RDS_CMSG_ATOMIC_FADD:
+               case RDS_CMSG_MASKED_ATOMIC_CSWP:
+               case RDS_CMSG_MASKED_ATOMIC_FADD:
+                       cmsg_groups |= 1;
+                       size += sizeof(struct scatterlist);
+                       break;
+
+               default:
+                       return -EINVAL;
+               }
+
+       }
+
+       size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
+
+       /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
+       if (cmsg_groups == 3)
+               return -EINVAL;
+
+       return size;
+}
+
 static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
                         struct msghdr *msg, int *allocated_mr)
 {
@@ -777,7 +878,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
                        continue;
 
                /* As a side effect, RDMA_DEST and RDMA_MAP will set
-                * rm->m_rdma_cookie and rm->m_rdma_mr.
+                * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
                 */
                switch (cmsg->cmsg_type) {
                case RDS_CMSG_RDMA_ARGS:
@@ -793,6 +894,12 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
                        if (!ret)
                                *allocated_mr = 1;
                        break;
+               case RDS_CMSG_ATOMIC_CSWP:
+               case RDS_CMSG_ATOMIC_FADD:
+               case RDS_CMSG_MASKED_ATOMIC_CSWP:
+               case RDS_CMSG_MASKED_ATOMIC_FADD:
+                       ret = rds_cmsg_atomic(rs, rm, cmsg);
+                       break;
 
                default:
                        return -EINVAL;
@@ -850,13 +957,26 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
                goto out;
        }
 
-       rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
-       if (IS_ERR(rm)) {
-               ret = PTR_ERR(rm);
-               rm = NULL;
+       /* size of rm including all sgs */
+       ret = rds_rm_size(msg, payload_len);
+       if (ret < 0)
+               goto out;
+
+       rm = rds_message_alloc(ret, GFP_KERNEL);
+       if (!rm) {
+               ret = -ENOMEM;
                goto out;
        }
 
+       /* Attach data to the rm */
+       if (payload_len) {
+               rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
+               ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len);
+               if (ret)
+                       goto out;
+       }
+       rm->data.op_active = 1;
+
        rm->m_daddr = daddr;
 
        /* rds_conn_create has a spinlock that runs with IRQ off.
@@ -879,22 +999,23 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
        if (ret)
                goto out;
 
-       if ((rm->m_rdma_cookie || rm->m_rdma_op) &&
-           conn->c_trans->xmit_rdma == NULL) {
+       if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
                if (printk_ratelimit())
                        printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
-                               rm->m_rdma_op, conn->c_trans->xmit_rdma);
+                              &rm->rdma, conn->c_trans->xmit_rdma);
                ret = -EOPNOTSUPP;
                goto out;
        }
 
-       /* If the connection is down, trigger a connect. We may
-        * have scheduled a delayed reconnect however - in this case
-        * we should not interfere.
-        */
-       if (rds_conn_state(conn) == RDS_CONN_DOWN &&
-           !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
-               queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+       if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
+               if (printk_ratelimit())
+                       printk(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
+                              &rm->atomic, conn->c_trans->xmit_atomic);
+               ret = -EOPNOTSUPP;
+               goto out;
+       }
+
+       rds_conn_connect_if_down(conn);
 
        ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
        if (ret) {
@@ -938,7 +1059,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
        rds_stats_inc(s_send_queued);
 
        if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
-               rds_send_worker(&conn->c_send_w.work);
+               rds_send_xmit(conn);
 
        rds_message_put(rm);
        return payload_len;
@@ -966,20 +1087,15 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
        int ret = 0;
 
        rm = rds_message_alloc(0, GFP_ATOMIC);
-       if (rm == NULL) {
+       if (!rm) {
                ret = -ENOMEM;
                goto out;
        }
 
        rm->m_daddr = conn->c_faddr;
+       rm->data.op_active = 1;
 
-       /* If the connection is down, trigger a connect. We may
-        * have scheduled a delayed reconnect however - in this case
-        * we should not interfere.
-        */
-       if (rds_conn_state(conn) == RDS_CONN_DOWN &&
-           !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
-               queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+       rds_conn_connect_if_down(conn);
 
        ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
        if (ret)
@@ -999,7 +1115,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
        rds_stats_inc(s_send_queued);
        rds_stats_inc(s_send_pong);
 
-       queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+       if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+               rds_send_xmit(conn);
+
        rds_message_put(rm);
        return 0;
 
index 7598eb07cfb149264355695d4582f28e7b9b0962..10c759ccac0c7a5d134e1cd49b41c45651e657c1 100644 (file)
@@ -57,8 +57,8 @@ static const char *const rds_stat_names[] = {
        "recv_ping",
        "send_queue_empty",
        "send_queue_full",
-       "send_sem_contention",
-       "send_sem_queue_raced",
+       "send_lock_contention",
+       "send_lock_queue_raced",
        "send_immediate_retry",
        "send_delayed_retry",
        "send_drop_acked",
@@ -143,7 +143,7 @@ void rds_stats_exit(void)
        rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
 }
 
-int __init rds_stats_init(void)
+int rds_stats_init(void)
 {
        rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
        return 0;
index 7829a20325d3281018dfe0520c450d47a066e890..25ad0c77a26cd16cca2d96681fae4f77373757bf 100644 (file)
@@ -105,13 +105,13 @@ void rds_sysctl_exit(void)
                unregister_sysctl_table(rds_sysctl_reg_table);
 }
 
-int __init rds_sysctl_init(void)
+int rds_sysctl_init(void)
 {
        rds_sysctl_reconnect_min = msecs_to_jiffies(1);
        rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
 
        rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
-       if (rds_sysctl_reg_table == NULL)
+       if (!rds_sysctl_reg_table)
                return -ENOMEM;
        return 0;
 }
index babf4577ff7d3f06835e073e48733edeceb97db3..eeb08e6ab96b0c79b3553dbcef9899eee5fbdac2 100644 (file)
@@ -200,7 +200,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
        struct rds_tcp_connection *tc;
 
        tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
-       if (tc == NULL)
+       if (!tc)
                return -ENOMEM;
 
        tc->t_sock = NULL;
@@ -258,7 +258,6 @@ struct rds_transport rds_tcp_transport = {
        .laddr_check            = rds_tcp_laddr_check,
        .xmit_prepare           = rds_tcp_xmit_prepare,
        .xmit_complete          = rds_tcp_xmit_complete,
-       .xmit_cong_map          = rds_tcp_xmit_cong_map,
        .xmit                   = rds_tcp_xmit,
        .recv                   = rds_tcp_recv,
        .conn_alloc             = rds_tcp_conn_alloc,
@@ -266,7 +265,6 @@ struct rds_transport rds_tcp_transport = {
        .conn_connect           = rds_tcp_conn_connect,
        .conn_shutdown          = rds_tcp_conn_shutdown,
        .inc_copy_to_user       = rds_tcp_inc_copy_to_user,
-       .inc_purge              = rds_tcp_inc_purge,
        .inc_free               = rds_tcp_inc_free,
        .stats_info_copy        = rds_tcp_stats_info_copy,
        .exit                   = rds_tcp_exit,
@@ -276,14 +274,14 @@ struct rds_transport rds_tcp_transport = {
        .t_prefer_loopback      = 1,
 };
 
-int __init rds_tcp_init(void)
+int rds_tcp_init(void)
 {
        int ret;
 
        rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
                                              sizeof(struct rds_tcp_connection),
                                              0, 0, NULL);
-       if (rds_tcp_conn_slab == NULL) {
+       if (!rds_tcp_conn_slab) {
                ret = -ENOMEM;
                goto out;
        }
index 844fa6b9cf5aeb2e5bc1a9cad3a3b2b747449929..f5e6f7bebb5013acbf6b74fe76faf44ab391bfcb 100644 (file)
@@ -43,7 +43,7 @@ struct rds_tcp_statistics {
 };
 
 /* tcp.c */
-int __init rds_tcp_init(void);
+int rds_tcp_init(void);
 void rds_tcp_exit(void);
 void rds_tcp_tune(struct socket *sock);
 void rds_tcp_nonagle(struct socket *sock);
@@ -61,16 +61,15 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn);
 void rds_tcp_state_change(struct sock *sk);
 
 /* tcp_listen.c */
-int __init rds_tcp_listen_init(void);
+int rds_tcp_listen_init(void);
 void rds_tcp_listen_stop(void);
 void rds_tcp_listen_data_ready(struct sock *sk, int bytes);
 
 /* tcp_recv.c */
-int __init rds_tcp_recv_init(void);
+int rds_tcp_recv_init(void);
 void rds_tcp_recv_exit(void);
 void rds_tcp_data_ready(struct sock *sk, int bytes);
 int rds_tcp_recv(struct rds_connection *conn);
-void rds_tcp_inc_purge(struct rds_incoming *inc);
 void rds_tcp_inc_free(struct rds_incoming *inc);
 int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
                             size_t size);
@@ -81,8 +80,6 @@ void rds_tcp_xmit_complete(struct rds_connection *conn);
 int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
                 unsigned int hdr_off, unsigned int sg, unsigned int off);
 void rds_tcp_write_space(struct sock *sk);
-int rds_tcp_xmit_cong_map(struct rds_connection *conn,
-                         struct rds_cong_map *map, unsigned long offset);
 
 /* tcp_stats.c */
 DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
index c397524c039cdb28140ff5f6c0fe2359cebfa3b5..a65ee78db0c54e1062186b9b132fe46fac380c91 100644 (file)
@@ -45,7 +45,7 @@ void rds_tcp_state_change(struct sock *sk)
 
        read_lock(&sk->sk_callback_lock);
        conn = sk->sk_user_data;
-       if (conn == NULL) {
+       if (!conn) {
                state_change = sk->sk_state_change;
                goto out;
        }
index 975183fe6950a34b242ef55db011e6f04c1c6772..ae27869dfc2137a342918f558082c1e58e1d4e88 100644 (file)
@@ -116,7 +116,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
 
        read_lock(&sk->sk_callback_lock);
        ready = sk->sk_user_data;
-       if (ready == NULL) { /* check for teardown race */
+       if (!ready) { /* check for teardown race */
                ready = sk->sk_data_ready;
                goto out;
        }
@@ -135,7 +135,7 @@ out:
        ready(sk, bytes);
 }
 
-int __init rds_tcp_listen_init(void)
+int rds_tcp_listen_init(void)
 {
        struct sockaddr_in sin;
        struct socket *sock = NULL;
@@ -178,7 +178,7 @@ void rds_tcp_listen_stop(void)
        struct socket *sock = rds_tcp_listen_sock;
        struct sock *sk;
 
-       if (sock == NULL)
+       if (!sock)
                return;
 
        sk = sock->sk;
index 1aba6878fa5dc42d4c54473350fde61d714a184c..7017f3af80b671cca7b403d99e8006444eb1c8bb 100644 (file)
@@ -39,7 +39,7 @@
 
 static struct kmem_cache *rds_tcp_incoming_slab;
 
-void rds_tcp_inc_purge(struct rds_incoming *inc)
+static void rds_tcp_inc_purge(struct rds_incoming *inc)
 {
        struct rds_tcp_incoming *tinc;
        tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
@@ -190,10 +190,10 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
         * processing.
         */
        while (left) {
-               if (tinc == NULL) {
+               if (!tinc) {
                        tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
                                                arg->gfp);
-                       if (tinc == NULL) {
+                       if (!tinc) {
                                desc->error = -ENOMEM;
                                goto out;
                        }
@@ -229,7 +229,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
 
                if (left && tc->t_tinc_data_rem) {
                        clone = skb_clone(skb, arg->gfp);
-                       if (clone == NULL) {
+                       if (!clone) {
                                desc->error = -ENOMEM;
                                goto out;
                        }
@@ -326,7 +326,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
 
        read_lock(&sk->sk_callback_lock);
        conn = sk->sk_user_data;
-       if (conn == NULL) { /* check for teardown race */
+       if (!conn) { /* check for teardown race */
                ready = sk->sk_data_ready;
                goto out;
        }
@@ -342,12 +342,12 @@ out:
        ready(sk, bytes);
 }
 
-int __init rds_tcp_recv_init(void)
+int rds_tcp_recv_init(void)
 {
        rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
                                        sizeof(struct rds_tcp_incoming),
                                        0, 0, NULL);
-       if (rds_tcp_incoming_slab == NULL)
+       if (!rds_tcp_incoming_slab)
                return -ENOMEM;
        return 0;
 }
index a28b895ff0d10194730463b218e3ccb526cdea50..2979fb4a4b9aaf8df3b0216d69cac828885df398 100644 (file)
@@ -76,56 +76,6 @@ int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
        return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
 }
 
-/* the core send_sem serializes this with other xmit and shutdown */
-int rds_tcp_xmit_cong_map(struct rds_connection *conn,
-                         struct rds_cong_map *map, unsigned long offset)
-{
-       static struct rds_header rds_tcp_map_header = {
-               .h_flags = RDS_FLAG_CONG_BITMAP,
-       };
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-       unsigned long i;
-       int ret;
-       int copied = 0;
-
-       /* Some problem claims cpu_to_be32(constant) isn't a constant. */
-       rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES);
-
-       if (offset < sizeof(struct rds_header)) {
-               ret = rds_tcp_sendmsg(tc->t_sock,
-                                     (void *)&rds_tcp_map_header + offset,
-                                     sizeof(struct rds_header) - offset);
-               if (ret <= 0)
-                       return ret;
-               offset += ret;
-               copied = ret;
-               if (offset < sizeof(struct rds_header))
-                       return ret;
-       }
-
-       offset -= sizeof(struct rds_header);
-       i = offset / PAGE_SIZE;
-       offset = offset % PAGE_SIZE;
-       BUG_ON(i >= RDS_CONG_MAP_PAGES);
-
-       do {
-               ret = tc->t_sock->ops->sendpage(tc->t_sock,
-                                       virt_to_page(map->m_page_addrs[i]),
-                                       offset, PAGE_SIZE - offset,
-                                       MSG_DONTWAIT);
-               if (ret <= 0)
-                       break;
-               copied += ret;
-               offset += ret;
-               if (offset == PAGE_SIZE) {
-                       offset = 0;
-                       i++;
-               }
-       } while (i < RDS_CONG_MAP_PAGES);
-
-        return copied ? copied : ret;
-}
-
 /* the core send_sem serializes this with other xmit and shutdown */
 int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
                 unsigned int hdr_off, unsigned int sg, unsigned int off)
@@ -166,21 +116,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
                        goto out;
        }
 
-       while (sg < rm->m_nents) {
+       while (sg < rm->data.op_nents) {
                ret = tc->t_sock->ops->sendpage(tc->t_sock,
-                                               sg_page(&rm->m_sg[sg]),
-                                               rm->m_sg[sg].offset + off,
-                                               rm->m_sg[sg].length - off,
+                                               sg_page(&rm->data.op_sg[sg]),
+                                               rm->data.op_sg[sg].offset + off,
+                                               rm->data.op_sg[sg].length - off,
                                                MSG_DONTWAIT|MSG_NOSIGNAL);
-               rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]),
-                        rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off,
+               rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
+                        rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
                         ret);
                if (ret <= 0)
                        break;
 
                off += ret;
                done += ret;
-               if (off == rm->m_sg[sg].length) {
+               if (off == rm->data.op_sg[sg].length) {
                        off = 0;
                        sg++;
                }
@@ -226,7 +176,7 @@ void rds_tcp_write_space(struct sock *sk)
 
        read_lock(&sk->sk_callback_lock);
        conn = sk->sk_user_data;
-       if (conn == NULL) {
+       if (!conn) {
                write_space = sk->sk_write_space;
                goto out;
        }
index 786c20eaaf5e44a1cb1a2d7bdd284c7ad2cedf53..0fd90f8c5f59c75c18c244701f3230f93c408d85 100644 (file)
@@ -61,7 +61,7 @@
  *
  * Transition to state DISCONNECTING/DOWN:
  *  -  Inside the shutdown worker; synchronizes with xmit path
- *     through c_send_lock, and with connection management callbacks
+ *     through RDS_IN_XMIT, and with connection management callbacks
  *     via c_cm_lock.
  *
  *     For receive callbacks, we rely on the underlying transport
@@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete);
  * We should *always* start with a random backoff; otherwise a broken connection
  * will always take several iterations to be re-established.
  */
-static void rds_queue_reconnect(struct rds_connection *conn)
+void rds_queue_reconnect(struct rds_connection *conn)
 {
        unsigned long rand;
 
@@ -156,58 +156,6 @@ void rds_connect_worker(struct work_struct *work)
        }
 }
 
-void rds_shutdown_worker(struct work_struct *work)
-{
-       struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
-
-       /* shut it down unless it's down already */
-       if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
-               /*
-                * Quiesce the connection mgmt handlers before we start tearing
-                * things down. We don't hold the mutex for the entire
-                * duration of the shutdown operation, else we may be
-                * deadlocking with the CM handler. Instead, the CM event
-                * handler is supposed to check for state DISCONNECTING
-                */
-               mutex_lock(&conn->c_cm_lock);
-               if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) &&
-                   !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
-                       rds_conn_error(conn, "shutdown called in state %d\n",
-                                       atomic_read(&conn->c_state));
-                       mutex_unlock(&conn->c_cm_lock);
-                       return;
-               }
-               mutex_unlock(&conn->c_cm_lock);
-
-               mutex_lock(&conn->c_send_lock);
-               conn->c_trans->conn_shutdown(conn);
-               rds_conn_reset(conn);
-               mutex_unlock(&conn->c_send_lock);
-
-               if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
-                       /* This can happen - eg when we're in the middle of tearing
-                        * down the connection, and someone unloads the rds module.
-                        * Quite reproduceable with loopback connections.
-                        * Mostly harmless.
-                        */
-                       rds_conn_error(conn,
-                               "%s: failed to transition to state DOWN, "
-                               "current state is %d\n",
-                               __func__,
-                               atomic_read(&conn->c_state));
-                       return;
-               }
-       }
-
-       /* Then reconnect if it's still live.
-        * The passive side of an IB loopback connection is never added
-        * to the conn hash, so we never trigger a reconnect on this
-        * conn - the reconnect is always triggered by the active peer. */
-       cancel_delayed_work(&conn->c_conn_w);
-       if (!hlist_unhashed(&conn->c_hash_node))
-               rds_queue_reconnect(conn);
-}
-
 void rds_send_worker(struct work_struct *work)
 {
        struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
@@ -252,15 +200,22 @@ void rds_recv_worker(struct work_struct *work)
        }
 }
 
+void rds_shutdown_worker(struct work_struct *work)
+{
+       struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
+
+       rds_conn_shutdown(conn);
+}
+
 void rds_threads_exit(void)
 {
        destroy_workqueue(rds_wq);
 }
 
-int __init rds_threads_init(void)
+int rds_threads_init(void)
 {
-       rds_wq = create_workqueue("krdsd");
-       if (rds_wq == NULL)
+       rds_wq = create_singlethread_workqueue("krdsd");
+       if (!rds_wq)
                return -ENOMEM;
 
        return 0;
index 7e106790135353c036d6fa0dcba3345cf072f3dd..7f2ac4fec3678b28715b95094c6346bcc49333e1 100644 (file)
@@ -71,19 +71,28 @@ void rds_trans_unregister(struct rds_transport *trans)
 }
 EXPORT_SYMBOL_GPL(rds_trans_unregister);
 
+void rds_trans_put(struct rds_transport *trans)
+{
+       if (trans && trans->t_owner)
+               module_put(trans->t_owner);
+}
+
 struct rds_transport *rds_trans_get_preferred(__be32 addr)
 {
        struct rds_transport *ret = NULL;
-       int i;
+       struct rds_transport *trans;
+       unsigned int i;
 
        if (IN_LOOPBACK(ntohl(addr)))
                return &rds_loop_transport;
 
        down_read(&rds_trans_sem);
-       for (i = 0; i < RDS_TRANS_COUNT; i++)
-       {
-               if (transports[i] && (transports[i]->laddr_check(addr) == 0)) {
-                       ret = transports[i];
+       for (i = 0; i < RDS_TRANS_COUNT; i++) {
+               trans = transports[i];
+
+               if (trans && (trans->laddr_check(addr) == 0) &&
+                   (!trans->t_owner || try_module_get(trans->t_owner))) {
+                       ret = trans;
                        break;
                }
        }
diff --git a/net/rds/xlist.h b/net/rds/xlist.h
new file mode 100644 (file)
index 0000000..e6b5190
--- /dev/null
@@ -0,0 +1,80 @@
+#ifndef _LINUX_XLIST_H
+#define _LINUX_XLIST_H
+
+#include <linux/stddef.h>
+#include <linux/poison.h>
+#include <linux/prefetch.h>
+#include <asm/system.h>
+
+struct xlist_head {
+       struct xlist_head *next;
+};
+
+static inline void INIT_XLIST_HEAD(struct xlist_head *list)
+{
+       list->next = NULL;
+}
+
+static inline int xlist_empty(struct xlist_head *head)
+{
+       return head->next == NULL;
+}
+
+static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail,
+                            struct xlist_head *head)
+{
+       struct xlist_head *cur;
+       struct xlist_head *check;
+
+       while (1) {
+               cur = head->next;
+               tail->next = cur;
+               check = cmpxchg(&head->next, cur, new);
+               if (check == cur)
+                       break;
+       }
+}
+
+static inline struct xlist_head *xlist_del_head(struct xlist_head *head)
+{
+       struct xlist_head *cur;
+       struct xlist_head *check;
+       struct xlist_head *next;
+
+       while (1) {
+               cur = head->next;
+               if (!cur)
+                       goto out;
+
+               next = cur->next;
+               check = cmpxchg(&head->next, cur, next);
+               if (check == cur)
+                       goto out;
+       }
+out:
+       return cur;
+}
+
+static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head)
+{
+       struct xlist_head *cur;
+
+       cur = head->next;
+       if (!cur)
+               return NULL;
+
+       head->next = cur->next;
+       return cur;
+}
+
+static inline void xlist_splice(struct xlist_head *list,
+                               struct xlist_head *head)
+{
+       struct xlist_head *cur;
+
+       WARN_ON(head->next);
+       cur = xchg(&list->next, NULL);
+       head->next = cur;
+}
+
+#endif