]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/sunrpc/xprtrdma/verbs.c
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit...
[net-next-2.6.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
f58851e6 1/*
c56c65fb
TT
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
38 */
39
c56c65fb
TT
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50#include <linux/pci.h> /* for Tavor hack below */
5a0e3ad6 51#include <linux/slab.h>
c56c65fb 52
f58851e6
TT
53#include "xprt_rdma.h"
54
c56c65fb
TT
55/*
56 * Globals/Macros
57 */
58
59#ifdef RPC_DEBUG
60# define RPCDBG_FACILITY RPCDBG_TRANS
61#endif
62
63/*
64 * internal functions
65 */
66
67/*
68 * handle replies in tasklet context, using a single, global list
69 * rdma tasklet function -- just turn around and call the func
70 * for all replies on the list
71 */
72
73static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
74static LIST_HEAD(rpcrdma_tasklets_g);
75
76static void
77rpcrdma_run_tasklet(unsigned long data)
78{
79 struct rpcrdma_rep *rep;
80 void (*func)(struct rpcrdma_rep *);
81 unsigned long flags;
82
83 data = data;
84 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
85 while (!list_empty(&rpcrdma_tasklets_g)) {
86 rep = list_entry(rpcrdma_tasklets_g.next,
87 struct rpcrdma_rep, rr_list);
88 list_del(&rep->rr_list);
89 func = rep->rr_func;
90 rep->rr_func = NULL;
91 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
92
93 if (func)
94 func(rep);
95 else
96 rpcrdma_recv_buffer_put(rep);
97
98 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
99 }
100 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
101}
102
103static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
104
105static inline void
106rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
107{
108 unsigned long flags;
109
110 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
111 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
112 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
113 tasklet_schedule(&rpcrdma_tasklet_g);
114}
115
116static void
117rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
118{
119 struct rpcrdma_ep *ep = context;
120
121 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
122 __func__, event->event, event->device->name, context);
123 if (ep->rep_connected == 1) {
124 ep->rep_connected = -EIO;
125 ep->rep_func(ep);
126 wake_up_all(&ep->rep_connect_wait);
127 }
128}
129
130static void
131rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
132{
133 struct rpcrdma_ep *ep = context;
134
135 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
136 __func__, event->event, event->device->name, context);
137 if (ep->rep_connected == 1) {
138 ep->rep_connected = -EIO;
139 ep->rep_func(ep);
140 wake_up_all(&ep->rep_connect_wait);
141 }
142}
143
144static inline
145void rpcrdma_event_process(struct ib_wc *wc)
146{
147 struct rpcrdma_rep *rep =
148 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
149
150 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
151 __func__, rep, wc->status, wc->opcode, wc->byte_len);
152
153 if (!rep) /* send or bind completion that we don't care about */
154 return;
155
156 if (IB_WC_SUCCESS != wc->status) {
157 dprintk("RPC: %s: %s WC status %X, connection lost\n",
158 __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
159 wc->status);
160 rep->rr_len = ~0U;
161 rpcrdma_schedule_tasklet(rep);
162 return;
163 }
164
165 switch (wc->opcode) {
166 case IB_WC_RECV:
167 rep->rr_len = wc->byte_len;
168 ib_dma_sync_single_for_cpu(
169 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
170 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
171 /* Keep (only) the most recent credits, after check validity */
172 if (rep->rr_len >= 16) {
173 struct rpcrdma_msg *p =
174 (struct rpcrdma_msg *) rep->rr_base;
175 unsigned int credits = ntohl(p->rm_credit);
176 if (credits == 0) {
177 dprintk("RPC: %s: server"
178 " dropped credits to 0!\n", __func__);
179 /* don't deadlock */
180 credits = 1;
181 } else if (credits > rep->rr_buffer->rb_max_requests) {
182 dprintk("RPC: %s: server"
183 " over-crediting: %d (%d)\n",
184 __func__, credits,
185 rep->rr_buffer->rb_max_requests);
186 credits = rep->rr_buffer->rb_max_requests;
187 }
188 atomic_set(&rep->rr_buffer->rb_credits, credits);
189 }
190 /* fall through */
191 case IB_WC_BIND_MW:
192 rpcrdma_schedule_tasklet(rep);
193 break;
194 default:
195 dprintk("RPC: %s: unexpected WC event %X\n",
196 __func__, wc->opcode);
197 break;
198 }
199}
200
201static inline int
202rpcrdma_cq_poll(struct ib_cq *cq)
203{
204 struct ib_wc wc;
205 int rc;
206
207 for (;;) {
208 rc = ib_poll_cq(cq, 1, &wc);
209 if (rc < 0) {
210 dprintk("RPC: %s: ib_poll_cq failed %i\n",
211 __func__, rc);
212 return rc;
213 }
214 if (rc == 0)
215 break;
216
217 rpcrdma_event_process(&wc);
218 }
219
220 return 0;
221}
222
223/*
224 * rpcrdma_cq_event_upcall
225 *
226 * This upcall handles recv, send, bind and unbind events.
227 * It is reentrant but processes single events in order to maintain
228 * ordering of receives to keep server credits.
229 *
230 * It is the responsibility of the scheduled tasklet to return
231 * recv buffers to the pool. NOTE: this affects synchronization of
232 * connection shutdown. That is, the structures required for
233 * the completion of the reply handler must remain intact until
234 * all memory has been reclaimed.
235 *
236 * Note that send events are suppressed and do not result in an upcall.
237 */
238static void
239rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
240{
241 int rc;
242
243 rc = rpcrdma_cq_poll(cq);
244 if (rc)
245 return;
246
247 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
248 if (rc) {
249 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
250 __func__, rc);
251 return;
252 }
253
254 rpcrdma_cq_poll(cq);
255}
256
257#ifdef RPC_DEBUG
258static const char * const conn[] = {
259 "address resolved",
260 "address error",
261 "route resolved",
262 "route error",
263 "connect request",
264 "connect response",
265 "connect error",
266 "unreachable",
267 "rejected",
268 "established",
269 "disconnected",
270 "device removal"
271};
272#endif
273
274static int
275rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
276{
277 struct rpcrdma_xprt *xprt = id->context;
278 struct rpcrdma_ia *ia = &xprt->rx_ia;
279 struct rpcrdma_ep *ep = &xprt->rx_ep;
ff0db049 280#ifdef RPC_DEBUG
c56c65fb 281 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
ff0db049 282#endif
c56c65fb
TT
283 struct ib_qp_attr attr;
284 struct ib_qp_init_attr iattr;
285 int connstate = 0;
286
287 switch (event->event) {
288 case RDMA_CM_EVENT_ADDR_RESOLVED:
289 case RDMA_CM_EVENT_ROUTE_RESOLVED:
5675add3 290 ia->ri_async_rc = 0;
c56c65fb
TT
291 complete(&ia->ri_done);
292 break;
293 case RDMA_CM_EVENT_ADDR_ERROR:
294 ia->ri_async_rc = -EHOSTUNREACH;
295 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
296 __func__, ep);
297 complete(&ia->ri_done);
298 break;
299 case RDMA_CM_EVENT_ROUTE_ERROR:
300 ia->ri_async_rc = -ENETUNREACH;
301 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
302 __func__, ep);
303 complete(&ia->ri_done);
304 break;
305 case RDMA_CM_EVENT_ESTABLISHED:
306 connstate = 1;
307 ib_query_qp(ia->ri_id->qp, &attr,
308 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
309 &iattr);
310 dprintk("RPC: %s: %d responder resources"
311 " (%d initiator)\n",
312 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
313 goto connected;
314 case RDMA_CM_EVENT_CONNECT_ERROR:
315 connstate = -ENOTCONN;
316 goto connected;
317 case RDMA_CM_EVENT_UNREACHABLE:
318 connstate = -ENETDOWN;
319 goto connected;
320 case RDMA_CM_EVENT_REJECTED:
321 connstate = -ECONNREFUSED;
322 goto connected;
323 case RDMA_CM_EVENT_DISCONNECTED:
324 connstate = -ECONNABORTED;
325 goto connected;
326 case RDMA_CM_EVENT_DEVICE_REMOVAL:
327 connstate = -ENODEV;
328connected:
21454aaa 329 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
c56c65fb
TT
330 __func__,
331 (event->event <= 11) ? conn[event->event] :
332 "unknown connection error",
21454aaa 333 &addr->sin_addr.s_addr,
c56c65fb
TT
334 ntohs(addr->sin_port),
335 ep, event->event);
336 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
337 dprintk("RPC: %s: %sconnected\n",
338 __func__, connstate > 0 ? "" : "dis");
339 ep->rep_connected = connstate;
340 ep->rep_func(ep);
341 wake_up_all(&ep->rep_connect_wait);
342 break;
343 default:
1a954051 344 dprintk("RPC: %s: unexpected CM event %d\n",
c56c65fb 345 __func__, event->event);
c56c65fb
TT
346 break;
347 }
348
b3cd8d45
TT
349#ifdef RPC_DEBUG
350 if (connstate == 1) {
351 int ird = attr.max_dest_rd_atomic;
352 int tird = ep->rep_remote_cma.responder_resources;
21454aaa 353 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
b3cd8d45 354 "on %s, memreg %d slots %d ird %d%s\n",
21454aaa 355 &addr->sin_addr.s_addr,
b3cd8d45
TT
356 ntohs(addr->sin_port),
357 ia->ri_id->device->name,
358 ia->ri_memreg_strategy,
359 xprt->rx_buf.rb_max_requests,
360 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
361 } else if (connstate < 0) {
21454aaa
HH
362 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
363 &addr->sin_addr.s_addr,
b3cd8d45
TT
364 ntohs(addr->sin_port),
365 connstate);
366 }
367#endif
368
c56c65fb
TT
369 return 0;
370}
371
372static struct rdma_cm_id *
373rpcrdma_create_id(struct rpcrdma_xprt *xprt,
374 struct rpcrdma_ia *ia, struct sockaddr *addr)
375{
376 struct rdma_cm_id *id;
377 int rc;
378
1a954051
TT
379 init_completion(&ia->ri_done);
380
c56c65fb
TT
381 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
382 if (IS_ERR(id)) {
383 rc = PTR_ERR(id);
384 dprintk("RPC: %s: rdma_create_id() failed %i\n",
385 __func__, rc);
386 return id;
387 }
388
5675add3 389 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
390 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
391 if (rc) {
392 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
393 __func__, rc);
394 goto out;
395 }
5675add3
TT
396 wait_for_completion_interruptible_timeout(&ia->ri_done,
397 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
398 rc = ia->ri_async_rc;
399 if (rc)
400 goto out;
401
5675add3 402 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
403 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
404 if (rc) {
405 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
406 __func__, rc);
407 goto out;
408 }
5675add3
TT
409 wait_for_completion_interruptible_timeout(&ia->ri_done,
410 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
411 rc = ia->ri_async_rc;
412 if (rc)
413 goto out;
414
415 return id;
416
417out:
418 rdma_destroy_id(id);
419 return ERR_PTR(rc);
420}
421
422/*
423 * Drain any cq, prior to teardown.
424 */
425static void
426rpcrdma_clean_cq(struct ib_cq *cq)
427{
428 struct ib_wc wc;
429 int count = 0;
430
431 while (1 == ib_poll_cq(cq, 1, &wc))
432 ++count;
433
434 if (count)
435 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
436 __func__, count, wc.opcode);
437}
438
439/*
440 * Exported functions.
441 */
442
443/*
444 * Open and initialize an Interface Adapter.
445 * o initializes fields of struct rpcrdma_ia, including
446 * interface and provider attributes and protection zone.
447 */
448int
449rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
450{
bd7ed1d1
TT
451 int rc, mem_priv;
452 struct ib_device_attr devattr;
c56c65fb
TT
453 struct rpcrdma_ia *ia = &xprt->rx_ia;
454
c56c65fb
TT
455 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
456 if (IS_ERR(ia->ri_id)) {
457 rc = PTR_ERR(ia->ri_id);
458 goto out1;
459 }
460
461 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
462 if (IS_ERR(ia->ri_pd)) {
463 rc = PTR_ERR(ia->ri_pd);
464 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
465 __func__, rc);
466 goto out2;
467 }
468
bd7ed1d1
TT
469 /*
470 * Query the device to determine if the requested memory
471 * registration strategy is supported. If it isn't, set the
472 * strategy to a globally supported model.
473 */
474 rc = ib_query_device(ia->ri_id->device, &devattr);
475 if (rc) {
476 dprintk("RPC: %s: ib_query_device failed %d\n",
477 __func__, rc);
478 goto out2;
479 }
480
481 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
482 ia->ri_have_dma_lkey = 1;
483 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
484 }
485
486 switch (memreg) {
487 case RPCRDMA_MEMWINDOWS:
488 case RPCRDMA_MEMWINDOWS_ASYNC:
489 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
490 dprintk("RPC: %s: MEMWINDOWS registration "
491 "specified but not supported by adapter, "
492 "using slower RPCRDMA_REGISTER\n",
493 __func__);
494 memreg = RPCRDMA_REGISTER;
495 }
496 break;
497 case RPCRDMA_MTHCAFMR:
498 if (!ia->ri_id->device->alloc_fmr) {
499#if RPCRDMA_PERSISTENT_REGISTRATION
500 dprintk("RPC: %s: MTHCAFMR registration "
501 "specified but not supported by adapter, "
502 "using riskier RPCRDMA_ALLPHYSICAL\n",
503 __func__);
504 memreg = RPCRDMA_ALLPHYSICAL;
505#else
506 dprintk("RPC: %s: MTHCAFMR registration "
507 "specified but not supported by adapter, "
508 "using slower RPCRDMA_REGISTER\n",
509 __func__);
510 memreg = RPCRDMA_REGISTER;
3197d309
TT
511#endif
512 }
513 break;
514 case RPCRDMA_FRMR:
515 /* Requires both frmr reg and local dma lkey */
516 if ((devattr.device_cap_flags &
517 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
518 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
519#if RPCRDMA_PERSISTENT_REGISTRATION
520 dprintk("RPC: %s: FRMR registration "
521 "specified but not supported by adapter, "
522 "using riskier RPCRDMA_ALLPHYSICAL\n",
523 __func__);
524 memreg = RPCRDMA_ALLPHYSICAL;
525#else
526 dprintk("RPC: %s: FRMR registration "
527 "specified but not supported by adapter, "
528 "using slower RPCRDMA_REGISTER\n",
529 __func__);
530 memreg = RPCRDMA_REGISTER;
bd7ed1d1
TT
531#endif
532 }
533 break;
534 }
535
c56c65fb
TT
536 /*
537 * Optionally obtain an underlying physical identity mapping in
538 * order to do a memory window-based bind. This base registration
539 * is protected from remote access - that is enabled only by binding
540 * for the specific bytes targeted during each RPC operation, and
541 * revoked after the corresponding completion similar to a storage
542 * adapter.
543 */
bd7ed1d1
TT
544 switch (memreg) {
545 case RPCRDMA_BOUNCEBUFFERS:
546 case RPCRDMA_REGISTER:
3197d309 547 case RPCRDMA_FRMR:
bd7ed1d1 548 break;
c56c65fb 549#if RPCRDMA_PERSISTENT_REGISTRATION
bd7ed1d1
TT
550 case RPCRDMA_ALLPHYSICAL:
551 mem_priv = IB_ACCESS_LOCAL_WRITE |
552 IB_ACCESS_REMOTE_WRITE |
553 IB_ACCESS_REMOTE_READ;
554 goto register_setup;
c56c65fb 555#endif
bd7ed1d1
TT
556 case RPCRDMA_MEMWINDOWS_ASYNC:
557 case RPCRDMA_MEMWINDOWS:
558 mem_priv = IB_ACCESS_LOCAL_WRITE |
559 IB_ACCESS_MW_BIND;
560 goto register_setup;
561 case RPCRDMA_MTHCAFMR:
562 if (ia->ri_have_dma_lkey)
c56c65fb 563 break;
bd7ed1d1
TT
564 mem_priv = IB_ACCESS_LOCAL_WRITE;
565 register_setup:
c56c65fb
TT
566 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
567 if (IS_ERR(ia->ri_bind_mem)) {
568 printk(KERN_ALERT "%s: ib_get_dma_mr for "
569 "phys register failed with %lX\n\t"
570 "Will continue with degraded performance\n",
571 __func__, PTR_ERR(ia->ri_bind_mem));
572 memreg = RPCRDMA_REGISTER;
573 ia->ri_bind_mem = NULL;
574 }
bd7ed1d1
TT
575 break;
576 default:
577 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
578 __func__, memreg);
579 rc = -EINVAL;
580 goto out2;
c56c65fb 581 }
bd7ed1d1
TT
582 dprintk("RPC: %s: memory registration strategy is %d\n",
583 __func__, memreg);
c56c65fb
TT
584
585 /* Else will do memory reg/dereg for each chunk */
586 ia->ri_memreg_strategy = memreg;
587
588 return 0;
589out2:
590 rdma_destroy_id(ia->ri_id);
fee08caf 591 ia->ri_id = NULL;
c56c65fb
TT
592out1:
593 return rc;
594}
595
596/*
597 * Clean up/close an IA.
598 * o if event handles and PD have been initialized, free them.
599 * o close the IA
600 */
601void
602rpcrdma_ia_close(struct rpcrdma_ia *ia)
603{
604 int rc;
605
606 dprintk("RPC: %s: entering\n", __func__);
607 if (ia->ri_bind_mem != NULL) {
608 rc = ib_dereg_mr(ia->ri_bind_mem);
609 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
610 __func__, rc);
611 }
fee08caf
TT
612 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
613 if (ia->ri_id->qp)
614 rdma_destroy_qp(ia->ri_id);
615 rdma_destroy_id(ia->ri_id);
616 ia->ri_id = NULL;
617 }
c56c65fb
TT
618 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
619 rc = ib_dealloc_pd(ia->ri_pd);
620 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
621 __func__, rc);
622 }
c56c65fb
TT
623}
624
625/*
626 * Create unconnected endpoint.
627 */
628int
629rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
630 struct rpcrdma_create_data_internal *cdata)
631{
632 struct ib_device_attr devattr;
5d40a8a5 633 int rc, err;
c56c65fb
TT
634
635 rc = ib_query_device(ia->ri_id->device, &devattr);
636 if (rc) {
637 dprintk("RPC: %s: ib_query_device failed %d\n",
638 __func__, rc);
639 return rc;
640 }
641
642 /* check provider's send/recv wr limits */
643 if (cdata->max_requests > devattr.max_qp_wr)
644 cdata->max_requests = devattr.max_qp_wr;
645
646 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
647 ep->rep_attr.qp_context = ep;
648 /* send_cq and recv_cq initialized below */
649 ep->rep_attr.srq = NULL;
650 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
651 switch (ia->ri_memreg_strategy) {
3197d309
TT
652 case RPCRDMA_FRMR:
653 /* Add room for frmr register and invalidate WRs */
654 ep->rep_attr.cap.max_send_wr *= 3;
655 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
656 return -EINVAL;
657 break;
c56c65fb
TT
658 case RPCRDMA_MEMWINDOWS_ASYNC:
659 case RPCRDMA_MEMWINDOWS:
660 /* Add room for mw_binds+unbinds - overkill! */
661 ep->rep_attr.cap.max_send_wr++;
662 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
663 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
664 return -EINVAL;
665 break;
666 default:
667 break;
668 }
669 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
670 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
671 ep->rep_attr.cap.max_recv_sge = 1;
672 ep->rep_attr.cap.max_inline_data = 0;
673 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
674 ep->rep_attr.qp_type = IB_QPT_RC;
675 ep->rep_attr.port_num = ~0;
676
677 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
678 "iovs: send %d recv %d\n",
679 __func__,
680 ep->rep_attr.cap.max_send_wr,
681 ep->rep_attr.cap.max_recv_wr,
682 ep->rep_attr.cap.max_send_sge,
683 ep->rep_attr.cap.max_recv_sge);
684
685 /* set trigger for requesting send completion */
686 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
687 switch (ia->ri_memreg_strategy) {
688 case RPCRDMA_MEMWINDOWS_ASYNC:
689 case RPCRDMA_MEMWINDOWS:
690 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
691 break;
692 default:
693 break;
694 }
695 if (ep->rep_cqinit <= 2)
696 ep->rep_cqinit = 0;
697 INIT_CQCOUNT(ep);
698 ep->rep_ia = ia;
699 init_waitqueue_head(&ep->rep_connect_wait);
700
701 /*
702 * Create a single cq for receive dto and mw_bind (only ever
703 * care about unbind, really). Send completions are suppressed.
704 * Use single threaded tasklet upcalls to maintain ordering.
705 */
706 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
707 rpcrdma_cq_async_error_upcall, NULL,
708 ep->rep_attr.cap.max_recv_wr +
709 ep->rep_attr.cap.max_send_wr + 1, 0);
710 if (IS_ERR(ep->rep_cq)) {
711 rc = PTR_ERR(ep->rep_cq);
712 dprintk("RPC: %s: ib_create_cq failed: %i\n",
713 __func__, rc);
714 goto out1;
715 }
716
717 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
718 if (rc) {
719 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
720 __func__, rc);
721 goto out2;
722 }
723
724 ep->rep_attr.send_cq = ep->rep_cq;
725 ep->rep_attr.recv_cq = ep->rep_cq;
726
727 /* Initialize cma parameters */
728
729 /* RPC/RDMA does not use private data */
730 ep->rep_remote_cma.private_data = NULL;
731 ep->rep_remote_cma.private_data_len = 0;
732
733 /* Client offers RDMA Read but does not initiate */
b334eaab
TT
734 ep->rep_remote_cma.initiator_depth = 0;
735 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
c56c65fb 736 ep->rep_remote_cma.responder_resources = 0;
b334eaab
TT
737 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
738 ep->rep_remote_cma.responder_resources = 32;
739 else
c56c65fb 740 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
c56c65fb
TT
741
742 ep->rep_remote_cma.retry_count = 7;
743 ep->rep_remote_cma.flow_control = 0;
744 ep->rep_remote_cma.rnr_retry_count = 0;
745
746 return 0;
747
748out2:
5d40a8a5
CL
749 err = ib_destroy_cq(ep->rep_cq);
750 if (err)
751 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
752 __func__, err);
c56c65fb
TT
753out1:
754 return rc;
755}
756
757/*
758 * rpcrdma_ep_destroy
759 *
760 * Disconnect and destroy endpoint. After this, the only
761 * valid operations on the ep are to free it (if dynamically
762 * allocated) or re-create it.
763 *
764 * The caller's error handling must be sure to not leak the endpoint
765 * if this function fails.
766 */
767int
768rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
769{
770 int rc;
771
772 dprintk("RPC: %s: entering, connected is %d\n",
773 __func__, ep->rep_connected);
774
775 if (ia->ri_id->qp) {
776 rc = rpcrdma_ep_disconnect(ep, ia);
777 if (rc)
778 dprintk("RPC: %s: rpcrdma_ep_disconnect"
779 " returned %i\n", __func__, rc);
fee08caf
TT
780 rdma_destroy_qp(ia->ri_id);
781 ia->ri_id->qp = NULL;
c56c65fb
TT
782 }
783
c56c65fb
TT
784 /* padding - could be done in rpcrdma_buffer_destroy... */
785 if (ep->rep_pad_mr) {
786 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
787 ep->rep_pad_mr = NULL;
788 }
789
c56c65fb
TT
790 rpcrdma_clean_cq(ep->rep_cq);
791 rc = ib_destroy_cq(ep->rep_cq);
792 if (rc)
793 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
794 __func__, rc);
795
796 return rc;
797}
798
799/*
800 * Connect unconnected endpoint.
801 */
802int
803rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
804{
805 struct rdma_cm_id *id;
806 int rc = 0;
807 int retry_count = 0;
c56c65fb 808
c055551e 809 if (ep->rep_connected != 0) {
c56c65fb
TT
810 struct rpcrdma_xprt *xprt;
811retry:
812 rc = rpcrdma_ep_disconnect(ep, ia);
813 if (rc && rc != -ENOTCONN)
814 dprintk("RPC: %s: rpcrdma_ep_disconnect"
815 " status %i\n", __func__, rc);
816 rpcrdma_clean_cq(ep->rep_cq);
817
818 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
819 id = rpcrdma_create_id(xprt, ia,
820 (struct sockaddr *)&xprt->rx_data.addr);
821 if (IS_ERR(id)) {
822 rc = PTR_ERR(id);
823 goto out;
824 }
825 /* TEMP TEMP TEMP - fail if new device:
826 * Deregister/remarshal *all* requests!
827 * Close and recreate adapter, pd, etc!
828 * Re-determine all attributes still sane!
829 * More stuff I haven't thought of!
830 * Rrrgh!
831 */
832 if (ia->ri_id->device != id->device) {
833 printk("RPC: %s: can't reconnect on "
834 "different device!\n", __func__);
835 rdma_destroy_id(id);
836 rc = -ENETDOWN;
837 goto out;
838 }
839 /* END TEMP */
1a954051 840 rdma_destroy_qp(ia->ri_id);
c56c65fb
TT
841 rdma_destroy_id(ia->ri_id);
842 ia->ri_id = id;
843 }
844
845 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
846 if (rc) {
847 dprintk("RPC: %s: rdma_create_qp failed %i\n",
848 __func__, rc);
849 goto out;
850 }
851
852/* XXX Tavor device performs badly with 2K MTU! */
853if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
854 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
855 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
856 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
857 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
858 struct ib_qp_attr attr = {
859 .path_mtu = IB_MTU_1024
860 };
861 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
862 }
863}
864
c56c65fb
TT
865 ep->rep_connected = 0;
866
867 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
868 if (rc) {
869 dprintk("RPC: %s: rdma_connect() failed with %i\n",
870 __func__, rc);
871 goto out;
872 }
873
c56c65fb
TT
874 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
875
876 /*
877 * Check state. A non-peer reject indicates no listener
878 * (ECONNREFUSED), which may be a transient state. All
879 * others indicate a transport condition which has already
880 * undergone a best-effort.
881 */
f64f9e71
JP
882 if (ep->rep_connected == -ECONNREFUSED &&
883 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
c56c65fb
TT
884 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
885 goto retry;
886 }
887 if (ep->rep_connected <= 0) {
888 /* Sometimes, the only way to reliably connect to remote
889 * CMs is to use same nonzero values for ORD and IRD. */
b334eaab
TT
890 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
891 (ep->rep_remote_cma.responder_resources == 0 ||
892 ep->rep_remote_cma.initiator_depth !=
893 ep->rep_remote_cma.responder_resources)) {
894 if (ep->rep_remote_cma.responder_resources == 0)
895 ep->rep_remote_cma.responder_resources = 1;
896 ep->rep_remote_cma.initiator_depth =
897 ep->rep_remote_cma.responder_resources;
c56c65fb 898 goto retry;
b334eaab 899 }
c56c65fb
TT
900 rc = ep->rep_connected;
901 } else {
902 dprintk("RPC: %s: connected\n", __func__);
903 }
904
905out:
906 if (rc)
907 ep->rep_connected = rc;
908 return rc;
909}
910
911/*
912 * rpcrdma_ep_disconnect
913 *
914 * This is separate from destroy to facilitate the ability
915 * to reconnect without recreating the endpoint.
916 *
917 * This call is not reentrant, and must not be made in parallel
918 * on the same endpoint.
919 */
920int
921rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
922{
923 int rc;
924
925 rpcrdma_clean_cq(ep->rep_cq);
926 rc = rdma_disconnect(ia->ri_id);
927 if (!rc) {
928 /* returns without wait if not connected */
929 wait_event_interruptible(ep->rep_connect_wait,
930 ep->rep_connected != 1);
931 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
932 (ep->rep_connected == 1) ? "still " : "dis");
933 } else {
934 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
935 ep->rep_connected = rc;
936 }
937 return rc;
938}
939
940/*
941 * Initialize buffer memory
942 */
943int
944rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
945 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
946{
947 char *p;
948 size_t len;
949 int i, rc;
8d4ba034 950 struct rpcrdma_mw *r;
c56c65fb
TT
951
952 buf->rb_max_requests = cdata->max_requests;
953 spin_lock_init(&buf->rb_lock);
954 atomic_set(&buf->rb_credits, 1);
955
956 /* Need to allocate:
957 * 1. arrays for send and recv pointers
958 * 2. arrays of struct rpcrdma_req to fill in pointers
959 * 3. array of struct rpcrdma_rep for replies
960 * 4. padding, if any
3197d309 961 * 5. mw's, fmr's or frmr's, if any
c56c65fb
TT
962 * Send/recv buffers in req/rep need to be registered
963 */
964
965 len = buf->rb_max_requests *
966 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
967 len += cdata->padding;
968 switch (ia->ri_memreg_strategy) {
3197d309
TT
969 case RPCRDMA_FRMR:
970 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
971 sizeof(struct rpcrdma_mw);
972 break;
c56c65fb
TT
973 case RPCRDMA_MTHCAFMR:
974 /* TBD we are perhaps overallocating here */
975 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
976 sizeof(struct rpcrdma_mw);
977 break;
978 case RPCRDMA_MEMWINDOWS_ASYNC:
979 case RPCRDMA_MEMWINDOWS:
980 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
981 sizeof(struct rpcrdma_mw);
982 break;
983 default:
984 break;
985 }
986
987 /* allocate 1, 4 and 5 in one shot */
988 p = kzalloc(len, GFP_KERNEL);
989 if (p == NULL) {
990 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
991 __func__, len);
992 rc = -ENOMEM;
993 goto out;
994 }
995 buf->rb_pool = p; /* for freeing it later */
996
997 buf->rb_send_bufs = (struct rpcrdma_req **) p;
998 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
999 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1000 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1001
1002 /*
1003 * Register the zeroed pad buffer, if any.
1004 */
1005 if (cdata->padding) {
1006 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1007 &ep->rep_pad_mr, &ep->rep_pad);
1008 if (rc)
1009 goto out;
1010 }
1011 p += cdata->padding;
1012
1013 /*
1014 * Allocate the fmr's, or mw's for mw_bind chunk registration.
1015 * We "cycle" the mw's in order to minimize rkey reuse,
1016 * and also reduce unbind-to-bind collision.
1017 */
1018 INIT_LIST_HEAD(&buf->rb_mws);
8d4ba034 1019 r = (struct rpcrdma_mw *)p;
c56c65fb 1020 switch (ia->ri_memreg_strategy) {
3197d309
TT
1021 case RPCRDMA_FRMR:
1022 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1023 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1024 RPCRDMA_MAX_SEGS);
1025 if (IS_ERR(r->r.frmr.fr_mr)) {
1026 rc = PTR_ERR(r->r.frmr.fr_mr);
1027 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1028 " failed %i\n", __func__, rc);
1029 goto out;
1030 }
1031 r->r.frmr.fr_pgl =
1032 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1033 RPCRDMA_MAX_SEGS);
1034 if (IS_ERR(r->r.frmr.fr_pgl)) {
1035 rc = PTR_ERR(r->r.frmr.fr_pgl);
1036 dprintk("RPC: %s: "
1037 "ib_alloc_fast_reg_page_list "
1038 "failed %i\n", __func__, rc);
1039 goto out;
1040 }
1041 list_add(&r->mw_list, &buf->rb_mws);
1042 ++r;
1043 }
1044 break;
c56c65fb 1045 case RPCRDMA_MTHCAFMR:
c56c65fb
TT
1046 /* TBD we are perhaps overallocating here */
1047 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
8d4ba034
TT
1048 static struct ib_fmr_attr fa =
1049 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
c56c65fb
TT
1050 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1051 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1052 &fa);
1053 if (IS_ERR(r->r.fmr)) {
1054 rc = PTR_ERR(r->r.fmr);
1055 dprintk("RPC: %s: ib_alloc_fmr"
1056 " failed %i\n", __func__, rc);
1057 goto out;
1058 }
1059 list_add(&r->mw_list, &buf->rb_mws);
1060 ++r;
1061 }
c56c65fb
TT
1062 break;
1063 case RPCRDMA_MEMWINDOWS_ASYNC:
1064 case RPCRDMA_MEMWINDOWS:
c56c65fb
TT
1065 /* Allocate one extra request's worth, for full cycling */
1066 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1067 r->r.mw = ib_alloc_mw(ia->ri_pd);
1068 if (IS_ERR(r->r.mw)) {
1069 rc = PTR_ERR(r->r.mw);
1070 dprintk("RPC: %s: ib_alloc_mw"
1071 " failed %i\n", __func__, rc);
1072 goto out;
1073 }
1074 list_add(&r->mw_list, &buf->rb_mws);
1075 ++r;
1076 }
c56c65fb
TT
1077 break;
1078 default:
1079 break;
1080 }
1081
1082 /*
1083 * Allocate/init the request/reply buffers. Doing this
1084 * using kmalloc for now -- one for each buf.
1085 */
1086 for (i = 0; i < buf->rb_max_requests; i++) {
1087 struct rpcrdma_req *req;
1088 struct rpcrdma_rep *rep;
1089
1090 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1091 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1092 /* Typical ~2400b, so rounding up saves work later */
1093 if (len < 4096)
1094 len = 4096;
1095 req = kmalloc(len, GFP_KERNEL);
1096 if (req == NULL) {
1097 dprintk("RPC: %s: request buffer %d alloc"
1098 " failed\n", __func__, i);
1099 rc = -ENOMEM;
1100 goto out;
1101 }
1102 memset(req, 0, sizeof(struct rpcrdma_req));
1103 buf->rb_send_bufs[i] = req;
1104 buf->rb_send_bufs[i]->rl_buffer = buf;
1105
1106 rc = rpcrdma_register_internal(ia, req->rl_base,
1107 len - offsetof(struct rpcrdma_req, rl_base),
1108 &buf->rb_send_bufs[i]->rl_handle,
1109 &buf->rb_send_bufs[i]->rl_iov);
1110 if (rc)
1111 goto out;
1112
1113 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1114
1115 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1116 rep = kmalloc(len, GFP_KERNEL);
1117 if (rep == NULL) {
1118 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1119 __func__, i);
1120 rc = -ENOMEM;
1121 goto out;
1122 }
1123 memset(rep, 0, sizeof(struct rpcrdma_rep));
1124 buf->rb_recv_bufs[i] = rep;
1125 buf->rb_recv_bufs[i]->rr_buffer = buf;
1126 init_waitqueue_head(&rep->rr_unbind);
1127
1128 rc = rpcrdma_register_internal(ia, rep->rr_base,
1129 len - offsetof(struct rpcrdma_rep, rr_base),
1130 &buf->rb_recv_bufs[i]->rr_handle,
1131 &buf->rb_recv_bufs[i]->rr_iov);
1132 if (rc)
1133 goto out;
1134
1135 }
1136 dprintk("RPC: %s: max_requests %d\n",
1137 __func__, buf->rb_max_requests);
1138 /* done */
1139 return 0;
1140out:
1141 rpcrdma_buffer_destroy(buf);
1142 return rc;
1143}
1144
1145/*
1146 * Unregister and destroy buffer memory. Need to deal with
1147 * partial initialization, so it's callable from failed create.
1148 * Must be called before destroying endpoint, as registrations
1149 * reference it.
1150 */
1151void
1152rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1153{
1154 int rc, i;
1155 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
8d4ba034 1156 struct rpcrdma_mw *r;
c56c65fb
TT
1157
1158 /* clean up in reverse order from create
1159 * 1. recv mr memory (mr free, then kfree)
1160 * 1a. bind mw memory
1161 * 2. send mr memory (mr free, then kfree)
1162 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1163 * 4. arrays
1164 */
1165 dprintk("RPC: %s: entering\n", __func__);
1166
1167 for (i = 0; i < buf->rb_max_requests; i++) {
1168 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1169 rpcrdma_deregister_internal(ia,
1170 buf->rb_recv_bufs[i]->rr_handle,
1171 &buf->rb_recv_bufs[i]->rr_iov);
1172 kfree(buf->rb_recv_bufs[i]);
1173 }
1174 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1175 while (!list_empty(&buf->rb_mws)) {
c56c65fb
TT
1176 r = list_entry(buf->rb_mws.next,
1177 struct rpcrdma_mw, mw_list);
1178 list_del(&r->mw_list);
1179 switch (ia->ri_memreg_strategy) {
3197d309
TT
1180 case RPCRDMA_FRMR:
1181 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1182 if (rc)
1183 dprintk("RPC: %s:"
1184 " ib_dereg_mr"
1185 " failed %i\n",
1186 __func__, rc);
1187 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1188 break;
c56c65fb
TT
1189 case RPCRDMA_MTHCAFMR:
1190 rc = ib_dealloc_fmr(r->r.fmr);
1191 if (rc)
1192 dprintk("RPC: %s:"
1193 " ib_dealloc_fmr"
1194 " failed %i\n",
1195 __func__, rc);
1196 break;
1197 case RPCRDMA_MEMWINDOWS_ASYNC:
1198 case RPCRDMA_MEMWINDOWS:
1199 rc = ib_dealloc_mw(r->r.mw);
1200 if (rc)
1201 dprintk("RPC: %s:"
1202 " ib_dealloc_mw"
1203 " failed %i\n",
1204 __func__, rc);
1205 break;
1206 default:
1207 break;
1208 }
1209 }
1210 rpcrdma_deregister_internal(ia,
1211 buf->rb_send_bufs[i]->rl_handle,
1212 &buf->rb_send_bufs[i]->rl_iov);
1213 kfree(buf->rb_send_bufs[i]);
1214 }
1215 }
1216
1217 kfree(buf->rb_pool);
1218}
1219
1220/*
1221 * Get a set of request/reply buffers.
1222 *
1223 * Reply buffer (if needed) is attached to send buffer upon return.
1224 * Rule:
1225 * rb_send_index and rb_recv_index MUST always be pointing to the
1226 * *next* available buffer (non-NULL). They are incremented after
1227 * removing buffers, and decremented *before* returning them.
1228 */
1229struct rpcrdma_req *
1230rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1231{
1232 struct rpcrdma_req *req;
1233 unsigned long flags;
8d4ba034
TT
1234 int i;
1235 struct rpcrdma_mw *r;
c56c65fb
TT
1236
1237 spin_lock_irqsave(&buffers->rb_lock, flags);
1238 if (buffers->rb_send_index == buffers->rb_max_requests) {
1239 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1240 dprintk("RPC: %s: out of request buffers\n", __func__);
1241 return ((struct rpcrdma_req *)NULL);
1242 }
1243
1244 req = buffers->rb_send_bufs[buffers->rb_send_index];
1245 if (buffers->rb_send_index < buffers->rb_recv_index) {
1246 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1247 __func__,
1248 buffers->rb_recv_index - buffers->rb_send_index);
1249 req->rl_reply = NULL;
1250 } else {
1251 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1252 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1253 }
1254 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1255 if (!list_empty(&buffers->rb_mws)) {
8d4ba034 1256 i = RPCRDMA_MAX_SEGS - 1;
c56c65fb 1257 do {
c56c65fb
TT
1258 r = list_entry(buffers->rb_mws.next,
1259 struct rpcrdma_mw, mw_list);
1260 list_del(&r->mw_list);
1261 req->rl_segments[i].mr_chunk.rl_mw = r;
1262 } while (--i >= 0);
1263 }
1264 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1265 return req;
1266}
1267
1268/*
1269 * Put request/reply buffers back into pool.
1270 * Pre-decrement counter/array index.
1271 */
1272void
1273rpcrdma_buffer_put(struct rpcrdma_req *req)
1274{
1275 struct rpcrdma_buffer *buffers = req->rl_buffer;
1276 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1277 int i;
1278 unsigned long flags;
1279
1280 BUG_ON(req->rl_nchunks != 0);
1281 spin_lock_irqsave(&buffers->rb_lock, flags);
1282 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1283 req->rl_niovs = 0;
1284 if (req->rl_reply) {
1285 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1286 init_waitqueue_head(&req->rl_reply->rr_unbind);
1287 req->rl_reply->rr_func = NULL;
1288 req->rl_reply = NULL;
1289 }
1290 switch (ia->ri_memreg_strategy) {
3197d309 1291 case RPCRDMA_FRMR:
c56c65fb
TT
1292 case RPCRDMA_MTHCAFMR:
1293 case RPCRDMA_MEMWINDOWS_ASYNC:
1294 case RPCRDMA_MEMWINDOWS:
1295 /*
1296 * Cycle mw's back in reverse order, and "spin" them.
1297 * This delays and scrambles reuse as much as possible.
1298 */
1299 i = 1;
1300 do {
1301 struct rpcrdma_mw **mw;
1302 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1303 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1304 *mw = NULL;
1305 } while (++i < RPCRDMA_MAX_SEGS);
1306 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1307 &buffers->rb_mws);
1308 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1309 break;
1310 default:
1311 break;
1312 }
1313 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1314}
1315
1316/*
1317 * Recover reply buffers from pool.
1318 * This happens when recovering from error conditions.
1319 * Post-increment counter/array index.
1320 */
1321void
1322rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1323{
1324 struct rpcrdma_buffer *buffers = req->rl_buffer;
1325 unsigned long flags;
1326
1327 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1328 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1329 spin_lock_irqsave(&buffers->rb_lock, flags);
1330 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1331 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1332 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1333 }
1334 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1335}
1336
1337/*
1338 * Put reply buffers back into pool when not attached to
1339 * request. This happens in error conditions, and when
1340 * aborting unbinds. Pre-decrement counter/array index.
1341 */
1342void
1343rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1344{
1345 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1346 unsigned long flags;
1347
1348 rep->rr_func = NULL;
1349 spin_lock_irqsave(&buffers->rb_lock, flags);
1350 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1351 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1352}
1353
1354/*
1355 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1356 */
1357
1358int
1359rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1360 struct ib_mr **mrp, struct ib_sge *iov)
1361{
1362 struct ib_phys_buf ipb;
1363 struct ib_mr *mr;
1364 int rc;
1365
1366 /*
1367 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1368 */
1369 iov->addr = ib_dma_map_single(ia->ri_id->device,
1370 va, len, DMA_BIDIRECTIONAL);
1371 iov->length = len;
1372
bd7ed1d1
TT
1373 if (ia->ri_have_dma_lkey) {
1374 *mrp = NULL;
1375 iov->lkey = ia->ri_dma_lkey;
1376 return 0;
1377 } else if (ia->ri_bind_mem != NULL) {
c56c65fb
TT
1378 *mrp = NULL;
1379 iov->lkey = ia->ri_bind_mem->lkey;
1380 return 0;
1381 }
1382
1383 ipb.addr = iov->addr;
1384 ipb.size = iov->length;
1385 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1386 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1387
1388 dprintk("RPC: %s: phys convert: 0x%llx "
1389 "registered 0x%llx length %d\n",
a56daeb7
AM
1390 __func__, (unsigned long long)ipb.addr,
1391 (unsigned long long)iov->addr, len);
c56c65fb
TT
1392
1393 if (IS_ERR(mr)) {
1394 *mrp = NULL;
1395 rc = PTR_ERR(mr);
1396 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1397 } else {
1398 *mrp = mr;
1399 iov->lkey = mr->lkey;
1400 rc = 0;
1401 }
1402
1403 return rc;
1404}
1405
1406int
1407rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1408 struct ib_mr *mr, struct ib_sge *iov)
1409{
1410 int rc;
1411
1412 ib_dma_unmap_single(ia->ri_id->device,
1413 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1414
1415 if (NULL == mr)
1416 return 0;
1417
1418 rc = ib_dereg_mr(mr);
1419 if (rc)
1420 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1421 return rc;
1422}
1423
1424/*
1425 * Wrappers for chunk registration, shared by read/write chunk code.
1426 */
1427
1428static void
1429rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1430{
1431 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1432 seg->mr_dmalen = seg->mr_len;
1433 if (seg->mr_page)
1434 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1435 seg->mr_page, offset_in_page(seg->mr_offset),
1436 seg->mr_dmalen, seg->mr_dir);
1437 else
1438 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1439 seg->mr_offset,
1440 seg->mr_dmalen, seg->mr_dir);
1441}
1442
1443static void
1444rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1445{
1446 if (seg->mr_page)
1447 ib_dma_unmap_page(ia->ri_id->device,
1448 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1449 else
1450 ib_dma_unmap_single(ia->ri_id->device,
1451 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1452}
1453
3197d309
TT
1454static int
1455rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1456 int *nsegs, int writing, struct rpcrdma_ia *ia,
1457 struct rpcrdma_xprt *r_xprt)
1458{
1459 struct rpcrdma_mr_seg *seg1 = seg;
1460 struct ib_send_wr frmr_wr, *bad_wr;
1461 u8 key;
1462 int len, pageoff;
1463 int i, rc;
1464
1465 pageoff = offset_in_page(seg1->mr_offset);
1466 seg1->mr_offset -= pageoff; /* start of page */
1467 seg1->mr_len += pageoff;
1468 len = -pageoff;
1469 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1470 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1471 for (i = 0; i < *nsegs;) {
1472 rpcrdma_map_one(ia, seg, writing);
1473 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1474 len += seg->mr_len;
1475 ++seg;
1476 ++i;
1477 /* Check for holes */
1478 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1479 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1480 break;
1481 }
1482 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1483 __func__, seg1->mr_chunk.rl_mw, i);
1484
1485 /* Bump the key */
1486 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1487 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1488
1489 /* Prepare FRMR WR */
1490 memset(&frmr_wr, 0, sizeof frmr_wr);
1491 frmr_wr.opcode = IB_WR_FAST_REG_MR;
1492 frmr_wr.send_flags = 0; /* unsignaled */
1493 frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma;
1494 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1495 frmr_wr.wr.fast_reg.page_list_len = i;
1496 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1497 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1498 frmr_wr.wr.fast_reg.access_flags = (writing ?
68743082
VP
1499 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1500 IB_ACCESS_REMOTE_READ);
3197d309
TT
1501 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1502 DECR_CQCOUNT(&r_xprt->rx_ep);
1503
1504 rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr);
1505
1506 if (rc) {
1507 dprintk("RPC: %s: failed ib_post_send for register,"
1508 " status %i\n", __func__, rc);
1509 while (i--)
1510 rpcrdma_unmap_one(ia, --seg);
1511 } else {
1512 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1513 seg1->mr_base = seg1->mr_dma + pageoff;
1514 seg1->mr_nsegs = i;
1515 seg1->mr_len = len;
1516 }
1517 *nsegs = i;
1518 return rc;
1519}
1520
1521static int
1522rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1523 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1524{
1525 struct rpcrdma_mr_seg *seg1 = seg;
1526 struct ib_send_wr invalidate_wr, *bad_wr;
1527 int rc;
1528
1529 while (seg1->mr_nsegs--)
1530 rpcrdma_unmap_one(ia, seg++);
1531
1532 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1533 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1534 invalidate_wr.send_flags = 0; /* unsignaled */
1535 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1536 DECR_CQCOUNT(&r_xprt->rx_ep);
1537
1538 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1539 if (rc)
1540 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1541 " status %i\n", __func__, rc);
1542 return rc;
1543}
1544
8d4ba034
TT
1545static int
1546rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1547 int *nsegs, int writing, struct rpcrdma_ia *ia)
1548{
1549 struct rpcrdma_mr_seg *seg1 = seg;
1550 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1551 int len, pageoff, i, rc;
1552
1553 pageoff = offset_in_page(seg1->mr_offset);
1554 seg1->mr_offset -= pageoff; /* start of page */
1555 seg1->mr_len += pageoff;
1556 len = -pageoff;
1557 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1558 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1559 for (i = 0; i < *nsegs;) {
1560 rpcrdma_map_one(ia, seg, writing);
1561 physaddrs[i] = seg->mr_dma;
1562 len += seg->mr_len;
1563 ++seg;
1564 ++i;
1565 /* Check for holes */
1566 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1567 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1568 break;
1569 }
1570 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1571 physaddrs, i, seg1->mr_dma);
1572 if (rc) {
1573 dprintk("RPC: %s: failed ib_map_phys_fmr "
1574 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1575 len, (unsigned long long)seg1->mr_dma,
1576 pageoff, i, rc);
1577 while (i--)
1578 rpcrdma_unmap_one(ia, --seg);
1579 } else {
1580 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1581 seg1->mr_base = seg1->mr_dma + pageoff;
1582 seg1->mr_nsegs = i;
1583 seg1->mr_len = len;
1584 }
1585 *nsegs = i;
1586 return rc;
1587}
1588
1589static int
1590rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1591 struct rpcrdma_ia *ia)
1592{
1593 struct rpcrdma_mr_seg *seg1 = seg;
1594 LIST_HEAD(l);
1595 int rc;
1596
1597 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1598 rc = ib_unmap_fmr(&l);
1599 while (seg1->mr_nsegs--)
1600 rpcrdma_unmap_one(ia, seg++);
1601 if (rc)
1602 dprintk("RPC: %s: failed ib_unmap_fmr,"
1603 " status %i\n", __func__, rc);
1604 return rc;
1605}
1606
1607static int
1608rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1609 int *nsegs, int writing, struct rpcrdma_ia *ia,
1610 struct rpcrdma_xprt *r_xprt)
1611{
1612 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1613 IB_ACCESS_REMOTE_READ);
1614 struct ib_mw_bind param;
1615 int rc;
1616
1617 *nsegs = 1;
1618 rpcrdma_map_one(ia, seg, writing);
1619 param.mr = ia->ri_bind_mem;
1620 param.wr_id = 0ULL; /* no send cookie */
1621 param.addr = seg->mr_dma;
1622 param.length = seg->mr_len;
1623 param.send_flags = 0;
1624 param.mw_access_flags = mem_priv;
1625
1626 DECR_CQCOUNT(&r_xprt->rx_ep);
1627 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1628 if (rc) {
1629 dprintk("RPC: %s: failed ib_bind_mw "
1630 "%u@0x%llx status %i\n",
1631 __func__, seg->mr_len,
1632 (unsigned long long)seg->mr_dma, rc);
1633 rpcrdma_unmap_one(ia, seg);
1634 } else {
1635 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1636 seg->mr_base = param.addr;
1637 seg->mr_nsegs = 1;
1638 }
1639 return rc;
1640}
1641
1642static int
1643rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1644 struct rpcrdma_ia *ia,
1645 struct rpcrdma_xprt *r_xprt, void **r)
1646{
1647 struct ib_mw_bind param;
1648 LIST_HEAD(l);
1649 int rc;
1650
1651 BUG_ON(seg->mr_nsegs != 1);
1652 param.mr = ia->ri_bind_mem;
1653 param.addr = 0ULL; /* unbind */
1654 param.length = 0;
1655 param.mw_access_flags = 0;
1656 if (*r) {
1657 param.wr_id = (u64) (unsigned long) *r;
1658 param.send_flags = IB_SEND_SIGNALED;
1659 INIT_CQCOUNT(&r_xprt->rx_ep);
1660 } else {
1661 param.wr_id = 0ULL;
1662 param.send_flags = 0;
1663 DECR_CQCOUNT(&r_xprt->rx_ep);
1664 }
1665 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1666 rpcrdma_unmap_one(ia, seg);
1667 if (rc)
1668 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1669 " status %i\n", __func__, rc);
1670 else
1671 *r = NULL; /* will upcall on completion */
1672 return rc;
1673}
1674
1675static int
1676rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1677 int *nsegs, int writing, struct rpcrdma_ia *ia)
1678{
1679 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1680 IB_ACCESS_REMOTE_READ);
1681 struct rpcrdma_mr_seg *seg1 = seg;
1682 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1683 int len, i, rc = 0;
1684
1685 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1686 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1687 for (len = 0, i = 0; i < *nsegs;) {
1688 rpcrdma_map_one(ia, seg, writing);
1689 ipb[i].addr = seg->mr_dma;
1690 ipb[i].size = seg->mr_len;
1691 len += seg->mr_len;
1692 ++seg;
1693 ++i;
1694 /* Check for holes */
1695 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1696 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1697 break;
1698 }
1699 seg1->mr_base = seg1->mr_dma;
1700 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1701 ipb, i, mem_priv, &seg1->mr_base);
1702 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1703 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1704 dprintk("RPC: %s: failed ib_reg_phys_mr "
1705 "%u@0x%llx (%d)... status %i\n",
1706 __func__, len,
1707 (unsigned long long)seg1->mr_dma, i, rc);
1708 while (i--)
1709 rpcrdma_unmap_one(ia, --seg);
1710 } else {
1711 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1712 seg1->mr_nsegs = i;
1713 seg1->mr_len = len;
1714 }
1715 *nsegs = i;
1716 return rc;
1717}
1718
1719static int
1720rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1721 struct rpcrdma_ia *ia)
1722{
1723 struct rpcrdma_mr_seg *seg1 = seg;
1724 int rc;
1725
1726 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1727 seg1->mr_chunk.rl_mr = NULL;
1728 while (seg1->mr_nsegs--)
1729 rpcrdma_unmap_one(ia, seg++);
1730 if (rc)
1731 dprintk("RPC: %s: failed ib_dereg_mr,"
1732 " status %i\n", __func__, rc);
1733 return rc;
1734}
1735
c56c65fb
TT
1736int
1737rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1738 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1739{
1740 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1741 int rc = 0;
1742
1743 switch (ia->ri_memreg_strategy) {
1744
1745#if RPCRDMA_PERSISTENT_REGISTRATION
1746 case RPCRDMA_ALLPHYSICAL:
1747 rpcrdma_map_one(ia, seg, writing);
1748 seg->mr_rkey = ia->ri_bind_mem->rkey;
1749 seg->mr_base = seg->mr_dma;
1750 seg->mr_nsegs = 1;
1751 nsegs = 1;
1752 break;
1753#endif
1754
3197d309
TT
1755 /* Registration using frmr registration */
1756 case RPCRDMA_FRMR:
1757 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1758 break;
1759
8d4ba034 1760 /* Registration using fmr memory registration */
c56c65fb 1761 case RPCRDMA_MTHCAFMR:
8d4ba034 1762 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1763 break;
1764
1765 /* Registration using memory windows */
1766 case RPCRDMA_MEMWINDOWS_ASYNC:
1767 case RPCRDMA_MEMWINDOWS:
8d4ba034 1768 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
c56c65fb
TT
1769 break;
1770
1771 /* Default registration each time */
1772 default:
8d4ba034 1773 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1774 break;
1775 }
1776 if (rc)
1777 return -1;
1778
1779 return nsegs;
1780}
1781
1782int
1783rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1784 struct rpcrdma_xprt *r_xprt, void *r)
1785{
1786 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1787 int nsegs = seg->mr_nsegs, rc;
1788
1789 switch (ia->ri_memreg_strategy) {
1790
1791#if RPCRDMA_PERSISTENT_REGISTRATION
1792 case RPCRDMA_ALLPHYSICAL:
1793 BUG_ON(nsegs != 1);
1794 rpcrdma_unmap_one(ia, seg);
1795 rc = 0;
1796 break;
1797#endif
1798
3197d309
TT
1799 case RPCRDMA_FRMR:
1800 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1801 break;
1802
c56c65fb 1803 case RPCRDMA_MTHCAFMR:
8d4ba034 1804 rc = rpcrdma_deregister_fmr_external(seg, ia);
c56c65fb
TT
1805 break;
1806
1807 case RPCRDMA_MEMWINDOWS_ASYNC:
1808 case RPCRDMA_MEMWINDOWS:
8d4ba034 1809 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
c56c65fb
TT
1810 break;
1811
1812 default:
8d4ba034 1813 rc = rpcrdma_deregister_default_external(seg, ia);
c56c65fb
TT
1814 break;
1815 }
1816 if (r) {
1817 struct rpcrdma_rep *rep = r;
1818 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1819 rep->rr_func = NULL;
1820 func(rep); /* dereg done, callback now */
1821 }
1822 return nsegs;
1823}
1824
1825/*
1826 * Prepost any receive buffer, then post send.
1827 *
1828 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1829 */
1830int
1831rpcrdma_ep_post(struct rpcrdma_ia *ia,
1832 struct rpcrdma_ep *ep,
1833 struct rpcrdma_req *req)
1834{
1835 struct ib_send_wr send_wr, *send_wr_fail;
1836 struct rpcrdma_rep *rep = req->rl_reply;
1837 int rc;
1838
1839 if (rep) {
1840 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1841 if (rc)
1842 goto out;
1843 req->rl_reply = NULL;
1844 }
1845
1846 send_wr.next = NULL;
1847 send_wr.wr_id = 0ULL; /* no send cookie */
1848 send_wr.sg_list = req->rl_send_iov;
1849 send_wr.num_sge = req->rl_niovs;
1850 send_wr.opcode = IB_WR_SEND;
c56c65fb
TT
1851 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1852 ib_dma_sync_single_for_device(ia->ri_id->device,
1853 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1854 DMA_TO_DEVICE);
1855 ib_dma_sync_single_for_device(ia->ri_id->device,
1856 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1857 DMA_TO_DEVICE);
1858 ib_dma_sync_single_for_device(ia->ri_id->device,
1859 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1860 DMA_TO_DEVICE);
1861
1862 if (DECR_CQCOUNT(ep) > 0)
1863 send_wr.send_flags = 0;
1864 else { /* Provider must take a send completion every now and then */
1865 INIT_CQCOUNT(ep);
1866 send_wr.send_flags = IB_SEND_SIGNALED;
1867 }
1868
1869 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1870 if (rc)
1871 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1872 rc);
1873out:
1874 return rc;
1875}
1876
1877/*
1878 * (Re)post a receive buffer.
1879 */
1880int
1881rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1882 struct rpcrdma_ep *ep,
1883 struct rpcrdma_rep *rep)
1884{
1885 struct ib_recv_wr recv_wr, *recv_wr_fail;
1886 int rc;
1887
1888 recv_wr.next = NULL;
1889 recv_wr.wr_id = (u64) (unsigned long) rep;
1890 recv_wr.sg_list = &rep->rr_iov;
1891 recv_wr.num_sge = 1;
1892
1893 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1894 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1895
1896 DECR_CQCOUNT(ep);
1897 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1898
1899 if (rc)
1900 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1901 rc);
1902 return rc;
1903}