]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/sunrpc/xprtrdma/verbs.c
Merge branch 'fix/asoc' into for-linus
[net-next-2.6.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
f58851e6 1/*
c56c65fb
TT
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
38 */
39
c56c65fb
TT
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50#include <linux/pci.h> /* for Tavor hack below */
5a0e3ad6 51#include <linux/slab.h>
c56c65fb 52
f58851e6
TT
53#include "xprt_rdma.h"
54
c56c65fb
TT
55/*
56 * Globals/Macros
57 */
58
59#ifdef RPC_DEBUG
60# define RPCDBG_FACILITY RPCDBG_TRANS
61#endif
62
63/*
64 * internal functions
65 */
66
67/*
68 * handle replies in tasklet context, using a single, global list
69 * rdma tasklet function -- just turn around and call the func
70 * for all replies on the list
71 */
72
73static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
74static LIST_HEAD(rpcrdma_tasklets_g);
75
76static void
77rpcrdma_run_tasklet(unsigned long data)
78{
79 struct rpcrdma_rep *rep;
80 void (*func)(struct rpcrdma_rep *);
81 unsigned long flags;
82
83 data = data;
84 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
85 while (!list_empty(&rpcrdma_tasklets_g)) {
86 rep = list_entry(rpcrdma_tasklets_g.next,
87 struct rpcrdma_rep, rr_list);
88 list_del(&rep->rr_list);
89 func = rep->rr_func;
90 rep->rr_func = NULL;
91 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
92
93 if (func)
94 func(rep);
95 else
96 rpcrdma_recv_buffer_put(rep);
97
98 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
99 }
100 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
101}
102
103static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
104
105static inline void
106rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
107{
108 unsigned long flags;
109
110 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
111 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
112 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
113 tasklet_schedule(&rpcrdma_tasklet_g);
114}
115
116static void
117rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
118{
119 struct rpcrdma_ep *ep = context;
120
121 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
122 __func__, event->event, event->device->name, context);
123 if (ep->rep_connected == 1) {
124 ep->rep_connected = -EIO;
125 ep->rep_func(ep);
126 wake_up_all(&ep->rep_connect_wait);
127 }
128}
129
130static void
131rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
132{
133 struct rpcrdma_ep *ep = context;
134
135 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
136 __func__, event->event, event->device->name, context);
137 if (ep->rep_connected == 1) {
138 ep->rep_connected = -EIO;
139 ep->rep_func(ep);
140 wake_up_all(&ep->rep_connect_wait);
141 }
142}
143
144static inline
145void rpcrdma_event_process(struct ib_wc *wc)
146{
147 struct rpcrdma_rep *rep =
148 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
149
150 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
151 __func__, rep, wc->status, wc->opcode, wc->byte_len);
152
153 if (!rep) /* send or bind completion that we don't care about */
154 return;
155
156 if (IB_WC_SUCCESS != wc->status) {
157 dprintk("RPC: %s: %s WC status %X, connection lost\n",
158 __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
159 wc->status);
160 rep->rr_len = ~0U;
161 rpcrdma_schedule_tasklet(rep);
162 return;
163 }
164
165 switch (wc->opcode) {
166 case IB_WC_RECV:
167 rep->rr_len = wc->byte_len;
168 ib_dma_sync_single_for_cpu(
169 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
170 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
171 /* Keep (only) the most recent credits, after check validity */
172 if (rep->rr_len >= 16) {
173 struct rpcrdma_msg *p =
174 (struct rpcrdma_msg *) rep->rr_base;
175 unsigned int credits = ntohl(p->rm_credit);
176 if (credits == 0) {
177 dprintk("RPC: %s: server"
178 " dropped credits to 0!\n", __func__);
179 /* don't deadlock */
180 credits = 1;
181 } else if (credits > rep->rr_buffer->rb_max_requests) {
182 dprintk("RPC: %s: server"
183 " over-crediting: %d (%d)\n",
184 __func__, credits,
185 rep->rr_buffer->rb_max_requests);
186 credits = rep->rr_buffer->rb_max_requests;
187 }
188 atomic_set(&rep->rr_buffer->rb_credits, credits);
189 }
190 /* fall through */
191 case IB_WC_BIND_MW:
192 rpcrdma_schedule_tasklet(rep);
193 break;
194 default:
195 dprintk("RPC: %s: unexpected WC event %X\n",
196 __func__, wc->opcode);
197 break;
198 }
199}
200
201static inline int
202rpcrdma_cq_poll(struct ib_cq *cq)
203{
204 struct ib_wc wc;
205 int rc;
206
207 for (;;) {
208 rc = ib_poll_cq(cq, 1, &wc);
209 if (rc < 0) {
210 dprintk("RPC: %s: ib_poll_cq failed %i\n",
211 __func__, rc);
212 return rc;
213 }
214 if (rc == 0)
215 break;
216
217 rpcrdma_event_process(&wc);
218 }
219
220 return 0;
221}
222
223/*
224 * rpcrdma_cq_event_upcall
225 *
226 * This upcall handles recv, send, bind and unbind events.
227 * It is reentrant but processes single events in order to maintain
228 * ordering of receives to keep server credits.
229 *
230 * It is the responsibility of the scheduled tasklet to return
231 * recv buffers to the pool. NOTE: this affects synchronization of
232 * connection shutdown. That is, the structures required for
233 * the completion of the reply handler must remain intact until
234 * all memory has been reclaimed.
235 *
236 * Note that send events are suppressed and do not result in an upcall.
237 */
238static void
239rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
240{
241 int rc;
242
243 rc = rpcrdma_cq_poll(cq);
244 if (rc)
245 return;
246
247 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
248 if (rc) {
249 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
250 __func__, rc);
251 return;
252 }
253
254 rpcrdma_cq_poll(cq);
255}
256
257#ifdef RPC_DEBUG
258static const char * const conn[] = {
259 "address resolved",
260 "address error",
261 "route resolved",
262 "route error",
263 "connect request",
264 "connect response",
265 "connect error",
266 "unreachable",
267 "rejected",
268 "established",
269 "disconnected",
270 "device removal"
271};
272#endif
273
274static int
275rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
276{
277 struct rpcrdma_xprt *xprt = id->context;
278 struct rpcrdma_ia *ia = &xprt->rx_ia;
279 struct rpcrdma_ep *ep = &xprt->rx_ep;
ff0db049 280#ifdef RPC_DEBUG
c56c65fb 281 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
ff0db049 282#endif
c56c65fb
TT
283 struct ib_qp_attr attr;
284 struct ib_qp_init_attr iattr;
285 int connstate = 0;
286
287 switch (event->event) {
288 case RDMA_CM_EVENT_ADDR_RESOLVED:
289 case RDMA_CM_EVENT_ROUTE_RESOLVED:
5675add3 290 ia->ri_async_rc = 0;
c56c65fb
TT
291 complete(&ia->ri_done);
292 break;
293 case RDMA_CM_EVENT_ADDR_ERROR:
294 ia->ri_async_rc = -EHOSTUNREACH;
295 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
296 __func__, ep);
297 complete(&ia->ri_done);
298 break;
299 case RDMA_CM_EVENT_ROUTE_ERROR:
300 ia->ri_async_rc = -ENETUNREACH;
301 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
302 __func__, ep);
303 complete(&ia->ri_done);
304 break;
305 case RDMA_CM_EVENT_ESTABLISHED:
306 connstate = 1;
307 ib_query_qp(ia->ri_id->qp, &attr,
308 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
309 &iattr);
310 dprintk("RPC: %s: %d responder resources"
311 " (%d initiator)\n",
312 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
313 goto connected;
314 case RDMA_CM_EVENT_CONNECT_ERROR:
315 connstate = -ENOTCONN;
316 goto connected;
317 case RDMA_CM_EVENT_UNREACHABLE:
318 connstate = -ENETDOWN;
319 goto connected;
320 case RDMA_CM_EVENT_REJECTED:
321 connstate = -ECONNREFUSED;
322 goto connected;
323 case RDMA_CM_EVENT_DISCONNECTED:
324 connstate = -ECONNABORTED;
325 goto connected;
326 case RDMA_CM_EVENT_DEVICE_REMOVAL:
327 connstate = -ENODEV;
328connected:
21454aaa 329 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
c56c65fb
TT
330 __func__,
331 (event->event <= 11) ? conn[event->event] :
332 "unknown connection error",
21454aaa 333 &addr->sin_addr.s_addr,
c56c65fb
TT
334 ntohs(addr->sin_port),
335 ep, event->event);
336 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
337 dprintk("RPC: %s: %sconnected\n",
338 __func__, connstate > 0 ? "" : "dis");
339 ep->rep_connected = connstate;
340 ep->rep_func(ep);
341 wake_up_all(&ep->rep_connect_wait);
342 break;
343 default:
1a954051 344 dprintk("RPC: %s: unexpected CM event %d\n",
c56c65fb 345 __func__, event->event);
c56c65fb
TT
346 break;
347 }
348
b3cd8d45
TT
349#ifdef RPC_DEBUG
350 if (connstate == 1) {
351 int ird = attr.max_dest_rd_atomic;
352 int tird = ep->rep_remote_cma.responder_resources;
21454aaa 353 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
b3cd8d45 354 "on %s, memreg %d slots %d ird %d%s\n",
21454aaa 355 &addr->sin_addr.s_addr,
b3cd8d45
TT
356 ntohs(addr->sin_port),
357 ia->ri_id->device->name,
358 ia->ri_memreg_strategy,
359 xprt->rx_buf.rb_max_requests,
360 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
361 } else if (connstate < 0) {
21454aaa
HH
362 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
363 &addr->sin_addr.s_addr,
b3cd8d45
TT
364 ntohs(addr->sin_port),
365 connstate);
366 }
367#endif
368
c56c65fb
TT
369 return 0;
370}
371
372static struct rdma_cm_id *
373rpcrdma_create_id(struct rpcrdma_xprt *xprt,
374 struct rpcrdma_ia *ia, struct sockaddr *addr)
375{
376 struct rdma_cm_id *id;
377 int rc;
378
1a954051
TT
379 init_completion(&ia->ri_done);
380
c56c65fb
TT
381 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
382 if (IS_ERR(id)) {
383 rc = PTR_ERR(id);
384 dprintk("RPC: %s: rdma_create_id() failed %i\n",
385 __func__, rc);
386 return id;
387 }
388
5675add3 389 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
390 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
391 if (rc) {
392 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
393 __func__, rc);
394 goto out;
395 }
5675add3
TT
396 wait_for_completion_interruptible_timeout(&ia->ri_done,
397 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
398 rc = ia->ri_async_rc;
399 if (rc)
400 goto out;
401
5675add3 402 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
403 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
404 if (rc) {
405 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
406 __func__, rc);
407 goto out;
408 }
5675add3
TT
409 wait_for_completion_interruptible_timeout(&ia->ri_done,
410 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
411 rc = ia->ri_async_rc;
412 if (rc)
413 goto out;
414
415 return id;
416
417out:
418 rdma_destroy_id(id);
419 return ERR_PTR(rc);
420}
421
422/*
423 * Drain any cq, prior to teardown.
424 */
425static void
426rpcrdma_clean_cq(struct ib_cq *cq)
427{
428 struct ib_wc wc;
429 int count = 0;
430
431 while (1 == ib_poll_cq(cq, 1, &wc))
432 ++count;
433
434 if (count)
435 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
436 __func__, count, wc.opcode);
437}
438
439/*
440 * Exported functions.
441 */
442
443/*
444 * Open and initialize an Interface Adapter.
445 * o initializes fields of struct rpcrdma_ia, including
446 * interface and provider attributes and protection zone.
447 */
448int
449rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
450{
bd7ed1d1
TT
451 int rc, mem_priv;
452 struct ib_device_attr devattr;
c56c65fb
TT
453 struct rpcrdma_ia *ia = &xprt->rx_ia;
454
c56c65fb
TT
455 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
456 if (IS_ERR(ia->ri_id)) {
457 rc = PTR_ERR(ia->ri_id);
458 goto out1;
459 }
460
461 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
462 if (IS_ERR(ia->ri_pd)) {
463 rc = PTR_ERR(ia->ri_pd);
464 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
465 __func__, rc);
466 goto out2;
467 }
468
bd7ed1d1
TT
469 /*
470 * Query the device to determine if the requested memory
471 * registration strategy is supported. If it isn't, set the
472 * strategy to a globally supported model.
473 */
474 rc = ib_query_device(ia->ri_id->device, &devattr);
475 if (rc) {
476 dprintk("RPC: %s: ib_query_device failed %d\n",
477 __func__, rc);
478 goto out2;
479 }
480
481 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
482 ia->ri_have_dma_lkey = 1;
483 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
484 }
485
486 switch (memreg) {
487 case RPCRDMA_MEMWINDOWS:
488 case RPCRDMA_MEMWINDOWS_ASYNC:
489 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
490 dprintk("RPC: %s: MEMWINDOWS registration "
491 "specified but not supported by adapter, "
492 "using slower RPCRDMA_REGISTER\n",
493 __func__);
494 memreg = RPCRDMA_REGISTER;
495 }
496 break;
497 case RPCRDMA_MTHCAFMR:
498 if (!ia->ri_id->device->alloc_fmr) {
499#if RPCRDMA_PERSISTENT_REGISTRATION
500 dprintk("RPC: %s: MTHCAFMR registration "
501 "specified but not supported by adapter, "
502 "using riskier RPCRDMA_ALLPHYSICAL\n",
503 __func__);
504 memreg = RPCRDMA_ALLPHYSICAL;
505#else
506 dprintk("RPC: %s: MTHCAFMR registration "
507 "specified but not supported by adapter, "
508 "using slower RPCRDMA_REGISTER\n",
509 __func__);
510 memreg = RPCRDMA_REGISTER;
3197d309
TT
511#endif
512 }
513 break;
514 case RPCRDMA_FRMR:
515 /* Requires both frmr reg and local dma lkey */
516 if ((devattr.device_cap_flags &
517 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
518 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
519#if RPCRDMA_PERSISTENT_REGISTRATION
520 dprintk("RPC: %s: FRMR registration "
521 "specified but not supported by adapter, "
522 "using riskier RPCRDMA_ALLPHYSICAL\n",
523 __func__);
524 memreg = RPCRDMA_ALLPHYSICAL;
525#else
526 dprintk("RPC: %s: FRMR registration "
527 "specified but not supported by adapter, "
528 "using slower RPCRDMA_REGISTER\n",
529 __func__);
530 memreg = RPCRDMA_REGISTER;
bd7ed1d1
TT
531#endif
532 }
533 break;
534 }
535
c56c65fb
TT
536 /*
537 * Optionally obtain an underlying physical identity mapping in
538 * order to do a memory window-based bind. This base registration
539 * is protected from remote access - that is enabled only by binding
540 * for the specific bytes targeted during each RPC operation, and
541 * revoked after the corresponding completion similar to a storage
542 * adapter.
543 */
bd7ed1d1
TT
544 switch (memreg) {
545 case RPCRDMA_BOUNCEBUFFERS:
546 case RPCRDMA_REGISTER:
3197d309 547 case RPCRDMA_FRMR:
bd7ed1d1 548 break;
c56c65fb 549#if RPCRDMA_PERSISTENT_REGISTRATION
bd7ed1d1
TT
550 case RPCRDMA_ALLPHYSICAL:
551 mem_priv = IB_ACCESS_LOCAL_WRITE |
552 IB_ACCESS_REMOTE_WRITE |
553 IB_ACCESS_REMOTE_READ;
554 goto register_setup;
c56c65fb 555#endif
bd7ed1d1
TT
556 case RPCRDMA_MEMWINDOWS_ASYNC:
557 case RPCRDMA_MEMWINDOWS:
558 mem_priv = IB_ACCESS_LOCAL_WRITE |
559 IB_ACCESS_MW_BIND;
560 goto register_setup;
561 case RPCRDMA_MTHCAFMR:
562 if (ia->ri_have_dma_lkey)
c56c65fb 563 break;
bd7ed1d1
TT
564 mem_priv = IB_ACCESS_LOCAL_WRITE;
565 register_setup:
c56c65fb
TT
566 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
567 if (IS_ERR(ia->ri_bind_mem)) {
568 printk(KERN_ALERT "%s: ib_get_dma_mr for "
569 "phys register failed with %lX\n\t"
570 "Will continue with degraded performance\n",
571 __func__, PTR_ERR(ia->ri_bind_mem));
572 memreg = RPCRDMA_REGISTER;
573 ia->ri_bind_mem = NULL;
574 }
bd7ed1d1
TT
575 break;
576 default:
577 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
578 __func__, memreg);
579 rc = -EINVAL;
580 goto out2;
c56c65fb 581 }
bd7ed1d1
TT
582 dprintk("RPC: %s: memory registration strategy is %d\n",
583 __func__, memreg);
c56c65fb
TT
584
585 /* Else will do memory reg/dereg for each chunk */
586 ia->ri_memreg_strategy = memreg;
587
588 return 0;
589out2:
590 rdma_destroy_id(ia->ri_id);
fee08caf 591 ia->ri_id = NULL;
c56c65fb
TT
592out1:
593 return rc;
594}
595
596/*
597 * Clean up/close an IA.
598 * o if event handles and PD have been initialized, free them.
599 * o close the IA
600 */
601void
602rpcrdma_ia_close(struct rpcrdma_ia *ia)
603{
604 int rc;
605
606 dprintk("RPC: %s: entering\n", __func__);
607 if (ia->ri_bind_mem != NULL) {
608 rc = ib_dereg_mr(ia->ri_bind_mem);
609 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
610 __func__, rc);
611 }
fee08caf
TT
612 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
613 if (ia->ri_id->qp)
614 rdma_destroy_qp(ia->ri_id);
615 rdma_destroy_id(ia->ri_id);
616 ia->ri_id = NULL;
617 }
c56c65fb
TT
618 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
619 rc = ib_dealloc_pd(ia->ri_pd);
620 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
621 __func__, rc);
622 }
c56c65fb
TT
623}
624
625/*
626 * Create unconnected endpoint.
627 */
628int
629rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
630 struct rpcrdma_create_data_internal *cdata)
631{
632 struct ib_device_attr devattr;
5d40a8a5 633 int rc, err;
c56c65fb
TT
634
635 rc = ib_query_device(ia->ri_id->device, &devattr);
636 if (rc) {
637 dprintk("RPC: %s: ib_query_device failed %d\n",
638 __func__, rc);
639 return rc;
640 }
641
642 /* check provider's send/recv wr limits */
643 if (cdata->max_requests > devattr.max_qp_wr)
644 cdata->max_requests = devattr.max_qp_wr;
645
646 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
647 ep->rep_attr.qp_context = ep;
648 /* send_cq and recv_cq initialized below */
649 ep->rep_attr.srq = NULL;
650 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
651 switch (ia->ri_memreg_strategy) {
3197d309 652 case RPCRDMA_FRMR:
15cdc644
TT
653 /* Add room for frmr register and invalidate WRs.
654 * 1. FRMR reg WR for head
655 * 2. FRMR invalidate WR for head
656 * 3. FRMR reg WR for pagelist
657 * 4. FRMR invalidate WR for pagelist
658 * 5. FRMR reg WR for tail
659 * 6. FRMR invalidate WR for tail
660 * 7. The RDMA_SEND WR
661 */
662 ep->rep_attr.cap.max_send_wr *= 7;
663 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
664 cdata->max_requests = devattr.max_qp_wr / 7;
665 if (!cdata->max_requests)
666 return -EINVAL;
667 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
668 }
3197d309 669 break;
c56c65fb
TT
670 case RPCRDMA_MEMWINDOWS_ASYNC:
671 case RPCRDMA_MEMWINDOWS:
672 /* Add room for mw_binds+unbinds - overkill! */
673 ep->rep_attr.cap.max_send_wr++;
674 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
675 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
676 return -EINVAL;
677 break;
678 default:
679 break;
680 }
681 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
682 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
683 ep->rep_attr.cap.max_recv_sge = 1;
684 ep->rep_attr.cap.max_inline_data = 0;
685 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
686 ep->rep_attr.qp_type = IB_QPT_RC;
687 ep->rep_attr.port_num = ~0;
688
689 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
690 "iovs: send %d recv %d\n",
691 __func__,
692 ep->rep_attr.cap.max_send_wr,
693 ep->rep_attr.cap.max_recv_wr,
694 ep->rep_attr.cap.max_send_sge,
695 ep->rep_attr.cap.max_recv_sge);
696
697 /* set trigger for requesting send completion */
698 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
699 switch (ia->ri_memreg_strategy) {
700 case RPCRDMA_MEMWINDOWS_ASYNC:
701 case RPCRDMA_MEMWINDOWS:
702 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
703 break;
704 default:
705 break;
706 }
707 if (ep->rep_cqinit <= 2)
708 ep->rep_cqinit = 0;
709 INIT_CQCOUNT(ep);
710 ep->rep_ia = ia;
711 init_waitqueue_head(&ep->rep_connect_wait);
712
713 /*
714 * Create a single cq for receive dto and mw_bind (only ever
715 * care about unbind, really). Send completions are suppressed.
716 * Use single threaded tasklet upcalls to maintain ordering.
717 */
718 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
719 rpcrdma_cq_async_error_upcall, NULL,
720 ep->rep_attr.cap.max_recv_wr +
721 ep->rep_attr.cap.max_send_wr + 1, 0);
722 if (IS_ERR(ep->rep_cq)) {
723 rc = PTR_ERR(ep->rep_cq);
724 dprintk("RPC: %s: ib_create_cq failed: %i\n",
725 __func__, rc);
726 goto out1;
727 }
728
729 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
730 if (rc) {
731 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
732 __func__, rc);
733 goto out2;
734 }
735
736 ep->rep_attr.send_cq = ep->rep_cq;
737 ep->rep_attr.recv_cq = ep->rep_cq;
738
739 /* Initialize cma parameters */
740
741 /* RPC/RDMA does not use private data */
742 ep->rep_remote_cma.private_data = NULL;
743 ep->rep_remote_cma.private_data_len = 0;
744
745 /* Client offers RDMA Read but does not initiate */
b334eaab
TT
746 ep->rep_remote_cma.initiator_depth = 0;
747 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
c56c65fb 748 ep->rep_remote_cma.responder_resources = 0;
b334eaab
TT
749 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
750 ep->rep_remote_cma.responder_resources = 32;
751 else
c56c65fb 752 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
c56c65fb
TT
753
754 ep->rep_remote_cma.retry_count = 7;
755 ep->rep_remote_cma.flow_control = 0;
756 ep->rep_remote_cma.rnr_retry_count = 0;
757
758 return 0;
759
760out2:
5d40a8a5
CL
761 err = ib_destroy_cq(ep->rep_cq);
762 if (err)
763 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
764 __func__, err);
c56c65fb
TT
765out1:
766 return rc;
767}
768
769/*
770 * rpcrdma_ep_destroy
771 *
772 * Disconnect and destroy endpoint. After this, the only
773 * valid operations on the ep are to free it (if dynamically
774 * allocated) or re-create it.
775 *
776 * The caller's error handling must be sure to not leak the endpoint
777 * if this function fails.
778 */
779int
780rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
781{
782 int rc;
783
784 dprintk("RPC: %s: entering, connected is %d\n",
785 __func__, ep->rep_connected);
786
787 if (ia->ri_id->qp) {
788 rc = rpcrdma_ep_disconnect(ep, ia);
789 if (rc)
790 dprintk("RPC: %s: rpcrdma_ep_disconnect"
791 " returned %i\n", __func__, rc);
fee08caf
TT
792 rdma_destroy_qp(ia->ri_id);
793 ia->ri_id->qp = NULL;
c56c65fb
TT
794 }
795
c56c65fb
TT
796 /* padding - could be done in rpcrdma_buffer_destroy... */
797 if (ep->rep_pad_mr) {
798 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
799 ep->rep_pad_mr = NULL;
800 }
801
c56c65fb
TT
802 rpcrdma_clean_cq(ep->rep_cq);
803 rc = ib_destroy_cq(ep->rep_cq);
804 if (rc)
805 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
806 __func__, rc);
807
808 return rc;
809}
810
811/*
812 * Connect unconnected endpoint.
813 */
814int
815rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
816{
817 struct rdma_cm_id *id;
818 int rc = 0;
819 int retry_count = 0;
c56c65fb 820
c055551e 821 if (ep->rep_connected != 0) {
c56c65fb
TT
822 struct rpcrdma_xprt *xprt;
823retry:
824 rc = rpcrdma_ep_disconnect(ep, ia);
825 if (rc && rc != -ENOTCONN)
826 dprintk("RPC: %s: rpcrdma_ep_disconnect"
827 " status %i\n", __func__, rc);
828 rpcrdma_clean_cq(ep->rep_cq);
829
830 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
831 id = rpcrdma_create_id(xprt, ia,
832 (struct sockaddr *)&xprt->rx_data.addr);
833 if (IS_ERR(id)) {
834 rc = PTR_ERR(id);
835 goto out;
836 }
837 /* TEMP TEMP TEMP - fail if new device:
838 * Deregister/remarshal *all* requests!
839 * Close and recreate adapter, pd, etc!
840 * Re-determine all attributes still sane!
841 * More stuff I haven't thought of!
842 * Rrrgh!
843 */
844 if (ia->ri_id->device != id->device) {
845 printk("RPC: %s: can't reconnect on "
846 "different device!\n", __func__);
847 rdma_destroy_id(id);
848 rc = -ENETDOWN;
849 goto out;
850 }
851 /* END TEMP */
1a954051 852 rdma_destroy_qp(ia->ri_id);
c56c65fb
TT
853 rdma_destroy_id(ia->ri_id);
854 ia->ri_id = id;
855 }
856
857 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
858 if (rc) {
859 dprintk("RPC: %s: rdma_create_qp failed %i\n",
860 __func__, rc);
861 goto out;
862 }
863
864/* XXX Tavor device performs badly with 2K MTU! */
865if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
866 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
867 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
868 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
869 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
870 struct ib_qp_attr attr = {
871 .path_mtu = IB_MTU_1024
872 };
873 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
874 }
875}
876
c56c65fb
TT
877 ep->rep_connected = 0;
878
879 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
880 if (rc) {
881 dprintk("RPC: %s: rdma_connect() failed with %i\n",
882 __func__, rc);
883 goto out;
884 }
885
c56c65fb
TT
886 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
887
888 /*
889 * Check state. A non-peer reject indicates no listener
890 * (ECONNREFUSED), which may be a transient state. All
891 * others indicate a transport condition which has already
892 * undergone a best-effort.
893 */
f64f9e71
JP
894 if (ep->rep_connected == -ECONNREFUSED &&
895 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
c56c65fb
TT
896 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
897 goto retry;
898 }
899 if (ep->rep_connected <= 0) {
900 /* Sometimes, the only way to reliably connect to remote
901 * CMs is to use same nonzero values for ORD and IRD. */
b334eaab
TT
902 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
903 (ep->rep_remote_cma.responder_resources == 0 ||
904 ep->rep_remote_cma.initiator_depth !=
905 ep->rep_remote_cma.responder_resources)) {
906 if (ep->rep_remote_cma.responder_resources == 0)
907 ep->rep_remote_cma.responder_resources = 1;
908 ep->rep_remote_cma.initiator_depth =
909 ep->rep_remote_cma.responder_resources;
c56c65fb 910 goto retry;
b334eaab 911 }
c56c65fb
TT
912 rc = ep->rep_connected;
913 } else {
914 dprintk("RPC: %s: connected\n", __func__);
915 }
916
917out:
918 if (rc)
919 ep->rep_connected = rc;
920 return rc;
921}
922
923/*
924 * rpcrdma_ep_disconnect
925 *
926 * This is separate from destroy to facilitate the ability
927 * to reconnect without recreating the endpoint.
928 *
929 * This call is not reentrant, and must not be made in parallel
930 * on the same endpoint.
931 */
932int
933rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
934{
935 int rc;
936
937 rpcrdma_clean_cq(ep->rep_cq);
938 rc = rdma_disconnect(ia->ri_id);
939 if (!rc) {
940 /* returns without wait if not connected */
941 wait_event_interruptible(ep->rep_connect_wait,
942 ep->rep_connected != 1);
943 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
944 (ep->rep_connected == 1) ? "still " : "dis");
945 } else {
946 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
947 ep->rep_connected = rc;
948 }
949 return rc;
950}
951
952/*
953 * Initialize buffer memory
954 */
955int
956rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
957 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
958{
959 char *p;
960 size_t len;
961 int i, rc;
8d4ba034 962 struct rpcrdma_mw *r;
c56c65fb
TT
963
964 buf->rb_max_requests = cdata->max_requests;
965 spin_lock_init(&buf->rb_lock);
966 atomic_set(&buf->rb_credits, 1);
967
968 /* Need to allocate:
969 * 1. arrays for send and recv pointers
970 * 2. arrays of struct rpcrdma_req to fill in pointers
971 * 3. array of struct rpcrdma_rep for replies
972 * 4. padding, if any
3197d309 973 * 5. mw's, fmr's or frmr's, if any
c56c65fb
TT
974 * Send/recv buffers in req/rep need to be registered
975 */
976
977 len = buf->rb_max_requests *
978 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
979 len += cdata->padding;
980 switch (ia->ri_memreg_strategy) {
3197d309
TT
981 case RPCRDMA_FRMR:
982 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
983 sizeof(struct rpcrdma_mw);
984 break;
c56c65fb
TT
985 case RPCRDMA_MTHCAFMR:
986 /* TBD we are perhaps overallocating here */
987 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
988 sizeof(struct rpcrdma_mw);
989 break;
990 case RPCRDMA_MEMWINDOWS_ASYNC:
991 case RPCRDMA_MEMWINDOWS:
992 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
993 sizeof(struct rpcrdma_mw);
994 break;
995 default:
996 break;
997 }
998
999 /* allocate 1, 4 and 5 in one shot */
1000 p = kzalloc(len, GFP_KERNEL);
1001 if (p == NULL) {
1002 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1003 __func__, len);
1004 rc = -ENOMEM;
1005 goto out;
1006 }
1007 buf->rb_pool = p; /* for freeing it later */
1008
1009 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1010 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1011 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1012 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1013
1014 /*
1015 * Register the zeroed pad buffer, if any.
1016 */
1017 if (cdata->padding) {
1018 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1019 &ep->rep_pad_mr, &ep->rep_pad);
1020 if (rc)
1021 goto out;
1022 }
1023 p += cdata->padding;
1024
1025 /*
1026 * Allocate the fmr's, or mw's for mw_bind chunk registration.
1027 * We "cycle" the mw's in order to minimize rkey reuse,
1028 * and also reduce unbind-to-bind collision.
1029 */
1030 INIT_LIST_HEAD(&buf->rb_mws);
8d4ba034 1031 r = (struct rpcrdma_mw *)p;
c56c65fb 1032 switch (ia->ri_memreg_strategy) {
3197d309
TT
1033 case RPCRDMA_FRMR:
1034 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1035 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1036 RPCRDMA_MAX_SEGS);
1037 if (IS_ERR(r->r.frmr.fr_mr)) {
1038 rc = PTR_ERR(r->r.frmr.fr_mr);
1039 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1040 " failed %i\n", __func__, rc);
1041 goto out;
1042 }
1043 r->r.frmr.fr_pgl =
1044 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1045 RPCRDMA_MAX_SEGS);
1046 if (IS_ERR(r->r.frmr.fr_pgl)) {
1047 rc = PTR_ERR(r->r.frmr.fr_pgl);
1048 dprintk("RPC: %s: "
1049 "ib_alloc_fast_reg_page_list "
1050 "failed %i\n", __func__, rc);
1051 goto out;
1052 }
1053 list_add(&r->mw_list, &buf->rb_mws);
1054 ++r;
1055 }
1056 break;
c56c65fb 1057 case RPCRDMA_MTHCAFMR:
c56c65fb
TT
1058 /* TBD we are perhaps overallocating here */
1059 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
8d4ba034
TT
1060 static struct ib_fmr_attr fa =
1061 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
c56c65fb
TT
1062 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1063 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1064 &fa);
1065 if (IS_ERR(r->r.fmr)) {
1066 rc = PTR_ERR(r->r.fmr);
1067 dprintk("RPC: %s: ib_alloc_fmr"
1068 " failed %i\n", __func__, rc);
1069 goto out;
1070 }
1071 list_add(&r->mw_list, &buf->rb_mws);
1072 ++r;
1073 }
c56c65fb
TT
1074 break;
1075 case RPCRDMA_MEMWINDOWS_ASYNC:
1076 case RPCRDMA_MEMWINDOWS:
c56c65fb
TT
1077 /* Allocate one extra request's worth, for full cycling */
1078 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1079 r->r.mw = ib_alloc_mw(ia->ri_pd);
1080 if (IS_ERR(r->r.mw)) {
1081 rc = PTR_ERR(r->r.mw);
1082 dprintk("RPC: %s: ib_alloc_mw"
1083 " failed %i\n", __func__, rc);
1084 goto out;
1085 }
1086 list_add(&r->mw_list, &buf->rb_mws);
1087 ++r;
1088 }
c56c65fb
TT
1089 break;
1090 default:
1091 break;
1092 }
1093
1094 /*
1095 * Allocate/init the request/reply buffers. Doing this
1096 * using kmalloc for now -- one for each buf.
1097 */
1098 for (i = 0; i < buf->rb_max_requests; i++) {
1099 struct rpcrdma_req *req;
1100 struct rpcrdma_rep *rep;
1101
1102 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1103 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1104 /* Typical ~2400b, so rounding up saves work later */
1105 if (len < 4096)
1106 len = 4096;
1107 req = kmalloc(len, GFP_KERNEL);
1108 if (req == NULL) {
1109 dprintk("RPC: %s: request buffer %d alloc"
1110 " failed\n", __func__, i);
1111 rc = -ENOMEM;
1112 goto out;
1113 }
1114 memset(req, 0, sizeof(struct rpcrdma_req));
1115 buf->rb_send_bufs[i] = req;
1116 buf->rb_send_bufs[i]->rl_buffer = buf;
1117
1118 rc = rpcrdma_register_internal(ia, req->rl_base,
1119 len - offsetof(struct rpcrdma_req, rl_base),
1120 &buf->rb_send_bufs[i]->rl_handle,
1121 &buf->rb_send_bufs[i]->rl_iov);
1122 if (rc)
1123 goto out;
1124
1125 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1126
1127 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1128 rep = kmalloc(len, GFP_KERNEL);
1129 if (rep == NULL) {
1130 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1131 __func__, i);
1132 rc = -ENOMEM;
1133 goto out;
1134 }
1135 memset(rep, 0, sizeof(struct rpcrdma_rep));
1136 buf->rb_recv_bufs[i] = rep;
1137 buf->rb_recv_bufs[i]->rr_buffer = buf;
1138 init_waitqueue_head(&rep->rr_unbind);
1139
1140 rc = rpcrdma_register_internal(ia, rep->rr_base,
1141 len - offsetof(struct rpcrdma_rep, rr_base),
1142 &buf->rb_recv_bufs[i]->rr_handle,
1143 &buf->rb_recv_bufs[i]->rr_iov);
1144 if (rc)
1145 goto out;
1146
1147 }
1148 dprintk("RPC: %s: max_requests %d\n",
1149 __func__, buf->rb_max_requests);
1150 /* done */
1151 return 0;
1152out:
1153 rpcrdma_buffer_destroy(buf);
1154 return rc;
1155}
1156
1157/*
1158 * Unregister and destroy buffer memory. Need to deal with
1159 * partial initialization, so it's callable from failed create.
1160 * Must be called before destroying endpoint, as registrations
1161 * reference it.
1162 */
1163void
1164rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1165{
1166 int rc, i;
1167 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
8d4ba034 1168 struct rpcrdma_mw *r;
c56c65fb
TT
1169
1170 /* clean up in reverse order from create
1171 * 1. recv mr memory (mr free, then kfree)
1172 * 1a. bind mw memory
1173 * 2. send mr memory (mr free, then kfree)
1174 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1175 * 4. arrays
1176 */
1177 dprintk("RPC: %s: entering\n", __func__);
1178
1179 for (i = 0; i < buf->rb_max_requests; i++) {
1180 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1181 rpcrdma_deregister_internal(ia,
1182 buf->rb_recv_bufs[i]->rr_handle,
1183 &buf->rb_recv_bufs[i]->rr_iov);
1184 kfree(buf->rb_recv_bufs[i]);
1185 }
1186 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1187 while (!list_empty(&buf->rb_mws)) {
c56c65fb
TT
1188 r = list_entry(buf->rb_mws.next,
1189 struct rpcrdma_mw, mw_list);
1190 list_del(&r->mw_list);
1191 switch (ia->ri_memreg_strategy) {
3197d309
TT
1192 case RPCRDMA_FRMR:
1193 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1194 if (rc)
1195 dprintk("RPC: %s:"
1196 " ib_dereg_mr"
1197 " failed %i\n",
1198 __func__, rc);
1199 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1200 break;
c56c65fb
TT
1201 case RPCRDMA_MTHCAFMR:
1202 rc = ib_dealloc_fmr(r->r.fmr);
1203 if (rc)
1204 dprintk("RPC: %s:"
1205 " ib_dealloc_fmr"
1206 " failed %i\n",
1207 __func__, rc);
1208 break;
1209 case RPCRDMA_MEMWINDOWS_ASYNC:
1210 case RPCRDMA_MEMWINDOWS:
1211 rc = ib_dealloc_mw(r->r.mw);
1212 if (rc)
1213 dprintk("RPC: %s:"
1214 " ib_dealloc_mw"
1215 " failed %i\n",
1216 __func__, rc);
1217 break;
1218 default:
1219 break;
1220 }
1221 }
1222 rpcrdma_deregister_internal(ia,
1223 buf->rb_send_bufs[i]->rl_handle,
1224 &buf->rb_send_bufs[i]->rl_iov);
1225 kfree(buf->rb_send_bufs[i]);
1226 }
1227 }
1228
1229 kfree(buf->rb_pool);
1230}
1231
1232/*
1233 * Get a set of request/reply buffers.
1234 *
1235 * Reply buffer (if needed) is attached to send buffer upon return.
1236 * Rule:
1237 * rb_send_index and rb_recv_index MUST always be pointing to the
1238 * *next* available buffer (non-NULL). They are incremented after
1239 * removing buffers, and decremented *before* returning them.
1240 */
1241struct rpcrdma_req *
1242rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1243{
1244 struct rpcrdma_req *req;
1245 unsigned long flags;
8d4ba034
TT
1246 int i;
1247 struct rpcrdma_mw *r;
c56c65fb
TT
1248
1249 spin_lock_irqsave(&buffers->rb_lock, flags);
1250 if (buffers->rb_send_index == buffers->rb_max_requests) {
1251 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1252 dprintk("RPC: %s: out of request buffers\n", __func__);
1253 return ((struct rpcrdma_req *)NULL);
1254 }
1255
1256 req = buffers->rb_send_bufs[buffers->rb_send_index];
1257 if (buffers->rb_send_index < buffers->rb_recv_index) {
1258 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1259 __func__,
1260 buffers->rb_recv_index - buffers->rb_send_index);
1261 req->rl_reply = NULL;
1262 } else {
1263 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1264 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1265 }
1266 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1267 if (!list_empty(&buffers->rb_mws)) {
8d4ba034 1268 i = RPCRDMA_MAX_SEGS - 1;
c56c65fb 1269 do {
c56c65fb
TT
1270 r = list_entry(buffers->rb_mws.next,
1271 struct rpcrdma_mw, mw_list);
1272 list_del(&r->mw_list);
1273 req->rl_segments[i].mr_chunk.rl_mw = r;
1274 } while (--i >= 0);
1275 }
1276 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1277 return req;
1278}
1279
1280/*
1281 * Put request/reply buffers back into pool.
1282 * Pre-decrement counter/array index.
1283 */
1284void
1285rpcrdma_buffer_put(struct rpcrdma_req *req)
1286{
1287 struct rpcrdma_buffer *buffers = req->rl_buffer;
1288 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1289 int i;
1290 unsigned long flags;
1291
1292 BUG_ON(req->rl_nchunks != 0);
1293 spin_lock_irqsave(&buffers->rb_lock, flags);
1294 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1295 req->rl_niovs = 0;
1296 if (req->rl_reply) {
1297 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1298 init_waitqueue_head(&req->rl_reply->rr_unbind);
1299 req->rl_reply->rr_func = NULL;
1300 req->rl_reply = NULL;
1301 }
1302 switch (ia->ri_memreg_strategy) {
3197d309 1303 case RPCRDMA_FRMR:
c56c65fb
TT
1304 case RPCRDMA_MTHCAFMR:
1305 case RPCRDMA_MEMWINDOWS_ASYNC:
1306 case RPCRDMA_MEMWINDOWS:
1307 /*
1308 * Cycle mw's back in reverse order, and "spin" them.
1309 * This delays and scrambles reuse as much as possible.
1310 */
1311 i = 1;
1312 do {
1313 struct rpcrdma_mw **mw;
1314 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1315 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1316 *mw = NULL;
1317 } while (++i < RPCRDMA_MAX_SEGS);
1318 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1319 &buffers->rb_mws);
1320 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1321 break;
1322 default:
1323 break;
1324 }
1325 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1326}
1327
1328/*
1329 * Recover reply buffers from pool.
1330 * This happens when recovering from error conditions.
1331 * Post-increment counter/array index.
1332 */
1333void
1334rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1335{
1336 struct rpcrdma_buffer *buffers = req->rl_buffer;
1337 unsigned long flags;
1338
1339 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1340 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1341 spin_lock_irqsave(&buffers->rb_lock, flags);
1342 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1343 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1344 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1345 }
1346 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1347}
1348
1349/*
1350 * Put reply buffers back into pool when not attached to
1351 * request. This happens in error conditions, and when
1352 * aborting unbinds. Pre-decrement counter/array index.
1353 */
1354void
1355rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1356{
1357 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1358 unsigned long flags;
1359
1360 rep->rr_func = NULL;
1361 spin_lock_irqsave(&buffers->rb_lock, flags);
1362 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1363 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1364}
1365
1366/*
1367 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1368 */
1369
1370int
1371rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1372 struct ib_mr **mrp, struct ib_sge *iov)
1373{
1374 struct ib_phys_buf ipb;
1375 struct ib_mr *mr;
1376 int rc;
1377
1378 /*
1379 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1380 */
1381 iov->addr = ib_dma_map_single(ia->ri_id->device,
1382 va, len, DMA_BIDIRECTIONAL);
1383 iov->length = len;
1384
bd7ed1d1
TT
1385 if (ia->ri_have_dma_lkey) {
1386 *mrp = NULL;
1387 iov->lkey = ia->ri_dma_lkey;
1388 return 0;
1389 } else if (ia->ri_bind_mem != NULL) {
c56c65fb
TT
1390 *mrp = NULL;
1391 iov->lkey = ia->ri_bind_mem->lkey;
1392 return 0;
1393 }
1394
1395 ipb.addr = iov->addr;
1396 ipb.size = iov->length;
1397 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1398 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1399
1400 dprintk("RPC: %s: phys convert: 0x%llx "
1401 "registered 0x%llx length %d\n",
a56daeb7
AM
1402 __func__, (unsigned long long)ipb.addr,
1403 (unsigned long long)iov->addr, len);
c56c65fb
TT
1404
1405 if (IS_ERR(mr)) {
1406 *mrp = NULL;
1407 rc = PTR_ERR(mr);
1408 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1409 } else {
1410 *mrp = mr;
1411 iov->lkey = mr->lkey;
1412 rc = 0;
1413 }
1414
1415 return rc;
1416}
1417
1418int
1419rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1420 struct ib_mr *mr, struct ib_sge *iov)
1421{
1422 int rc;
1423
1424 ib_dma_unmap_single(ia->ri_id->device,
1425 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1426
1427 if (NULL == mr)
1428 return 0;
1429
1430 rc = ib_dereg_mr(mr);
1431 if (rc)
1432 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1433 return rc;
1434}
1435
1436/*
1437 * Wrappers for chunk registration, shared by read/write chunk code.
1438 */
1439
1440static void
1441rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1442{
1443 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1444 seg->mr_dmalen = seg->mr_len;
1445 if (seg->mr_page)
1446 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1447 seg->mr_page, offset_in_page(seg->mr_offset),
1448 seg->mr_dmalen, seg->mr_dir);
1449 else
1450 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1451 seg->mr_offset,
1452 seg->mr_dmalen, seg->mr_dir);
1453}
1454
1455static void
1456rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1457{
1458 if (seg->mr_page)
1459 ib_dma_unmap_page(ia->ri_id->device,
1460 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1461 else
1462 ib_dma_unmap_single(ia->ri_id->device,
1463 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1464}
1465
3197d309
TT
1466static int
1467rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1468 int *nsegs, int writing, struct rpcrdma_ia *ia,
1469 struct rpcrdma_xprt *r_xprt)
1470{
1471 struct rpcrdma_mr_seg *seg1 = seg;
1472 struct ib_send_wr frmr_wr, *bad_wr;
1473 u8 key;
1474 int len, pageoff;
1475 int i, rc;
1476
1477 pageoff = offset_in_page(seg1->mr_offset);
1478 seg1->mr_offset -= pageoff; /* start of page */
1479 seg1->mr_len += pageoff;
1480 len = -pageoff;
1481 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1482 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1483 for (i = 0; i < *nsegs;) {
1484 rpcrdma_map_one(ia, seg, writing);
1485 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1486 len += seg->mr_len;
1487 ++seg;
1488 ++i;
1489 /* Check for holes */
1490 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1491 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1492 break;
1493 }
1494 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1495 __func__, seg1->mr_chunk.rl_mw, i);
1496
1497 /* Bump the key */
1498 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1499 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1500
1501 /* Prepare FRMR WR */
1502 memset(&frmr_wr, 0, sizeof frmr_wr);
1503 frmr_wr.opcode = IB_WR_FAST_REG_MR;
1504 frmr_wr.send_flags = 0; /* unsignaled */
7a8b80eb 1505 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
3197d309
TT
1506 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1507 frmr_wr.wr.fast_reg.page_list_len = i;
1508 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1509 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1510 frmr_wr.wr.fast_reg.access_flags = (writing ?
68743082
VP
1511 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1512 IB_ACCESS_REMOTE_READ);
3197d309
TT
1513 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1514 DECR_CQCOUNT(&r_xprt->rx_ep);
1515
1516 rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr);
1517
1518 if (rc) {
1519 dprintk("RPC: %s: failed ib_post_send for register,"
1520 " status %i\n", __func__, rc);
1521 while (i--)
1522 rpcrdma_unmap_one(ia, --seg);
1523 } else {
1524 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1525 seg1->mr_base = seg1->mr_dma + pageoff;
1526 seg1->mr_nsegs = i;
1527 seg1->mr_len = len;
1528 }
1529 *nsegs = i;
1530 return rc;
1531}
1532
1533static int
1534rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1535 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1536{
1537 struct rpcrdma_mr_seg *seg1 = seg;
1538 struct ib_send_wr invalidate_wr, *bad_wr;
1539 int rc;
1540
1541 while (seg1->mr_nsegs--)
1542 rpcrdma_unmap_one(ia, seg++);
1543
1544 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1545 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1546 invalidate_wr.send_flags = 0; /* unsignaled */
1547 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1548 DECR_CQCOUNT(&r_xprt->rx_ep);
1549
1550 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1551 if (rc)
1552 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1553 " status %i\n", __func__, rc);
1554 return rc;
1555}
1556
8d4ba034
TT
1557static int
1558rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1559 int *nsegs, int writing, struct rpcrdma_ia *ia)
1560{
1561 struct rpcrdma_mr_seg *seg1 = seg;
1562 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1563 int len, pageoff, i, rc;
1564
1565 pageoff = offset_in_page(seg1->mr_offset);
1566 seg1->mr_offset -= pageoff; /* start of page */
1567 seg1->mr_len += pageoff;
1568 len = -pageoff;
1569 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1570 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1571 for (i = 0; i < *nsegs;) {
1572 rpcrdma_map_one(ia, seg, writing);
1573 physaddrs[i] = seg->mr_dma;
1574 len += seg->mr_len;
1575 ++seg;
1576 ++i;
1577 /* Check for holes */
1578 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1579 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1580 break;
1581 }
1582 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1583 physaddrs, i, seg1->mr_dma);
1584 if (rc) {
1585 dprintk("RPC: %s: failed ib_map_phys_fmr "
1586 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1587 len, (unsigned long long)seg1->mr_dma,
1588 pageoff, i, rc);
1589 while (i--)
1590 rpcrdma_unmap_one(ia, --seg);
1591 } else {
1592 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1593 seg1->mr_base = seg1->mr_dma + pageoff;
1594 seg1->mr_nsegs = i;
1595 seg1->mr_len = len;
1596 }
1597 *nsegs = i;
1598 return rc;
1599}
1600
1601static int
1602rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1603 struct rpcrdma_ia *ia)
1604{
1605 struct rpcrdma_mr_seg *seg1 = seg;
1606 LIST_HEAD(l);
1607 int rc;
1608
1609 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1610 rc = ib_unmap_fmr(&l);
1611 while (seg1->mr_nsegs--)
1612 rpcrdma_unmap_one(ia, seg++);
1613 if (rc)
1614 dprintk("RPC: %s: failed ib_unmap_fmr,"
1615 " status %i\n", __func__, rc);
1616 return rc;
1617}
1618
1619static int
1620rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1621 int *nsegs, int writing, struct rpcrdma_ia *ia,
1622 struct rpcrdma_xprt *r_xprt)
1623{
1624 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1625 IB_ACCESS_REMOTE_READ);
1626 struct ib_mw_bind param;
1627 int rc;
1628
1629 *nsegs = 1;
1630 rpcrdma_map_one(ia, seg, writing);
1631 param.mr = ia->ri_bind_mem;
1632 param.wr_id = 0ULL; /* no send cookie */
1633 param.addr = seg->mr_dma;
1634 param.length = seg->mr_len;
1635 param.send_flags = 0;
1636 param.mw_access_flags = mem_priv;
1637
1638 DECR_CQCOUNT(&r_xprt->rx_ep);
1639 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1640 if (rc) {
1641 dprintk("RPC: %s: failed ib_bind_mw "
1642 "%u@0x%llx status %i\n",
1643 __func__, seg->mr_len,
1644 (unsigned long long)seg->mr_dma, rc);
1645 rpcrdma_unmap_one(ia, seg);
1646 } else {
1647 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1648 seg->mr_base = param.addr;
1649 seg->mr_nsegs = 1;
1650 }
1651 return rc;
1652}
1653
1654static int
1655rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1656 struct rpcrdma_ia *ia,
1657 struct rpcrdma_xprt *r_xprt, void **r)
1658{
1659 struct ib_mw_bind param;
1660 LIST_HEAD(l);
1661 int rc;
1662
1663 BUG_ON(seg->mr_nsegs != 1);
1664 param.mr = ia->ri_bind_mem;
1665 param.addr = 0ULL; /* unbind */
1666 param.length = 0;
1667 param.mw_access_flags = 0;
1668 if (*r) {
1669 param.wr_id = (u64) (unsigned long) *r;
1670 param.send_flags = IB_SEND_SIGNALED;
1671 INIT_CQCOUNT(&r_xprt->rx_ep);
1672 } else {
1673 param.wr_id = 0ULL;
1674 param.send_flags = 0;
1675 DECR_CQCOUNT(&r_xprt->rx_ep);
1676 }
1677 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1678 rpcrdma_unmap_one(ia, seg);
1679 if (rc)
1680 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1681 " status %i\n", __func__, rc);
1682 else
1683 *r = NULL; /* will upcall on completion */
1684 return rc;
1685}
1686
1687static int
1688rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1689 int *nsegs, int writing, struct rpcrdma_ia *ia)
1690{
1691 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1692 IB_ACCESS_REMOTE_READ);
1693 struct rpcrdma_mr_seg *seg1 = seg;
1694 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1695 int len, i, rc = 0;
1696
1697 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1698 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1699 for (len = 0, i = 0; i < *nsegs;) {
1700 rpcrdma_map_one(ia, seg, writing);
1701 ipb[i].addr = seg->mr_dma;
1702 ipb[i].size = seg->mr_len;
1703 len += seg->mr_len;
1704 ++seg;
1705 ++i;
1706 /* Check for holes */
1707 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1708 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1709 break;
1710 }
1711 seg1->mr_base = seg1->mr_dma;
1712 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1713 ipb, i, mem_priv, &seg1->mr_base);
1714 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1715 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1716 dprintk("RPC: %s: failed ib_reg_phys_mr "
1717 "%u@0x%llx (%d)... status %i\n",
1718 __func__, len,
1719 (unsigned long long)seg1->mr_dma, i, rc);
1720 while (i--)
1721 rpcrdma_unmap_one(ia, --seg);
1722 } else {
1723 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1724 seg1->mr_nsegs = i;
1725 seg1->mr_len = len;
1726 }
1727 *nsegs = i;
1728 return rc;
1729}
1730
1731static int
1732rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1733 struct rpcrdma_ia *ia)
1734{
1735 struct rpcrdma_mr_seg *seg1 = seg;
1736 int rc;
1737
1738 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1739 seg1->mr_chunk.rl_mr = NULL;
1740 while (seg1->mr_nsegs--)
1741 rpcrdma_unmap_one(ia, seg++);
1742 if (rc)
1743 dprintk("RPC: %s: failed ib_dereg_mr,"
1744 " status %i\n", __func__, rc);
1745 return rc;
1746}
1747
c56c65fb
TT
1748int
1749rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1750 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1751{
1752 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1753 int rc = 0;
1754
1755 switch (ia->ri_memreg_strategy) {
1756
1757#if RPCRDMA_PERSISTENT_REGISTRATION
1758 case RPCRDMA_ALLPHYSICAL:
1759 rpcrdma_map_one(ia, seg, writing);
1760 seg->mr_rkey = ia->ri_bind_mem->rkey;
1761 seg->mr_base = seg->mr_dma;
1762 seg->mr_nsegs = 1;
1763 nsegs = 1;
1764 break;
1765#endif
1766
3197d309
TT
1767 /* Registration using frmr registration */
1768 case RPCRDMA_FRMR:
1769 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1770 break;
1771
8d4ba034 1772 /* Registration using fmr memory registration */
c56c65fb 1773 case RPCRDMA_MTHCAFMR:
8d4ba034 1774 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1775 break;
1776
1777 /* Registration using memory windows */
1778 case RPCRDMA_MEMWINDOWS_ASYNC:
1779 case RPCRDMA_MEMWINDOWS:
8d4ba034 1780 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
c56c65fb
TT
1781 break;
1782
1783 /* Default registration each time */
1784 default:
8d4ba034 1785 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1786 break;
1787 }
1788 if (rc)
1789 return -1;
1790
1791 return nsegs;
1792}
1793
1794int
1795rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1796 struct rpcrdma_xprt *r_xprt, void *r)
1797{
1798 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1799 int nsegs = seg->mr_nsegs, rc;
1800
1801 switch (ia->ri_memreg_strategy) {
1802
1803#if RPCRDMA_PERSISTENT_REGISTRATION
1804 case RPCRDMA_ALLPHYSICAL:
1805 BUG_ON(nsegs != 1);
1806 rpcrdma_unmap_one(ia, seg);
1807 rc = 0;
1808 break;
1809#endif
1810
3197d309
TT
1811 case RPCRDMA_FRMR:
1812 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1813 break;
1814
c56c65fb 1815 case RPCRDMA_MTHCAFMR:
8d4ba034 1816 rc = rpcrdma_deregister_fmr_external(seg, ia);
c56c65fb
TT
1817 break;
1818
1819 case RPCRDMA_MEMWINDOWS_ASYNC:
1820 case RPCRDMA_MEMWINDOWS:
8d4ba034 1821 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
c56c65fb
TT
1822 break;
1823
1824 default:
8d4ba034 1825 rc = rpcrdma_deregister_default_external(seg, ia);
c56c65fb
TT
1826 break;
1827 }
1828 if (r) {
1829 struct rpcrdma_rep *rep = r;
1830 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1831 rep->rr_func = NULL;
1832 func(rep); /* dereg done, callback now */
1833 }
1834 return nsegs;
1835}
1836
1837/*
1838 * Prepost any receive buffer, then post send.
1839 *
1840 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1841 */
1842int
1843rpcrdma_ep_post(struct rpcrdma_ia *ia,
1844 struct rpcrdma_ep *ep,
1845 struct rpcrdma_req *req)
1846{
1847 struct ib_send_wr send_wr, *send_wr_fail;
1848 struct rpcrdma_rep *rep = req->rl_reply;
1849 int rc;
1850
1851 if (rep) {
1852 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1853 if (rc)
1854 goto out;
1855 req->rl_reply = NULL;
1856 }
1857
1858 send_wr.next = NULL;
1859 send_wr.wr_id = 0ULL; /* no send cookie */
1860 send_wr.sg_list = req->rl_send_iov;
1861 send_wr.num_sge = req->rl_niovs;
1862 send_wr.opcode = IB_WR_SEND;
c56c65fb
TT
1863 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1864 ib_dma_sync_single_for_device(ia->ri_id->device,
1865 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1866 DMA_TO_DEVICE);
1867 ib_dma_sync_single_for_device(ia->ri_id->device,
1868 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1869 DMA_TO_DEVICE);
1870 ib_dma_sync_single_for_device(ia->ri_id->device,
1871 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1872 DMA_TO_DEVICE);
1873
1874 if (DECR_CQCOUNT(ep) > 0)
1875 send_wr.send_flags = 0;
1876 else { /* Provider must take a send completion every now and then */
1877 INIT_CQCOUNT(ep);
1878 send_wr.send_flags = IB_SEND_SIGNALED;
1879 }
1880
1881 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1882 if (rc)
1883 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1884 rc);
1885out:
1886 return rc;
1887}
1888
1889/*
1890 * (Re)post a receive buffer.
1891 */
1892int
1893rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1894 struct rpcrdma_ep *ep,
1895 struct rpcrdma_rep *rep)
1896{
1897 struct ib_recv_wr recv_wr, *recv_wr_fail;
1898 int rc;
1899
1900 recv_wr.next = NULL;
1901 recv_wr.wr_id = (u64) (unsigned long) rep;
1902 recv_wr.sg_list = &rep->rr_iov;
1903 recv_wr.num_sge = 1;
1904
1905 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1906 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1907
1908 DECR_CQCOUNT(ep);
1909 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1910
1911 if (rc)
1912 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1913 rc);
1914 return rc;
1915}