verbs.c 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461
  1. /*
  2. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  3. *
  4. * This software is available to you under a choice of one of two
  5. * licenses. You may choose to be licensed under the terms of the GNU
  6. * General Public License (GPL) Version 2, available from the file
  7. * COPYING in the main directory of this source tree, or the BSD-type
  8. * license below:
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. *
  14. * Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. *
  17. * Redistributions in binary form must reproduce the above
  18. * copyright notice, this list of conditions and the following
  19. * disclaimer in the documentation and/or other materials provided
  20. * with the distribution.
  21. *
  22. * Neither the name of the Network Appliance, Inc. nor the names of
  23. * its contributors may be used to endorse or promote products
  24. * derived from this software without specific prior written
  25. * permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  30. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  31. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  32. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  33. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  34. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  35. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  36. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  37. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. */
  39. /*
  40. * verbs.c
  41. *
  42. * Encapsulates the major functions managing:
  43. * o adapters
  44. * o endpoints
  45. * o connections
  46. * o buffer memory
  47. */
  48. #include <linux/interrupt.h>
  49. #include <linux/slab.h>
  50. #include <linux/prefetch.h>
  51. #include <linux/sunrpc/addr.h>
  52. #include <asm/bitops.h>
  53. #include "xprt_rdma.h"
  54. /*
  55. * Globals/Macros
  56. */
  57. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  58. # define RPCDBG_FACILITY RPCDBG_TRANS
  59. #endif
  60. /*
  61. * internal functions
  62. */
  63. /*
  64. * handle replies in tasklet context, using a single, global list
  65. * rdma tasklet function -- just turn around and call the func
  66. * for all replies on the list
  67. */
  68. static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
  69. static LIST_HEAD(rpcrdma_tasklets_g);
  70. static void
  71. rpcrdma_run_tasklet(unsigned long data)
  72. {
  73. struct rpcrdma_rep *rep;
  74. unsigned long flags;
  75. data = data;
  76. spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
  77. while (!list_empty(&rpcrdma_tasklets_g)) {
  78. rep = list_entry(rpcrdma_tasklets_g.next,
  79. struct rpcrdma_rep, rr_list);
  80. list_del(&rep->rr_list);
  81. spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
  82. rpcrdma_reply_handler(rep);
  83. spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
  84. }
  85. spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
  86. }
  87. static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
  88. static void
  89. rpcrdma_schedule_tasklet(struct list_head *sched_list)
  90. {
  91. unsigned long flags;
  92. spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
  93. list_splice_tail(sched_list, &rpcrdma_tasklets_g);
  94. spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
  95. tasklet_schedule(&rpcrdma_tasklet_g);
  96. }
  97. static void
  98. rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
  99. {
  100. struct rpcrdma_ep *ep = context;
  101. pr_err("RPC: %s: %s on device %s ep %p\n",
  102. __func__, ib_event_msg(event->event),
  103. event->device->name, context);
  104. if (ep->rep_connected == 1) {
  105. ep->rep_connected = -EIO;
  106. rpcrdma_conn_func(ep);
  107. wake_up_all(&ep->rep_connect_wait);
  108. }
  109. }
  110. static void
  111. rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
  112. {
  113. struct rpcrdma_ep *ep = context;
  114. pr_err("RPC: %s: %s on device %s ep %p\n",
  115. __func__, ib_event_msg(event->event),
  116. event->device->name, context);
  117. if (ep->rep_connected == 1) {
  118. ep->rep_connected = -EIO;
  119. rpcrdma_conn_func(ep);
  120. wake_up_all(&ep->rep_connect_wait);
  121. }
  122. }
  123. static void
  124. rpcrdma_sendcq_process_wc(struct ib_wc *wc)
  125. {
  126. /* WARNING: Only wr_id and status are reliable at this point */
  127. if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) {
  128. if (wc->status != IB_WC_SUCCESS &&
  129. wc->status != IB_WC_WR_FLUSH_ERR)
  130. pr_err("RPC: %s: SEND: %s\n",
  131. __func__, ib_wc_status_msg(wc->status));
  132. } else {
  133. struct rpcrdma_mw *r;
  134. r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
  135. r->mw_sendcompletion(wc);
  136. }
  137. }
  138. static int
  139. rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
  140. {
  141. struct ib_wc *wcs;
  142. int budget, count, rc;
  143. budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
  144. do {
  145. wcs = ep->rep_send_wcs;
  146. rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
  147. if (rc <= 0)
  148. return rc;
  149. count = rc;
  150. while (count-- > 0)
  151. rpcrdma_sendcq_process_wc(wcs++);
  152. } while (rc == RPCRDMA_POLLSIZE && --budget);
  153. return 0;
  154. }
  155. /*
  156. * Handle send, fast_reg_mr, and local_inv completions.
  157. *
  158. * Send events are typically suppressed and thus do not result
  159. * in an upcall. Occasionally one is signaled, however. This
  160. * prevents the provider's completion queue from wrapping and
  161. * losing a completion.
  162. */
  163. static void
  164. rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
  165. {
  166. struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
  167. int rc;
  168. rc = rpcrdma_sendcq_poll(cq, ep);
  169. if (rc) {
  170. dprintk("RPC: %s: ib_poll_cq failed: %i\n",
  171. __func__, rc);
  172. return;
  173. }
  174. rc = ib_req_notify_cq(cq,
  175. IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
  176. if (rc == 0)
  177. return;
  178. if (rc < 0) {
  179. dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
  180. __func__, rc);
  181. return;
  182. }
  183. rpcrdma_sendcq_poll(cq, ep);
  184. }
  185. static void
  186. rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
  187. {
  188. struct rpcrdma_rep *rep =
  189. (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
  190. /* WARNING: Only wr_id and status are reliable at this point */
  191. if (wc->status != IB_WC_SUCCESS)
  192. goto out_fail;
  193. /* status == SUCCESS means all fields in wc are trustworthy */
  194. if (wc->opcode != IB_WC_RECV)
  195. return;
  196. dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
  197. __func__, rep, wc->byte_len);
  198. rep->rr_len = wc->byte_len;
  199. ib_dma_sync_single_for_cpu(rep->rr_device,
  200. rdmab_addr(rep->rr_rdmabuf),
  201. rep->rr_len, DMA_FROM_DEVICE);
  202. prefetch(rdmab_to_msg(rep->rr_rdmabuf));
  203. out_schedule:
  204. list_add_tail(&rep->rr_list, sched_list);
  205. return;
  206. out_fail:
  207. if (wc->status != IB_WC_WR_FLUSH_ERR)
  208. pr_err("RPC: %s: rep %p: %s\n",
  209. __func__, rep, ib_wc_status_msg(wc->status));
  210. rep->rr_len = ~0U;
  211. goto out_schedule;
  212. }
  213. static int
  214. rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
  215. {
  216. struct list_head sched_list;
  217. struct ib_wc *wcs;
  218. int budget, count, rc;
  219. INIT_LIST_HEAD(&sched_list);
  220. budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
  221. do {
  222. wcs = ep->rep_recv_wcs;
  223. rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
  224. if (rc <= 0)
  225. goto out_schedule;
  226. count = rc;
  227. while (count-- > 0)
  228. rpcrdma_recvcq_process_wc(wcs++, &sched_list);
  229. } while (rc == RPCRDMA_POLLSIZE && --budget);
  230. rc = 0;
  231. out_schedule:
  232. rpcrdma_schedule_tasklet(&sched_list);
  233. return rc;
  234. }
  235. /*
  236. * Handle receive completions.
  237. *
  238. * It is reentrant but processes single events in order to maintain
  239. * ordering of receives to keep server credits.
  240. *
  241. * It is the responsibility of the scheduled tasklet to return
  242. * recv buffers to the pool. NOTE: this affects synchronization of
  243. * connection shutdown. That is, the structures required for
  244. * the completion of the reply handler must remain intact until
  245. * all memory has been reclaimed.
  246. */
  247. static void
  248. rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
  249. {
  250. struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
  251. int rc;
  252. rc = rpcrdma_recvcq_poll(cq, ep);
  253. if (rc) {
  254. dprintk("RPC: %s: ib_poll_cq failed: %i\n",
  255. __func__, rc);
  256. return;
  257. }
  258. rc = ib_req_notify_cq(cq,
  259. IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
  260. if (rc == 0)
  261. return;
  262. if (rc < 0) {
  263. dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
  264. __func__, rc);
  265. return;
  266. }
  267. rpcrdma_recvcq_poll(cq, ep);
  268. }
  269. static void
  270. rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
  271. {
  272. struct ib_wc wc;
  273. LIST_HEAD(sched_list);
  274. while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
  275. rpcrdma_recvcq_process_wc(&wc, &sched_list);
  276. if (!list_empty(&sched_list))
  277. rpcrdma_schedule_tasklet(&sched_list);
  278. while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
  279. rpcrdma_sendcq_process_wc(&wc);
  280. }
  281. static int
  282. rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
  283. {
  284. struct rpcrdma_xprt *xprt = id->context;
  285. struct rpcrdma_ia *ia = &xprt->rx_ia;
  286. struct rpcrdma_ep *ep = &xprt->rx_ep;
  287. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  288. struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
  289. #endif
  290. struct ib_qp_attr *attr = &ia->ri_qp_attr;
  291. struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
  292. int connstate = 0;
  293. switch (event->event) {
  294. case RDMA_CM_EVENT_ADDR_RESOLVED:
  295. case RDMA_CM_EVENT_ROUTE_RESOLVED:
  296. ia->ri_async_rc = 0;
  297. complete(&ia->ri_done);
  298. break;
  299. case RDMA_CM_EVENT_ADDR_ERROR:
  300. ia->ri_async_rc = -EHOSTUNREACH;
  301. dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
  302. __func__, ep);
  303. complete(&ia->ri_done);
  304. break;
  305. case RDMA_CM_EVENT_ROUTE_ERROR:
  306. ia->ri_async_rc = -ENETUNREACH;
  307. dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
  308. __func__, ep);
  309. complete(&ia->ri_done);
  310. break;
  311. case RDMA_CM_EVENT_ESTABLISHED:
  312. connstate = 1;
  313. ib_query_qp(ia->ri_id->qp, attr,
  314. IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
  315. iattr);
  316. dprintk("RPC: %s: %d responder resources"
  317. " (%d initiator)\n",
  318. __func__, attr->max_dest_rd_atomic,
  319. attr->max_rd_atomic);
  320. goto connected;
  321. case RDMA_CM_EVENT_CONNECT_ERROR:
  322. connstate = -ENOTCONN;
  323. goto connected;
  324. case RDMA_CM_EVENT_UNREACHABLE:
  325. connstate = -ENETDOWN;
  326. goto connected;
  327. case RDMA_CM_EVENT_REJECTED:
  328. connstate = -ECONNREFUSED;
  329. goto connected;
  330. case RDMA_CM_EVENT_DISCONNECTED:
  331. connstate = -ECONNABORTED;
  332. goto connected;
  333. case RDMA_CM_EVENT_DEVICE_REMOVAL:
  334. connstate = -ENODEV;
  335. connected:
  336. dprintk("RPC: %s: %sconnected\n",
  337. __func__, connstate > 0 ? "" : "dis");
  338. ep->rep_connected = connstate;
  339. rpcrdma_conn_func(ep);
  340. wake_up_all(&ep->rep_connect_wait);
  341. /*FALLTHROUGH*/
  342. default:
  343. dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
  344. __func__, sap, rpc_get_port(sap), ep,
  345. rdma_event_msg(event->event));
  346. break;
  347. }
  348. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  349. if (connstate == 1) {
  350. int ird = attr->max_dest_rd_atomic;
  351. int tird = ep->rep_remote_cma.responder_resources;
  352. pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
  353. sap, rpc_get_port(sap),
  354. ia->ri_device->name,
  355. ia->ri_ops->ro_displayname,
  356. xprt->rx_buf.rb_max_requests,
  357. ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
  358. } else if (connstate < 0) {
  359. pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
  360. sap, rpc_get_port(sap), connstate);
  361. }
  362. #endif
  363. return 0;
  364. }
  365. static struct rdma_cm_id *
  366. rpcrdma_create_id(struct rpcrdma_xprt *xprt,
  367. struct rpcrdma_ia *ia, struct sockaddr *addr)
  368. {
  369. struct rdma_cm_id *id;
  370. int rc;
  371. init_completion(&ia->ri_done);
  372. id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
  373. if (IS_ERR(id)) {
  374. rc = PTR_ERR(id);
  375. dprintk("RPC: %s: rdma_create_id() failed %i\n",
  376. __func__, rc);
  377. return id;
  378. }
  379. ia->ri_async_rc = -ETIMEDOUT;
  380. rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
  381. if (rc) {
  382. dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
  383. __func__, rc);
  384. goto out;
  385. }
  386. wait_for_completion_interruptible_timeout(&ia->ri_done,
  387. msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
  388. rc = ia->ri_async_rc;
  389. if (rc)
  390. goto out;
  391. ia->ri_async_rc = -ETIMEDOUT;
  392. rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
  393. if (rc) {
  394. dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
  395. __func__, rc);
  396. goto out;
  397. }
  398. wait_for_completion_interruptible_timeout(&ia->ri_done,
  399. msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
  400. rc = ia->ri_async_rc;
  401. if (rc)
  402. goto out;
  403. return id;
  404. out:
  405. rdma_destroy_id(id);
  406. return ERR_PTR(rc);
  407. }
  408. /*
  409. * Drain any cq, prior to teardown.
  410. */
  411. static void
  412. rpcrdma_clean_cq(struct ib_cq *cq)
  413. {
  414. struct ib_wc wc;
  415. int count = 0;
  416. while (1 == ib_poll_cq(cq, 1, &wc))
  417. ++count;
  418. if (count)
  419. dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
  420. __func__, count, wc.opcode);
  421. }
  422. /*
  423. * Exported functions.
  424. */
  425. /*
  426. * Open and initialize an Interface Adapter.
  427. * o initializes fields of struct rpcrdma_ia, including
  428. * interface and provider attributes and protection zone.
  429. */
  430. int
  431. rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
  432. {
  433. int rc, mem_priv;
  434. struct rpcrdma_ia *ia = &xprt->rx_ia;
  435. struct ib_device_attr *devattr = &ia->ri_devattr;
  436. ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
  437. if (IS_ERR(ia->ri_id)) {
  438. rc = PTR_ERR(ia->ri_id);
  439. goto out1;
  440. }
  441. ia->ri_device = ia->ri_id->device;
  442. ia->ri_pd = ib_alloc_pd(ia->ri_device);
  443. if (IS_ERR(ia->ri_pd)) {
  444. rc = PTR_ERR(ia->ri_pd);
  445. dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
  446. __func__, rc);
  447. goto out2;
  448. }
  449. rc = ib_query_device(ia->ri_device, devattr);
  450. if (rc) {
  451. dprintk("RPC: %s: ib_query_device failed %d\n",
  452. __func__, rc);
  453. goto out3;
  454. }
  455. if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
  456. ia->ri_have_dma_lkey = 1;
  457. ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
  458. }
  459. if (memreg == RPCRDMA_FRMR) {
  460. /* Requires both frmr reg and local dma lkey */
  461. if (((devattr->device_cap_flags &
  462. (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
  463. (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
  464. (devattr->max_fast_reg_page_list_len == 0)) {
  465. dprintk("RPC: %s: FRMR registration "
  466. "not supported by HCA\n", __func__);
  467. memreg = RPCRDMA_MTHCAFMR;
  468. }
  469. }
  470. if (memreg == RPCRDMA_MTHCAFMR) {
  471. if (!ia->ri_device->alloc_fmr) {
  472. dprintk("RPC: %s: MTHCAFMR registration "
  473. "not supported by HCA\n", __func__);
  474. memreg = RPCRDMA_ALLPHYSICAL;
  475. }
  476. }
  477. /*
  478. * Optionally obtain an underlying physical identity mapping in
  479. * order to do a memory window-based bind. This base registration
  480. * is protected from remote access - that is enabled only by binding
  481. * for the specific bytes targeted during each RPC operation, and
  482. * revoked after the corresponding completion similar to a storage
  483. * adapter.
  484. */
  485. switch (memreg) {
  486. case RPCRDMA_FRMR:
  487. ia->ri_ops = &rpcrdma_frwr_memreg_ops;
  488. break;
  489. case RPCRDMA_ALLPHYSICAL:
  490. ia->ri_ops = &rpcrdma_physical_memreg_ops;
  491. mem_priv = IB_ACCESS_LOCAL_WRITE |
  492. IB_ACCESS_REMOTE_WRITE |
  493. IB_ACCESS_REMOTE_READ;
  494. goto register_setup;
  495. case RPCRDMA_MTHCAFMR:
  496. ia->ri_ops = &rpcrdma_fmr_memreg_ops;
  497. if (ia->ri_have_dma_lkey)
  498. break;
  499. mem_priv = IB_ACCESS_LOCAL_WRITE;
  500. register_setup:
  501. ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
  502. if (IS_ERR(ia->ri_bind_mem)) {
  503. printk(KERN_ALERT "%s: ib_get_dma_mr for "
  504. "phys register failed with %lX\n",
  505. __func__, PTR_ERR(ia->ri_bind_mem));
  506. rc = -ENOMEM;
  507. goto out3;
  508. }
  509. break;
  510. default:
  511. printk(KERN_ERR "RPC: Unsupported memory "
  512. "registration mode: %d\n", memreg);
  513. rc = -ENOMEM;
  514. goto out3;
  515. }
  516. dprintk("RPC: %s: memory registration strategy is '%s'\n",
  517. __func__, ia->ri_ops->ro_displayname);
  518. rwlock_init(&ia->ri_qplock);
  519. return 0;
  520. out3:
  521. ib_dealloc_pd(ia->ri_pd);
  522. ia->ri_pd = NULL;
  523. out2:
  524. rdma_destroy_id(ia->ri_id);
  525. ia->ri_id = NULL;
  526. out1:
  527. return rc;
  528. }
  529. /*
  530. * Clean up/close an IA.
  531. * o if event handles and PD have been initialized, free them.
  532. * o close the IA
  533. */
  534. void
  535. rpcrdma_ia_close(struct rpcrdma_ia *ia)
  536. {
  537. int rc;
  538. dprintk("RPC: %s: entering\n", __func__);
  539. if (ia->ri_bind_mem != NULL) {
  540. rc = ib_dereg_mr(ia->ri_bind_mem);
  541. dprintk("RPC: %s: ib_dereg_mr returned %i\n",
  542. __func__, rc);
  543. }
  544. if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
  545. if (ia->ri_id->qp)
  546. rdma_destroy_qp(ia->ri_id);
  547. rdma_destroy_id(ia->ri_id);
  548. ia->ri_id = NULL;
  549. }
  550. /* If the pd is still busy, xprtrdma missed freeing a resource */
  551. if (ia->ri_pd && !IS_ERR(ia->ri_pd))
  552. WARN_ON(ib_dealloc_pd(ia->ri_pd));
  553. }
  554. /*
  555. * Create unconnected endpoint.
  556. */
  557. int
  558. rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
  559. struct rpcrdma_create_data_internal *cdata)
  560. {
  561. struct ib_device_attr *devattr = &ia->ri_devattr;
  562. struct ib_cq *sendcq, *recvcq;
  563. struct ib_cq_init_attr cq_attr = {};
  564. int rc, err;
  565. /* check provider's send/recv wr limits */
  566. if (cdata->max_requests > devattr->max_qp_wr)
  567. cdata->max_requests = devattr->max_qp_wr;
  568. ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
  569. ep->rep_attr.qp_context = ep;
  570. ep->rep_attr.srq = NULL;
  571. ep->rep_attr.cap.max_send_wr = cdata->max_requests;
  572. rc = ia->ri_ops->ro_open(ia, ep, cdata);
  573. if (rc)
  574. return rc;
  575. ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
  576. ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
  577. ep->rep_attr.cap.max_recv_sge = 1;
  578. ep->rep_attr.cap.max_inline_data = 0;
  579. ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
  580. ep->rep_attr.qp_type = IB_QPT_RC;
  581. ep->rep_attr.port_num = ~0;
  582. if (cdata->padding) {
  583. ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
  584. GFP_KERNEL);
  585. if (IS_ERR(ep->rep_padbuf))
  586. return PTR_ERR(ep->rep_padbuf);
  587. } else
  588. ep->rep_padbuf = NULL;
  589. dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
  590. "iovs: send %d recv %d\n",
  591. __func__,
  592. ep->rep_attr.cap.max_send_wr,
  593. ep->rep_attr.cap.max_recv_wr,
  594. ep->rep_attr.cap.max_send_sge,
  595. ep->rep_attr.cap.max_recv_sge);
  596. /* set trigger for requesting send completion */
  597. ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
  598. if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
  599. ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
  600. else if (ep->rep_cqinit <= 2)
  601. ep->rep_cqinit = 0;
  602. INIT_CQCOUNT(ep);
  603. init_waitqueue_head(&ep->rep_connect_wait);
  604. INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
  605. cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1;
  606. sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall,
  607. rpcrdma_cq_async_error_upcall, ep, &cq_attr);
  608. if (IS_ERR(sendcq)) {
  609. rc = PTR_ERR(sendcq);
  610. dprintk("RPC: %s: failed to create send CQ: %i\n",
  611. __func__, rc);
  612. goto out1;
  613. }
  614. rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
  615. if (rc) {
  616. dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
  617. __func__, rc);
  618. goto out2;
  619. }
  620. cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1;
  621. recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall,
  622. rpcrdma_cq_async_error_upcall, ep, &cq_attr);
  623. if (IS_ERR(recvcq)) {
  624. rc = PTR_ERR(recvcq);
  625. dprintk("RPC: %s: failed to create recv CQ: %i\n",
  626. __func__, rc);
  627. goto out2;
  628. }
  629. rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
  630. if (rc) {
  631. dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
  632. __func__, rc);
  633. ib_destroy_cq(recvcq);
  634. goto out2;
  635. }
  636. ep->rep_attr.send_cq = sendcq;
  637. ep->rep_attr.recv_cq = recvcq;
  638. /* Initialize cma parameters */
  639. /* RPC/RDMA does not use private data */
  640. ep->rep_remote_cma.private_data = NULL;
  641. ep->rep_remote_cma.private_data_len = 0;
  642. /* Client offers RDMA Read but does not initiate */
  643. ep->rep_remote_cma.initiator_depth = 0;
  644. if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
  645. ep->rep_remote_cma.responder_resources = 32;
  646. else
  647. ep->rep_remote_cma.responder_resources =
  648. devattr->max_qp_rd_atom;
  649. ep->rep_remote_cma.retry_count = 7;
  650. ep->rep_remote_cma.flow_control = 0;
  651. ep->rep_remote_cma.rnr_retry_count = 0;
  652. return 0;
  653. out2:
  654. err = ib_destroy_cq(sendcq);
  655. if (err)
  656. dprintk("RPC: %s: ib_destroy_cq returned %i\n",
  657. __func__, err);
  658. out1:
  659. rpcrdma_free_regbuf(ia, ep->rep_padbuf);
  660. return rc;
  661. }
  662. /*
  663. * rpcrdma_ep_destroy
  664. *
  665. * Disconnect and destroy endpoint. After this, the only
  666. * valid operations on the ep are to free it (if dynamically
  667. * allocated) or re-create it.
  668. */
  669. void
  670. rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  671. {
  672. int rc;
  673. dprintk("RPC: %s: entering, connected is %d\n",
  674. __func__, ep->rep_connected);
  675. cancel_delayed_work_sync(&ep->rep_connect_worker);
  676. if (ia->ri_id->qp) {
  677. rpcrdma_ep_disconnect(ep, ia);
  678. rdma_destroy_qp(ia->ri_id);
  679. ia->ri_id->qp = NULL;
  680. }
  681. rpcrdma_free_regbuf(ia, ep->rep_padbuf);
  682. rpcrdma_clean_cq(ep->rep_attr.recv_cq);
  683. rc = ib_destroy_cq(ep->rep_attr.recv_cq);
  684. if (rc)
  685. dprintk("RPC: %s: ib_destroy_cq returned %i\n",
  686. __func__, rc);
  687. rpcrdma_clean_cq(ep->rep_attr.send_cq);
  688. rc = ib_destroy_cq(ep->rep_attr.send_cq);
  689. if (rc)
  690. dprintk("RPC: %s: ib_destroy_cq returned %i\n",
  691. __func__, rc);
  692. }
  693. /*
  694. * Connect unconnected endpoint.
  695. */
  696. int
  697. rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  698. {
  699. struct rdma_cm_id *id, *old;
  700. int rc = 0;
  701. int retry_count = 0;
  702. if (ep->rep_connected != 0) {
  703. struct rpcrdma_xprt *xprt;
  704. retry:
  705. dprintk("RPC: %s: reconnecting...\n", __func__);
  706. rpcrdma_ep_disconnect(ep, ia);
  707. rpcrdma_flush_cqs(ep);
  708. xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
  709. id = rpcrdma_create_id(xprt, ia,
  710. (struct sockaddr *)&xprt->rx_data.addr);
  711. if (IS_ERR(id)) {
  712. rc = -EHOSTUNREACH;
  713. goto out;
  714. }
  715. /* TEMP TEMP TEMP - fail if new device:
  716. * Deregister/remarshal *all* requests!
  717. * Close and recreate adapter, pd, etc!
  718. * Re-determine all attributes still sane!
  719. * More stuff I haven't thought of!
  720. * Rrrgh!
  721. */
  722. if (ia->ri_device != id->device) {
  723. printk("RPC: %s: can't reconnect on "
  724. "different device!\n", __func__);
  725. rdma_destroy_id(id);
  726. rc = -ENETUNREACH;
  727. goto out;
  728. }
  729. /* END TEMP */
  730. rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
  731. if (rc) {
  732. dprintk("RPC: %s: rdma_create_qp failed %i\n",
  733. __func__, rc);
  734. rdma_destroy_id(id);
  735. rc = -ENETUNREACH;
  736. goto out;
  737. }
  738. write_lock(&ia->ri_qplock);
  739. old = ia->ri_id;
  740. ia->ri_id = id;
  741. write_unlock(&ia->ri_qplock);
  742. rdma_destroy_qp(old);
  743. rdma_destroy_id(old);
  744. } else {
  745. dprintk("RPC: %s: connecting...\n", __func__);
  746. rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
  747. if (rc) {
  748. dprintk("RPC: %s: rdma_create_qp failed %i\n",
  749. __func__, rc);
  750. /* do not update ep->rep_connected */
  751. return -ENETUNREACH;
  752. }
  753. }
  754. ep->rep_connected = 0;
  755. rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
  756. if (rc) {
  757. dprintk("RPC: %s: rdma_connect() failed with %i\n",
  758. __func__, rc);
  759. goto out;
  760. }
  761. wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
  762. /*
  763. * Check state. A non-peer reject indicates no listener
  764. * (ECONNREFUSED), which may be a transient state. All
  765. * others indicate a transport condition which has already
  766. * undergone a best-effort.
  767. */
  768. if (ep->rep_connected == -ECONNREFUSED &&
  769. ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
  770. dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
  771. goto retry;
  772. }
  773. if (ep->rep_connected <= 0) {
  774. /* Sometimes, the only way to reliably connect to remote
  775. * CMs is to use same nonzero values for ORD and IRD. */
  776. if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
  777. (ep->rep_remote_cma.responder_resources == 0 ||
  778. ep->rep_remote_cma.initiator_depth !=
  779. ep->rep_remote_cma.responder_resources)) {
  780. if (ep->rep_remote_cma.responder_resources == 0)
  781. ep->rep_remote_cma.responder_resources = 1;
  782. ep->rep_remote_cma.initiator_depth =
  783. ep->rep_remote_cma.responder_resources;
  784. goto retry;
  785. }
  786. rc = ep->rep_connected;
  787. } else {
  788. dprintk("RPC: %s: connected\n", __func__);
  789. }
  790. out:
  791. if (rc)
  792. ep->rep_connected = rc;
  793. return rc;
  794. }
  795. /*
  796. * rpcrdma_ep_disconnect
  797. *
  798. * This is separate from destroy to facilitate the ability
  799. * to reconnect without recreating the endpoint.
  800. *
  801. * This call is not reentrant, and must not be made in parallel
  802. * on the same endpoint.
  803. */
  804. void
  805. rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  806. {
  807. int rc;
  808. rpcrdma_flush_cqs(ep);
  809. rc = rdma_disconnect(ia->ri_id);
  810. if (!rc) {
  811. /* returns without wait if not connected */
  812. wait_event_interruptible(ep->rep_connect_wait,
  813. ep->rep_connected != 1);
  814. dprintk("RPC: %s: after wait, %sconnected\n", __func__,
  815. (ep->rep_connected == 1) ? "still " : "dis");
  816. } else {
  817. dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
  818. ep->rep_connected = rc;
  819. }
  820. }
  821. static struct rpcrdma_req *
  822. rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
  823. {
  824. struct rpcrdma_req *req;
  825. req = kzalloc(sizeof(*req), GFP_KERNEL);
  826. if (req == NULL)
  827. return ERR_PTR(-ENOMEM);
  828. req->rl_buffer = &r_xprt->rx_buf;
  829. return req;
  830. }
  831. static struct rpcrdma_rep *
  832. rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
  833. {
  834. struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
  835. struct rpcrdma_ia *ia = &r_xprt->rx_ia;
  836. struct rpcrdma_rep *rep;
  837. int rc;
  838. rc = -ENOMEM;
  839. rep = kzalloc(sizeof(*rep), GFP_KERNEL);
  840. if (rep == NULL)
  841. goto out;
  842. rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
  843. GFP_KERNEL);
  844. if (IS_ERR(rep->rr_rdmabuf)) {
  845. rc = PTR_ERR(rep->rr_rdmabuf);
  846. goto out_free;
  847. }
  848. rep->rr_device = ia->ri_device;
  849. rep->rr_rxprt = r_xprt;
  850. return rep;
  851. out_free:
  852. kfree(rep);
  853. out:
  854. return ERR_PTR(rc);
  855. }
  856. int
  857. rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
  858. {
  859. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  860. struct rpcrdma_ia *ia = &r_xprt->rx_ia;
  861. struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
  862. char *p;
  863. size_t len;
  864. int i, rc;
  865. buf->rb_max_requests = cdata->max_requests;
  866. spin_lock_init(&buf->rb_lock);
  867. /* Need to allocate:
  868. * 1. arrays for send and recv pointers
  869. * 2. arrays of struct rpcrdma_req to fill in pointers
  870. * 3. array of struct rpcrdma_rep for replies
  871. * Send/recv buffers in req/rep need to be registered
  872. */
  873. len = buf->rb_max_requests *
  874. (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
  875. p = kzalloc(len, GFP_KERNEL);
  876. if (p == NULL) {
  877. dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
  878. __func__, len);
  879. rc = -ENOMEM;
  880. goto out;
  881. }
  882. buf->rb_pool = p; /* for freeing it later */
  883. buf->rb_send_bufs = (struct rpcrdma_req **) p;
  884. p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
  885. buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
  886. p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
  887. rc = ia->ri_ops->ro_init(r_xprt);
  888. if (rc)
  889. goto out;
  890. for (i = 0; i < buf->rb_max_requests; i++) {
  891. struct rpcrdma_req *req;
  892. struct rpcrdma_rep *rep;
  893. req = rpcrdma_create_req(r_xprt);
  894. if (IS_ERR(req)) {
  895. dprintk("RPC: %s: request buffer %d alloc"
  896. " failed\n", __func__, i);
  897. rc = PTR_ERR(req);
  898. goto out;
  899. }
  900. buf->rb_send_bufs[i] = req;
  901. rep = rpcrdma_create_rep(r_xprt);
  902. if (IS_ERR(rep)) {
  903. dprintk("RPC: %s: reply buffer %d alloc failed\n",
  904. __func__, i);
  905. rc = PTR_ERR(rep);
  906. goto out;
  907. }
  908. buf->rb_recv_bufs[i] = rep;
  909. }
  910. return 0;
  911. out:
  912. rpcrdma_buffer_destroy(buf);
  913. return rc;
  914. }
  915. static void
  916. rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
  917. {
  918. if (!rep)
  919. return;
  920. rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
  921. kfree(rep);
  922. }
  923. static void
  924. rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
  925. {
  926. if (!req)
  927. return;
  928. rpcrdma_free_regbuf(ia, req->rl_sendbuf);
  929. rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
  930. kfree(req);
  931. }
  932. void
  933. rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
  934. {
  935. struct rpcrdma_ia *ia = rdmab_to_ia(buf);
  936. int i;
  937. /* clean up in reverse order from create
  938. * 1. recv mr memory (mr free, then kfree)
  939. * 2. send mr memory (mr free, then kfree)
  940. * 3. MWs
  941. */
  942. dprintk("RPC: %s: entering\n", __func__);
  943. for (i = 0; i < buf->rb_max_requests; i++) {
  944. if (buf->rb_recv_bufs)
  945. rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
  946. if (buf->rb_send_bufs)
  947. rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
  948. }
  949. ia->ri_ops->ro_destroy(buf);
  950. kfree(buf->rb_pool);
  951. }
  952. struct rpcrdma_mw *
  953. rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
  954. {
  955. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  956. struct rpcrdma_mw *mw = NULL;
  957. spin_lock(&buf->rb_mwlock);
  958. if (!list_empty(&buf->rb_mws)) {
  959. mw = list_first_entry(&buf->rb_mws,
  960. struct rpcrdma_mw, mw_list);
  961. list_del_init(&mw->mw_list);
  962. }
  963. spin_unlock(&buf->rb_mwlock);
  964. if (!mw)
  965. pr_err("RPC: %s: no MWs available\n", __func__);
  966. return mw;
  967. }
  968. void
  969. rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
  970. {
  971. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  972. spin_lock(&buf->rb_mwlock);
  973. list_add_tail(&mw->mw_list, &buf->rb_mws);
  974. spin_unlock(&buf->rb_mwlock);
  975. }
  976. static void
  977. rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
  978. {
  979. buf->rb_send_bufs[--buf->rb_send_index] = req;
  980. req->rl_niovs = 0;
  981. if (req->rl_reply) {
  982. buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
  983. req->rl_reply = NULL;
  984. }
  985. }
  986. /*
  987. * Get a set of request/reply buffers.
  988. *
  989. * Reply buffer (if needed) is attached to send buffer upon return.
  990. * Rule:
  991. * rb_send_index and rb_recv_index MUST always be pointing to the
  992. * *next* available buffer (non-NULL). They are incremented after
  993. * removing buffers, and decremented *before* returning them.
  994. */
  995. struct rpcrdma_req *
  996. rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
  997. {
  998. struct rpcrdma_req *req;
  999. unsigned long flags;
  1000. spin_lock_irqsave(&buffers->rb_lock, flags);
  1001. if (buffers->rb_send_index == buffers->rb_max_requests) {
  1002. spin_unlock_irqrestore(&buffers->rb_lock, flags);
  1003. dprintk("RPC: %s: out of request buffers\n", __func__);
  1004. return ((struct rpcrdma_req *)NULL);
  1005. }
  1006. req = buffers->rb_send_bufs[buffers->rb_send_index];
  1007. if (buffers->rb_send_index < buffers->rb_recv_index) {
  1008. dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
  1009. __func__,
  1010. buffers->rb_recv_index - buffers->rb_send_index);
  1011. req->rl_reply = NULL;
  1012. } else {
  1013. req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
  1014. buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
  1015. }
  1016. buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
  1017. spin_unlock_irqrestore(&buffers->rb_lock, flags);
  1018. return req;
  1019. }
  1020. /*
  1021. * Put request/reply buffers back into pool.
  1022. * Pre-decrement counter/array index.
  1023. */
  1024. void
  1025. rpcrdma_buffer_put(struct rpcrdma_req *req)
  1026. {
  1027. struct rpcrdma_buffer *buffers = req->rl_buffer;
  1028. unsigned long flags;
  1029. spin_lock_irqsave(&buffers->rb_lock, flags);
  1030. rpcrdma_buffer_put_sendbuf(req, buffers);
  1031. spin_unlock_irqrestore(&buffers->rb_lock, flags);
  1032. }
  1033. /*
  1034. * Recover reply buffers from pool.
  1035. * This happens when recovering from error conditions.
  1036. * Post-increment counter/array index.
  1037. */
  1038. void
  1039. rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
  1040. {
  1041. struct rpcrdma_buffer *buffers = req->rl_buffer;
  1042. unsigned long flags;
  1043. spin_lock_irqsave(&buffers->rb_lock, flags);
  1044. if (buffers->rb_recv_index < buffers->rb_max_requests) {
  1045. req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
  1046. buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
  1047. }
  1048. spin_unlock_irqrestore(&buffers->rb_lock, flags);
  1049. }
  1050. /*
  1051. * Put reply buffers back into pool when not attached to
  1052. * request. This happens in error conditions.
  1053. */
  1054. void
  1055. rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
  1056. {
  1057. struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
  1058. unsigned long flags;
  1059. spin_lock_irqsave(&buffers->rb_lock, flags);
  1060. buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
  1061. spin_unlock_irqrestore(&buffers->rb_lock, flags);
  1062. }
  1063. /*
  1064. * Wrappers for internal-use kmalloc memory registration, used by buffer code.
  1065. */
  1066. void
  1067. rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
  1068. {
  1069. dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
  1070. seg->mr_offset,
  1071. (unsigned long long)seg->mr_dma, seg->mr_dmalen);
  1072. }
  1073. static int
  1074. rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
  1075. struct ib_mr **mrp, struct ib_sge *iov)
  1076. {
  1077. struct ib_phys_buf ipb;
  1078. struct ib_mr *mr;
  1079. int rc;
  1080. /*
  1081. * All memory passed here was kmalloc'ed, therefore phys-contiguous.
  1082. */
  1083. iov->addr = ib_dma_map_single(ia->ri_device,
  1084. va, len, DMA_BIDIRECTIONAL);
  1085. if (ib_dma_mapping_error(ia->ri_device, iov->addr))
  1086. return -ENOMEM;
  1087. iov->length = len;
  1088. if (ia->ri_have_dma_lkey) {
  1089. *mrp = NULL;
  1090. iov->lkey = ia->ri_dma_lkey;
  1091. return 0;
  1092. } else if (ia->ri_bind_mem != NULL) {
  1093. *mrp = NULL;
  1094. iov->lkey = ia->ri_bind_mem->lkey;
  1095. return 0;
  1096. }
  1097. ipb.addr = iov->addr;
  1098. ipb.size = iov->length;
  1099. mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
  1100. IB_ACCESS_LOCAL_WRITE, &iov->addr);
  1101. dprintk("RPC: %s: phys convert: 0x%llx "
  1102. "registered 0x%llx length %d\n",
  1103. __func__, (unsigned long long)ipb.addr,
  1104. (unsigned long long)iov->addr, len);
  1105. if (IS_ERR(mr)) {
  1106. *mrp = NULL;
  1107. rc = PTR_ERR(mr);
  1108. dprintk("RPC: %s: failed with %i\n", __func__, rc);
  1109. } else {
  1110. *mrp = mr;
  1111. iov->lkey = mr->lkey;
  1112. rc = 0;
  1113. }
  1114. return rc;
  1115. }
  1116. static int
  1117. rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
  1118. struct ib_mr *mr, struct ib_sge *iov)
  1119. {
  1120. int rc;
  1121. ib_dma_unmap_single(ia->ri_device,
  1122. iov->addr, iov->length, DMA_BIDIRECTIONAL);
  1123. if (NULL == mr)
  1124. return 0;
  1125. rc = ib_dereg_mr(mr);
  1126. if (rc)
  1127. dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
  1128. return rc;
  1129. }
  1130. /**
  1131. * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
  1132. * @ia: controlling rpcrdma_ia
  1133. * @size: size of buffer to be allocated, in bytes
  1134. * @flags: GFP flags
  1135. *
  1136. * Returns pointer to private header of an area of internally
  1137. * registered memory, or an ERR_PTR. The registered buffer follows
  1138. * the end of the private header.
  1139. *
  1140. * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
  1141. * receiving the payload of RDMA RECV operations. regbufs are not
  1142. * used for RDMA READ/WRITE operations, thus are registered only for
  1143. * LOCAL access.
  1144. */
  1145. struct rpcrdma_regbuf *
  1146. rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
  1147. {
  1148. struct rpcrdma_regbuf *rb;
  1149. int rc;
  1150. rc = -ENOMEM;
  1151. rb = kmalloc(sizeof(*rb) + size, flags);
  1152. if (rb == NULL)
  1153. goto out;
  1154. rb->rg_size = size;
  1155. rb->rg_owner = NULL;
  1156. rc = rpcrdma_register_internal(ia, rb->rg_base, size,
  1157. &rb->rg_mr, &rb->rg_iov);
  1158. if (rc)
  1159. goto out_free;
  1160. return rb;
  1161. out_free:
  1162. kfree(rb);
  1163. out:
  1164. return ERR_PTR(rc);
  1165. }
  1166. /**
  1167. * rpcrdma_free_regbuf - deregister and free registered buffer
  1168. * @ia: controlling rpcrdma_ia
  1169. * @rb: regbuf to be deregistered and freed
  1170. */
  1171. void
  1172. rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
  1173. {
  1174. if (rb) {
  1175. rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
  1176. kfree(rb);
  1177. }
  1178. }
  1179. /*
  1180. * Prepost any receive buffer, then post send.
  1181. *
  1182. * Receive buffer is donated to hardware, reclaimed upon recv completion.
  1183. */
  1184. int
  1185. rpcrdma_ep_post(struct rpcrdma_ia *ia,
  1186. struct rpcrdma_ep *ep,
  1187. struct rpcrdma_req *req)
  1188. {
  1189. struct ib_send_wr send_wr, *send_wr_fail;
  1190. struct rpcrdma_rep *rep = req->rl_reply;
  1191. int rc;
  1192. if (rep) {
  1193. rc = rpcrdma_ep_post_recv(ia, ep, rep);
  1194. if (rc)
  1195. goto out;
  1196. req->rl_reply = NULL;
  1197. }
  1198. send_wr.next = NULL;
  1199. send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
  1200. send_wr.sg_list = req->rl_send_iov;
  1201. send_wr.num_sge = req->rl_niovs;
  1202. send_wr.opcode = IB_WR_SEND;
  1203. if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
  1204. ib_dma_sync_single_for_device(ia->ri_device,
  1205. req->rl_send_iov[3].addr,
  1206. req->rl_send_iov[3].length,
  1207. DMA_TO_DEVICE);
  1208. ib_dma_sync_single_for_device(ia->ri_device,
  1209. req->rl_send_iov[1].addr,
  1210. req->rl_send_iov[1].length,
  1211. DMA_TO_DEVICE);
  1212. ib_dma_sync_single_for_device(ia->ri_device,
  1213. req->rl_send_iov[0].addr,
  1214. req->rl_send_iov[0].length,
  1215. DMA_TO_DEVICE);
  1216. if (DECR_CQCOUNT(ep) > 0)
  1217. send_wr.send_flags = 0;
  1218. else { /* Provider must take a send completion every now and then */
  1219. INIT_CQCOUNT(ep);
  1220. send_wr.send_flags = IB_SEND_SIGNALED;
  1221. }
  1222. rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
  1223. if (rc)
  1224. dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
  1225. rc);
  1226. out:
  1227. return rc;
  1228. }
  1229. /*
  1230. * (Re)post a receive buffer.
  1231. */
  1232. int
  1233. rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
  1234. struct rpcrdma_ep *ep,
  1235. struct rpcrdma_rep *rep)
  1236. {
  1237. struct ib_recv_wr recv_wr, *recv_wr_fail;
  1238. int rc;
  1239. recv_wr.next = NULL;
  1240. recv_wr.wr_id = (u64) (unsigned long) rep;
  1241. recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
  1242. recv_wr.num_sge = 1;
  1243. ib_dma_sync_single_for_cpu(ia->ri_device,
  1244. rdmab_addr(rep->rr_rdmabuf),
  1245. rdmab_length(rep->rr_rdmabuf),
  1246. DMA_BIDIRECTIONAL);
  1247. rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
  1248. if (rc)
  1249. dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
  1250. rc);
  1251. return rc;
  1252. }
  1253. /* How many chunk list items fit within our inline buffers?
  1254. */
  1255. unsigned int
  1256. rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
  1257. {
  1258. struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
  1259. int bytes, segments;
  1260. bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
  1261. bytes -= RPCRDMA_HDRLEN_MIN;
  1262. if (bytes < sizeof(struct rpcrdma_segment) * 2) {
  1263. pr_warn("RPC: %s: inline threshold too small\n",
  1264. __func__);
  1265. return 0;
  1266. }
  1267. segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
  1268. dprintk("RPC: %s: max chunk list size = %d segments\n",
  1269. __func__, segments);
  1270. return segments;
  1271. }