fmr_ops.c 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. /*
  2. * Copyright (c) 2015 Oracle. All rights reserved.
  3. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  4. */
  5. /* Lightweight memory registration using Fast Memory Regions (FMR).
  6. * Referred to sometimes as MTHCAFMR mode.
  7. *
  8. * FMR uses synchronous memory registration and deregistration.
  9. * FMR registration is known to be fast, but FMR deregistration
  10. * can take tens of usecs to complete.
  11. */
  12. /* Normal operation
  13. *
  14. * A Memory Region is prepared for RDMA READ or WRITE using the
  15. * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is
  16. * finished, the Memory Region is unmapped using the ib_unmap_fmr
  17. * verb (fmr_op_unmap).
  18. */
  19. #include "xprt_rdma.h"
  20. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  21. # define RPCDBG_FACILITY RPCDBG_TRANS
  22. #endif
  23. /* Maximum scatter/gather per FMR */
  24. #define RPCRDMA_MAX_FMR_SGES (64)
  25. /* Access mode of externally registered pages */
  26. enum {
  27. RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE |
  28. IB_ACCESS_REMOTE_READ,
  29. };
  30. bool
  31. fmr_is_supported(struct rpcrdma_ia *ia)
  32. {
  33. if (!ia->ri_device->alloc_fmr) {
  34. pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
  35. ia->ri_device->name);
  36. return false;
  37. }
  38. return true;
  39. }
  40. static int
  41. fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
  42. {
  43. static struct ib_fmr_attr fmr_attr = {
  44. .max_pages = RPCRDMA_MAX_FMR_SGES,
  45. .max_maps = 1,
  46. .page_shift = PAGE_SHIFT
  47. };
  48. mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
  49. sizeof(u64), GFP_KERNEL);
  50. if (!mw->fmr.fm_physaddrs)
  51. goto out_free;
  52. mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
  53. sizeof(*mw->mw_sg), GFP_KERNEL);
  54. if (!mw->mw_sg)
  55. goto out_free;
  56. sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES);
  57. mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
  58. &fmr_attr);
  59. if (IS_ERR(mw->fmr.fm_mr))
  60. goto out_fmr_err;
  61. return 0;
  62. out_fmr_err:
  63. dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__,
  64. PTR_ERR(mw->fmr.fm_mr));
  65. out_free:
  66. kfree(mw->mw_sg);
  67. kfree(mw->fmr.fm_physaddrs);
  68. return -ENOMEM;
  69. }
  70. static int
  71. __fmr_unmap(struct rpcrdma_mw *mw)
  72. {
  73. LIST_HEAD(l);
  74. int rc;
  75. list_add(&mw->fmr.fm_mr->list, &l);
  76. rc = ib_unmap_fmr(&l);
  77. list_del_init(&mw->fmr.fm_mr->list);
  78. return rc;
  79. }
  80. static void
  81. fmr_op_release_mr(struct rpcrdma_mw *r)
  82. {
  83. LIST_HEAD(unmap_list);
  84. int rc;
  85. /* Ensure MW is not on any rl_registered list */
  86. if (!list_empty(&r->mw_list))
  87. list_del(&r->mw_list);
  88. kfree(r->fmr.fm_physaddrs);
  89. kfree(r->mw_sg);
  90. /* In case this one was left mapped, try to unmap it
  91. * to prevent dealloc_fmr from failing with EBUSY
  92. */
  93. rc = __fmr_unmap(r);
  94. if (rc)
  95. pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
  96. r, rc);
  97. rc = ib_dealloc_fmr(r->fmr.fm_mr);
  98. if (rc)
  99. pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
  100. r, rc);
  101. kfree(r);
  102. }
  103. /* Reset of a single FMR.
  104. */
  105. static void
  106. fmr_op_recover_mr(struct rpcrdma_mw *mw)
  107. {
  108. struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
  109. int rc;
  110. /* ORDER: invalidate first */
  111. rc = __fmr_unmap(mw);
  112. /* ORDER: then DMA unmap */
  113. ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
  114. mw->mw_sg, mw->mw_nents, mw->mw_dir);
  115. if (rc)
  116. goto out_release;
  117. rpcrdma_put_mw(r_xprt, mw);
  118. r_xprt->rx_stats.mrs_recovered++;
  119. return;
  120. out_release:
  121. pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw);
  122. r_xprt->rx_stats.mrs_orphaned++;
  123. spin_lock(&r_xprt->rx_buf.rb_mwlock);
  124. list_del(&mw->mw_all);
  125. spin_unlock(&r_xprt->rx_buf.rb_mwlock);
  126. fmr_op_release_mr(mw);
  127. }
  128. static int
  129. fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
  130. struct rpcrdma_create_data_internal *cdata)
  131. {
  132. ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
  133. RPCRDMA_MAX_FMR_SGES);
  134. return 0;
  135. }
  136. /* FMR mode conveys up to 64 pages of payload per chunk segment.
  137. */
  138. static size_t
  139. fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
  140. {
  141. return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
  142. RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
  143. }
  144. /* Use the ib_map_phys_fmr() verb to register a memory region
  145. * for remote access via RDMA READ or RDMA WRITE.
  146. */
  147. static int
  148. fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
  149. int nsegs, bool writing, struct rpcrdma_mw **out)
  150. {
  151. struct rpcrdma_mr_seg *seg1 = seg;
  152. int len, pageoff, i, rc;
  153. struct rpcrdma_mw *mw;
  154. u64 *dma_pages;
  155. mw = rpcrdma_get_mw(r_xprt);
  156. if (!mw)
  157. return -ENOBUFS;
  158. pageoff = offset_in_page(seg1->mr_offset);
  159. seg1->mr_offset -= pageoff; /* start of page */
  160. seg1->mr_len += pageoff;
  161. len = -pageoff;
  162. if (nsegs > RPCRDMA_MAX_FMR_SGES)
  163. nsegs = RPCRDMA_MAX_FMR_SGES;
  164. for (i = 0; i < nsegs;) {
  165. if (seg->mr_page)
  166. sg_set_page(&mw->mw_sg[i],
  167. seg->mr_page,
  168. seg->mr_len,
  169. offset_in_page(seg->mr_offset));
  170. else
  171. sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
  172. seg->mr_len);
  173. len += seg->mr_len;
  174. ++seg;
  175. ++i;
  176. /* Check for holes */
  177. if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
  178. offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
  179. break;
  180. }
  181. mw->mw_nents = i;
  182. mw->mw_dir = rpcrdma_data_dir(writing);
  183. if (i == 0)
  184. goto out_dmamap_err;
  185. if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device,
  186. mw->mw_sg, mw->mw_nents, mw->mw_dir))
  187. goto out_dmamap_err;
  188. for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++)
  189. dma_pages[i] = sg_dma_address(&mw->mw_sg[i]);
  190. rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents,
  191. dma_pages[0]);
  192. if (rc)
  193. goto out_maperr;
  194. mw->mw_handle = mw->fmr.fm_mr->rkey;
  195. mw->mw_length = len;
  196. mw->mw_offset = dma_pages[0] + pageoff;
  197. *out = mw;
  198. return mw->mw_nents;
  199. out_dmamap_err:
  200. pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
  201. mw->mw_sg, mw->mw_nents);
  202. rpcrdma_defer_mr_recovery(mw);
  203. return -EIO;
  204. out_maperr:
  205. pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
  206. len, (unsigned long long)dma_pages[0],
  207. pageoff, mw->mw_nents, rc);
  208. rpcrdma_defer_mr_recovery(mw);
  209. return -EIO;
  210. }
  211. /* Invalidate all memory regions that were registered for "req".
  212. *
  213. * Sleeps until it is safe for the host CPU to access the
  214. * previously mapped memory regions.
  215. *
  216. * Caller ensures that req->rl_registered is not empty.
  217. */
  218. static void
  219. fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
  220. {
  221. struct rpcrdma_mw *mw, *tmp;
  222. LIST_HEAD(unmap_list);
  223. int rc;
  224. dprintk("RPC: %s: req %p\n", __func__, req);
  225. /* ORDER: Invalidate all of the req's MRs first
  226. *
  227. * ib_unmap_fmr() is slow, so use a single call instead
  228. * of one call per mapped FMR.
  229. */
  230. list_for_each_entry(mw, &req->rl_registered, mw_list)
  231. list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
  232. r_xprt->rx_stats.local_inv_needed++;
  233. rc = ib_unmap_fmr(&unmap_list);
  234. if (rc)
  235. goto out_reset;
  236. /* ORDER: Now DMA unmap all of the req's MRs, and return
  237. * them to the free MW list.
  238. */
  239. list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
  240. list_del_init(&mw->mw_list);
  241. list_del_init(&mw->fmr.fm_mr->list);
  242. ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
  243. mw->mw_sg, mw->mw_nents, mw->mw_dir);
  244. rpcrdma_put_mw(r_xprt, mw);
  245. }
  246. return;
  247. out_reset:
  248. pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
  249. list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
  250. list_del_init(&mw->fmr.fm_mr->list);
  251. fmr_op_recover_mr(mw);
  252. }
  253. }
  254. /* Use a slow, safe mechanism to invalidate all memory regions
  255. * that were registered for "req".
  256. */
  257. static void
  258. fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
  259. bool sync)
  260. {
  261. struct rpcrdma_mw *mw;
  262. while (!list_empty(&req->rl_registered)) {
  263. mw = list_first_entry(&req->rl_registered,
  264. struct rpcrdma_mw, mw_list);
  265. list_del_init(&mw->mw_list);
  266. if (sync)
  267. fmr_op_recover_mr(mw);
  268. else
  269. rpcrdma_defer_mr_recovery(mw);
  270. }
  271. }
  272. const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
  273. .ro_map = fmr_op_map,
  274. .ro_unmap_sync = fmr_op_unmap_sync,
  275. .ro_unmap_safe = fmr_op_unmap_safe,
  276. .ro_recover_mr = fmr_op_recover_mr,
  277. .ro_open = fmr_op_open,
  278. .ro_maxpages = fmr_op_maxpages,
  279. .ro_init_mr = fmr_op_init_mr,
  280. .ro_release_mr = fmr_op_release_mr,
  281. .ro_displayname = "fmr",
  282. .ro_send_w_inv_ok = 0,
  283. };