rdma.c 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623
  1. /*
  2. * NVMe over Fabrics RDMA target.
  3. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
  4. *
  5. * This program is free software; you can redistribute it and/or modify it
  6. * under the terms and conditions of the GNU General Public License,
  7. * version 2, as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope it will be useful, but WITHOUT
  10. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  12. * more details.
  13. */
  14. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  15. #include <linux/atomic.h>
  16. #include <linux/ctype.h>
  17. #include <linux/delay.h>
  18. #include <linux/err.h>
  19. #include <linux/init.h>
  20. #include <linux/module.h>
  21. #include <linux/nvme.h>
  22. #include <linux/slab.h>
  23. #include <linux/string.h>
  24. #include <linux/wait.h>
  25. #include <linux/inet.h>
  26. #include <asm/unaligned.h>
  27. #include <rdma/ib_verbs.h>
  28. #include <rdma/rdma_cm.h>
  29. #include <rdma/rw.h>
  30. #include <linux/nvme-rdma.h>
  31. #include "nvmet.h"
  32. /*
  33. * We allow up to a page of inline data to go with the SQE
  34. */
  35. #define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE
  36. struct nvmet_rdma_cmd {
  37. struct ib_sge sge[2];
  38. struct ib_cqe cqe;
  39. struct ib_recv_wr wr;
  40. struct scatterlist inline_sg;
  41. struct page *inline_page;
  42. struct nvme_command *nvme_cmd;
  43. struct nvmet_rdma_queue *queue;
  44. };
  45. enum {
  46. NVMET_RDMA_REQ_INLINE_DATA = (1 << 0),
  47. NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1),
  48. };
  49. struct nvmet_rdma_rsp {
  50. struct ib_sge send_sge;
  51. struct ib_cqe send_cqe;
  52. struct ib_send_wr send_wr;
  53. struct nvmet_rdma_cmd *cmd;
  54. struct nvmet_rdma_queue *queue;
  55. struct ib_cqe read_cqe;
  56. struct rdma_rw_ctx rw;
  57. struct nvmet_req req;
  58. bool allocated;
  59. u8 n_rdma;
  60. u32 flags;
  61. u32 invalidate_rkey;
  62. struct list_head wait_list;
  63. struct list_head free_list;
  64. };
  65. enum nvmet_rdma_queue_state {
  66. NVMET_RDMA_Q_CONNECTING,
  67. NVMET_RDMA_Q_LIVE,
  68. NVMET_RDMA_Q_DISCONNECTING,
  69. NVMET_RDMA_IN_DEVICE_REMOVAL,
  70. };
  71. struct nvmet_rdma_queue {
  72. struct rdma_cm_id *cm_id;
  73. struct nvmet_port *port;
  74. struct ib_cq *cq;
  75. atomic_t sq_wr_avail;
  76. struct nvmet_rdma_device *dev;
  77. spinlock_t state_lock;
  78. enum nvmet_rdma_queue_state state;
  79. struct nvmet_cq nvme_cq;
  80. struct nvmet_sq nvme_sq;
  81. struct nvmet_rdma_rsp *rsps;
  82. struct list_head free_rsps;
  83. spinlock_t rsps_lock;
  84. struct nvmet_rdma_cmd *cmds;
  85. struct work_struct release_work;
  86. struct list_head rsp_wait_list;
  87. struct list_head rsp_wr_wait_list;
  88. spinlock_t rsp_wr_wait_lock;
  89. int idx;
  90. int host_qid;
  91. int recv_queue_size;
  92. int send_queue_size;
  93. struct list_head queue_list;
  94. };
  95. struct nvmet_rdma_device {
  96. struct ib_device *device;
  97. struct ib_pd *pd;
  98. struct ib_srq *srq;
  99. struct nvmet_rdma_cmd *srq_cmds;
  100. size_t srq_size;
  101. struct kref ref;
  102. struct list_head entry;
  103. };
  104. static bool nvmet_rdma_use_srq;
  105. module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444);
  106. MODULE_PARM_DESC(use_srq, "Use shared receive queue.");
  107. static DEFINE_IDA(nvmet_rdma_queue_ida);
  108. static LIST_HEAD(nvmet_rdma_queue_list);
  109. static DEFINE_MUTEX(nvmet_rdma_queue_mutex);
  110. static LIST_HEAD(device_list);
  111. static DEFINE_MUTEX(device_list_mutex);
  112. static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp);
  113. static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc);
  114. static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
  115. static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc);
  116. static void nvmet_rdma_qp_event(struct ib_event *event, void *priv);
  117. static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
  118. static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
  119. struct nvmet_rdma_rsp *r);
  120. static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
  121. struct nvmet_rdma_rsp *r);
  122. static struct nvmet_fabrics_ops nvmet_rdma_ops;
  123. /* XXX: really should move to a generic header sooner or later.. */
  124. static inline u32 get_unaligned_le24(const u8 *p)
  125. {
  126. return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16;
  127. }
  128. static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
  129. {
  130. return nvme_is_write(rsp->req.cmd) &&
  131. rsp->req.data_len &&
  132. !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
  133. }
  134. static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
  135. {
  136. return !nvme_is_write(rsp->req.cmd) &&
  137. rsp->req.data_len &&
  138. !rsp->req.rsp->status &&
  139. !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
  140. }
  141. static inline struct nvmet_rdma_rsp *
  142. nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue)
  143. {
  144. struct nvmet_rdma_rsp *rsp;
  145. unsigned long flags;
  146. spin_lock_irqsave(&queue->rsps_lock, flags);
  147. rsp = list_first_entry_or_null(&queue->free_rsps,
  148. struct nvmet_rdma_rsp, free_list);
  149. if (likely(rsp))
  150. list_del(&rsp->free_list);
  151. spin_unlock_irqrestore(&queue->rsps_lock, flags);
  152. if (unlikely(!rsp)) {
  153. int ret;
  154. rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
  155. if (unlikely(!rsp))
  156. return NULL;
  157. ret = nvmet_rdma_alloc_rsp(queue->dev, rsp);
  158. if (unlikely(ret)) {
  159. kfree(rsp);
  160. return NULL;
  161. }
  162. rsp->allocated = true;
  163. }
  164. return rsp;
  165. }
  166. static inline void
  167. nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
  168. {
  169. unsigned long flags;
  170. if (unlikely(rsp->allocated)) {
  171. nvmet_rdma_free_rsp(rsp->queue->dev, rsp);
  172. kfree(rsp);
  173. return;
  174. }
  175. spin_lock_irqsave(&rsp->queue->rsps_lock, flags);
  176. list_add_tail(&rsp->free_list, &rsp->queue->free_rsps);
  177. spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
  178. }
  179. static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents)
  180. {
  181. struct scatterlist *sg;
  182. int count;
  183. if (!sgl || !nents)
  184. return;
  185. for_each_sg(sgl, sg, nents, count)
  186. __free_page(sg_page(sg));
  187. kfree(sgl);
  188. }
  189. static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents,
  190. u32 length)
  191. {
  192. struct scatterlist *sg;
  193. struct page *page;
  194. unsigned int nent;
  195. int i = 0;
  196. nent = DIV_ROUND_UP(length, PAGE_SIZE);
  197. sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
  198. if (!sg)
  199. goto out;
  200. sg_init_table(sg, nent);
  201. while (length) {
  202. u32 page_len = min_t(u32, length, PAGE_SIZE);
  203. page = alloc_page(GFP_KERNEL);
  204. if (!page)
  205. goto out_free_pages;
  206. sg_set_page(&sg[i], page, page_len, 0);
  207. length -= page_len;
  208. i++;
  209. }
  210. *sgl = sg;
  211. *nents = nent;
  212. return 0;
  213. out_free_pages:
  214. while (i > 0) {
  215. i--;
  216. __free_page(sg_page(&sg[i]));
  217. }
  218. kfree(sg);
  219. out:
  220. return NVME_SC_INTERNAL;
  221. }
  222. static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
  223. struct nvmet_rdma_cmd *c, bool admin)
  224. {
  225. /* NVMe command / RDMA RECV */
  226. c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL);
  227. if (!c->nvme_cmd)
  228. goto out;
  229. c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd,
  230. sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
  231. if (ib_dma_mapping_error(ndev->device, c->sge[0].addr))
  232. goto out_free_cmd;
  233. c->sge[0].length = sizeof(*c->nvme_cmd);
  234. c->sge[0].lkey = ndev->pd->local_dma_lkey;
  235. if (!admin) {
  236. c->inline_page = alloc_pages(GFP_KERNEL,
  237. get_order(NVMET_RDMA_INLINE_DATA_SIZE));
  238. if (!c->inline_page)
  239. goto out_unmap_cmd;
  240. c->sge[1].addr = ib_dma_map_page(ndev->device,
  241. c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE,
  242. DMA_FROM_DEVICE);
  243. if (ib_dma_mapping_error(ndev->device, c->sge[1].addr))
  244. goto out_free_inline_page;
  245. c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE;
  246. c->sge[1].lkey = ndev->pd->local_dma_lkey;
  247. }
  248. c->cqe.done = nvmet_rdma_recv_done;
  249. c->wr.wr_cqe = &c->cqe;
  250. c->wr.sg_list = c->sge;
  251. c->wr.num_sge = admin ? 1 : 2;
  252. return 0;
  253. out_free_inline_page:
  254. if (!admin) {
  255. __free_pages(c->inline_page,
  256. get_order(NVMET_RDMA_INLINE_DATA_SIZE));
  257. }
  258. out_unmap_cmd:
  259. ib_dma_unmap_single(ndev->device, c->sge[0].addr,
  260. sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
  261. out_free_cmd:
  262. kfree(c->nvme_cmd);
  263. out:
  264. return -ENOMEM;
  265. }
  266. static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
  267. struct nvmet_rdma_cmd *c, bool admin)
  268. {
  269. if (!admin) {
  270. ib_dma_unmap_page(ndev->device, c->sge[1].addr,
  271. NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE);
  272. __free_pages(c->inline_page,
  273. get_order(NVMET_RDMA_INLINE_DATA_SIZE));
  274. }
  275. ib_dma_unmap_single(ndev->device, c->sge[0].addr,
  276. sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
  277. kfree(c->nvme_cmd);
  278. }
  279. static struct nvmet_rdma_cmd *
  280. nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev,
  281. int nr_cmds, bool admin)
  282. {
  283. struct nvmet_rdma_cmd *cmds;
  284. int ret = -EINVAL, i;
  285. cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL);
  286. if (!cmds)
  287. goto out;
  288. for (i = 0; i < nr_cmds; i++) {
  289. ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin);
  290. if (ret)
  291. goto out_free;
  292. }
  293. return cmds;
  294. out_free:
  295. while (--i >= 0)
  296. nvmet_rdma_free_cmd(ndev, cmds + i, admin);
  297. kfree(cmds);
  298. out:
  299. return ERR_PTR(ret);
  300. }
  301. static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev,
  302. struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin)
  303. {
  304. int i;
  305. for (i = 0; i < nr_cmds; i++)
  306. nvmet_rdma_free_cmd(ndev, cmds + i, admin);
  307. kfree(cmds);
  308. }
  309. static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
  310. struct nvmet_rdma_rsp *r)
  311. {
  312. /* NVMe CQE / RDMA SEND */
  313. r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL);
  314. if (!r->req.rsp)
  315. goto out;
  316. r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp,
  317. sizeof(*r->req.rsp), DMA_TO_DEVICE);
  318. if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
  319. goto out_free_rsp;
  320. r->send_sge.length = sizeof(*r->req.rsp);
  321. r->send_sge.lkey = ndev->pd->local_dma_lkey;
  322. r->send_cqe.done = nvmet_rdma_send_done;
  323. r->send_wr.wr_cqe = &r->send_cqe;
  324. r->send_wr.sg_list = &r->send_sge;
  325. r->send_wr.num_sge = 1;
  326. r->send_wr.send_flags = IB_SEND_SIGNALED;
  327. /* Data In / RDMA READ */
  328. r->read_cqe.done = nvmet_rdma_read_data_done;
  329. return 0;
  330. out_free_rsp:
  331. kfree(r->req.rsp);
  332. out:
  333. return -ENOMEM;
  334. }
  335. static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
  336. struct nvmet_rdma_rsp *r)
  337. {
  338. ib_dma_unmap_single(ndev->device, r->send_sge.addr,
  339. sizeof(*r->req.rsp), DMA_TO_DEVICE);
  340. kfree(r->req.rsp);
  341. }
  342. static int
  343. nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue)
  344. {
  345. struct nvmet_rdma_device *ndev = queue->dev;
  346. int nr_rsps = queue->recv_queue_size * 2;
  347. int ret = -EINVAL, i;
  348. queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp),
  349. GFP_KERNEL);
  350. if (!queue->rsps)
  351. goto out;
  352. for (i = 0; i < nr_rsps; i++) {
  353. struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
  354. ret = nvmet_rdma_alloc_rsp(ndev, rsp);
  355. if (ret)
  356. goto out_free;
  357. list_add_tail(&rsp->free_list, &queue->free_rsps);
  358. }
  359. return 0;
  360. out_free:
  361. while (--i >= 0) {
  362. struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
  363. list_del(&rsp->free_list);
  364. nvmet_rdma_free_rsp(ndev, rsp);
  365. }
  366. kfree(queue->rsps);
  367. out:
  368. return ret;
  369. }
  370. static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue)
  371. {
  372. struct nvmet_rdma_device *ndev = queue->dev;
  373. int i, nr_rsps = queue->recv_queue_size * 2;
  374. for (i = 0; i < nr_rsps; i++) {
  375. struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
  376. list_del(&rsp->free_list);
  377. nvmet_rdma_free_rsp(ndev, rsp);
  378. }
  379. kfree(queue->rsps);
  380. }
  381. static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
  382. struct nvmet_rdma_cmd *cmd)
  383. {
  384. struct ib_recv_wr *bad_wr;
  385. ib_dma_sync_single_for_device(ndev->device,
  386. cmd->sge[0].addr, cmd->sge[0].length,
  387. DMA_FROM_DEVICE);
  388. if (ndev->srq)
  389. return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr);
  390. return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr);
  391. }
  392. static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
  393. {
  394. spin_lock(&queue->rsp_wr_wait_lock);
  395. while (!list_empty(&queue->rsp_wr_wait_list)) {
  396. struct nvmet_rdma_rsp *rsp;
  397. bool ret;
  398. rsp = list_entry(queue->rsp_wr_wait_list.next,
  399. struct nvmet_rdma_rsp, wait_list);
  400. list_del(&rsp->wait_list);
  401. spin_unlock(&queue->rsp_wr_wait_lock);
  402. ret = nvmet_rdma_execute_command(rsp);
  403. spin_lock(&queue->rsp_wr_wait_lock);
  404. if (!ret) {
  405. list_add(&rsp->wait_list, &queue->rsp_wr_wait_list);
  406. break;
  407. }
  408. }
  409. spin_unlock(&queue->rsp_wr_wait_lock);
  410. }
  411. static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
  412. {
  413. struct nvmet_rdma_queue *queue = rsp->queue;
  414. atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
  415. if (rsp->n_rdma) {
  416. rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
  417. queue->cm_id->port_num, rsp->req.sg,
  418. rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
  419. }
  420. if (rsp->req.sg != &rsp->cmd->inline_sg)
  421. nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt);
  422. if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
  423. nvmet_rdma_process_wr_wait_list(queue);
  424. nvmet_rdma_put_rsp(rsp);
  425. }
  426. static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue)
  427. {
  428. if (queue->nvme_sq.ctrl) {
  429. nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
  430. } else {
  431. /*
  432. * we didn't setup the controller yet in case
  433. * of admin connect error, just disconnect and
  434. * cleanup the queue
  435. */
  436. nvmet_rdma_queue_disconnect(queue);
  437. }
  438. }
  439. static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
  440. {
  441. struct nvmet_rdma_rsp *rsp =
  442. container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe);
  443. struct nvmet_rdma_queue *queue = cq->cq_context;
  444. nvmet_rdma_release_rsp(rsp);
  445. if (unlikely(wc->status != IB_WC_SUCCESS &&
  446. wc->status != IB_WC_WR_FLUSH_ERR)) {
  447. pr_err("SEND for CQE 0x%p failed with status %s (%d).\n",
  448. wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
  449. nvmet_rdma_error_comp(queue);
  450. }
  451. }
  452. static void nvmet_rdma_queue_response(struct nvmet_req *req)
  453. {
  454. struct nvmet_rdma_rsp *rsp =
  455. container_of(req, struct nvmet_rdma_rsp, req);
  456. struct rdma_cm_id *cm_id = rsp->queue->cm_id;
  457. struct ib_send_wr *first_wr, *bad_wr;
  458. if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) {
  459. rsp->send_wr.opcode = IB_WR_SEND_WITH_INV;
  460. rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey;
  461. } else {
  462. rsp->send_wr.opcode = IB_WR_SEND;
  463. }
  464. if (nvmet_rdma_need_data_out(rsp))
  465. first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
  466. cm_id->port_num, NULL, &rsp->send_wr);
  467. else
  468. first_wr = &rsp->send_wr;
  469. nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
  470. ib_dma_sync_single_for_device(rsp->queue->dev->device,
  471. rsp->send_sge.addr, rsp->send_sge.length,
  472. DMA_TO_DEVICE);
  473. if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) {
  474. pr_err("sending cmd response failed\n");
  475. nvmet_rdma_release_rsp(rsp);
  476. }
  477. }
  478. static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
  479. {
  480. struct nvmet_rdma_rsp *rsp =
  481. container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
  482. struct nvmet_rdma_queue *queue = cq->cq_context;
  483. WARN_ON(rsp->n_rdma <= 0);
  484. atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
  485. rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
  486. queue->cm_id->port_num, rsp->req.sg,
  487. rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
  488. rsp->n_rdma = 0;
  489. if (unlikely(wc->status != IB_WC_SUCCESS)) {
  490. nvmet_req_uninit(&rsp->req);
  491. nvmet_rdma_release_rsp(rsp);
  492. if (wc->status != IB_WC_WR_FLUSH_ERR) {
  493. pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n",
  494. wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
  495. nvmet_rdma_error_comp(queue);
  496. }
  497. return;
  498. }
  499. rsp->req.execute(&rsp->req);
  500. }
  501. static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
  502. u64 off)
  503. {
  504. sg_init_table(&rsp->cmd->inline_sg, 1);
  505. sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off);
  506. rsp->req.sg = &rsp->cmd->inline_sg;
  507. rsp->req.sg_cnt = 1;
  508. }
  509. static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
  510. {
  511. struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl;
  512. u64 off = le64_to_cpu(sgl->addr);
  513. u32 len = le32_to_cpu(sgl->length);
  514. if (!nvme_is_write(rsp->req.cmd))
  515. return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
  516. if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) {
  517. pr_err("invalid inline data offset!\n");
  518. return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
  519. }
  520. /* no data command? */
  521. if (!len)
  522. return 0;
  523. nvmet_rdma_use_inline_sg(rsp, len, off);
  524. rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA;
  525. return 0;
  526. }
  527. static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
  528. struct nvme_keyed_sgl_desc *sgl, bool invalidate)
  529. {
  530. struct rdma_cm_id *cm_id = rsp->queue->cm_id;
  531. u64 addr = le64_to_cpu(sgl->addr);
  532. u32 len = get_unaligned_le24(sgl->length);
  533. u32 key = get_unaligned_le32(sgl->key);
  534. int ret;
  535. u16 status;
  536. /* no data command? */
  537. if (!len)
  538. return 0;
  539. status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
  540. len);
  541. if (status)
  542. return status;
  543. ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
  544. rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
  545. nvmet_data_dir(&rsp->req));
  546. if (ret < 0)
  547. return NVME_SC_INTERNAL;
  548. rsp->n_rdma += ret;
  549. if (invalidate) {
  550. rsp->invalidate_rkey = key;
  551. rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY;
  552. }
  553. return 0;
  554. }
  555. static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
  556. {
  557. struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl;
  558. switch (sgl->type >> 4) {
  559. case NVME_SGL_FMT_DATA_DESC:
  560. switch (sgl->type & 0xf) {
  561. case NVME_SGL_FMT_OFFSET:
  562. return nvmet_rdma_map_sgl_inline(rsp);
  563. default:
  564. pr_err("invalid SGL subtype: %#x\n", sgl->type);
  565. return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
  566. }
  567. case NVME_KEY_SGL_FMT_DATA_DESC:
  568. switch (sgl->type & 0xf) {
  569. case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE:
  570. return nvmet_rdma_map_sgl_keyed(rsp, sgl, true);
  571. case NVME_SGL_FMT_ADDRESS:
  572. return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
  573. default:
  574. pr_err("invalid SGL subtype: %#x\n", sgl->type);
  575. return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
  576. }
  577. default:
  578. pr_err("invalid SGL type: %#x\n", sgl->type);
  579. return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
  580. }
  581. }
  582. static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
  583. {
  584. struct nvmet_rdma_queue *queue = rsp->queue;
  585. if (unlikely(atomic_sub_return(1 + rsp->n_rdma,
  586. &queue->sq_wr_avail) < 0)) {
  587. pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n",
  588. 1 + rsp->n_rdma, queue->idx,
  589. queue->nvme_sq.ctrl->cntlid);
  590. atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
  591. return false;
  592. }
  593. if (nvmet_rdma_need_data_in(rsp)) {
  594. if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp,
  595. queue->cm_id->port_num, &rsp->read_cqe, NULL))
  596. nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
  597. } else {
  598. rsp->req.execute(&rsp->req);
  599. }
  600. return true;
  601. }
  602. static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
  603. struct nvmet_rdma_rsp *cmd)
  604. {
  605. u16 status;
  606. ib_dma_sync_single_for_cpu(queue->dev->device,
  607. cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length,
  608. DMA_FROM_DEVICE);
  609. ib_dma_sync_single_for_cpu(queue->dev->device,
  610. cmd->send_sge.addr, cmd->send_sge.length,
  611. DMA_TO_DEVICE);
  612. if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
  613. &queue->nvme_sq, &nvmet_rdma_ops))
  614. return;
  615. status = nvmet_rdma_map_sgl(cmd);
  616. if (status)
  617. goto out_err;
  618. if (unlikely(!nvmet_rdma_execute_command(cmd))) {
  619. spin_lock(&queue->rsp_wr_wait_lock);
  620. list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list);
  621. spin_unlock(&queue->rsp_wr_wait_lock);
  622. }
  623. return;
  624. out_err:
  625. nvmet_req_complete(&cmd->req, status);
  626. }
  627. static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
  628. {
  629. struct nvmet_rdma_cmd *cmd =
  630. container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
  631. struct nvmet_rdma_queue *queue = cq->cq_context;
  632. struct nvmet_rdma_rsp *rsp;
  633. if (unlikely(wc->status != IB_WC_SUCCESS)) {
  634. if (wc->status != IB_WC_WR_FLUSH_ERR) {
  635. pr_err("RECV for CQE 0x%p failed with status %s (%d)\n",
  636. wc->wr_cqe, ib_wc_status_msg(wc->status),
  637. wc->status);
  638. nvmet_rdma_error_comp(queue);
  639. }
  640. return;
  641. }
  642. if (unlikely(wc->byte_len < sizeof(struct nvme_command))) {
  643. pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n");
  644. nvmet_rdma_error_comp(queue);
  645. return;
  646. }
  647. cmd->queue = queue;
  648. rsp = nvmet_rdma_get_rsp(queue);
  649. if (unlikely(!rsp)) {
  650. /*
  651. * we get here only under memory pressure,
  652. * silently drop and have the host retry
  653. * as we can't even fail it.
  654. */
  655. nvmet_rdma_post_recv(queue->dev, cmd);
  656. return;
  657. }
  658. rsp->queue = queue;
  659. rsp->cmd = cmd;
  660. rsp->flags = 0;
  661. rsp->req.cmd = cmd->nvme_cmd;
  662. rsp->req.port = queue->port;
  663. rsp->n_rdma = 0;
  664. if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) {
  665. unsigned long flags;
  666. spin_lock_irqsave(&queue->state_lock, flags);
  667. if (queue->state == NVMET_RDMA_Q_CONNECTING)
  668. list_add_tail(&rsp->wait_list, &queue->rsp_wait_list);
  669. else
  670. nvmet_rdma_put_rsp(rsp);
  671. spin_unlock_irqrestore(&queue->state_lock, flags);
  672. return;
  673. }
  674. nvmet_rdma_handle_command(queue, rsp);
  675. }
  676. static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev)
  677. {
  678. if (!ndev->srq)
  679. return;
  680. nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
  681. ib_destroy_srq(ndev->srq);
  682. }
  683. static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
  684. {
  685. struct ib_srq_init_attr srq_attr = { NULL, };
  686. struct ib_srq *srq;
  687. size_t srq_size;
  688. int ret, i;
  689. srq_size = 4095; /* XXX: tune */
  690. srq_attr.attr.max_wr = srq_size;
  691. srq_attr.attr.max_sge = 2;
  692. srq_attr.attr.srq_limit = 0;
  693. srq_attr.srq_type = IB_SRQT_BASIC;
  694. srq = ib_create_srq(ndev->pd, &srq_attr);
  695. if (IS_ERR(srq)) {
  696. /*
  697. * If SRQs aren't supported we just go ahead and use normal
  698. * non-shared receive queues.
  699. */
  700. pr_info("SRQ requested but not supported.\n");
  701. return 0;
  702. }
  703. ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false);
  704. if (IS_ERR(ndev->srq_cmds)) {
  705. ret = PTR_ERR(ndev->srq_cmds);
  706. goto out_destroy_srq;
  707. }
  708. ndev->srq = srq;
  709. ndev->srq_size = srq_size;
  710. for (i = 0; i < srq_size; i++)
  711. nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
  712. return 0;
  713. out_destroy_srq:
  714. ib_destroy_srq(srq);
  715. return ret;
  716. }
  717. static void nvmet_rdma_free_dev(struct kref *ref)
  718. {
  719. struct nvmet_rdma_device *ndev =
  720. container_of(ref, struct nvmet_rdma_device, ref);
  721. mutex_lock(&device_list_mutex);
  722. list_del(&ndev->entry);
  723. mutex_unlock(&device_list_mutex);
  724. nvmet_rdma_destroy_srq(ndev);
  725. ib_dealloc_pd(ndev->pd);
  726. kfree(ndev);
  727. }
  728. static struct nvmet_rdma_device *
  729. nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
  730. {
  731. struct nvmet_rdma_device *ndev;
  732. int ret;
  733. mutex_lock(&device_list_mutex);
  734. list_for_each_entry(ndev, &device_list, entry) {
  735. if (ndev->device->node_guid == cm_id->device->node_guid &&
  736. kref_get_unless_zero(&ndev->ref))
  737. goto out_unlock;
  738. }
  739. ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
  740. if (!ndev)
  741. goto out_err;
  742. ndev->device = cm_id->device;
  743. kref_init(&ndev->ref);
  744. ndev->pd = ib_alloc_pd(ndev->device, 0);
  745. if (IS_ERR(ndev->pd))
  746. goto out_free_dev;
  747. if (nvmet_rdma_use_srq) {
  748. ret = nvmet_rdma_init_srq(ndev);
  749. if (ret)
  750. goto out_free_pd;
  751. }
  752. list_add(&ndev->entry, &device_list);
  753. out_unlock:
  754. mutex_unlock(&device_list_mutex);
  755. pr_debug("added %s.\n", ndev->device->name);
  756. return ndev;
  757. out_free_pd:
  758. ib_dealloc_pd(ndev->pd);
  759. out_free_dev:
  760. kfree(ndev);
  761. out_err:
  762. mutex_unlock(&device_list_mutex);
  763. return NULL;
  764. }
  765. static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
  766. {
  767. struct ib_qp_init_attr qp_attr;
  768. struct nvmet_rdma_device *ndev = queue->dev;
  769. int comp_vector, nr_cqe, ret, i;
  770. /*
  771. * Spread the io queues across completion vectors,
  772. * but still keep all admin queues on vector 0.
  773. */
  774. comp_vector = !queue->host_qid ? 0 :
  775. queue->idx % ndev->device->num_comp_vectors;
  776. /*
  777. * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND.
  778. */
  779. nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
  780. queue->cq = ib_alloc_cq(ndev->device, queue,
  781. nr_cqe + 1, comp_vector,
  782. IB_POLL_WORKQUEUE);
  783. if (IS_ERR(queue->cq)) {
  784. ret = PTR_ERR(queue->cq);
  785. pr_err("failed to create CQ cqe= %d ret= %d\n",
  786. nr_cqe + 1, ret);
  787. goto out;
  788. }
  789. memset(&qp_attr, 0, sizeof(qp_attr));
  790. qp_attr.qp_context = queue;
  791. qp_attr.event_handler = nvmet_rdma_qp_event;
  792. qp_attr.send_cq = queue->cq;
  793. qp_attr.recv_cq = queue->cq;
  794. qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
  795. qp_attr.qp_type = IB_QPT_RC;
  796. /* +1 for drain */
  797. qp_attr.cap.max_send_wr = queue->send_queue_size + 1;
  798. qp_attr.cap.max_rdma_ctxs = queue->send_queue_size;
  799. qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd,
  800. ndev->device->attrs.max_sge);
  801. if (ndev->srq) {
  802. qp_attr.srq = ndev->srq;
  803. } else {
  804. /* +1 for drain */
  805. qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
  806. qp_attr.cap.max_recv_sge = 2;
  807. }
  808. ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
  809. if (ret) {
  810. pr_err("failed to create_qp ret= %d\n", ret);
  811. goto err_destroy_cq;
  812. }
  813. atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
  814. pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
  815. __func__, queue->cq->cqe, qp_attr.cap.max_send_sge,
  816. qp_attr.cap.max_send_wr, queue->cm_id);
  817. if (!ndev->srq) {
  818. for (i = 0; i < queue->recv_queue_size; i++) {
  819. queue->cmds[i].queue = queue;
  820. nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
  821. }
  822. }
  823. out:
  824. return ret;
  825. err_destroy_cq:
  826. ib_free_cq(queue->cq);
  827. goto out;
  828. }
  829. static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
  830. {
  831. ib_drain_qp(queue->cm_id->qp);
  832. rdma_destroy_qp(queue->cm_id);
  833. ib_free_cq(queue->cq);
  834. }
  835. static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
  836. {
  837. pr_info("freeing queue %d\n", queue->idx);
  838. nvmet_sq_destroy(&queue->nvme_sq);
  839. nvmet_rdma_destroy_queue_ib(queue);
  840. if (!queue->dev->srq) {
  841. nvmet_rdma_free_cmds(queue->dev, queue->cmds,
  842. queue->recv_queue_size,
  843. !queue->host_qid);
  844. }
  845. nvmet_rdma_free_rsps(queue);
  846. ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
  847. kfree(queue);
  848. }
  849. static void nvmet_rdma_release_queue_work(struct work_struct *w)
  850. {
  851. struct nvmet_rdma_queue *queue =
  852. container_of(w, struct nvmet_rdma_queue, release_work);
  853. struct rdma_cm_id *cm_id = queue->cm_id;
  854. struct nvmet_rdma_device *dev = queue->dev;
  855. enum nvmet_rdma_queue_state state = queue->state;
  856. nvmet_rdma_free_queue(queue);
  857. if (state != NVMET_RDMA_IN_DEVICE_REMOVAL)
  858. rdma_destroy_id(cm_id);
  859. kref_put(&dev->ref, nvmet_rdma_free_dev);
  860. }
  861. static int
  862. nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn,
  863. struct nvmet_rdma_queue *queue)
  864. {
  865. struct nvme_rdma_cm_req *req;
  866. req = (struct nvme_rdma_cm_req *)conn->private_data;
  867. if (!req || conn->private_data_len == 0)
  868. return NVME_RDMA_CM_INVALID_LEN;
  869. if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0)
  870. return NVME_RDMA_CM_INVALID_RECFMT;
  871. queue->host_qid = le16_to_cpu(req->qid);
  872. /*
  873. * req->hsqsize corresponds to our recv queue size plus 1
  874. * req->hrqsize corresponds to our send queue size
  875. */
  876. queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1;
  877. queue->send_queue_size = le16_to_cpu(req->hrqsize);
  878. if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH)
  879. return NVME_RDMA_CM_INVALID_HSQSIZE;
  880. /* XXX: Should we enforce some kind of max for IO queues? */
  881. return 0;
  882. }
  883. static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
  884. enum nvme_rdma_cm_status status)
  885. {
  886. struct nvme_rdma_cm_rej rej;
  887. pr_debug("rejecting connect request: status %d (%s)\n",
  888. status, nvme_rdma_cm_msg(status));
  889. rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
  890. rej.sts = cpu_to_le16(status);
  891. return rdma_reject(cm_id, (void *)&rej, sizeof(rej));
  892. }
  893. static struct nvmet_rdma_queue *
  894. nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
  895. struct rdma_cm_id *cm_id,
  896. struct rdma_cm_event *event)
  897. {
  898. struct nvmet_rdma_queue *queue;
  899. int ret;
  900. queue = kzalloc(sizeof(*queue), GFP_KERNEL);
  901. if (!queue) {
  902. ret = NVME_RDMA_CM_NO_RSC;
  903. goto out_reject;
  904. }
  905. ret = nvmet_sq_init(&queue->nvme_sq);
  906. if (ret) {
  907. ret = NVME_RDMA_CM_NO_RSC;
  908. goto out_free_queue;
  909. }
  910. ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue);
  911. if (ret)
  912. goto out_destroy_sq;
  913. /*
  914. * Schedules the actual release because calling rdma_destroy_id from
  915. * inside a CM callback would trigger a deadlock. (great API design..)
  916. */
  917. INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
  918. queue->dev = ndev;
  919. queue->cm_id = cm_id;
  920. spin_lock_init(&queue->state_lock);
  921. queue->state = NVMET_RDMA_Q_CONNECTING;
  922. INIT_LIST_HEAD(&queue->rsp_wait_list);
  923. INIT_LIST_HEAD(&queue->rsp_wr_wait_list);
  924. spin_lock_init(&queue->rsp_wr_wait_lock);
  925. INIT_LIST_HEAD(&queue->free_rsps);
  926. spin_lock_init(&queue->rsps_lock);
  927. INIT_LIST_HEAD(&queue->queue_list);
  928. queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL);
  929. if (queue->idx < 0) {
  930. ret = NVME_RDMA_CM_NO_RSC;
  931. goto out_destroy_sq;
  932. }
  933. ret = nvmet_rdma_alloc_rsps(queue);
  934. if (ret) {
  935. ret = NVME_RDMA_CM_NO_RSC;
  936. goto out_ida_remove;
  937. }
  938. if (!ndev->srq) {
  939. queue->cmds = nvmet_rdma_alloc_cmds(ndev,
  940. queue->recv_queue_size,
  941. !queue->host_qid);
  942. if (IS_ERR(queue->cmds)) {
  943. ret = NVME_RDMA_CM_NO_RSC;
  944. goto out_free_responses;
  945. }
  946. }
  947. ret = nvmet_rdma_create_queue_ib(queue);
  948. if (ret) {
  949. pr_err("%s: creating RDMA queue failed (%d).\n",
  950. __func__, ret);
  951. ret = NVME_RDMA_CM_NO_RSC;
  952. goto out_free_cmds;
  953. }
  954. return queue;
  955. out_free_cmds:
  956. if (!ndev->srq) {
  957. nvmet_rdma_free_cmds(queue->dev, queue->cmds,
  958. queue->recv_queue_size,
  959. !queue->host_qid);
  960. }
  961. out_free_responses:
  962. nvmet_rdma_free_rsps(queue);
  963. out_ida_remove:
  964. ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
  965. out_destroy_sq:
  966. nvmet_sq_destroy(&queue->nvme_sq);
  967. out_free_queue:
  968. kfree(queue);
  969. out_reject:
  970. nvmet_rdma_cm_reject(cm_id, ret);
  971. return NULL;
  972. }
  973. static void nvmet_rdma_qp_event(struct ib_event *event, void *priv)
  974. {
  975. struct nvmet_rdma_queue *queue = priv;
  976. switch (event->event) {
  977. case IB_EVENT_COMM_EST:
  978. rdma_notify(queue->cm_id, event->event);
  979. break;
  980. default:
  981. pr_err("received IB QP event: %s (%d)\n",
  982. ib_event_msg(event->event), event->event);
  983. break;
  984. }
  985. }
  986. static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
  987. struct nvmet_rdma_queue *queue,
  988. struct rdma_conn_param *p)
  989. {
  990. struct rdma_conn_param param = { };
  991. struct nvme_rdma_cm_rep priv = { };
  992. int ret = -ENOMEM;
  993. param.rnr_retry_count = 7;
  994. param.flow_control = 1;
  995. param.initiator_depth = min_t(u8, p->initiator_depth,
  996. queue->dev->device->attrs.max_qp_init_rd_atom);
  997. param.private_data = &priv;
  998. param.private_data_len = sizeof(priv);
  999. priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
  1000. priv.crqsize = cpu_to_le16(queue->recv_queue_size);
  1001. ret = rdma_accept(cm_id, &param);
  1002. if (ret)
  1003. pr_err("rdma_accept failed (error code = %d)\n", ret);
  1004. return ret;
  1005. }
  1006. static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
  1007. struct rdma_cm_event *event)
  1008. {
  1009. struct nvmet_rdma_device *ndev;
  1010. struct nvmet_rdma_queue *queue;
  1011. int ret = -EINVAL;
  1012. ndev = nvmet_rdma_find_get_device(cm_id);
  1013. if (!ndev) {
  1014. nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC);
  1015. return -ECONNREFUSED;
  1016. }
  1017. queue = nvmet_rdma_alloc_queue(ndev, cm_id, event);
  1018. if (!queue) {
  1019. ret = -ENOMEM;
  1020. goto put_device;
  1021. }
  1022. queue->port = cm_id->context;
  1023. if (queue->host_qid == 0) {
  1024. /* Let inflight controller teardown complete */
  1025. flush_scheduled_work();
  1026. }
  1027. ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
  1028. if (ret)
  1029. goto release_queue;
  1030. mutex_lock(&nvmet_rdma_queue_mutex);
  1031. list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list);
  1032. mutex_unlock(&nvmet_rdma_queue_mutex);
  1033. return 0;
  1034. release_queue:
  1035. nvmet_rdma_free_queue(queue);
  1036. put_device:
  1037. kref_put(&ndev->ref, nvmet_rdma_free_dev);
  1038. return ret;
  1039. }
  1040. static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue)
  1041. {
  1042. unsigned long flags;
  1043. spin_lock_irqsave(&queue->state_lock, flags);
  1044. if (queue->state != NVMET_RDMA_Q_CONNECTING) {
  1045. pr_warn("trying to establish a connected queue\n");
  1046. goto out_unlock;
  1047. }
  1048. queue->state = NVMET_RDMA_Q_LIVE;
  1049. while (!list_empty(&queue->rsp_wait_list)) {
  1050. struct nvmet_rdma_rsp *cmd;
  1051. cmd = list_first_entry(&queue->rsp_wait_list,
  1052. struct nvmet_rdma_rsp, wait_list);
  1053. list_del(&cmd->wait_list);
  1054. spin_unlock_irqrestore(&queue->state_lock, flags);
  1055. nvmet_rdma_handle_command(queue, cmd);
  1056. spin_lock_irqsave(&queue->state_lock, flags);
  1057. }
  1058. out_unlock:
  1059. spin_unlock_irqrestore(&queue->state_lock, flags);
  1060. }
  1061. static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
  1062. {
  1063. bool disconnect = false;
  1064. unsigned long flags;
  1065. pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state);
  1066. spin_lock_irqsave(&queue->state_lock, flags);
  1067. switch (queue->state) {
  1068. case NVMET_RDMA_Q_CONNECTING:
  1069. case NVMET_RDMA_Q_LIVE:
  1070. queue->state = NVMET_RDMA_Q_DISCONNECTING;
  1071. case NVMET_RDMA_IN_DEVICE_REMOVAL:
  1072. disconnect = true;
  1073. break;
  1074. case NVMET_RDMA_Q_DISCONNECTING:
  1075. break;
  1076. }
  1077. spin_unlock_irqrestore(&queue->state_lock, flags);
  1078. if (disconnect) {
  1079. rdma_disconnect(queue->cm_id);
  1080. schedule_work(&queue->release_work);
  1081. }
  1082. }
  1083. static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
  1084. {
  1085. bool disconnect = false;
  1086. mutex_lock(&nvmet_rdma_queue_mutex);
  1087. if (!list_empty(&queue->queue_list)) {
  1088. list_del_init(&queue->queue_list);
  1089. disconnect = true;
  1090. }
  1091. mutex_unlock(&nvmet_rdma_queue_mutex);
  1092. if (disconnect)
  1093. __nvmet_rdma_queue_disconnect(queue);
  1094. }
  1095. static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
  1096. struct nvmet_rdma_queue *queue)
  1097. {
  1098. WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING);
  1099. mutex_lock(&nvmet_rdma_queue_mutex);
  1100. if (!list_empty(&queue->queue_list))
  1101. list_del_init(&queue->queue_list);
  1102. mutex_unlock(&nvmet_rdma_queue_mutex);
  1103. pr_err("failed to connect queue %d\n", queue->idx);
  1104. schedule_work(&queue->release_work);
  1105. }
  1106. /**
  1107. * nvme_rdma_device_removal() - Handle RDMA device removal
  1108. * @cm_id: rdma_cm id, used for nvmet port
  1109. * @queue: nvmet rdma queue (cm id qp_context)
  1110. *
  1111. * DEVICE_REMOVAL event notifies us that the RDMA device is about
  1112. * to unplug. Note that this event can be generated on a normal
  1113. * queue cm_id and/or a device bound listener cm_id (where in this
  1114. * case queue will be null).
  1115. *
  1116. * We registered an ib_client to handle device removal for queues,
  1117. * so we only need to handle the listening port cm_ids. In this case
  1118. * we nullify the priv to prevent double cm_id destruction and destroying
  1119. * the cm_id implicitely by returning a non-zero rc to the callout.
  1120. */
  1121. static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
  1122. struct nvmet_rdma_queue *queue)
  1123. {
  1124. struct nvmet_port *port;
  1125. if (queue) {
  1126. /*
  1127. * This is a queue cm_id. we have registered
  1128. * an ib_client to handle queues removal
  1129. * so don't interfear and just return.
  1130. */
  1131. return 0;
  1132. }
  1133. port = cm_id->context;
  1134. /*
  1135. * This is a listener cm_id. Make sure that
  1136. * future remove_port won't invoke a double
  1137. * cm_id destroy. use atomic xchg to make sure
  1138. * we don't compete with remove_port.
  1139. */
  1140. if (xchg(&port->priv, NULL) != cm_id)
  1141. return 0;
  1142. /*
  1143. * We need to return 1 so that the core will destroy
  1144. * it's own ID. What a great API design..
  1145. */
  1146. return 1;
  1147. }
  1148. static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
  1149. struct rdma_cm_event *event)
  1150. {
  1151. struct nvmet_rdma_queue *queue = NULL;
  1152. int ret = 0;
  1153. if (cm_id->qp)
  1154. queue = cm_id->qp->qp_context;
  1155. pr_debug("%s (%d): status %d id %p\n",
  1156. rdma_event_msg(event->event), event->event,
  1157. event->status, cm_id);
  1158. switch (event->event) {
  1159. case RDMA_CM_EVENT_CONNECT_REQUEST:
  1160. ret = nvmet_rdma_queue_connect(cm_id, event);
  1161. break;
  1162. case RDMA_CM_EVENT_ESTABLISHED:
  1163. nvmet_rdma_queue_established(queue);
  1164. break;
  1165. case RDMA_CM_EVENT_ADDR_CHANGE:
  1166. case RDMA_CM_EVENT_DISCONNECTED:
  1167. case RDMA_CM_EVENT_TIMEWAIT_EXIT:
  1168. /*
  1169. * We might end up here when we already freed the qp
  1170. * which means queue release sequence is in progress,
  1171. * so don't get in the way...
  1172. */
  1173. if (queue)
  1174. nvmet_rdma_queue_disconnect(queue);
  1175. break;
  1176. case RDMA_CM_EVENT_DEVICE_REMOVAL:
  1177. ret = nvmet_rdma_device_removal(cm_id, queue);
  1178. break;
  1179. case RDMA_CM_EVENT_REJECTED:
  1180. pr_debug("Connection rejected: %s\n",
  1181. rdma_reject_msg(cm_id, event->status));
  1182. /* FALLTHROUGH */
  1183. case RDMA_CM_EVENT_UNREACHABLE:
  1184. case RDMA_CM_EVENT_CONNECT_ERROR:
  1185. nvmet_rdma_queue_connect_fail(cm_id, queue);
  1186. break;
  1187. default:
  1188. pr_err("received unrecognized RDMA CM event %d\n",
  1189. event->event);
  1190. break;
  1191. }
  1192. return ret;
  1193. }
  1194. static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl)
  1195. {
  1196. struct nvmet_rdma_queue *queue;
  1197. restart:
  1198. mutex_lock(&nvmet_rdma_queue_mutex);
  1199. list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) {
  1200. if (queue->nvme_sq.ctrl == ctrl) {
  1201. list_del_init(&queue->queue_list);
  1202. mutex_unlock(&nvmet_rdma_queue_mutex);
  1203. __nvmet_rdma_queue_disconnect(queue);
  1204. goto restart;
  1205. }
  1206. }
  1207. mutex_unlock(&nvmet_rdma_queue_mutex);
  1208. }
  1209. static int nvmet_rdma_add_port(struct nvmet_port *port)
  1210. {
  1211. struct rdma_cm_id *cm_id;
  1212. struct sockaddr_storage addr = { };
  1213. __kernel_sa_family_t af;
  1214. int ret;
  1215. switch (port->disc_addr.adrfam) {
  1216. case NVMF_ADDR_FAMILY_IP4:
  1217. af = AF_INET;
  1218. break;
  1219. case NVMF_ADDR_FAMILY_IP6:
  1220. af = AF_INET6;
  1221. break;
  1222. default:
  1223. pr_err("address family %d not supported\n",
  1224. port->disc_addr.adrfam);
  1225. return -EINVAL;
  1226. }
  1227. ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
  1228. port->disc_addr.trsvcid, &addr);
  1229. if (ret) {
  1230. pr_err("malformed ip/port passed: %s:%s\n",
  1231. port->disc_addr.traddr, port->disc_addr.trsvcid);
  1232. return ret;
  1233. }
  1234. cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
  1235. RDMA_PS_TCP, IB_QPT_RC);
  1236. if (IS_ERR(cm_id)) {
  1237. pr_err("CM ID creation failed\n");
  1238. return PTR_ERR(cm_id);
  1239. }
  1240. /*
  1241. * Allow both IPv4 and IPv6 sockets to bind a single port
  1242. * at the same time.
  1243. */
  1244. ret = rdma_set_afonly(cm_id, 1);
  1245. if (ret) {
  1246. pr_err("rdma_set_afonly failed (%d)\n", ret);
  1247. goto out_destroy_id;
  1248. }
  1249. ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
  1250. if (ret) {
  1251. pr_err("binding CM ID to %pISpcs failed (%d)\n",
  1252. (struct sockaddr *)&addr, ret);
  1253. goto out_destroy_id;
  1254. }
  1255. ret = rdma_listen(cm_id, 128);
  1256. if (ret) {
  1257. pr_err("listening to %pISpcs failed (%d)\n",
  1258. (struct sockaddr *)&addr, ret);
  1259. goto out_destroy_id;
  1260. }
  1261. pr_info("enabling port %d (%pISpcs)\n",
  1262. le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
  1263. port->priv = cm_id;
  1264. return 0;
  1265. out_destroy_id:
  1266. rdma_destroy_id(cm_id);
  1267. return ret;
  1268. }
  1269. static void nvmet_rdma_remove_port(struct nvmet_port *port)
  1270. {
  1271. struct rdma_cm_id *cm_id = xchg(&port->priv, NULL);
  1272. if (cm_id)
  1273. rdma_destroy_id(cm_id);
  1274. }
  1275. static struct nvmet_fabrics_ops nvmet_rdma_ops = {
  1276. .owner = THIS_MODULE,
  1277. .type = NVMF_TRTYPE_RDMA,
  1278. .sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE,
  1279. .msdbd = 1,
  1280. .has_keyed_sgls = 1,
  1281. .add_port = nvmet_rdma_add_port,
  1282. .remove_port = nvmet_rdma_remove_port,
  1283. .queue_response = nvmet_rdma_queue_response,
  1284. .delete_ctrl = nvmet_rdma_delete_ctrl,
  1285. };
  1286. static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
  1287. {
  1288. struct nvmet_rdma_queue *queue, *tmp;
  1289. /* Device is being removed, delete all queues using this device */
  1290. mutex_lock(&nvmet_rdma_queue_mutex);
  1291. list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
  1292. queue_list) {
  1293. if (queue->dev->device != ib_device)
  1294. continue;
  1295. pr_info("Removing queue %d\n", queue->idx);
  1296. list_del_init(&queue->queue_list);
  1297. __nvmet_rdma_queue_disconnect(queue);
  1298. }
  1299. mutex_unlock(&nvmet_rdma_queue_mutex);
  1300. flush_scheduled_work();
  1301. }
  1302. static struct ib_client nvmet_rdma_ib_client = {
  1303. .name = "nvmet_rdma",
  1304. .remove = nvmet_rdma_remove_one
  1305. };
  1306. static int __init nvmet_rdma_init(void)
  1307. {
  1308. int ret;
  1309. ret = ib_register_client(&nvmet_rdma_ib_client);
  1310. if (ret)
  1311. return ret;
  1312. ret = nvmet_register_transport(&nvmet_rdma_ops);
  1313. if (ret)
  1314. goto err_ib_client;
  1315. return 0;
  1316. err_ib_client:
  1317. ib_unregister_client(&nvmet_rdma_ib_client);
  1318. return ret;
  1319. }
  1320. static void __exit nvmet_rdma_exit(void)
  1321. {
  1322. struct nvmet_rdma_queue *queue;
  1323. nvmet_unregister_transport(&nvmet_rdma_ops);
  1324. flush_scheduled_work();
  1325. mutex_lock(&nvmet_rdma_queue_mutex);
  1326. while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list,
  1327. struct nvmet_rdma_queue, queue_list))) {
  1328. list_del_init(&queue->queue_list);
  1329. mutex_unlock(&nvmet_rdma_queue_mutex);
  1330. __nvmet_rdma_queue_disconnect(queue);
  1331. mutex_lock(&nvmet_rdma_queue_mutex);
  1332. }
  1333. mutex_unlock(&nvmet_rdma_queue_mutex);
  1334. flush_scheduled_work();
  1335. ib_unregister_client(&nvmet_rdma_ib_client);
  1336. ida_destroy(&nvmet_rdma_queue_ida);
  1337. }
  1338. module_init(nvmet_rdma_init);
  1339. module_exit(nvmet_rdma_exit);
  1340. MODULE_LICENSE("GPL v2");
  1341. MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */