user_sdma.c 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526
  1. /*
  2. * Copyright(c) 2015 - 2017 Intel Corporation.
  3. *
  4. * This file is provided under a dual BSD/GPLv2 license. When using or
  5. * redistributing this file, you may do so under either license.
  6. *
  7. * GPL LICENSE SUMMARY
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of version 2 of the GNU General Public License as
  11. * published by the Free Software Foundation.
  12. *
  13. * This program is distributed in the hope that it will be useful, but
  14. * WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * General Public License for more details.
  17. *
  18. * BSD LICENSE
  19. *
  20. * Redistribution and use in source and binary forms, with or without
  21. * modification, are permitted provided that the following conditions
  22. * are met:
  23. *
  24. * - Redistributions of source code must retain the above copyright
  25. * notice, this list of conditions and the following disclaimer.
  26. * - Redistributions in binary form must reproduce the above copyright
  27. * notice, this list of conditions and the following disclaimer in
  28. * the documentation and/or other materials provided with the
  29. * distribution.
  30. * - Neither the name of Intel Corporation nor the names of its
  31. * contributors may be used to endorse or promote products derived
  32. * from this software without specific prior written permission.
  33. *
  34. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45. *
  46. */
  47. #include <linux/mm.h>
  48. #include <linux/types.h>
  49. #include <linux/device.h>
  50. #include <linux/dmapool.h>
  51. #include <linux/slab.h>
  52. #include <linux/list.h>
  53. #include <linux/highmem.h>
  54. #include <linux/io.h>
  55. #include <linux/uio.h>
  56. #include <linux/rbtree.h>
  57. #include <linux/spinlock.h>
  58. #include <linux/delay.h>
  59. #include <linux/kthread.h>
  60. #include <linux/mmu_context.h>
  61. #include <linux/module.h>
  62. #include <linux/vmalloc.h>
  63. #include <linux/string.h>
  64. #include "hfi.h"
  65. #include "sdma.h"
  66. #include "mmu_rb.h"
  67. #include "user_sdma.h"
  68. #include "verbs.h" /* for the headers */
  69. #include "common.h" /* for struct hfi1_tid_info */
  70. #include "trace.h"
  71. static uint hfi1_sdma_comp_ring_size = 128;
  72. module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
  73. MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
  74. static unsigned initial_pkt_count = 8;
  75. static int user_sdma_send_pkts(struct user_sdma_request *req,
  76. unsigned maxpkts);
  77. static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
  78. static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
  79. static void user_sdma_free_request(struct user_sdma_request *req, bool unpin);
  80. static int pin_vector_pages(struct user_sdma_request *req,
  81. struct user_sdma_iovec *iovec);
  82. static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
  83. unsigned start, unsigned npages);
  84. static int check_header_template(struct user_sdma_request *req,
  85. struct hfi1_pkt_header *hdr, u32 lrhlen,
  86. u32 datalen);
  87. static int set_txreq_header(struct user_sdma_request *req,
  88. struct user_sdma_txreq *tx, u32 datalen);
  89. static int set_txreq_header_ahg(struct user_sdma_request *req,
  90. struct user_sdma_txreq *tx, u32 len);
  91. static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  92. struct hfi1_user_sdma_comp_q *cq,
  93. u16 idx, enum hfi1_sdma_comp_state state,
  94. int ret);
  95. static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
  96. static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
  97. static int defer_packet_queue(
  98. struct sdma_engine *sde,
  99. struct iowait *wait,
  100. struct sdma_txreq *txreq,
  101. uint seq,
  102. bool pkts_sent);
  103. static void activate_packet_queue(struct iowait *wait, int reason);
  104. static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
  105. unsigned long len);
  106. static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode);
  107. static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
  108. void *arg2, bool *stop);
  109. static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode);
  110. static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
  111. static struct mmu_rb_ops sdma_rb_ops = {
  112. .filter = sdma_rb_filter,
  113. .insert = sdma_rb_insert,
  114. .evict = sdma_rb_evict,
  115. .remove = sdma_rb_remove,
  116. .invalidate = sdma_rb_invalidate
  117. };
  118. static int defer_packet_queue(
  119. struct sdma_engine *sde,
  120. struct iowait *wait,
  121. struct sdma_txreq *txreq,
  122. uint seq,
  123. bool pkts_sent)
  124. {
  125. struct hfi1_user_sdma_pkt_q *pq =
  126. container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
  127. struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
  128. write_seqlock(&dev->iowait_lock);
  129. if (sdma_progress(sde, seq, txreq))
  130. goto eagain;
  131. /*
  132. * We are assuming that if the list is enqueued somewhere, it
  133. * is to the dmawait list since that is the only place where
  134. * it is supposed to be enqueued.
  135. */
  136. xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
  137. if (list_empty(&pq->busy.list))
  138. iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
  139. write_sequnlock(&dev->iowait_lock);
  140. return -EBUSY;
  141. eagain:
  142. write_sequnlock(&dev->iowait_lock);
  143. return -EAGAIN;
  144. }
  145. static void activate_packet_queue(struct iowait *wait, int reason)
  146. {
  147. struct hfi1_user_sdma_pkt_q *pq =
  148. container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
  149. xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
  150. wake_up(&wait->wait_dma);
  151. };
  152. int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
  153. struct hfi1_filedata *fd)
  154. {
  155. int ret = -ENOMEM;
  156. char buf[64];
  157. struct hfi1_devdata *dd;
  158. struct hfi1_user_sdma_comp_q *cq;
  159. struct hfi1_user_sdma_pkt_q *pq;
  160. if (!uctxt || !fd)
  161. return -EBADF;
  162. if (!hfi1_sdma_comp_ring_size)
  163. return -EINVAL;
  164. dd = uctxt->dd;
  165. pq = kzalloc(sizeof(*pq), GFP_KERNEL);
  166. if (!pq)
  167. return -ENOMEM;
  168. pq->dd = dd;
  169. pq->ctxt = uctxt->ctxt;
  170. pq->subctxt = fd->subctxt;
  171. pq->n_max_reqs = hfi1_sdma_comp_ring_size;
  172. atomic_set(&pq->n_reqs, 0);
  173. init_waitqueue_head(&pq->wait);
  174. atomic_set(&pq->n_locked, 0);
  175. pq->mm = fd->mm;
  176. iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
  177. activate_packet_queue, NULL);
  178. pq->reqidx = 0;
  179. pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
  180. sizeof(*pq->reqs),
  181. GFP_KERNEL);
  182. if (!pq->reqs)
  183. goto pq_reqs_nomem;
  184. pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size),
  185. sizeof(*pq->req_in_use),
  186. GFP_KERNEL);
  187. if (!pq->req_in_use)
  188. goto pq_reqs_no_in_use;
  189. snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
  190. fd->subctxt);
  191. pq->txreq_cache = kmem_cache_create(buf,
  192. sizeof(struct user_sdma_txreq),
  193. L1_CACHE_BYTES,
  194. SLAB_HWCACHE_ALIGN,
  195. NULL);
  196. if (!pq->txreq_cache) {
  197. dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
  198. uctxt->ctxt);
  199. goto pq_txreq_nomem;
  200. }
  201. cq = kzalloc(sizeof(*cq), GFP_KERNEL);
  202. if (!cq)
  203. goto cq_nomem;
  204. cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
  205. * hfi1_sdma_comp_ring_size));
  206. if (!cq->comps)
  207. goto cq_comps_nomem;
  208. cq->nentries = hfi1_sdma_comp_ring_size;
  209. ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq,
  210. &pq->handler);
  211. if (ret) {
  212. dd_dev_err(dd, "Failed to register with MMU %d", ret);
  213. goto pq_mmu_fail;
  214. }
  215. rcu_assign_pointer(fd->pq, pq);
  216. fd->cq = cq;
  217. return 0;
  218. pq_mmu_fail:
  219. vfree(cq->comps);
  220. cq_comps_nomem:
  221. kfree(cq);
  222. cq_nomem:
  223. kmem_cache_destroy(pq->txreq_cache);
  224. pq_txreq_nomem:
  225. kfree(pq->req_in_use);
  226. pq_reqs_no_in_use:
  227. kfree(pq->reqs);
  228. pq_reqs_nomem:
  229. kfree(pq);
  230. return ret;
  231. }
  232. int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
  233. struct hfi1_ctxtdata *uctxt)
  234. {
  235. struct hfi1_user_sdma_pkt_q *pq;
  236. trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
  237. spin_lock(&fd->pq_rcu_lock);
  238. pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
  239. lockdep_is_held(&fd->pq_rcu_lock));
  240. if (pq) {
  241. rcu_assign_pointer(fd->pq, NULL);
  242. spin_unlock(&fd->pq_rcu_lock);
  243. synchronize_srcu(&fd->pq_srcu);
  244. /* at this point there can be no more new requests */
  245. if (pq->handler)
  246. hfi1_mmu_rb_unregister(pq->handler);
  247. iowait_sdma_drain(&pq->busy);
  248. /* Wait until all requests have been freed. */
  249. wait_event_interruptible(
  250. pq->wait,
  251. !atomic_read(&pq->n_reqs));
  252. kfree(pq->reqs);
  253. kfree(pq->req_in_use);
  254. kmem_cache_destroy(pq->txreq_cache);
  255. kfree(pq);
  256. } else {
  257. spin_unlock(&fd->pq_rcu_lock);
  258. }
  259. if (fd->cq) {
  260. vfree(fd->cq->comps);
  261. kfree(fd->cq);
  262. fd->cq = NULL;
  263. }
  264. return 0;
  265. }
  266. static u8 dlid_to_selector(u16 dlid)
  267. {
  268. static u8 mapping[256];
  269. static int initialized;
  270. static u8 next;
  271. int hash;
  272. if (!initialized) {
  273. memset(mapping, 0xFF, 256);
  274. initialized = 1;
  275. }
  276. hash = ((dlid >> 8) ^ dlid) & 0xFF;
  277. if (mapping[hash] == 0xFF) {
  278. mapping[hash] = next;
  279. next = (next + 1) & 0x7F;
  280. }
  281. return mapping[hash];
  282. }
  283. /**
  284. * hfi1_user_sdma_process_request() - Process and start a user sdma request
  285. * @fd: valid file descriptor
  286. * @iovec: array of io vectors to process
  287. * @dim: overall iovec array size
  288. * @count: number of io vector array entries processed
  289. */
  290. int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
  291. struct iovec *iovec, unsigned long dim,
  292. unsigned long *count)
  293. {
  294. int ret = 0, i;
  295. struct hfi1_ctxtdata *uctxt = fd->uctxt;
  296. struct hfi1_user_sdma_pkt_q *pq =
  297. srcu_dereference(fd->pq, &fd->pq_srcu);
  298. struct hfi1_user_sdma_comp_q *cq = fd->cq;
  299. struct hfi1_devdata *dd = pq->dd;
  300. unsigned long idx = 0;
  301. u8 pcount = initial_pkt_count;
  302. struct sdma_req_info info;
  303. struct user_sdma_request *req;
  304. u8 opcode, sc, vl;
  305. u16 pkey;
  306. u32 slid;
  307. u16 dlid;
  308. u32 selector;
  309. if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
  310. hfi1_cdbg(
  311. SDMA,
  312. "[%u:%u:%u] First vector not big enough for header %lu/%lu",
  313. dd->unit, uctxt->ctxt, fd->subctxt,
  314. iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
  315. return -EINVAL;
  316. }
  317. ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
  318. if (ret) {
  319. hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
  320. dd->unit, uctxt->ctxt, fd->subctxt, ret);
  321. return -EFAULT;
  322. }
  323. trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
  324. (u16 *)&info);
  325. if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
  326. hfi1_cdbg(SDMA,
  327. "[%u:%u:%u:%u] Invalid comp index",
  328. dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
  329. return -EINVAL;
  330. }
  331. /*
  332. * Sanity check the header io vector count. Need at least 1 vector
  333. * (header) and cannot be larger than the actual io vector count.
  334. */
  335. if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
  336. hfi1_cdbg(SDMA,
  337. "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
  338. dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
  339. req_iovcnt(info.ctrl), dim);
  340. return -EINVAL;
  341. }
  342. if (!info.fragsize) {
  343. hfi1_cdbg(SDMA,
  344. "[%u:%u:%u:%u] Request does not specify fragsize",
  345. dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
  346. return -EINVAL;
  347. }
  348. /* Try to claim the request. */
  349. if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
  350. hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
  351. dd->unit, uctxt->ctxt, fd->subctxt,
  352. info.comp_idx);
  353. return -EBADSLT;
  354. }
  355. /*
  356. * All safety checks have been done and this request has been claimed.
  357. */
  358. trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
  359. info.comp_idx);
  360. req = pq->reqs + info.comp_idx;
  361. req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
  362. req->data_len = 0;
  363. req->pq = pq;
  364. req->cq = cq;
  365. req->ahg_idx = -1;
  366. req->iov_idx = 0;
  367. req->sent = 0;
  368. req->seqnum = 0;
  369. req->seqcomp = 0;
  370. req->seqsubmitted = 0;
  371. req->tids = NULL;
  372. req->has_error = 0;
  373. INIT_LIST_HEAD(&req->txps);
  374. memcpy(&req->info, &info, sizeof(info));
  375. /* The request is initialized, count it */
  376. atomic_inc(&pq->n_reqs);
  377. if (req_opcode(info.ctrl) == EXPECTED) {
  378. /* expected must have a TID info and at least one data vector */
  379. if (req->data_iovs < 2) {
  380. SDMA_DBG(req,
  381. "Not enough vectors for expected request");
  382. ret = -EINVAL;
  383. goto free_req;
  384. }
  385. req->data_iovs--;
  386. }
  387. if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
  388. SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
  389. MAX_VECTORS_PER_REQ);
  390. ret = -EINVAL;
  391. goto free_req;
  392. }
  393. /* Copy the header from the user buffer */
  394. ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
  395. sizeof(req->hdr));
  396. if (ret) {
  397. SDMA_DBG(req, "Failed to copy header template (%d)", ret);
  398. ret = -EFAULT;
  399. goto free_req;
  400. }
  401. /* If Static rate control is not enabled, sanitize the header. */
  402. if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
  403. req->hdr.pbc[2] = 0;
  404. /* Validate the opcode. Do not trust packets from user space blindly. */
  405. opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
  406. if ((opcode & USER_OPCODE_CHECK_MASK) !=
  407. USER_OPCODE_CHECK_VAL) {
  408. SDMA_DBG(req, "Invalid opcode (%d)", opcode);
  409. ret = -EINVAL;
  410. goto free_req;
  411. }
  412. /*
  413. * Validate the vl. Do not trust packets from user space blindly.
  414. * VL comes from PBC, SC comes from LRH, and the VL needs to
  415. * match the SC look up.
  416. */
  417. vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
  418. sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
  419. (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
  420. if (vl >= dd->pport->vls_operational ||
  421. vl != sc_to_vlt(dd, sc)) {
  422. SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
  423. ret = -EINVAL;
  424. goto free_req;
  425. }
  426. /* Checking P_KEY for requests from user-space */
  427. pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
  428. slid = be16_to_cpu(req->hdr.lrh[3]);
  429. if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
  430. ret = -EINVAL;
  431. goto free_req;
  432. }
  433. /*
  434. * Also should check the BTH.lnh. If it says the next header is GRH then
  435. * the RXE parsing will be off and will land in the middle of the KDETH
  436. * or miss it entirely.
  437. */
  438. if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
  439. SDMA_DBG(req, "User tried to pass in a GRH");
  440. ret = -EINVAL;
  441. goto free_req;
  442. }
  443. req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
  444. /*
  445. * Calculate the initial TID offset based on the values of
  446. * KDETH.OFFSET and KDETH.OM that are passed in.
  447. */
  448. req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
  449. (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
  450. KDETH_OM_LARGE : KDETH_OM_SMALL);
  451. trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
  452. info.comp_idx, req->tidoffset);
  453. idx++;
  454. /* Save all the IO vector structures */
  455. for (i = 0; i < req->data_iovs; i++) {
  456. req->iovs[i].offset = 0;
  457. INIT_LIST_HEAD(&req->iovs[i].list);
  458. memcpy(&req->iovs[i].iov,
  459. iovec + idx++,
  460. sizeof(req->iovs[i].iov));
  461. ret = pin_vector_pages(req, &req->iovs[i]);
  462. if (ret) {
  463. req->data_iovs = i;
  464. goto free_req;
  465. }
  466. req->data_len += req->iovs[i].iov.iov_len;
  467. }
  468. trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
  469. info.comp_idx, req->data_len);
  470. if (pcount > req->info.npkts)
  471. pcount = req->info.npkts;
  472. /*
  473. * Copy any TID info
  474. * User space will provide the TID info only when the
  475. * request type is EXPECTED. This is true even if there is
  476. * only one packet in the request and the header is already
  477. * setup. The reason for the singular TID case is that the
  478. * driver needs to perform safety checks.
  479. */
  480. if (req_opcode(req->info.ctrl) == EXPECTED) {
  481. u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
  482. u32 *tmp;
  483. if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
  484. ret = -EINVAL;
  485. goto free_req;
  486. }
  487. /*
  488. * We have to copy all of the tids because they may vary
  489. * in size and, therefore, the TID count might not be
  490. * equal to the pkt count. However, there is no way to
  491. * tell at this point.
  492. */
  493. tmp = memdup_user(iovec[idx].iov_base,
  494. ntids * sizeof(*req->tids));
  495. if (IS_ERR(tmp)) {
  496. ret = PTR_ERR(tmp);
  497. SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
  498. ntids, ret);
  499. goto free_req;
  500. }
  501. req->tids = tmp;
  502. req->n_tids = ntids;
  503. req->tididx = 0;
  504. idx++;
  505. }
  506. dlid = be16_to_cpu(req->hdr.lrh[1]);
  507. selector = dlid_to_selector(dlid);
  508. selector += uctxt->ctxt + fd->subctxt;
  509. req->sde = sdma_select_user_engine(dd, selector, vl);
  510. if (!req->sde || !sdma_running(req->sde)) {
  511. ret = -ECOMM;
  512. goto free_req;
  513. }
  514. /* We don't need an AHG entry if the request contains only one packet */
  515. if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
  516. req->ahg_idx = sdma_ahg_alloc(req->sde);
  517. set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
  518. pq->state = SDMA_PKT_Q_ACTIVE;
  519. /* Send the first N packets in the request to buy us some time */
  520. ret = user_sdma_send_pkts(req, pcount);
  521. if (unlikely(ret < 0 && ret != -EBUSY))
  522. goto free_req;
  523. /*
  524. * This is a somewhat blocking send implementation.
  525. * The driver will block the caller until all packets of the
  526. * request have been submitted to the SDMA engine. However, it
  527. * will not wait for send completions.
  528. */
  529. while (req->seqsubmitted != req->info.npkts) {
  530. ret = user_sdma_send_pkts(req, pcount);
  531. if (ret < 0) {
  532. if (ret != -EBUSY)
  533. goto free_req;
  534. wait_event_interruptible_timeout(
  535. pq->busy.wait_dma,
  536. (pq->state == SDMA_PKT_Q_ACTIVE),
  537. msecs_to_jiffies(
  538. SDMA_IOWAIT_TIMEOUT));
  539. }
  540. }
  541. *count += idx;
  542. return 0;
  543. free_req:
  544. /*
  545. * If the submitted seqsubmitted == npkts, the completion routine
  546. * controls the final state. If sequbmitted < npkts, wait for any
  547. * outstanding packets to finish before cleaning up.
  548. */
  549. if (req->seqsubmitted < req->info.npkts) {
  550. if (req->seqsubmitted)
  551. wait_event(pq->busy.wait_dma,
  552. (req->seqcomp == req->seqsubmitted - 1));
  553. user_sdma_free_request(req, true);
  554. pq_update(pq);
  555. set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
  556. }
  557. return ret;
  558. }
  559. static inline u32 compute_data_length(struct user_sdma_request *req,
  560. struct user_sdma_txreq *tx)
  561. {
  562. /*
  563. * Determine the proper size of the packet data.
  564. * The size of the data of the first packet is in the header
  565. * template. However, it includes the header and ICRC, which need
  566. * to be subtracted.
  567. * The minimum representable packet data length in a header is 4 bytes,
  568. * therefore, when the data length request is less than 4 bytes, there's
  569. * only one packet, and the packet data length is equal to that of the
  570. * request data length.
  571. * The size of the remaining packets is the minimum of the frag
  572. * size (MTU) or remaining data in the request.
  573. */
  574. u32 len;
  575. if (!req->seqnum) {
  576. if (req->data_len < sizeof(u32))
  577. len = req->data_len;
  578. else
  579. len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
  580. (sizeof(tx->hdr) - 4));
  581. } else if (req_opcode(req->info.ctrl) == EXPECTED) {
  582. u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
  583. PAGE_SIZE;
  584. /*
  585. * Get the data length based on the remaining space in the
  586. * TID pair.
  587. */
  588. len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
  589. /* If we've filled up the TID pair, move to the next one. */
  590. if (unlikely(!len) && ++req->tididx < req->n_tids &&
  591. req->tids[req->tididx]) {
  592. tidlen = EXP_TID_GET(req->tids[req->tididx],
  593. LEN) * PAGE_SIZE;
  594. req->tidoffset = 0;
  595. len = min_t(u32, tidlen, req->info.fragsize);
  596. }
  597. /*
  598. * Since the TID pairs map entire pages, make sure that we
  599. * are not going to try to send more data that we have
  600. * remaining.
  601. */
  602. len = min(len, req->data_len - req->sent);
  603. } else {
  604. len = min(req->data_len - req->sent, (u32)req->info.fragsize);
  605. }
  606. trace_hfi1_sdma_user_compute_length(req->pq->dd,
  607. req->pq->ctxt,
  608. req->pq->subctxt,
  609. req->info.comp_idx,
  610. len);
  611. return len;
  612. }
  613. static inline u32 pad_len(u32 len)
  614. {
  615. if (len & (sizeof(u32) - 1))
  616. len += sizeof(u32) - (len & (sizeof(u32) - 1));
  617. return len;
  618. }
  619. static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
  620. {
  621. /* (Size of complete header - size of PBC) + 4B ICRC + data length */
  622. return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
  623. }
  624. static int user_sdma_txadd_ahg(struct user_sdma_request *req,
  625. struct user_sdma_txreq *tx,
  626. u32 datalen)
  627. {
  628. int ret;
  629. u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
  630. u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
  631. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  632. /*
  633. * Copy the request header into the tx header
  634. * because the HW needs a cacheline-aligned
  635. * address.
  636. * This copy can be optimized out if the hdr
  637. * member of user_sdma_request were also
  638. * cacheline aligned.
  639. */
  640. memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
  641. if (PBC2LRH(pbclen) != lrhlen) {
  642. pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
  643. tx->hdr.pbc[0] = cpu_to_le16(pbclen);
  644. }
  645. ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
  646. if (ret)
  647. return ret;
  648. ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
  649. sizeof(tx->hdr) + datalen, req->ahg_idx,
  650. 0, NULL, 0, user_sdma_txreq_cb);
  651. if (ret)
  652. return ret;
  653. ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
  654. if (ret)
  655. sdma_txclean(pq->dd, &tx->txreq);
  656. return ret;
  657. }
  658. static int user_sdma_txadd(struct user_sdma_request *req,
  659. struct user_sdma_txreq *tx,
  660. struct user_sdma_iovec *iovec, u32 datalen,
  661. u32 *queued_ptr, u32 *data_sent_ptr,
  662. u64 *iov_offset_ptr)
  663. {
  664. int ret;
  665. unsigned int pageidx, len;
  666. unsigned long base, offset;
  667. u64 iov_offset = *iov_offset_ptr;
  668. u32 queued = *queued_ptr, data_sent = *data_sent_ptr;
  669. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  670. base = (unsigned long)iovec->iov.iov_base;
  671. offset = offset_in_page(base + iovec->offset + iov_offset);
  672. pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >>
  673. PAGE_SHIFT);
  674. len = offset + req->info.fragsize > PAGE_SIZE ?
  675. PAGE_SIZE - offset : req->info.fragsize;
  676. len = min((datalen - queued), len);
  677. ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx],
  678. offset, len);
  679. if (ret) {
  680. SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret);
  681. return ret;
  682. }
  683. iov_offset += len;
  684. queued += len;
  685. data_sent += len;
  686. if (unlikely(queued < datalen && pageidx == iovec->npages &&
  687. req->iov_idx < req->data_iovs - 1)) {
  688. iovec->offset += iov_offset;
  689. iovec = &req->iovs[++req->iov_idx];
  690. iov_offset = 0;
  691. }
  692. *queued_ptr = queued;
  693. *data_sent_ptr = data_sent;
  694. *iov_offset_ptr = iov_offset;
  695. return ret;
  696. }
  697. static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
  698. {
  699. int ret = 0, count;
  700. unsigned npkts = 0;
  701. struct user_sdma_txreq *tx = NULL;
  702. struct hfi1_user_sdma_pkt_q *pq = NULL;
  703. struct user_sdma_iovec *iovec = NULL;
  704. if (!req->pq)
  705. return -EINVAL;
  706. pq = req->pq;
  707. /* If tx completion has reported an error, we are done. */
  708. if (READ_ONCE(req->has_error))
  709. return -EFAULT;
  710. /*
  711. * Check if we might have sent the entire request already
  712. */
  713. if (unlikely(req->seqnum == req->info.npkts)) {
  714. if (!list_empty(&req->txps))
  715. goto dosend;
  716. return ret;
  717. }
  718. if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
  719. maxpkts = req->info.npkts - req->seqnum;
  720. while (npkts < maxpkts) {
  721. u32 datalen = 0, queued = 0, data_sent = 0;
  722. u64 iov_offset = 0;
  723. /*
  724. * Check whether any of the completions have come back
  725. * with errors. If so, we are not going to process any
  726. * more packets from this request.
  727. */
  728. if (READ_ONCE(req->has_error))
  729. return -EFAULT;
  730. tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
  731. if (!tx)
  732. return -ENOMEM;
  733. tx->flags = 0;
  734. tx->req = req;
  735. INIT_LIST_HEAD(&tx->list);
  736. /*
  737. * For the last packet set the ACK request
  738. * and disable header suppression.
  739. */
  740. if (req->seqnum == req->info.npkts - 1)
  741. tx->flags |= (TXREQ_FLAGS_REQ_ACK |
  742. TXREQ_FLAGS_REQ_DISABLE_SH);
  743. /*
  744. * Calculate the payload size - this is min of the fragment
  745. * (MTU) size or the remaining bytes in the request but only
  746. * if we have payload data.
  747. */
  748. if (req->data_len) {
  749. iovec = &req->iovs[req->iov_idx];
  750. if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
  751. if (++req->iov_idx == req->data_iovs) {
  752. ret = -EFAULT;
  753. goto free_tx;
  754. }
  755. iovec = &req->iovs[req->iov_idx];
  756. WARN_ON(iovec->offset);
  757. }
  758. datalen = compute_data_length(req, tx);
  759. /*
  760. * Disable header suppression for the payload <= 8DWS.
  761. * If there is an uncorrectable error in the receive
  762. * data FIFO when the received payload size is less than
  763. * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
  764. * not reported.There is set RHF.EccErr if the header
  765. * is not suppressed.
  766. */
  767. if (!datalen) {
  768. SDMA_DBG(req,
  769. "Request has data but pkt len is 0");
  770. ret = -EFAULT;
  771. goto free_tx;
  772. } else if (datalen <= 32) {
  773. tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
  774. }
  775. }
  776. if (req->ahg_idx >= 0) {
  777. if (!req->seqnum) {
  778. ret = user_sdma_txadd_ahg(req, tx, datalen);
  779. if (ret)
  780. goto free_tx;
  781. } else {
  782. int changes;
  783. changes = set_txreq_header_ahg(req, tx,
  784. datalen);
  785. if (changes < 0) {
  786. ret = changes;
  787. goto free_tx;
  788. }
  789. }
  790. } else {
  791. ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
  792. datalen, user_sdma_txreq_cb);
  793. if (ret)
  794. goto free_tx;
  795. /*
  796. * Modify the header for this packet. This only needs
  797. * to be done if we are not going to use AHG. Otherwise,
  798. * the HW will do it based on the changes we gave it
  799. * during sdma_txinit_ahg().
  800. */
  801. ret = set_txreq_header(req, tx, datalen);
  802. if (ret)
  803. goto free_txreq;
  804. }
  805. /*
  806. * If the request contains any data vectors, add up to
  807. * fragsize bytes to the descriptor.
  808. */
  809. while (queued < datalen &&
  810. (req->sent + data_sent) < req->data_len) {
  811. ret = user_sdma_txadd(req, tx, iovec, datalen,
  812. &queued, &data_sent, &iov_offset);
  813. if (ret)
  814. goto free_txreq;
  815. }
  816. /*
  817. * The txreq was submitted successfully so we can update
  818. * the counters.
  819. */
  820. req->koffset += datalen;
  821. if (req_opcode(req->info.ctrl) == EXPECTED)
  822. req->tidoffset += datalen;
  823. req->sent += data_sent;
  824. if (req->data_len)
  825. iovec->offset += iov_offset;
  826. list_add_tail(&tx->txreq.list, &req->txps);
  827. /*
  828. * It is important to increment this here as it is used to
  829. * generate the BTH.PSN and, therefore, can't be bulk-updated
  830. * outside of the loop.
  831. */
  832. tx->seqnum = req->seqnum++;
  833. npkts++;
  834. }
  835. dosend:
  836. ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count);
  837. req->seqsubmitted += count;
  838. if (req->seqsubmitted == req->info.npkts) {
  839. /*
  840. * The txreq has already been submitted to the HW queue
  841. * so we can free the AHG entry now. Corruption will not
  842. * happen due to the sequential manner in which
  843. * descriptors are processed.
  844. */
  845. if (req->ahg_idx >= 0)
  846. sdma_ahg_free(req->sde, req->ahg_idx);
  847. }
  848. return ret;
  849. free_txreq:
  850. sdma_txclean(pq->dd, &tx->txreq);
  851. free_tx:
  852. kmem_cache_free(pq->txreq_cache, tx);
  853. return ret;
  854. }
  855. static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
  856. {
  857. struct evict_data evict_data;
  858. evict_data.cleared = 0;
  859. evict_data.target = npages;
  860. hfi1_mmu_rb_evict(pq->handler, &evict_data);
  861. return evict_data.cleared;
  862. }
  863. static int pin_sdma_pages(struct user_sdma_request *req,
  864. struct user_sdma_iovec *iovec,
  865. struct sdma_mmu_node *node,
  866. int npages)
  867. {
  868. int pinned, cleared;
  869. struct page **pages;
  870. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  871. pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
  872. if (!pages)
  873. return -ENOMEM;
  874. memcpy(pages, node->pages, node->npages * sizeof(*pages));
  875. npages -= node->npages;
  876. retry:
  877. if (!hfi1_can_pin_pages(pq->dd, pq->mm,
  878. atomic_read(&pq->n_locked), npages)) {
  879. cleared = sdma_cache_evict(pq, npages);
  880. if (cleared >= npages)
  881. goto retry;
  882. }
  883. pinned = hfi1_acquire_user_pages(pq->mm,
  884. ((unsigned long)iovec->iov.iov_base +
  885. (node->npages * PAGE_SIZE)), npages, 0,
  886. pages + node->npages);
  887. if (pinned < 0) {
  888. kfree(pages);
  889. return pinned;
  890. }
  891. if (pinned != npages) {
  892. unpin_vector_pages(pq->mm, pages, node->npages, pinned);
  893. return -EFAULT;
  894. }
  895. kfree(node->pages);
  896. node->rb.len = iovec->iov.iov_len;
  897. node->pages = pages;
  898. atomic_add(pinned, &pq->n_locked);
  899. return pinned;
  900. }
  901. static void unpin_sdma_pages(struct sdma_mmu_node *node)
  902. {
  903. if (node->npages) {
  904. unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages);
  905. atomic_sub(node->npages, &node->pq->n_locked);
  906. }
  907. }
  908. static int pin_vector_pages(struct user_sdma_request *req,
  909. struct user_sdma_iovec *iovec)
  910. {
  911. int ret = 0, pinned, npages;
  912. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  913. struct sdma_mmu_node *node = NULL;
  914. struct mmu_rb_node *rb_node;
  915. struct iovec *iov;
  916. bool extracted;
  917. extracted =
  918. hfi1_mmu_rb_remove_unless_exact(pq->handler,
  919. (unsigned long)
  920. iovec->iov.iov_base,
  921. iovec->iov.iov_len, &rb_node);
  922. if (rb_node) {
  923. node = container_of(rb_node, struct sdma_mmu_node, rb);
  924. if (!extracted) {
  925. atomic_inc(&node->refcount);
  926. iovec->pages = node->pages;
  927. iovec->npages = node->npages;
  928. iovec->node = node;
  929. return 0;
  930. }
  931. }
  932. if (!node) {
  933. node = kzalloc(sizeof(*node), GFP_KERNEL);
  934. if (!node)
  935. return -ENOMEM;
  936. node->rb.addr = (unsigned long)iovec->iov.iov_base;
  937. node->pq = pq;
  938. atomic_set(&node->refcount, 0);
  939. }
  940. iov = &iovec->iov;
  941. npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len);
  942. if (node->npages < npages) {
  943. pinned = pin_sdma_pages(req, iovec, node, npages);
  944. if (pinned < 0) {
  945. ret = pinned;
  946. goto bail;
  947. }
  948. node->npages += pinned;
  949. npages = node->npages;
  950. }
  951. iovec->pages = node->pages;
  952. iovec->npages = npages;
  953. iovec->node = node;
  954. ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb);
  955. if (ret) {
  956. iovec->node = NULL;
  957. goto bail;
  958. }
  959. return 0;
  960. bail:
  961. unpin_sdma_pages(node);
  962. kfree(node);
  963. return ret;
  964. }
  965. static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
  966. unsigned start, unsigned npages)
  967. {
  968. hfi1_release_user_pages(mm, pages + start, npages, false);
  969. kfree(pages);
  970. }
  971. static int check_header_template(struct user_sdma_request *req,
  972. struct hfi1_pkt_header *hdr, u32 lrhlen,
  973. u32 datalen)
  974. {
  975. /*
  976. * Perform safety checks for any type of packet:
  977. * - transfer size is multiple of 64bytes
  978. * - packet length is multiple of 4 bytes
  979. * - packet length is not larger than MTU size
  980. *
  981. * These checks are only done for the first packet of the
  982. * transfer since the header is "given" to us by user space.
  983. * For the remainder of the packets we compute the values.
  984. */
  985. if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
  986. lrhlen > get_lrh_len(*hdr, req->info.fragsize))
  987. return -EINVAL;
  988. if (req_opcode(req->info.ctrl) == EXPECTED) {
  989. /*
  990. * The header is checked only on the first packet. Furthermore,
  991. * we ensure that at least one TID entry is copied when the
  992. * request is submitted. Therefore, we don't have to verify that
  993. * tididx points to something sane.
  994. */
  995. u32 tidval = req->tids[req->tididx],
  996. tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
  997. tididx = EXP_TID_GET(tidval, IDX),
  998. tidctrl = EXP_TID_GET(tidval, CTRL),
  999. tidoff;
  1000. __le32 kval = hdr->kdeth.ver_tid_offset;
  1001. tidoff = KDETH_GET(kval, OFFSET) *
  1002. (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
  1003. KDETH_OM_LARGE : KDETH_OM_SMALL);
  1004. /*
  1005. * Expected receive packets have the following
  1006. * additional checks:
  1007. * - offset is not larger than the TID size
  1008. * - TIDCtrl values match between header and TID array
  1009. * - TID indexes match between header and TID array
  1010. */
  1011. if ((tidoff + datalen > tidlen) ||
  1012. KDETH_GET(kval, TIDCTRL) != tidctrl ||
  1013. KDETH_GET(kval, TID) != tididx)
  1014. return -EINVAL;
  1015. }
  1016. return 0;
  1017. }
  1018. /*
  1019. * Correctly set the BTH.PSN field based on type of
  1020. * transfer - eager packets can just increment the PSN but
  1021. * expected packets encode generation and sequence in the
  1022. * BTH.PSN field so just incrementing will result in errors.
  1023. */
  1024. static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
  1025. {
  1026. u32 val = be32_to_cpu(bthpsn),
  1027. mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
  1028. 0xffffffull),
  1029. psn = val & mask;
  1030. if (expct)
  1031. psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
  1032. else
  1033. psn = psn + frags;
  1034. return psn & mask;
  1035. }
  1036. static int set_txreq_header(struct user_sdma_request *req,
  1037. struct user_sdma_txreq *tx, u32 datalen)
  1038. {
  1039. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  1040. struct hfi1_pkt_header *hdr = &tx->hdr;
  1041. u8 omfactor; /* KDETH.OM */
  1042. u16 pbclen;
  1043. int ret;
  1044. u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
  1045. /* Copy the header template to the request before modification */
  1046. memcpy(hdr, &req->hdr, sizeof(*hdr));
  1047. /*
  1048. * Check if the PBC and LRH length are mismatched. If so
  1049. * adjust both in the header.
  1050. */
  1051. pbclen = le16_to_cpu(hdr->pbc[0]);
  1052. if (PBC2LRH(pbclen) != lrhlen) {
  1053. pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
  1054. hdr->pbc[0] = cpu_to_le16(pbclen);
  1055. hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
  1056. /*
  1057. * Third packet
  1058. * This is the first packet in the sequence that has
  1059. * a "static" size that can be used for the rest of
  1060. * the packets (besides the last one).
  1061. */
  1062. if (unlikely(req->seqnum == 2)) {
  1063. /*
  1064. * From this point on the lengths in both the
  1065. * PBC and LRH are the same until the last
  1066. * packet.
  1067. * Adjust the template so we don't have to update
  1068. * every packet
  1069. */
  1070. req->hdr.pbc[0] = hdr->pbc[0];
  1071. req->hdr.lrh[2] = hdr->lrh[2];
  1072. }
  1073. }
  1074. /*
  1075. * We only have to modify the header if this is not the
  1076. * first packet in the request. Otherwise, we use the
  1077. * header given to us.
  1078. */
  1079. if (unlikely(!req->seqnum)) {
  1080. ret = check_header_template(req, hdr, lrhlen, datalen);
  1081. if (ret)
  1082. return ret;
  1083. goto done;
  1084. }
  1085. hdr->bth[2] = cpu_to_be32(
  1086. set_pkt_bth_psn(hdr->bth[2],
  1087. (req_opcode(req->info.ctrl) == EXPECTED),
  1088. req->seqnum));
  1089. /* Set ACK request on last packet */
  1090. if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
  1091. hdr->bth[2] |= cpu_to_be32(1UL << 31);
  1092. /* Set the new offset */
  1093. hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
  1094. /* Expected packets have to fill in the new TID information */
  1095. if (req_opcode(req->info.ctrl) == EXPECTED) {
  1096. tidval = req->tids[req->tididx];
  1097. /*
  1098. * If the offset puts us at the end of the current TID,
  1099. * advance everything.
  1100. */
  1101. if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
  1102. PAGE_SIZE)) {
  1103. req->tidoffset = 0;
  1104. /*
  1105. * Since we don't copy all the TIDs, all at once,
  1106. * we have to check again.
  1107. */
  1108. if (++req->tididx > req->n_tids - 1 ||
  1109. !req->tids[req->tididx]) {
  1110. return -EINVAL;
  1111. }
  1112. tidval = req->tids[req->tididx];
  1113. }
  1114. omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
  1115. KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
  1116. KDETH_OM_SMALL_SHIFT;
  1117. /* Set KDETH.TIDCtrl based on value for this TID. */
  1118. KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
  1119. EXP_TID_GET(tidval, CTRL));
  1120. /* Set KDETH.TID based on value for this TID */
  1121. KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
  1122. EXP_TID_GET(tidval, IDX));
  1123. /* Clear KDETH.SH when DISABLE_SH flag is set */
  1124. if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
  1125. KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
  1126. /*
  1127. * Set the KDETH.OFFSET and KDETH.OM based on size of
  1128. * transfer.
  1129. */
  1130. trace_hfi1_sdma_user_tid_info(
  1131. pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
  1132. req->tidoffset, req->tidoffset >> omfactor,
  1133. omfactor != KDETH_OM_SMALL_SHIFT);
  1134. KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
  1135. req->tidoffset >> omfactor);
  1136. KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
  1137. omfactor != KDETH_OM_SMALL_SHIFT);
  1138. }
  1139. done:
  1140. trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
  1141. req->info.comp_idx, hdr, tidval);
  1142. return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
  1143. }
  1144. static int set_txreq_header_ahg(struct user_sdma_request *req,
  1145. struct user_sdma_txreq *tx, u32 datalen)
  1146. {
  1147. u32 ahg[AHG_KDETH_ARRAY_SIZE];
  1148. int idx = 0;
  1149. u8 omfactor; /* KDETH.OM */
  1150. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  1151. struct hfi1_pkt_header *hdr = &req->hdr;
  1152. u16 pbclen = le16_to_cpu(hdr->pbc[0]);
  1153. u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
  1154. size_t array_size = ARRAY_SIZE(ahg);
  1155. if (PBC2LRH(pbclen) != lrhlen) {
  1156. /* PBC.PbcLengthDWs */
  1157. idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
  1158. (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
  1159. if (idx < 0)
  1160. return idx;
  1161. /* LRH.PktLen (we need the full 16 bits due to byte swap) */
  1162. idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
  1163. (__force u16)cpu_to_be16(lrhlen >> 2));
  1164. if (idx < 0)
  1165. return idx;
  1166. }
  1167. /*
  1168. * Do the common updates
  1169. */
  1170. /* BTH.PSN and BTH.A */
  1171. val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
  1172. (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
  1173. if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
  1174. val32 |= 1UL << 31;
  1175. idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
  1176. (__force u16)cpu_to_be16(val32 >> 16));
  1177. if (idx < 0)
  1178. return idx;
  1179. idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
  1180. (__force u16)cpu_to_be16(val32 & 0xffff));
  1181. if (idx < 0)
  1182. return idx;
  1183. /* KDETH.Offset */
  1184. idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
  1185. (__force u16)cpu_to_le16(req->koffset & 0xffff));
  1186. if (idx < 0)
  1187. return idx;
  1188. idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
  1189. (__force u16)cpu_to_le16(req->koffset >> 16));
  1190. if (idx < 0)
  1191. return idx;
  1192. if (req_opcode(req->info.ctrl) == EXPECTED) {
  1193. __le16 val;
  1194. tidval = req->tids[req->tididx];
  1195. /*
  1196. * If the offset puts us at the end of the current TID,
  1197. * advance everything.
  1198. */
  1199. if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
  1200. PAGE_SIZE)) {
  1201. req->tidoffset = 0;
  1202. /*
  1203. * Since we don't copy all the TIDs, all at once,
  1204. * we have to check again.
  1205. */
  1206. if (++req->tididx > req->n_tids - 1 ||
  1207. !req->tids[req->tididx])
  1208. return -EINVAL;
  1209. tidval = req->tids[req->tididx];
  1210. }
  1211. omfactor = ((EXP_TID_GET(tidval, LEN) *
  1212. PAGE_SIZE) >=
  1213. KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
  1214. KDETH_OM_SMALL_SHIFT;
  1215. /* KDETH.OM and KDETH.OFFSET (TID) */
  1216. idx = ahg_header_set(
  1217. ahg, idx, array_size, 7, 0, 16,
  1218. ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
  1219. ((req->tidoffset >> omfactor)
  1220. & 0x7fff)));
  1221. if (idx < 0)
  1222. return idx;
  1223. /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
  1224. val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
  1225. (EXP_TID_GET(tidval, IDX) & 0x3ff));
  1226. if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
  1227. val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
  1228. INTR) <<
  1229. AHG_KDETH_INTR_SHIFT));
  1230. } else {
  1231. val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
  1232. cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
  1233. cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
  1234. INTR) <<
  1235. AHG_KDETH_INTR_SHIFT));
  1236. }
  1237. idx = ahg_header_set(ahg, idx, array_size,
  1238. 7, 16, 14, (__force u16)val);
  1239. if (idx < 0)
  1240. return idx;
  1241. }
  1242. trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
  1243. req->info.comp_idx, req->sde->this_idx,
  1244. req->ahg_idx, ahg, idx, tidval);
  1245. sdma_txinit_ahg(&tx->txreq,
  1246. SDMA_TXREQ_F_USE_AHG,
  1247. datalen, req->ahg_idx, idx,
  1248. ahg, sizeof(req->hdr),
  1249. user_sdma_txreq_cb);
  1250. return idx;
  1251. }
  1252. /**
  1253. * user_sdma_txreq_cb() - SDMA tx request completion callback.
  1254. * @txreq: valid sdma tx request
  1255. * @status: success/failure of request
  1256. *
  1257. * Called when the SDMA progress state machine gets notification that
  1258. * the SDMA descriptors for this tx request have been processed by the
  1259. * DMA engine. Called in interrupt context.
  1260. * Only do work on completed sequences.
  1261. */
  1262. static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
  1263. {
  1264. struct user_sdma_txreq *tx =
  1265. container_of(txreq, struct user_sdma_txreq, txreq);
  1266. struct user_sdma_request *req;
  1267. struct hfi1_user_sdma_pkt_q *pq;
  1268. struct hfi1_user_sdma_comp_q *cq;
  1269. enum hfi1_sdma_comp_state state = COMPLETE;
  1270. if (!tx->req)
  1271. return;
  1272. req = tx->req;
  1273. pq = req->pq;
  1274. cq = req->cq;
  1275. if (status != SDMA_TXREQ_S_OK) {
  1276. SDMA_DBG(req, "SDMA completion with error %d",
  1277. status);
  1278. WRITE_ONCE(req->has_error, 1);
  1279. state = ERROR;
  1280. }
  1281. req->seqcomp = tx->seqnum;
  1282. kmem_cache_free(pq->txreq_cache, tx);
  1283. /* sequence isn't complete? We are done */
  1284. if (req->seqcomp != req->info.npkts - 1)
  1285. return;
  1286. user_sdma_free_request(req, false);
  1287. set_comp_state(pq, cq, req->info.comp_idx, state, status);
  1288. pq_update(pq);
  1289. }
  1290. static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
  1291. {
  1292. if (atomic_dec_and_test(&pq->n_reqs))
  1293. wake_up(&pq->wait);
  1294. }
  1295. static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
  1296. {
  1297. int i;
  1298. if (!list_empty(&req->txps)) {
  1299. struct sdma_txreq *t, *p;
  1300. list_for_each_entry_safe(t, p, &req->txps, list) {
  1301. struct user_sdma_txreq *tx =
  1302. container_of(t, struct user_sdma_txreq, txreq);
  1303. list_del_init(&t->list);
  1304. sdma_txclean(req->pq->dd, t);
  1305. kmem_cache_free(req->pq->txreq_cache, tx);
  1306. }
  1307. }
  1308. for (i = 0; i < req->data_iovs; i++) {
  1309. struct sdma_mmu_node *node = req->iovs[i].node;
  1310. if (!node)
  1311. continue;
  1312. req->iovs[i].node = NULL;
  1313. if (unpin)
  1314. hfi1_mmu_rb_remove(req->pq->handler,
  1315. &node->rb);
  1316. else
  1317. atomic_dec(&node->refcount);
  1318. }
  1319. kfree(req->tids);
  1320. clear_bit(req->info.comp_idx, req->pq->req_in_use);
  1321. }
  1322. static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  1323. struct hfi1_user_sdma_comp_q *cq,
  1324. u16 idx, enum hfi1_sdma_comp_state state,
  1325. int ret)
  1326. {
  1327. if (state == ERROR)
  1328. cq->comps[idx].errcode = -ret;
  1329. smp_wmb(); /* make sure errcode is visible first */
  1330. cq->comps[idx].status = state;
  1331. trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
  1332. idx, state, ret);
  1333. }
  1334. static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
  1335. unsigned long len)
  1336. {
  1337. return (bool)(node->addr == addr);
  1338. }
  1339. static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode)
  1340. {
  1341. struct sdma_mmu_node *node =
  1342. container_of(mnode, struct sdma_mmu_node, rb);
  1343. atomic_inc(&node->refcount);
  1344. return 0;
  1345. }
  1346. /*
  1347. * Return 1 to remove the node from the rb tree and call the remove op.
  1348. *
  1349. * Called with the rb tree lock held.
  1350. */
  1351. static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
  1352. void *evict_arg, bool *stop)
  1353. {
  1354. struct sdma_mmu_node *node =
  1355. container_of(mnode, struct sdma_mmu_node, rb);
  1356. struct evict_data *evict_data = evict_arg;
  1357. /* is this node still being used? */
  1358. if (atomic_read(&node->refcount))
  1359. return 0; /* keep this node */
  1360. /* this node will be evicted, add its pages to our count */
  1361. evict_data->cleared += node->npages;
  1362. /* have enough pages been cleared? */
  1363. if (evict_data->cleared >= evict_data->target)
  1364. *stop = true;
  1365. return 1; /* remove this node */
  1366. }
  1367. static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
  1368. {
  1369. struct sdma_mmu_node *node =
  1370. container_of(mnode, struct sdma_mmu_node, rb);
  1371. unpin_sdma_pages(node);
  1372. kfree(node);
  1373. }
  1374. static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
  1375. {
  1376. struct sdma_mmu_node *node =
  1377. container_of(mnode, struct sdma_mmu_node, rb);
  1378. if (!atomic_read(&node->refcount))
  1379. return 1;
  1380. return 0;
  1381. }