xdp_umem.c 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. // SPDX-License-Identifier: GPL-2.0
  2. /* XDP user-space packet buffer
  3. * Copyright(c) 2018 Intel Corporation.
  4. */
  5. #include <linux/init.h>
  6. #include <linux/sched/mm.h>
  7. #include <linux/sched/signal.h>
  8. #include <linux/sched/task.h>
  9. #include <linux/uaccess.h>
  10. #include <linux/slab.h>
  11. #include <linux/bpf.h>
  12. #include <linux/mm.h>
  13. #include <linux/netdevice.h>
  14. #include <linux/rtnetlink.h>
  15. #include "xdp_umem.h"
  16. #include "xsk_queue.h"
  17. #define XDP_UMEM_MIN_CHUNK_SIZE 2048
  18. void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
  19. {
  20. unsigned long flags;
  21. if (!xs->tx)
  22. return;
  23. spin_lock_irqsave(&umem->xsk_list_lock, flags);
  24. list_add_rcu(&xs->list, &umem->xsk_list);
  25. spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
  26. }
  27. void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
  28. {
  29. unsigned long flags;
  30. if (!xs->tx)
  31. return;
  32. spin_lock_irqsave(&umem->xsk_list_lock, flags);
  33. list_del_rcu(&xs->list);
  34. spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
  35. }
  36. int xdp_umem_query(struct net_device *dev, u16 queue_id)
  37. {
  38. struct netdev_bpf bpf;
  39. ASSERT_RTNL();
  40. memset(&bpf, 0, sizeof(bpf));
  41. bpf.command = XDP_QUERY_XSK_UMEM;
  42. bpf.xsk.queue_id = queue_id;
  43. if (!dev->netdev_ops->ndo_bpf)
  44. return 0;
  45. return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem;
  46. }
  47. int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
  48. u32 queue_id, u16 flags)
  49. {
  50. bool force_zc, force_copy;
  51. struct netdev_bpf bpf;
  52. int err;
  53. force_zc = flags & XDP_ZEROCOPY;
  54. force_copy = flags & XDP_COPY;
  55. if (force_zc && force_copy)
  56. return -EINVAL;
  57. if (force_copy)
  58. return 0;
  59. if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit)
  60. return force_zc ? -EOPNOTSUPP : 0; /* fail or fallback */
  61. bpf.command = XDP_QUERY_XSK_UMEM;
  62. rtnl_lock();
  63. err = xdp_umem_query(dev, queue_id);
  64. if (err) {
  65. err = err < 0 ? -EOPNOTSUPP : -EBUSY;
  66. goto err_rtnl_unlock;
  67. }
  68. bpf.command = XDP_SETUP_XSK_UMEM;
  69. bpf.xsk.umem = umem;
  70. bpf.xsk.queue_id = queue_id;
  71. err = dev->netdev_ops->ndo_bpf(dev, &bpf);
  72. if (err)
  73. goto err_rtnl_unlock;
  74. rtnl_unlock();
  75. dev_hold(dev);
  76. umem->dev = dev;
  77. umem->queue_id = queue_id;
  78. umem->zc = true;
  79. return 0;
  80. err_rtnl_unlock:
  81. rtnl_unlock();
  82. return force_zc ? err : 0; /* fail or fallback */
  83. }
  84. static void xdp_umem_clear_dev(struct xdp_umem *umem)
  85. {
  86. struct netdev_bpf bpf;
  87. int err;
  88. if (umem->dev) {
  89. bpf.command = XDP_SETUP_XSK_UMEM;
  90. bpf.xsk.umem = NULL;
  91. bpf.xsk.queue_id = umem->queue_id;
  92. rtnl_lock();
  93. err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
  94. rtnl_unlock();
  95. if (err)
  96. WARN(1, "failed to disable umem!\n");
  97. dev_put(umem->dev);
  98. umem->dev = NULL;
  99. }
  100. }
  101. static void xdp_umem_unpin_pages(struct xdp_umem *umem)
  102. {
  103. unsigned int i;
  104. for (i = 0; i < umem->npgs; i++) {
  105. struct page *page = umem->pgs[i];
  106. set_page_dirty_lock(page);
  107. put_page(page);
  108. }
  109. kfree(umem->pgs);
  110. umem->pgs = NULL;
  111. }
  112. static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
  113. {
  114. if (umem->user) {
  115. atomic_long_sub(umem->npgs, &umem->user->locked_vm);
  116. free_uid(umem->user);
  117. }
  118. }
  119. static void xdp_umem_release(struct xdp_umem *umem)
  120. {
  121. xdp_umem_clear_dev(umem);
  122. if (umem->fq) {
  123. xskq_destroy(umem->fq);
  124. umem->fq = NULL;
  125. }
  126. if (umem->cq) {
  127. xskq_destroy(umem->cq);
  128. umem->cq = NULL;
  129. }
  130. xdp_umem_unpin_pages(umem);
  131. kfree(umem->pages);
  132. umem->pages = NULL;
  133. xdp_umem_unaccount_pages(umem);
  134. kfree(umem);
  135. }
  136. static void xdp_umem_release_deferred(struct work_struct *work)
  137. {
  138. struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
  139. xdp_umem_release(umem);
  140. }
  141. void xdp_get_umem(struct xdp_umem *umem)
  142. {
  143. refcount_inc(&umem->users);
  144. }
  145. void xdp_put_umem(struct xdp_umem *umem)
  146. {
  147. if (!umem)
  148. return;
  149. if (refcount_dec_and_test(&umem->users)) {
  150. INIT_WORK(&umem->work, xdp_umem_release_deferred);
  151. schedule_work(&umem->work);
  152. }
  153. }
  154. static int xdp_umem_pin_pages(struct xdp_umem *umem)
  155. {
  156. unsigned int gup_flags = FOLL_WRITE;
  157. long npgs;
  158. int err;
  159. umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs),
  160. GFP_KERNEL | __GFP_NOWARN);
  161. if (!umem->pgs)
  162. return -ENOMEM;
  163. down_write(&current->mm->mmap_sem);
  164. npgs = get_user_pages(umem->address, umem->npgs,
  165. gup_flags, &umem->pgs[0], NULL);
  166. up_write(&current->mm->mmap_sem);
  167. if (npgs != umem->npgs) {
  168. if (npgs >= 0) {
  169. umem->npgs = npgs;
  170. err = -ENOMEM;
  171. goto out_pin;
  172. }
  173. err = npgs;
  174. goto out_pgs;
  175. }
  176. return 0;
  177. out_pin:
  178. xdp_umem_unpin_pages(umem);
  179. out_pgs:
  180. kfree(umem->pgs);
  181. umem->pgs = NULL;
  182. return err;
  183. }
  184. static int xdp_umem_account_pages(struct xdp_umem *umem)
  185. {
  186. unsigned long lock_limit, new_npgs, old_npgs;
  187. if (capable(CAP_IPC_LOCK))
  188. return 0;
  189. lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  190. umem->user = get_uid(current_user());
  191. do {
  192. old_npgs = atomic_long_read(&umem->user->locked_vm);
  193. new_npgs = old_npgs + umem->npgs;
  194. if (new_npgs > lock_limit) {
  195. free_uid(umem->user);
  196. umem->user = NULL;
  197. return -ENOBUFS;
  198. }
  199. } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
  200. new_npgs) != old_npgs);
  201. return 0;
  202. }
  203. static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
  204. {
  205. u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
  206. unsigned int chunks, chunks_per_page;
  207. u64 addr = mr->addr, size = mr->len;
  208. int size_chk, err, i;
  209. if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
  210. /* Strictly speaking we could support this, if:
  211. * - huge pages, or*
  212. * - using an IOMMU, or
  213. * - making sure the memory area is consecutive
  214. * but for now, we simply say "computer says no".
  215. */
  216. return -EINVAL;
  217. }
  218. if (!is_power_of_2(chunk_size))
  219. return -EINVAL;
  220. if (!PAGE_ALIGNED(addr)) {
  221. /* Memory area has to be page size aligned. For
  222. * simplicity, this might change.
  223. */
  224. return -EINVAL;
  225. }
  226. if ((addr + size) < addr)
  227. return -EINVAL;
  228. chunks = (unsigned int)div_u64(size, chunk_size);
  229. if (chunks == 0)
  230. return -EINVAL;
  231. chunks_per_page = PAGE_SIZE / chunk_size;
  232. if (chunks < chunks_per_page || chunks % chunks_per_page)
  233. return -EINVAL;
  234. headroom = ALIGN(headroom, 64);
  235. size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
  236. if (size_chk < 0)
  237. return -EINVAL;
  238. umem->address = (unsigned long)addr;
  239. umem->props.chunk_mask = ~((u64)chunk_size - 1);
  240. umem->props.size = size;
  241. umem->headroom = headroom;
  242. umem->chunk_size_nohr = chunk_size - headroom;
  243. umem->npgs = size / PAGE_SIZE;
  244. umem->pgs = NULL;
  245. umem->user = NULL;
  246. INIT_LIST_HEAD(&umem->xsk_list);
  247. spin_lock_init(&umem->xsk_list_lock);
  248. refcount_set(&umem->users, 1);
  249. err = xdp_umem_account_pages(umem);
  250. if (err)
  251. return err;
  252. err = xdp_umem_pin_pages(umem);
  253. if (err)
  254. goto out_account;
  255. umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
  256. if (!umem->pages) {
  257. err = -ENOMEM;
  258. goto out_pin;
  259. }
  260. for (i = 0; i < umem->npgs; i++)
  261. umem->pages[i].addr = page_address(umem->pgs[i]);
  262. return 0;
  263. out_pin:
  264. xdp_umem_unpin_pages(umem);
  265. out_account:
  266. xdp_umem_unaccount_pages(umem);
  267. return err;
  268. }
  269. struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
  270. {
  271. struct xdp_umem *umem;
  272. int err;
  273. umem = kzalloc(sizeof(*umem), GFP_KERNEL);
  274. if (!umem)
  275. return ERR_PTR(-ENOMEM);
  276. err = xdp_umem_reg(umem, mr);
  277. if (err) {
  278. kfree(umem);
  279. return ERR_PTR(err);
  280. }
  281. return umem;
  282. }
  283. bool xdp_umem_validate_queues(struct xdp_umem *umem)
  284. {
  285. return umem->fq && umem->cq;
  286. }