object.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. /*
  2. * Copyright (c) 2017 Richard Braun.
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation, either version 3 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. *
  17. *
  18. * This implementation is based on the paper "A lockless pagecache in Linux"
  19. * by Nick Piggin. It allows looking up pages without contention on VM objects.
  20. */
  21. #include <assert.h>
  22. #include <errno.h>
  23. #include <stddef.h>
  24. #include <stdint.h>
  25. #include <kern/capability.h>
  26. #include <kern/init.h>
  27. #include <kern/kmem.h>
  28. #include <kern/list.h>
  29. #include <kern/mutex.h>
  30. #include <kern/rcu.h>
  31. #include <kern/unwind.h>
  32. #include <kern/user.h>
  33. #include <vm/object.h>
  34. #include <vm/page.h>
  35. #include <vm/rset.h>
  36. #include <machine/page.h>
  37. static struct kmem_cache vm_object_cache;
  38. struct vm_object_copy_data
  39. {
  40. struct pmap_window wstore;
  41. struct pmap_window *window;
  42. struct vm_page *page;
  43. int washed;
  44. };
  45. static int __init
  46. vm_object_bootstrap (void)
  47. {
  48. return (0);
  49. }
  50. INIT_OP_DEFINE (vm_object_bootstrap,
  51. INIT_OP_DEP (mutex_setup, true),
  52. INIT_OP_DEP (rdxtree_setup, true),
  53. INIT_OP_DEP (vm_page_setup, true));
  54. static int __init
  55. vm_object_setup (void)
  56. {
  57. kmem_cache_init (&vm_object_cache, "vm_object",
  58. sizeof (struct vm_object), 0, NULL, 0);
  59. return (0);
  60. }
  61. INIT_OP_DEFINE (vm_object_setup,
  62. INIT_OP_DEP (kmem_setup, true));
  63. static void
  64. vm_object_init (struct vm_object *object, uint32_t flags, void *ctx)
  65. {
  66. mutex_init (&object->lock);
  67. rdxtree_init (&object->pages, RDXTREE_ALLOC_SLEEP);
  68. object->nr_pages = 0;
  69. object->refcount = 1;
  70. object->flags = flags;
  71. if (flags & VM_OBJECT_EXTERNAL)
  72. object->channel = (struct cap_channel *)ctx;
  73. else
  74. object->page_get = (typeof (object->page_get))ctx;
  75. }
  76. int
  77. vm_object_create (struct vm_object **objp, uint32_t flags, void *ctx)
  78. {
  79. struct vm_object *ret = kmem_cache_alloc (&vm_object_cache);
  80. if (! ret)
  81. return (ENOMEM);
  82. vm_object_init (ret, flags, ctx);
  83. *objp = ret;
  84. return (0);
  85. }
  86. static void
  87. vm_object_work_fini (struct work *work)
  88. {
  89. kmem_cache_free (&vm_object_cache, structof (work, struct vm_object, work));
  90. }
  91. void
  92. vm_object_destroy (struct vm_object *object)
  93. {
  94. assert (object->nr_pages == 0);
  95. rdxtree_remove_all (&object->pages);
  96. if (object->flags & VM_OBJECT_EXTERNAL)
  97. cap_base_rel (object->channel);
  98. work_init (&object->work, vm_object_work_fini);
  99. rcu_defer (&object->work);
  100. }
  101. int
  102. vm_object_swap (struct vm_object *object, struct vm_page *page,
  103. uint64_t offset, struct vm_page *expected)
  104. {
  105. assert (vm_page_aligned (offset));
  106. assert (vm_page_referenced (page));
  107. if (!atomic_cas_bool_acq (&page->object, NULL, object))
  108. // Page belongs to a different object.
  109. return (EAGAIN);
  110. mutex_lock (&object->lock);
  111. struct vm_page *prev = NULL;
  112. void **slot;
  113. int error = rdxtree_insert_slot (&object->pages, vm_page_btop (offset),
  114. page, &slot);
  115. if (error)
  116. {
  117. if (error != EBUSY || atomic_load_rlx (slot) != expected)
  118. goto skip;
  119. /*
  120. * Replace the page slot. Also, if this is the page's last
  121. * reference, free it after the critical section.
  122. */
  123. prev = rdxtree_replace_slot (slot, page);
  124. if (!vm_page_unref_nofree (prev))
  125. prev = NULL;
  126. }
  127. page->offset = offset;
  128. ++object->nr_pages;
  129. assert (object->nr_pages != 0);
  130. vm_object_ref (object);
  131. mutex_unlock (&object->lock);
  132. if (prev)
  133. vm_page_free (prev, 0, VM_PAGE_SLEEP);
  134. return (0);
  135. skip:
  136. mutex_unlock (&object->lock);
  137. return (error);
  138. }
  139. void
  140. vm_object_remove (struct vm_object *object, uint64_t start, uint64_t end)
  141. {
  142. assert (vm_page_aligned (start));
  143. assert (vm_page_aligned (end));
  144. assert (start <= end);
  145. struct list pages;
  146. list_init (&pages);
  147. uint32_t cnt = 0, no_flush = !(object->flags & VM_OBJECT_FLUSHES);
  148. {
  149. struct rdxtree_iter it;
  150. struct vm_page *page = NULL;
  151. rdxtree_iter_init (&it);
  152. MUTEX_GUARD (&object->lock);
  153. for (uint64_t offset = start; offset < end; offset += PAGE_SIZE)
  154. {
  155. it.key = vm_page_btop (offset);
  156. void **slot = rdxtree_lookup_common (&object->pages, it.key, true,
  157. &it.node, &it.index);
  158. if (slot)
  159. {
  160. page = atomic_load_rlx (slot);
  161. break;
  162. }
  163. }
  164. if (! page)
  165. return;
  166. do
  167. {
  168. int idx = it.index;
  169. void *node = it.node;
  170. struct vm_page *next = rdxtree_walk (&object->pages, &it);
  171. if (vm_page_unref_nofree (page) &&
  172. (page->dirty == VM_PAGE_CLEAN || no_flush))
  173. {
  174. rdxtree_remove_node_idx (&object->pages, node, idx);
  175. vm_page_unlink (page);
  176. list_insert_tail (&pages, &page->node);
  177. ++cnt;
  178. }
  179. page = next;
  180. }
  181. while (page && page->offset < end);
  182. assert (object->nr_pages >= cnt);
  183. object->nr_pages -= cnt;
  184. }
  185. vm_object_unref_many (object, cnt);
  186. vm_page_list_free (&pages);
  187. }
  188. void
  189. vm_object_detach (struct vm_object *object, struct vm_page *page)
  190. {
  191. MUTEX_GUARD (&object->lock);
  192. void *node;
  193. int idx;
  194. void **slot = rdxtree_lookup_common (&object->pages,
  195. vm_page_btop (page->offset), true,
  196. &node, &idx);
  197. if (!slot || atomic_load_rlx (slot) != page)
  198. return;
  199. rdxtree_remove_node_idx (&object->pages, node, idx);
  200. --object->nr_pages;
  201. vm_object_unref (object);
  202. }
  203. struct vm_page*
  204. vm_object_lookup (struct vm_object *object, uint64_t offset)
  205. {
  206. RCU_GUARD ();
  207. while (1)
  208. {
  209. struct vm_page *page = rdxtree_lookup (&object->pages,
  210. vm_page_btop (offset));
  211. if (!page || vm_page_tryref (page) == 0)
  212. return (page);
  213. }
  214. }
  215. static int
  216. vm_object_anon_pager_get (struct vm_object *ap __unused, uint64_t off __unused,
  217. size_t size, int prot __unused, void *dst)
  218. {
  219. memset (dst, 0, size);
  220. return (size >> PAGE_SHIFT);
  221. }
  222. int
  223. vm_object_anon_create (struct vm_object **outp)
  224. {
  225. return (vm_object_create (outp, 0, vm_object_anon_pager_get));
  226. }
  227. ssize_t
  228. vm_object_list_dirty (struct vm_object *obj, struct cap_page_info *upg)
  229. {
  230. if (!user_check_range (upg, sizeof (*upg)))
  231. return (-EFAULT);
  232. struct unw_fixup fixup;
  233. int error = unw_fixup_save (&fixup);
  234. RCU_GUARD ();
  235. if (unlikely (error))
  236. return (-error);
  237. struct cap_page_info pg = *upg;
  238. void *out = pg.offsets;
  239. uint32_t cnt = pg.offset_cnt;
  240. if (! cnt)
  241. return (0);
  242. else if (!user_check_range (out, cnt * sizeof (uint64_t)))
  243. return (-EFAULT);
  244. struct rdxtree_iter it;
  245. struct vm_page *page;
  246. rdxtree_for_each (&obj->pages, &it, page)
  247. {
  248. if (page->dirty == VM_PAGE_CLEAN)
  249. continue;
  250. ((union user_ua *)out)->u8 = page->offset;
  251. if (--cnt == 0)
  252. break;
  253. out = (char *)out + sizeof (uint64_t);
  254. }
  255. return ((ssize_t)(pg.offset_cnt - cnt));
  256. }
  257. static void
  258. vm_object_copy_data_fini (struct vm_object_copy_data *dp, bool err)
  259. {
  260. if (dp->window)
  261. {
  262. pmap_window_put (dp->window);
  263. dp->window = NULL;
  264. }
  265. if (dp->washed && !err)
  266. vm_page_wash_end (dp->page);
  267. vm_page_unref (dp->page);
  268. }
  269. static ssize_t
  270. vm_object_copy_single_page (struct vm_object_copy_data *dp,
  271. struct ipc_iov_iter *it, struct iovec *iov)
  272. {
  273. dp->washed = 0;
  274. if (dp->page->dirty != VM_PAGE_CLEAN)
  275. { // Begin the page laundering process and mark it as read-only.
  276. vm_page_wash_begin (dp->page);
  277. vm_rset_mark_ro (dp->page);
  278. dp->washed = 1;
  279. }
  280. // Get the window to perform the copy.
  281. dp->window = pmap_window_load (0, &dp->wstore);
  282. pmap_window_set (dp->window, vm_page_to_pa (dp->page));
  283. const char *src = pmap_window_va (dp->window);
  284. ssize_t ret = 0;
  285. // Copy the page into the user buffer.
  286. while (1)
  287. {
  288. ssize_t tmp = MIN (PAGE_SIZE - ret, (ssize_t)iov->iov_len);
  289. memcpy (iov->iov_base, src, tmp);
  290. src += tmp, ret += tmp;
  291. iovec_adv (iov, tmp);
  292. if (ret == PAGE_SIZE)
  293. break;
  294. iov = ipc_iov_iter_usrnext (it, &ret);
  295. if (! iov)
  296. break;
  297. }
  298. // Cleanup.
  299. vm_object_copy_data_fini (dp, ret < 0);
  300. return (ret);
  301. }
  302. ssize_t
  303. vm_object_copy_pages (struct vm_object *obj, struct cap_page_info *upg)
  304. {
  305. if (!user_check_range (upg, sizeof (*upg)))
  306. return (-EFAULT);
  307. struct vm_object_copy_data data = { .window = NULL, .page = NULL };
  308. struct unw_fixup fixup;
  309. int error = unw_fixup_save (&fixup);
  310. if (unlikely (error))
  311. {
  312. vm_object_copy_data_fini (&data, true);
  313. return (-error);
  314. }
  315. struct cap_page_info pg = *upg;
  316. void *offs = pg.offsets;
  317. if (!user_check_range (offs, pg.offset_cnt * sizeof (uint64_t)) ||
  318. !P2ALIGNED ((uintptr_t)pg.iovs, alignof (struct iovec)))
  319. return (-EFAULT);
  320. ssize_t ret = 0;
  321. struct ipc_iov_iter it;
  322. ipc_iov_iter_init (&it, pg.iovs, pg.iov_cnt);
  323. for (uint32_t i = 0; i < pg.offset_cnt; ++i)
  324. {
  325. _Auto uptr = (union user_ua *)offs;
  326. uint64_t offset = uptr->u8;
  327. data.page = vm_object_lookup (obj, offset);
  328. offs = (char *)offs + sizeof (uint64_t);
  329. _Auto dv = ipc_iov_iter_usrnext (&it, &ret);
  330. if (! data.page)
  331. {
  332. uptr->u8 = offset | 1;
  333. if ((pg.flags & CAP_PAGES_ADV_SKIP) && dv)
  334. iovec_adv (dv, MIN (PAGE_SIZE, dv->iov_len));
  335. continue;
  336. }
  337. else if (! dv)
  338. break;
  339. ssize_t tmp = vm_object_copy_single_page (&data, &it, dv);
  340. if (tmp < 0)
  341. return (tmp);
  342. ret += tmp;
  343. }
  344. return (ret);
  345. }
  346. int
  347. vm_object_map_dirty (struct vm_object *obj, struct cap_page_info *upg)
  348. {
  349. if (!user_check_range (upg, sizeof (*upg)))
  350. return (-EFAULT);
  351. struct unw_fixup fixup;
  352. int error = unw_fixup_save (&fixup);
  353. if (unlikely (error))
  354. {
  355. rcu_read_leave ();
  356. return (-error);
  357. }
  358. rcu_read_enter ();
  359. _Auto pg = *upg;
  360. if (!pg.offset_cnt ||
  361. !user_check_range (pg.offsets, pg.offset_cnt * sizeof (uint64_t)))
  362. {
  363. rcu_read_leave ();
  364. return (pg.offset_cnt ? -EFAULT : 0);
  365. }
  366. struct rdxtree_iter it;
  367. struct vm_page *page;
  368. int ret = 0;
  369. void *out = pg.offsets;
  370. uint32_t cnt = pg.offset_cnt;
  371. uint64_t first, last;
  372. rdxtree_for_each (&obj->pages, &it, page)
  373. {
  374. if (page->dirty != VM_PAGE_DIRTY)
  375. continue;
  376. else if (--cnt == (uint32_t)-1)
  377. {
  378. pg.flags |= CAP_PAGES_MORE;
  379. break;
  380. }
  381. ((union user_ua *)out)->u8 = page->offset;
  382. out = (char *)out + sizeof (uint64_t);
  383. if (! ret)
  384. first = page->offset;
  385. last = page->offset;
  386. vm_page_wash_begin (page);
  387. vm_rset_mark_ro (page);
  388. ++ret;
  389. }
  390. rcu_read_leave ();
  391. if (! ret)
  392. return (ret);
  393. int map_flags = VM_MAP_FLAGS (VM_PROT_READ, VM_PROT_READ, VM_INHERIT_NONE,
  394. VM_ADV_DEFAULT, VM_MAP_CLEAN);
  395. pg.vme.size = (size_t)(last - first) + PAGE_SIZE;
  396. pg.vme.prot = pg.vme.max_prot = VM_PROT_READ;
  397. int rv = vm_map_enter (vm_map_self (), &pg.vme.addr, pg.vme.size,
  398. map_flags, obj, first);
  399. if (rv != 0 || (rv = user_copy_to (upg, &pg, sizeof (pg))) != 0)
  400. ret = -rv;
  401. return (ret);
  402. }