123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495 |
- /*
- * Copyright (c) 2017 Richard Braun.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- *
- * This implementation is based on the paper "A lockless pagecache in Linux"
- * by Nick Piggin. It allows looking up pages without contention on VM objects.
- */
- #include <assert.h>
- #include <errno.h>
- #include <stddef.h>
- #include <stdint.h>
- #include <kern/capability.h>
- #include <kern/init.h>
- #include <kern/kmem.h>
- #include <kern/list.h>
- #include <kern/mutex.h>
- #include <kern/rcu.h>
- #include <kern/unwind.h>
- #include <kern/user.h>
- #include <vm/object.h>
- #include <vm/page.h>
- #include <vm/rset.h>
- #include <machine/page.h>
- static struct kmem_cache vm_object_cache;
- struct vm_object_copy_data
- {
- struct pmap_window wstore;
- struct pmap_window *window;
- struct vm_page *page;
- int washed;
- };
- static int __init
- vm_object_bootstrap (void)
- {
- return (0);
- }
- INIT_OP_DEFINE (vm_object_bootstrap,
- INIT_OP_DEP (mutex_setup, true),
- INIT_OP_DEP (rdxtree_setup, true),
- INIT_OP_DEP (vm_page_setup, true));
- static int __init
- vm_object_setup (void)
- {
- kmem_cache_init (&vm_object_cache, "vm_object",
- sizeof (struct vm_object), 0, NULL, 0);
- return (0);
- }
- INIT_OP_DEFINE (vm_object_setup,
- INIT_OP_DEP (kmem_setup, true));
- static void
- vm_object_init (struct vm_object *object, uint32_t flags, void *ctx)
- {
- mutex_init (&object->lock);
- rdxtree_init (&object->pages, RDXTREE_ALLOC_SLEEP);
- object->nr_pages = 0;
- object->refcount = 1;
- object->flags = flags;
- if (flags & VM_OBJECT_EXTERNAL)
- object->channel = (struct cap_channel *)ctx;
- else
- object->page_get = (typeof (object->page_get))ctx;
- }
- int
- vm_object_create (struct vm_object **objp, uint32_t flags, void *ctx)
- {
- struct vm_object *ret = kmem_cache_alloc (&vm_object_cache);
- if (! ret)
- return (ENOMEM);
- vm_object_init (ret, flags, ctx);
- *objp = ret;
- return (0);
- }
- static void
- vm_object_work_fini (struct work *work)
- {
- kmem_cache_free (&vm_object_cache, structof (work, struct vm_object, work));
- }
- void
- vm_object_destroy (struct vm_object *object)
- {
- assert (object->nr_pages == 0);
- rdxtree_remove_all (&object->pages);
- if (object->flags & VM_OBJECT_EXTERNAL)
- cap_base_rel (object->channel);
- work_init (&object->work, vm_object_work_fini);
- rcu_defer (&object->work);
- }
- int
- vm_object_swap (struct vm_object *object, struct vm_page *page,
- uint64_t offset, struct vm_page *expected)
- {
- assert (vm_page_aligned (offset));
- assert (vm_page_referenced (page));
- if (!atomic_cas_bool_acq (&page->object, NULL, object))
- // Page belongs to a different object.
- return (EAGAIN);
- mutex_lock (&object->lock);
- struct vm_page *prev = NULL;
- void **slot;
- int error = rdxtree_insert_slot (&object->pages, vm_page_btop (offset),
- page, &slot);
- if (error)
- {
- if (error != EBUSY || atomic_load_rlx (slot) != expected)
- goto skip;
- /*
- * Replace the page slot. Also, if this is the page's last
- * reference, free it after the critical section.
- */
- prev = rdxtree_replace_slot (slot, page);
- if (!vm_page_unref_nofree (prev))
- prev = NULL;
- }
- page->offset = offset;
- ++object->nr_pages;
- assert (object->nr_pages != 0);
- vm_object_ref (object);
- mutex_unlock (&object->lock);
- if (prev)
- vm_page_free (prev, 0, VM_PAGE_SLEEP);
- return (0);
- skip:
- mutex_unlock (&object->lock);
- return (error);
- }
- void
- vm_object_remove (struct vm_object *object, uint64_t start, uint64_t end)
- {
- assert (vm_page_aligned (start));
- assert (vm_page_aligned (end));
- assert (start <= end);
- struct list pages;
- list_init (&pages);
- uint32_t cnt = 0, no_flush = !(object->flags & VM_OBJECT_FLUSHES);
- {
- struct rdxtree_iter it;
- struct vm_page *page = NULL;
- rdxtree_iter_init (&it);
- MUTEX_GUARD (&object->lock);
- for (uint64_t offset = start; offset < end; offset += PAGE_SIZE)
- {
- it.key = vm_page_btop (offset);
- void **slot = rdxtree_lookup_common (&object->pages, it.key, true,
- &it.node, &it.index);
- if (slot)
- {
- page = atomic_load_rlx (slot);
- break;
- }
- }
- if (! page)
- return;
- do
- {
- int idx = it.index;
- void *node = it.node;
- struct vm_page *next = rdxtree_walk (&object->pages, &it);
- if (vm_page_unref_nofree (page) &&
- (page->dirty == VM_PAGE_CLEAN || no_flush))
- {
- rdxtree_remove_node_idx (&object->pages, node, idx);
- vm_page_unlink (page);
- list_insert_tail (&pages, &page->node);
- ++cnt;
- }
- page = next;
- }
- while (page && page->offset < end);
- assert (object->nr_pages >= cnt);
- object->nr_pages -= cnt;
- }
- vm_object_unref_many (object, cnt);
- vm_page_list_free (&pages);
- }
- void
- vm_object_detach (struct vm_object *object, struct vm_page *page)
- {
- MUTEX_GUARD (&object->lock);
- void *node;
- int idx;
- void **slot = rdxtree_lookup_common (&object->pages,
- vm_page_btop (page->offset), true,
- &node, &idx);
- if (!slot || atomic_load_rlx (slot) != page)
- return;
- rdxtree_remove_node_idx (&object->pages, node, idx);
- --object->nr_pages;
- vm_object_unref (object);
- }
- struct vm_page*
- vm_object_lookup (struct vm_object *object, uint64_t offset)
- {
- RCU_GUARD ();
- while (1)
- {
- struct vm_page *page = rdxtree_lookup (&object->pages,
- vm_page_btop (offset));
- if (!page || vm_page_tryref (page) == 0)
- return (page);
- }
- }
- static int
- vm_object_anon_pager_get (struct vm_object *ap __unused, uint64_t off __unused,
- size_t size, int prot __unused, void *dst)
- {
- memset (dst, 0, size);
- return (size >> PAGE_SHIFT);
- }
- int
- vm_object_anon_create (struct vm_object **outp)
- {
- return (vm_object_create (outp, 0, vm_object_anon_pager_get));
- }
- ssize_t
- vm_object_list_dirty (struct vm_object *obj, struct cap_page_info *upg)
- {
- if (!user_check_range (upg, sizeof (*upg)))
- return (-EFAULT);
- struct unw_fixup fixup;
- int error = unw_fixup_save (&fixup);
- RCU_GUARD ();
- if (unlikely (error))
- return (-error);
- struct cap_page_info pg = *upg;
- void *out = pg.offsets;
- uint32_t cnt = pg.offset_cnt;
- if (! cnt)
- return (0);
- else if (!user_check_range (out, cnt * sizeof (uint64_t)))
- return (-EFAULT);
- struct rdxtree_iter it;
- struct vm_page *page;
- rdxtree_for_each (&obj->pages, &it, page)
- {
- if (page->dirty == VM_PAGE_CLEAN)
- continue;
- ((union user_ua *)out)->u8 = page->offset;
- if (--cnt == 0)
- break;
- out = (char *)out + sizeof (uint64_t);
- }
- return ((ssize_t)(pg.offset_cnt - cnt));
- }
- static void
- vm_object_copy_data_fini (struct vm_object_copy_data *dp, bool err)
- {
- if (dp->window)
- {
- pmap_window_put (dp->window);
- dp->window = NULL;
- }
- if (dp->washed && !err)
- vm_page_wash_end (dp->page);
- vm_page_unref (dp->page);
- }
- static ssize_t
- vm_object_copy_single_page (struct vm_object_copy_data *dp,
- struct ipc_iov_iter *it, struct iovec *iov)
- {
- dp->washed = 0;
- if (dp->page->dirty != VM_PAGE_CLEAN)
- { // Begin the page laundering process and mark it as read-only.
- vm_page_wash_begin (dp->page);
- vm_rset_mark_ro (dp->page);
- dp->washed = 1;
- }
- // Get the window to perform the copy.
- dp->window = pmap_window_load (0, &dp->wstore);
- pmap_window_set (dp->window, vm_page_to_pa (dp->page));
- const char *src = pmap_window_va (dp->window);
- ssize_t ret = 0;
- // Copy the page into the user buffer.
- while (1)
- {
- ssize_t tmp = MIN (PAGE_SIZE - ret, (ssize_t)iov->iov_len);
- memcpy (iov->iov_base, src, tmp);
- src += tmp, ret += tmp;
- iovec_adv (iov, tmp);
- if (ret == PAGE_SIZE)
- break;
- iov = ipc_iov_iter_usrnext (it, &ret);
- if (! iov)
- break;
- }
- // Cleanup.
- vm_object_copy_data_fini (dp, ret < 0);
- return (ret);
- }
- ssize_t
- vm_object_copy_pages (struct vm_object *obj, struct cap_page_info *upg)
- {
- if (!user_check_range (upg, sizeof (*upg)))
- return (-EFAULT);
- struct vm_object_copy_data data = { .window = NULL, .page = NULL };
- struct unw_fixup fixup;
- int error = unw_fixup_save (&fixup);
- if (unlikely (error))
- {
- vm_object_copy_data_fini (&data, true);
- return (-error);
- }
- struct cap_page_info pg = *upg;
- void *offs = pg.offsets;
- if (!user_check_range (offs, pg.offset_cnt * sizeof (uint64_t)) ||
- !P2ALIGNED ((uintptr_t)pg.iovs, alignof (struct iovec)))
- return (-EFAULT);
- ssize_t ret = 0;
- struct ipc_iov_iter it;
- ipc_iov_iter_init (&it, pg.iovs, pg.iov_cnt);
- for (uint32_t i = 0; i < pg.offset_cnt; ++i)
- {
- _Auto uptr = (union user_ua *)offs;
- uint64_t offset = uptr->u8;
- data.page = vm_object_lookup (obj, offset);
- offs = (char *)offs + sizeof (uint64_t);
- _Auto dv = ipc_iov_iter_usrnext (&it, &ret);
- if (! data.page)
- {
- uptr->u8 = offset | 1;
- if ((pg.flags & CAP_PAGES_ADV_SKIP) && dv)
- iovec_adv (dv, MIN (PAGE_SIZE, dv->iov_len));
- continue;
- }
- else if (! dv)
- break;
- ssize_t tmp = vm_object_copy_single_page (&data, &it, dv);
- if (tmp < 0)
- return (tmp);
- ret += tmp;
- }
- return (ret);
- }
- int
- vm_object_map_dirty (struct vm_object *obj, struct cap_page_info *upg)
- {
- if (!user_check_range (upg, sizeof (*upg)))
- return (-EFAULT);
- struct unw_fixup fixup;
- int error = unw_fixup_save (&fixup);
- if (unlikely (error))
- {
- rcu_read_leave ();
- return (-error);
- }
- rcu_read_enter ();
- _Auto pg = *upg;
- if (!pg.offset_cnt ||
- !user_check_range (pg.offsets, pg.offset_cnt * sizeof (uint64_t)))
- {
- rcu_read_leave ();
- return (pg.offset_cnt ? -EFAULT : 0);
- }
- struct rdxtree_iter it;
- struct vm_page *page;
- int ret = 0;
- void *out = pg.offsets;
- uint32_t cnt = pg.offset_cnt;
- uint64_t first, last;
- rdxtree_for_each (&obj->pages, &it, page)
- {
- if (page->dirty != VM_PAGE_DIRTY)
- continue;
- else if (--cnt == (uint32_t)-1)
- {
- pg.flags |= CAP_PAGES_MORE;
- break;
- }
- ((union user_ua *)out)->u8 = page->offset;
- out = (char *)out + sizeof (uint64_t);
- if (! ret)
- first = page->offset;
- last = page->offset;
- vm_page_wash_begin (page);
- vm_rset_mark_ro (page);
- ++ret;
- }
- rcu_read_leave ();
- if (! ret)
- return (ret);
- int map_flags = VM_MAP_FLAGS (VM_PROT_READ, VM_PROT_READ, VM_INHERIT_NONE,
- VM_ADV_DEFAULT, VM_MAP_CLEAN);
- pg.vme.size = (size_t)(last - first) + PAGE_SIZE;
- pg.vme.prot = pg.vme.max_prot = VM_PROT_READ;
- int rv = vm_map_enter (vm_map_self (), &pg.vme.addr, pg.vme.size,
- map_flags, obj, first);
- if (rv != 0 || (rv = user_copy_to (upg, &pg, sizeof (pg))) != 0)
- ret = -rv;
- return (ret);
- }
|