vm_glue.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843
  1. /*-
  2. * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  3. *
  4. * Copyright (c) 1991, 1993
  5. * The Regents of the University of California. All rights reserved.
  6. *
  7. * This code is derived from software contributed to Berkeley by
  8. * The Mach Operating System project at Carnegie-Mellon University.
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. * 1. Redistributions of source code must retain the above copyright
  14. * notice, this list of conditions and the following disclaimer.
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in the
  17. * documentation and/or other materials provided with the distribution.
  18. * 3. Neither the name of the University nor the names of its contributors
  19. * may be used to endorse or promote products derived from this software
  20. * without specific prior written permission.
  21. *
  22. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32. * SUCH DAMAGE.
  33. *
  34. *
  35. * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  36. * All rights reserved.
  37. *
  38. * Permission to use, copy, modify and distribute this software and
  39. * its documentation is hereby granted, provided that both the copyright
  40. * notice and this permission notice appear in all copies of the
  41. * software, derivative works or modified versions, and any portions
  42. * thereof, and that both notices appear in supporting documentation.
  43. *
  44. * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  45. * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  46. * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  47. *
  48. * Carnegie Mellon requests users of this software to return to
  49. *
  50. * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
  51. * School of Computer Science
  52. * Carnegie Mellon University
  53. * Pittsburgh PA 15213-3890
  54. *
  55. * any improvements or extensions that they make and grant Carnegie the
  56. * rights to redistribute these changes.
  57. */
  58. #include <sys/cdefs.h>
  59. #include "opt_vm.h"
  60. #include "opt_kstack_pages.h"
  61. #include "opt_kstack_max_pages.h"
  62. #include "opt_kstack_usage_prof.h"
  63. #include <sys/param.h>
  64. #include <sys/systm.h>
  65. #include <sys/asan.h>
  66. #include <sys/domainset.h>
  67. #include <sys/limits.h>
  68. #include <sys/lock.h>
  69. #include <sys/malloc.h>
  70. #include <sys/msan.h>
  71. #include <sys/mutex.h>
  72. #include <sys/proc.h>
  73. #include <sys/racct.h>
  74. #include <sys/refcount.h>
  75. #include <sys/resourcevar.h>
  76. #include <sys/rwlock.h>
  77. #include <sys/sched.h>
  78. #include <sys/sf_buf.h>
  79. #include <sys/shm.h>
  80. #include <sys/smp.h>
  81. #include <sys/vmmeter.h>
  82. #include <sys/vmem.h>
  83. #include <sys/sx.h>
  84. #include <sys/sysctl.h>
  85. #include <sys/kernel.h>
  86. #include <sys/ktr.h>
  87. #include <sys/unistd.h>
  88. #include <vm/uma.h>
  89. #include <vm/vm.h>
  90. #include <vm/vm_param.h>
  91. #include <vm/pmap.h>
  92. #include <vm/vm_domainset.h>
  93. #include <vm/vm_map.h>
  94. #include <vm/vm_page.h>
  95. #include <vm/vm_pageout.h>
  96. #include <vm/vm_pagequeue.h>
  97. #include <vm/vm_object.h>
  98. #include <vm/vm_kern.h>
  99. #include <vm/vm_extern.h>
  100. #include <vm/vm_pager.h>
  101. #include <vm/swap_pager.h>
  102. #include <vm/vm_phys.h>
  103. #include <machine/cpu.h>
  104. #if VM_NRESERVLEVEL > 1
  105. #define KVA_KSTACK_QUANTUM_SHIFT (VM_LEVEL_1_ORDER + VM_LEVEL_0_ORDER + \
  106. PAGE_SHIFT)
  107. #elif VM_NRESERVLEVEL > 0
  108. #define KVA_KSTACK_QUANTUM_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT)
  109. #else
  110. #define KVA_KSTACK_QUANTUM_SHIFT (8 + PAGE_SHIFT)
  111. #endif
  112. #define KVA_KSTACK_QUANTUM (1ul << KVA_KSTACK_QUANTUM_SHIFT)
  113. /*
  114. * MPSAFE
  115. *
  116. * WARNING! This code calls vm_map_check_protection() which only checks
  117. * the associated vm_map_entry range. It does not determine whether the
  118. * contents of the memory is actually readable or writable. In most cases
  119. * just checking the vm_map_entry is sufficient within the kernel's address
  120. * space.
  121. */
  122. bool
  123. kernacc(void *addr, int len, int rw)
  124. {
  125. boolean_t rv;
  126. vm_offset_t saddr, eaddr;
  127. vm_prot_t prot;
  128. KASSERT((rw & ~VM_PROT_ALL) == 0,
  129. ("illegal ``rw'' argument to kernacc (%x)\n", rw));
  130. if ((vm_offset_t)addr + len > vm_map_max(kernel_map) ||
  131. (vm_offset_t)addr + len < (vm_offset_t)addr)
  132. return (false);
  133. prot = rw;
  134. saddr = trunc_page((vm_offset_t)addr);
  135. eaddr = round_page((vm_offset_t)addr + len);
  136. vm_map_lock_read(kernel_map);
  137. rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
  138. vm_map_unlock_read(kernel_map);
  139. return (rv == TRUE);
  140. }
  141. /*
  142. * MPSAFE
  143. *
  144. * WARNING! This code calls vm_map_check_protection() which only checks
  145. * the associated vm_map_entry range. It does not determine whether the
  146. * contents of the memory is actually readable or writable. vmapbuf(),
  147. * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
  148. * used in conjunction with this call.
  149. */
  150. bool
  151. useracc(void *addr, int len, int rw)
  152. {
  153. boolean_t rv;
  154. vm_prot_t prot;
  155. vm_map_t map;
  156. KASSERT((rw & ~VM_PROT_ALL) == 0,
  157. ("illegal ``rw'' argument to useracc (%x)\n", rw));
  158. prot = rw;
  159. map = &curproc->p_vmspace->vm_map;
  160. if ((vm_offset_t)addr + len > vm_map_max(map) ||
  161. (vm_offset_t)addr + len < (vm_offset_t)addr) {
  162. return (false);
  163. }
  164. vm_map_lock_read(map);
  165. rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
  166. round_page((vm_offset_t)addr + len), prot);
  167. vm_map_unlock_read(map);
  168. return (rv == TRUE);
  169. }
  170. int
  171. vslock(void *addr, size_t len)
  172. {
  173. vm_offset_t end, last, start;
  174. vm_size_t npages;
  175. int error;
  176. last = (vm_offset_t)addr + len;
  177. start = trunc_page((vm_offset_t)addr);
  178. end = round_page(last);
  179. if (last < (vm_offset_t)addr || end < (vm_offset_t)addr)
  180. return (EINVAL);
  181. npages = atop(end - start);
  182. if (npages > vm_page_max_user_wired)
  183. return (ENOMEM);
  184. error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
  185. VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
  186. if (error == KERN_SUCCESS) {
  187. curthread->td_vslock_sz += len;
  188. return (0);
  189. }
  190. /*
  191. * Return EFAULT on error to match copy{in,out}() behaviour
  192. * rather than returning ENOMEM like mlock() would.
  193. */
  194. return (EFAULT);
  195. }
  196. void
  197. vsunlock(void *addr, size_t len)
  198. {
  199. /* Rely on the parameter sanity checks performed by vslock(). */
  200. MPASS(curthread->td_vslock_sz >= len);
  201. curthread->td_vslock_sz -= len;
  202. (void)vm_map_unwire(&curproc->p_vmspace->vm_map,
  203. trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
  204. VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
  205. }
  206. /*
  207. * Pin the page contained within the given object at the given offset. If the
  208. * page is not resident, allocate and load it using the given object's pager.
  209. * Return the pinned page if successful; otherwise, return NULL.
  210. */
  211. static vm_page_t
  212. vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
  213. {
  214. vm_page_t m;
  215. vm_pindex_t pindex;
  216. pindex = OFF_TO_IDX(offset);
  217. (void)vm_page_grab_valid_unlocked(&m, object, pindex,
  218. VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED);
  219. return (m);
  220. }
  221. /*
  222. * Return a CPU private mapping to the page at the given offset within the
  223. * given object. The page is pinned before it is mapped.
  224. */
  225. struct sf_buf *
  226. vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
  227. {
  228. vm_page_t m;
  229. m = vm_imgact_hold_page(object, offset);
  230. if (m == NULL)
  231. return (NULL);
  232. sched_pin();
  233. return (sf_buf_alloc(m, SFB_CPUPRIVATE));
  234. }
  235. /*
  236. * Destroy the given CPU private mapping and unpin the page that it mapped.
  237. */
  238. void
  239. vm_imgact_unmap_page(struct sf_buf *sf)
  240. {
  241. vm_page_t m;
  242. m = sf_buf_page(sf);
  243. sf_buf_free(sf);
  244. sched_unpin();
  245. vm_page_unwire(m, PQ_ACTIVE);
  246. }
  247. void
  248. vm_sync_icache(vm_map_t map, vm_offset_t va, vm_offset_t sz)
  249. {
  250. pmap_sync_icache(map->pmap, va, sz);
  251. }
  252. static vm_object_t kstack_object;
  253. static vm_object_t kstack_alt_object;
  254. static uma_zone_t kstack_cache;
  255. static int kstack_cache_size;
  256. static vmem_t *vmd_kstack_arena[MAXMEMDOM];
  257. static int
  258. sysctl_kstack_cache_size(SYSCTL_HANDLER_ARGS)
  259. {
  260. int error, oldsize;
  261. oldsize = kstack_cache_size;
  262. error = sysctl_handle_int(oidp, arg1, arg2, req);
  263. if (error == 0 && req->newptr && oldsize != kstack_cache_size)
  264. uma_zone_set_maxcache(kstack_cache, kstack_cache_size);
  265. return (error);
  266. }
  267. SYSCTL_PROC(_vm, OID_AUTO, kstack_cache_size,
  268. CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &kstack_cache_size, 0,
  269. sysctl_kstack_cache_size, "IU", "Maximum number of cached kernel stacks");
  270. /*
  271. * Allocate a virtual address range from a domain kstack arena, following
  272. * the specified NUMA policy.
  273. */
  274. static vm_offset_t
  275. vm_thread_alloc_kstack_kva(vm_size_t size, int domain)
  276. {
  277. #ifndef __ILP32__
  278. int rv;
  279. vmem_t *arena;
  280. vm_offset_t addr = 0;
  281. size = round_page(size);
  282. /* Allocate from the kernel arena for non-standard kstack sizes. */
  283. if (size != ptoa(kstack_pages + KSTACK_GUARD_PAGES)) {
  284. arena = vm_dom[domain].vmd_kernel_arena;
  285. } else {
  286. arena = vmd_kstack_arena[domain];
  287. }
  288. rv = vmem_alloc(arena, size, M_BESTFIT | M_NOWAIT, &addr);
  289. if (rv == ENOMEM)
  290. return (0);
  291. KASSERT(atop(addr - VM_MIN_KERNEL_ADDRESS) %
  292. (kstack_pages + KSTACK_GUARD_PAGES) == 0,
  293. ("%s: allocated kstack KVA not aligned to multiple of kstack size",
  294. __func__));
  295. return (addr);
  296. #else
  297. return (kva_alloc(size));
  298. #endif
  299. }
  300. /*
  301. * Release a region of kernel virtual memory
  302. * allocated from the kstack arena.
  303. */
  304. static __noinline void
  305. vm_thread_free_kstack_kva(vm_offset_t addr, vm_size_t size, int domain)
  306. {
  307. vmem_t *arena;
  308. size = round_page(size);
  309. #ifdef __ILP32__
  310. arena = kernel_arena;
  311. #else
  312. arena = vmd_kstack_arena[domain];
  313. if (size != ptoa(kstack_pages + KSTACK_GUARD_PAGES)) {
  314. arena = vm_dom[domain].vmd_kernel_arena;
  315. }
  316. #endif
  317. vmem_free(arena, addr, size);
  318. }
  319. static vmem_size_t
  320. vm_thread_kstack_import_quantum(void)
  321. {
  322. #ifndef __ILP32__
  323. /*
  324. * The kstack_quantum is larger than KVA_QUANTUM to account
  325. * for holes induced by guard pages.
  326. */
  327. return (KVA_KSTACK_QUANTUM * (kstack_pages + KSTACK_GUARD_PAGES));
  328. #else
  329. return (KVA_KSTACK_QUANTUM);
  330. #endif
  331. }
  332. /*
  333. * Import KVA from a parent arena into the kstack arena. Imports must be
  334. * a multiple of kernel stack pages + guard pages in size.
  335. *
  336. * Kstack VA allocations need to be aligned so that the linear KVA pindex
  337. * is divisible by the total number of kstack VA pages. This is necessary to
  338. * make vm_kstack_pindex work properly.
  339. *
  340. * We import a multiple of KVA_KSTACK_QUANTUM-sized region from the parent
  341. * arena. The actual size used by the kstack arena is one kstack smaller to
  342. * allow for the necessary alignment adjustments to be made.
  343. */
  344. static int
  345. vm_thread_kstack_arena_import(void *arena, vmem_size_t size, int flags,
  346. vmem_addr_t *addrp)
  347. {
  348. int error, rem;
  349. size_t kpages = kstack_pages + KSTACK_GUARD_PAGES;
  350. KASSERT(atop(size) % kpages == 0,
  351. ("%s: Size %jd is not a multiple of kstack pages (%d)", __func__,
  352. (intmax_t)size, (int)kpages));
  353. error = vmem_xalloc(arena, vm_thread_kstack_import_quantum(),
  354. KVA_KSTACK_QUANTUM, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags,
  355. addrp);
  356. if (error) {
  357. return (error);
  358. }
  359. rem = atop(*addrp - VM_MIN_KERNEL_ADDRESS) % kpages;
  360. if (rem != 0) {
  361. /* Bump addr to next aligned address */
  362. *addrp = *addrp + (kpages - rem) * PAGE_SIZE;
  363. }
  364. return (0);
  365. }
  366. /*
  367. * Release KVA from a parent arena into the kstack arena. Released imports must
  368. * be a multiple of kernel stack pages + guard pages in size.
  369. */
  370. static void
  371. vm_thread_kstack_arena_release(void *arena, vmem_addr_t addr, vmem_size_t size)
  372. {
  373. int rem;
  374. size_t kpages __diagused = kstack_pages + KSTACK_GUARD_PAGES;
  375. KASSERT(size % kpages == 0,
  376. ("%s: Size %jd is not a multiple of kstack pages (%d)", __func__,
  377. (intmax_t)size, (int)kpages));
  378. KASSERT((addr - VM_MIN_KERNEL_ADDRESS) % kpages == 0,
  379. ("%s: Address %p is not properly aligned (%p)", __func__,
  380. (void *)addr, (void *)VM_MIN_KERNEL_ADDRESS));
  381. /*
  382. * If the address is not KVA_KSTACK_QUANTUM-aligned we have to decrement
  383. * it to account for the shift in kva_import_kstack.
  384. */
  385. rem = addr % KVA_KSTACK_QUANTUM;
  386. if (rem) {
  387. KASSERT(rem <= ptoa(kpages),
  388. ("%s: rem > kpages (%d), (%d)", __func__, rem,
  389. (int)kpages));
  390. addr -= rem;
  391. }
  392. vmem_xfree(arena, addr, vm_thread_kstack_import_quantum());
  393. }
  394. /*
  395. * Create the kernel stack for a new thread.
  396. */
  397. static vm_offset_t
  398. vm_thread_stack_create(struct domainset *ds, int pages)
  399. {
  400. vm_page_t ma[KSTACK_MAX_PAGES];
  401. struct vm_domainset_iter di;
  402. int req = VM_ALLOC_NORMAL;
  403. vm_object_t obj;
  404. vm_offset_t ks;
  405. int domain, i;
  406. obj = vm_thread_kstack_size_to_obj(pages);
  407. if (vm_ndomains > 1)
  408. obj->domain.dr_policy = ds;
  409. vm_domainset_iter_page_init(&di, obj, 0, &domain, &req);
  410. do {
  411. /*
  412. * Get a kernel virtual address for this thread's kstack.
  413. */
  414. ks = vm_thread_alloc_kstack_kva(ptoa(pages + KSTACK_GUARD_PAGES),
  415. domain);
  416. if (ks == 0)
  417. continue;
  418. ks += ptoa(KSTACK_GUARD_PAGES);
  419. /*
  420. * Allocate physical pages to back the stack.
  421. */
  422. if (vm_thread_stack_back(ks, ma, pages, req, domain) != 0) {
  423. vm_thread_free_kstack_kva(ks - ptoa(KSTACK_GUARD_PAGES),
  424. ptoa(pages + KSTACK_GUARD_PAGES), domain);
  425. continue;
  426. }
  427. if (KSTACK_GUARD_PAGES != 0) {
  428. pmap_qremove(ks - ptoa(KSTACK_GUARD_PAGES),
  429. KSTACK_GUARD_PAGES);
  430. }
  431. for (i = 0; i < pages; i++)
  432. vm_page_valid(ma[i]);
  433. pmap_qenter(ks, ma, pages);
  434. return (ks);
  435. } while (vm_domainset_iter_page(&di, obj, &domain) == 0);
  436. return (0);
  437. }
  438. static __noinline void
  439. vm_thread_stack_dispose(vm_offset_t ks, int pages)
  440. {
  441. vm_page_t m;
  442. vm_pindex_t pindex;
  443. int i, domain;
  444. vm_object_t obj = vm_thread_kstack_size_to_obj(pages);
  445. pindex = vm_kstack_pindex(ks, pages);
  446. domain = vm_phys_domain(vtophys(ks));
  447. pmap_qremove(ks, pages);
  448. VM_OBJECT_WLOCK(obj);
  449. for (i = 0; i < pages; i++) {
  450. m = vm_page_lookup(obj, pindex + i);
  451. if (m == NULL)
  452. panic("%s: kstack already missing?", __func__);
  453. KASSERT(vm_page_domain(m) == domain,
  454. ("%s: page %p domain mismatch, expected %d got %d",
  455. __func__, m, domain, vm_page_domain(m)));
  456. vm_page_xbusy_claim(m);
  457. vm_page_unwire_noq(m);
  458. vm_page_free(m);
  459. }
  460. VM_OBJECT_WUNLOCK(obj);
  461. kasan_mark((void *)ks, ptoa(pages), ptoa(pages), 0);
  462. vm_thread_free_kstack_kva(ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
  463. ptoa(pages + KSTACK_GUARD_PAGES), domain);
  464. }
  465. /*
  466. * Allocate the kernel stack for a new thread.
  467. */
  468. int
  469. vm_thread_new(struct thread *td, int pages)
  470. {
  471. vm_offset_t ks;
  472. u_short ks_domain;
  473. /* Bounds check */
  474. if (pages <= 1)
  475. pages = kstack_pages;
  476. else if (pages > KSTACK_MAX_PAGES)
  477. pages = KSTACK_MAX_PAGES;
  478. ks = 0;
  479. if (pages == kstack_pages && kstack_cache != NULL)
  480. ks = (vm_offset_t)uma_zalloc(kstack_cache, M_NOWAIT);
  481. /*
  482. * Ensure that kstack objects can draw pages from any memory
  483. * domain. Otherwise a local memory shortage can block a process
  484. * swap-in.
  485. */
  486. if (ks == 0)
  487. ks = vm_thread_stack_create(DOMAINSET_PREF(PCPU_GET(domain)),
  488. pages);
  489. if (ks == 0)
  490. return (0);
  491. ks_domain = vm_phys_domain(vtophys(ks));
  492. KASSERT(ks_domain >= 0 && ks_domain < vm_ndomains,
  493. ("%s: invalid domain for kstack %p", __func__, (void *)ks));
  494. td->td_kstack = ks;
  495. td->td_kstack_pages = pages;
  496. td->td_kstack_domain = ks_domain;
  497. return (1);
  498. }
  499. /*
  500. * Dispose of a thread's kernel stack.
  501. */
  502. void
  503. vm_thread_dispose(struct thread *td)
  504. {
  505. vm_offset_t ks;
  506. int pages;
  507. pages = td->td_kstack_pages;
  508. ks = td->td_kstack;
  509. td->td_kstack = 0;
  510. td->td_kstack_pages = 0;
  511. td->td_kstack_domain = MAXMEMDOM;
  512. if (pages == kstack_pages) {
  513. kasan_mark((void *)ks, 0, ptoa(pages), KASAN_KSTACK_FREED);
  514. uma_zfree(kstack_cache, (void *)ks);
  515. } else {
  516. vm_thread_stack_dispose(ks, pages);
  517. }
  518. }
  519. /*
  520. * Calculate kstack pindex.
  521. *
  522. * Uses a non-identity mapping if guard pages are
  523. * active to avoid pindex holes in the kstack object.
  524. */
  525. vm_pindex_t
  526. vm_kstack_pindex(vm_offset_t ks, int kpages)
  527. {
  528. vm_pindex_t pindex = atop(ks - VM_MIN_KERNEL_ADDRESS);
  529. #ifdef __ILP32__
  530. return (pindex);
  531. #else
  532. /*
  533. * Return the linear pindex if guard pages aren't active or if we are
  534. * allocating a non-standard kstack size.
  535. */
  536. if (KSTACK_GUARD_PAGES == 0 || kpages != kstack_pages) {
  537. return (pindex);
  538. }
  539. KASSERT(pindex % (kpages + KSTACK_GUARD_PAGES) >= KSTACK_GUARD_PAGES,
  540. ("%s: Attempting to calculate kstack guard page pindex", __func__));
  541. return (pindex -
  542. (pindex / (kpages + KSTACK_GUARD_PAGES) + 1) * KSTACK_GUARD_PAGES);
  543. #endif
  544. }
  545. /*
  546. * Allocate physical pages, following the specified NUMA policy, to back a
  547. * kernel stack.
  548. */
  549. int
  550. vm_thread_stack_back(vm_offset_t ks, vm_page_t ma[], int npages, int req_class,
  551. int domain)
  552. {
  553. vm_object_t obj = vm_thread_kstack_size_to_obj(npages);
  554. vm_pindex_t pindex;
  555. vm_page_t m;
  556. int n;
  557. pindex = vm_kstack_pindex(ks, npages);
  558. VM_OBJECT_WLOCK(obj);
  559. for (n = 0; n < npages;) {
  560. m = vm_page_grab(obj, pindex + n,
  561. VM_ALLOC_NOCREAT | VM_ALLOC_WIRED);
  562. if (m == NULL) {
  563. m = vm_page_alloc_domain(obj, pindex + n, domain,
  564. req_class | VM_ALLOC_WIRED);
  565. }
  566. if (m == NULL)
  567. break;
  568. ma[n++] = m;
  569. }
  570. if (n < npages)
  571. goto cleanup;
  572. VM_OBJECT_WUNLOCK(obj);
  573. return (0);
  574. cleanup:
  575. for (int i = 0; i < n; i++) {
  576. m = ma[i];
  577. (void)vm_page_unwire_noq(m);
  578. vm_page_free(m);
  579. }
  580. VM_OBJECT_WUNLOCK(obj);
  581. return (ENOMEM);
  582. }
  583. vm_object_t
  584. vm_thread_kstack_size_to_obj(int npages)
  585. {
  586. return (npages == kstack_pages ? kstack_object : kstack_alt_object);
  587. }
  588. static int
  589. kstack_import(void *arg, void **store, int cnt, int domain, int flags)
  590. {
  591. struct domainset *ds;
  592. int i;
  593. if (domain == UMA_ANYDOMAIN)
  594. ds = DOMAINSET_RR();
  595. else
  596. ds = DOMAINSET_PREF(domain);
  597. for (i = 0; i < cnt; i++) {
  598. store[i] = (void *)vm_thread_stack_create(ds, kstack_pages);
  599. if (store[i] == NULL)
  600. break;
  601. }
  602. return (i);
  603. }
  604. static void
  605. kstack_release(void *arg, void **store, int cnt)
  606. {
  607. vm_offset_t ks;
  608. int i;
  609. for (i = 0; i < cnt; i++) {
  610. ks = (vm_offset_t)store[i];
  611. vm_thread_stack_dispose(ks, kstack_pages);
  612. }
  613. }
  614. static void
  615. kstack_cache_init(void *null)
  616. {
  617. vm_size_t kstack_quantum;
  618. int domain;
  619. kstack_object = vm_object_allocate(OBJT_SWAP,
  620. atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS));
  621. kstack_cache = uma_zcache_create("kstack_cache",
  622. kstack_pages * PAGE_SIZE, NULL, NULL, NULL, NULL,
  623. kstack_import, kstack_release, NULL,
  624. UMA_ZONE_FIRSTTOUCH);
  625. kstack_cache_size = imax(128, mp_ncpus * 4);
  626. uma_zone_set_maxcache(kstack_cache, kstack_cache_size);
  627. kstack_alt_object = vm_object_allocate(OBJT_SWAP,
  628. atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS));
  629. kstack_quantum = vm_thread_kstack_import_quantum();
  630. /*
  631. * Reduce size used by the kstack arena to allow for
  632. * alignment adjustments in vm_thread_kstack_arena_import.
  633. */
  634. kstack_quantum -= (kstack_pages + KSTACK_GUARD_PAGES) * PAGE_SIZE;
  635. /*
  636. * Create the kstack_arena for each domain and set kernel_arena as
  637. * parent.
  638. */
  639. for (domain = 0; domain < vm_ndomains; domain++) {
  640. vmd_kstack_arena[domain] = vmem_create("kstack arena", 0, 0,
  641. PAGE_SIZE, 0, M_WAITOK);
  642. KASSERT(vmd_kstack_arena[domain] != NULL,
  643. ("%s: failed to create domain %d kstack_arena", __func__,
  644. domain));
  645. vmem_set_import(vmd_kstack_arena[domain],
  646. vm_thread_kstack_arena_import,
  647. vm_thread_kstack_arena_release,
  648. vm_dom[domain].vmd_kernel_arena, kstack_quantum);
  649. }
  650. }
  651. SYSINIT(vm_kstacks, SI_SUB_KMEM, SI_ORDER_ANY, kstack_cache_init, NULL);
  652. #ifdef KSTACK_USAGE_PROF
  653. /*
  654. * Track maximum stack used by a thread in kernel.
  655. */
  656. static int max_kstack_used;
  657. SYSCTL_INT(_debug, OID_AUTO, max_kstack_used, CTLFLAG_RD,
  658. &max_kstack_used, 0,
  659. "Maximum stack depth used by a thread in kernel");
  660. void
  661. intr_prof_stack_use(struct thread *td, struct trapframe *frame)
  662. {
  663. vm_offset_t stack_top;
  664. vm_offset_t current;
  665. int used, prev_used;
  666. /*
  667. * Testing for interrupted kernel mode isn't strictly
  668. * needed. It optimizes the execution, since interrupts from
  669. * usermode will have only the trap frame on the stack.
  670. */
  671. if (TRAPF_USERMODE(frame))
  672. return;
  673. stack_top = td->td_kstack + td->td_kstack_pages * PAGE_SIZE;
  674. current = (vm_offset_t)(uintptr_t)&stack_top;
  675. /*
  676. * Try to detect if interrupt is using kernel thread stack.
  677. * Hardware could use a dedicated stack for interrupt handling.
  678. */
  679. if (stack_top <= current || current < td->td_kstack)
  680. return;
  681. used = stack_top - current;
  682. for (;;) {
  683. prev_used = max_kstack_used;
  684. if (prev_used >= used)
  685. break;
  686. if (atomic_cmpset_int(&max_kstack_used, prev_used, used))
  687. break;
  688. }
  689. }
  690. #endif /* KSTACK_USAGE_PROF */
  691. /*
  692. * Implement fork's actions on an address space.
  693. * Here we arrange for the address space to be copied or referenced,
  694. * allocate a user struct (pcb and kernel stack), then call the
  695. * machine-dependent layer to fill those in and make the new process
  696. * ready to run. The new process is set up so that it returns directly
  697. * to user mode to avoid stack copying and relocation problems.
  698. */
  699. int
  700. vm_forkproc(struct thread *td, struct proc *p2, struct thread *td2,
  701. struct vmspace *vm2, int flags)
  702. {
  703. struct proc *p1 = td->td_proc;
  704. struct domainset *dset;
  705. int error;
  706. if ((flags & RFPROC) == 0) {
  707. /*
  708. * Divorce the memory, if it is shared, essentially
  709. * this changes shared memory amongst threads, into
  710. * COW locally.
  711. */
  712. if ((flags & RFMEM) == 0) {
  713. error = vmspace_unshare(p1);
  714. if (error)
  715. return (error);
  716. }
  717. cpu_fork(td, p2, td2, flags);
  718. return (0);
  719. }
  720. if (flags & RFMEM) {
  721. p2->p_vmspace = p1->p_vmspace;
  722. refcount_acquire(&p1->p_vmspace->vm_refcnt);
  723. }
  724. dset = td2->td_domain.dr_policy;
  725. while (vm_page_count_severe_set(&dset->ds_mask)) {
  726. vm_wait_doms(&dset->ds_mask, 0);
  727. }
  728. if ((flags & RFMEM) == 0) {
  729. p2->p_vmspace = vm2;
  730. if (p1->p_vmspace->vm_shm)
  731. shmfork(p1, p2);
  732. }
  733. /*
  734. * cpu_fork will copy and update the pcb, set up the kernel stack,
  735. * and make the child ready to run.
  736. */
  737. cpu_fork(td, p2, td2, flags);
  738. return (0);
  739. }
  740. /*
  741. * Called after process has been wait(2)'ed upon and is being reaped.
  742. * The idea is to reclaim resources that we could not reclaim while
  743. * the process was still executing.
  744. */
  745. void
  746. vm_waitproc(struct proc *p)
  747. {
  748. vmspace_exitfree(p); /* and clean-out the vmspace */
  749. }
  750. void
  751. kick_proc0(void)
  752. {
  753. wakeup(&proc0);
  754. }