vm_phys.c 55 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051
  1. /*-
  2. * SPDX-License-Identifier: BSD-2-Clause
  3. *
  4. * Copyright (c) 2002-2006 Rice University
  5. * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
  6. * All rights reserved.
  7. *
  8. * This software was developed for the FreeBSD Project by Alan L. Cox,
  9. * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions
  13. * are met:
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. * 2. Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. *
  20. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  25. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  26. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  27. * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  28. * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  30. * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31. * POSSIBILITY OF SUCH DAMAGE.
  32. */
  33. /*
  34. * Physical memory system implementation
  35. *
  36. * Any external functions defined by this module are only to be used by the
  37. * virtual memory system.
  38. */
  39. #include <sys/cdefs.h>
  40. #include "opt_ddb.h"
  41. #include "opt_vm.h"
  42. #include <sys/param.h>
  43. #include <sys/systm.h>
  44. #include <sys/domainset.h>
  45. #include <sys/lock.h>
  46. #include <sys/kernel.h>
  47. #include <sys/kthread.h>
  48. #include <sys/malloc.h>
  49. #include <sys/mutex.h>
  50. #include <sys/proc.h>
  51. #include <sys/queue.h>
  52. #include <sys/rwlock.h>
  53. #include <sys/sbuf.h>
  54. #include <sys/sched.h>
  55. #include <sys/sysctl.h>
  56. #include <sys/tree.h>
  57. #include <sys/tslog.h>
  58. #include <sys/unistd.h>
  59. #include <sys/vmmeter.h>
  60. #include <ddb/ddb.h>
  61. #include <vm/vm.h>
  62. #include <vm/vm_extern.h>
  63. #include <vm/vm_param.h>
  64. #include <vm/vm_kern.h>
  65. #include <vm/vm_object.h>
  66. #include <vm/vm_page.h>
  67. #include <vm/vm_phys.h>
  68. #include <vm/vm_pagequeue.h>
  69. _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
  70. "Too many physsegs.");
  71. _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t),
  72. "vm_paddr_t too big for ffsll, flsll.");
  73. #ifdef NUMA
  74. struct mem_affinity __read_mostly *mem_affinity;
  75. int __read_mostly *mem_locality;
  76. static int numa_disabled;
  77. static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
  78. "NUMA options");
  79. SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
  80. &numa_disabled, 0, "NUMA-awareness in the allocators is disabled");
  81. #endif
  82. int __read_mostly vm_ndomains = 1;
  83. domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1);
  84. struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX];
  85. int __read_mostly vm_phys_nsegs;
  86. static struct vm_phys_seg vm_phys_early_segs[8];
  87. static int vm_phys_early_nsegs;
  88. struct vm_phys_fictitious_seg;
  89. static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
  90. struct vm_phys_fictitious_seg *);
  91. RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
  92. RB_INITIALIZER(&vm_phys_fictitious_tree);
  93. struct vm_phys_fictitious_seg {
  94. RB_ENTRY(vm_phys_fictitious_seg) node;
  95. /* Memory region data */
  96. vm_paddr_t start;
  97. vm_paddr_t end;
  98. vm_page_t first_page;
  99. };
  100. RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
  101. vm_phys_fictitious_cmp);
  102. static struct rwlock_padalign vm_phys_fictitious_reg_lock;
  103. MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
  104. static struct vm_freelist __aligned(CACHE_LINE_SIZE)
  105. vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL]
  106. [VM_NFREEORDER_MAX];
  107. static int __read_mostly vm_nfreelists;
  108. /*
  109. * These "avail lists" are globals used to communicate boot-time physical
  110. * memory layout to other parts of the kernel. Each physically contiguous
  111. * region of memory is defined by a start address at an even index and an
  112. * end address at the following odd index. Each list is terminated by a
  113. * pair of zero entries.
  114. *
  115. * dump_avail tells the dump code what regions to include in a crash dump, and
  116. * phys_avail is all of the remaining physical memory that is available for
  117. * the vm system.
  118. *
  119. * Initially dump_avail and phys_avail are identical. Boot time memory
  120. * allocations remove extents from phys_avail that may still be included
  121. * in dumps.
  122. */
  123. vm_paddr_t phys_avail[PHYS_AVAIL_COUNT];
  124. vm_paddr_t dump_avail[PHYS_AVAIL_COUNT];
  125. /*
  126. * Provides the mapping from VM_FREELIST_* to free list indices (flind).
  127. */
  128. static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
  129. static int __read_mostly vm_default_freepool;
  130. CTASSERT(VM_FREELIST_DEFAULT == 0);
  131. #ifdef VM_FREELIST_DMA32
  132. #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32)
  133. #endif
  134. /*
  135. * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
  136. * the ordering of the free list boundaries.
  137. */
  138. #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
  139. CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
  140. #endif
  141. static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
  142. SYSCTL_OID(_vm, OID_AUTO, phys_free,
  143. CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
  144. sysctl_vm_phys_free, "A",
  145. "Phys Free Info");
  146. static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
  147. SYSCTL_OID(_vm, OID_AUTO, phys_segs,
  148. CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
  149. sysctl_vm_phys_segs, "A",
  150. "Phys Seg Info");
  151. #ifdef NUMA
  152. static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
  153. SYSCTL_OID(_vm, OID_AUTO, phys_locality,
  154. CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
  155. sysctl_vm_phys_locality, "A",
  156. "Phys Locality Info");
  157. #endif
  158. SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
  159. &vm_ndomains, 0, "Number of physical memory domains available.");
  160. static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
  161. static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
  162. static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
  163. int order, int tail);
  164. static bool __diagused
  165. vm_phys_pool_valid(int pool)
  166. {
  167. #ifdef VM_FREEPOOL_LAZYINIT
  168. if (pool == VM_FREEPOOL_LAZYINIT)
  169. return (false);
  170. #endif
  171. return (pool >= 0 && pool < VM_NFREEPOOL);
  172. }
  173. /*
  174. * Red-black tree helpers for vm fictitious range management.
  175. */
  176. static inline int
  177. vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
  178. struct vm_phys_fictitious_seg *range)
  179. {
  180. KASSERT(range->start != 0 && range->end != 0,
  181. ("Invalid range passed on search for vm_fictitious page"));
  182. if (p->start >= range->end)
  183. return (1);
  184. if (p->start < range->start)
  185. return (-1);
  186. return (0);
  187. }
  188. static int
  189. vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
  190. struct vm_phys_fictitious_seg *p2)
  191. {
  192. /* Check if this is a search for a page */
  193. if (p1->end == 0)
  194. return (vm_phys_fictitious_in_range(p1, p2));
  195. KASSERT(p2->end != 0,
  196. ("Invalid range passed as second parameter to vm fictitious comparison"));
  197. /* Searching to add a new range */
  198. if (p1->end <= p2->start)
  199. return (-1);
  200. if (p1->start >= p2->end)
  201. return (1);
  202. panic("Trying to add overlapping vm fictitious ranges:\n"
  203. "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
  204. (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
  205. }
  206. int
  207. vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used,
  208. vm_paddr_t high __numa_used)
  209. {
  210. #ifdef NUMA
  211. domainset_t mask;
  212. int i;
  213. if (vm_ndomains == 1 || mem_affinity == NULL)
  214. return (0);
  215. DOMAINSET_ZERO(&mask);
  216. /*
  217. * Check for any memory that overlaps low, high.
  218. */
  219. for (i = 0; mem_affinity[i].end != 0; i++)
  220. if (mem_affinity[i].start <= high &&
  221. mem_affinity[i].end >= low)
  222. DOMAINSET_SET(mem_affinity[i].domain, &mask);
  223. if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask))
  224. return (prefer);
  225. if (DOMAINSET_EMPTY(&mask))
  226. panic("vm_phys_domain_match: Impossible constraint");
  227. return (DOMAINSET_FFS(&mask) - 1);
  228. #else
  229. return (0);
  230. #endif
  231. }
  232. /*
  233. * Outputs the state of the physical memory allocator, specifically,
  234. * the amount of physical memory in each free list.
  235. */
  236. static int
  237. sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
  238. {
  239. struct sbuf sbuf;
  240. struct vm_freelist *fl;
  241. int dom, error, flind, oind, pind;
  242. error = sysctl_wire_old_buffer(req, 0);
  243. if (error != 0)
  244. return (error);
  245. sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
  246. for (dom = 0; dom < vm_ndomains; dom++) {
  247. sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
  248. for (flind = 0; flind < vm_nfreelists; flind++) {
  249. sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
  250. "\n ORDER (SIZE) | NUMBER"
  251. "\n ", flind);
  252. for (pind = 0; pind < VM_NFREEPOOL; pind++)
  253. sbuf_printf(&sbuf, " | POOL %d", pind);
  254. sbuf_printf(&sbuf, "\n-- ");
  255. for (pind = 0; pind < VM_NFREEPOOL; pind++)
  256. sbuf_printf(&sbuf, "-- -- ");
  257. sbuf_printf(&sbuf, "--\n");
  258. for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
  259. sbuf_printf(&sbuf, " %2d (%6dK)", oind,
  260. 1 << (PAGE_SHIFT - 10 + oind));
  261. for (pind = 0; pind < VM_NFREEPOOL; pind++) {
  262. fl = vm_phys_free_queues[dom][flind][pind];
  263. sbuf_printf(&sbuf, " | %6d",
  264. fl[oind].lcnt);
  265. }
  266. sbuf_printf(&sbuf, "\n");
  267. }
  268. }
  269. }
  270. error = sbuf_finish(&sbuf);
  271. sbuf_delete(&sbuf);
  272. return (error);
  273. }
  274. /*
  275. * Outputs the set of physical memory segments.
  276. */
  277. static int
  278. sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
  279. {
  280. struct sbuf sbuf;
  281. struct vm_phys_seg *seg;
  282. int error, segind;
  283. error = sysctl_wire_old_buffer(req, 0);
  284. if (error != 0)
  285. return (error);
  286. sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
  287. for (segind = 0; segind < vm_phys_nsegs; segind++) {
  288. sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
  289. seg = &vm_phys_segs[segind];
  290. sbuf_printf(&sbuf, "start: %#jx\n",
  291. (uintmax_t)seg->start);
  292. sbuf_printf(&sbuf, "end: %#jx\n",
  293. (uintmax_t)seg->end);
  294. sbuf_printf(&sbuf, "domain: %d\n", seg->domain);
  295. sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
  296. }
  297. error = sbuf_finish(&sbuf);
  298. sbuf_delete(&sbuf);
  299. return (error);
  300. }
  301. /*
  302. * Return affinity, or -1 if there's no affinity information.
  303. */
  304. int
  305. vm_phys_mem_affinity(int f __numa_used, int t __numa_used)
  306. {
  307. #ifdef NUMA
  308. if (mem_locality == NULL)
  309. return (-1);
  310. if (f >= vm_ndomains || t >= vm_ndomains)
  311. return (-1);
  312. return (mem_locality[f * vm_ndomains + t]);
  313. #else
  314. return (-1);
  315. #endif
  316. }
  317. #ifdef NUMA
  318. /*
  319. * Outputs the VM locality table.
  320. */
  321. static int
  322. sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
  323. {
  324. struct sbuf sbuf;
  325. int error, i, j;
  326. error = sysctl_wire_old_buffer(req, 0);
  327. if (error != 0)
  328. return (error);
  329. sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
  330. sbuf_printf(&sbuf, "\n");
  331. for (i = 0; i < vm_ndomains; i++) {
  332. sbuf_printf(&sbuf, "%d: ", i);
  333. for (j = 0; j < vm_ndomains; j++) {
  334. sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
  335. }
  336. sbuf_printf(&sbuf, "\n");
  337. }
  338. error = sbuf_finish(&sbuf);
  339. sbuf_delete(&sbuf);
  340. return (error);
  341. }
  342. #endif
  343. static void
  344. vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
  345. {
  346. m->order = order;
  347. if (tail)
  348. TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
  349. else
  350. TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
  351. fl[order].lcnt++;
  352. }
  353. static void
  354. vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
  355. {
  356. TAILQ_REMOVE(&fl[order].pl, m, listq);
  357. fl[order].lcnt--;
  358. m->order = VM_NFREEORDER;
  359. }
  360. /*
  361. * Create a physical memory segment.
  362. */
  363. static void
  364. _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
  365. {
  366. struct vm_phys_seg *seg;
  367. KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
  368. ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
  369. KASSERT(domain >= 0 && domain < vm_ndomains,
  370. ("vm_phys_create_seg: invalid domain provided"));
  371. seg = &vm_phys_segs[vm_phys_nsegs++];
  372. while (seg > vm_phys_segs && (seg - 1)->start >= end) {
  373. *seg = *(seg - 1);
  374. seg--;
  375. }
  376. seg->start = start;
  377. seg->end = end;
  378. seg->domain = domain;
  379. }
  380. static void
  381. vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
  382. {
  383. #ifdef NUMA
  384. int i;
  385. if (mem_affinity == NULL) {
  386. _vm_phys_create_seg(start, end, 0);
  387. return;
  388. }
  389. for (i = 0;; i++) {
  390. if (mem_affinity[i].end == 0)
  391. panic("Reached end of affinity info");
  392. if (mem_affinity[i].end <= start)
  393. continue;
  394. if (mem_affinity[i].start > start)
  395. panic("No affinity info for start %jx",
  396. (uintmax_t)start);
  397. if (mem_affinity[i].end >= end) {
  398. _vm_phys_create_seg(start, end,
  399. mem_affinity[i].domain);
  400. break;
  401. }
  402. _vm_phys_create_seg(start, mem_affinity[i].end,
  403. mem_affinity[i].domain);
  404. start = mem_affinity[i].end;
  405. }
  406. #else
  407. _vm_phys_create_seg(start, end, 0);
  408. #endif
  409. }
  410. /*
  411. * Add a physical memory segment.
  412. */
  413. void
  414. vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
  415. {
  416. vm_paddr_t paddr;
  417. KASSERT((start & PAGE_MASK) == 0,
  418. ("vm_phys_define_seg: start is not page aligned"));
  419. KASSERT((end & PAGE_MASK) == 0,
  420. ("vm_phys_define_seg: end is not page aligned"));
  421. /*
  422. * Split the physical memory segment if it spans two or more free
  423. * list boundaries.
  424. */
  425. paddr = start;
  426. #ifdef VM_FREELIST_LOWMEM
  427. if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
  428. vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
  429. paddr = VM_LOWMEM_BOUNDARY;
  430. }
  431. #endif
  432. #ifdef VM_FREELIST_DMA32
  433. if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
  434. vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
  435. paddr = VM_DMA32_BOUNDARY;
  436. }
  437. #endif
  438. vm_phys_create_seg(paddr, end);
  439. }
  440. /*
  441. * Initialize the physical memory allocator.
  442. *
  443. * Requires that vm_page_array is initialized!
  444. */
  445. void
  446. vm_phys_init(void)
  447. {
  448. struct vm_freelist *fl;
  449. struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg;
  450. #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE)
  451. u_long npages;
  452. #endif
  453. int dom, flind, freelist, oind, pind, segind;
  454. /*
  455. * Compute the number of free lists, and generate the mapping from the
  456. * manifest constants VM_FREELIST_* to the free list indices.
  457. *
  458. * Initially, the entries of vm_freelist_to_flind[] are set to either
  459. * 0 or 1 to indicate which free lists should be created.
  460. */
  461. #ifdef VM_DMA32_NPAGES_THRESHOLD
  462. npages = 0;
  463. #endif
  464. for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
  465. seg = &vm_phys_segs[segind];
  466. #ifdef VM_FREELIST_LOWMEM
  467. if (seg->end <= VM_LOWMEM_BOUNDARY)
  468. vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
  469. else
  470. #endif
  471. #ifdef VM_FREELIST_DMA32
  472. if (
  473. #ifdef VM_DMA32_NPAGES_THRESHOLD
  474. /*
  475. * Create the DMA32 free list only if the amount of
  476. * physical memory above physical address 4G exceeds the
  477. * given threshold.
  478. */
  479. npages > VM_DMA32_NPAGES_THRESHOLD &&
  480. #endif
  481. seg->end <= VM_DMA32_BOUNDARY)
  482. vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
  483. else
  484. #endif
  485. {
  486. #ifdef VM_DMA32_NPAGES_THRESHOLD
  487. npages += atop(seg->end - seg->start);
  488. #endif
  489. vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
  490. }
  491. }
  492. /* Change each entry into a running total of the free lists. */
  493. for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
  494. vm_freelist_to_flind[freelist] +=
  495. vm_freelist_to_flind[freelist - 1];
  496. }
  497. vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
  498. KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
  499. /* Change each entry into a free list index. */
  500. for (freelist = 0; freelist < VM_NFREELIST; freelist++)
  501. vm_freelist_to_flind[freelist]--;
  502. /*
  503. * Initialize the first_page and free_queues fields of each physical
  504. * memory segment.
  505. */
  506. #ifdef VM_PHYSSEG_SPARSE
  507. npages = 0;
  508. #endif
  509. for (segind = 0; segind < vm_phys_nsegs; segind++) {
  510. seg = &vm_phys_segs[segind];
  511. #ifdef VM_PHYSSEG_SPARSE
  512. seg->first_page = &vm_page_array[npages];
  513. npages += atop(seg->end - seg->start);
  514. #else
  515. seg->first_page = PHYS_TO_VM_PAGE(seg->start);
  516. #endif
  517. #ifdef VM_FREELIST_LOWMEM
  518. if (seg->end <= VM_LOWMEM_BOUNDARY) {
  519. flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
  520. KASSERT(flind >= 0,
  521. ("vm_phys_init: LOWMEM flind < 0"));
  522. } else
  523. #endif
  524. #ifdef VM_FREELIST_DMA32
  525. if (seg->end <= VM_DMA32_BOUNDARY) {
  526. flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
  527. KASSERT(flind >= 0,
  528. ("vm_phys_init: DMA32 flind < 0"));
  529. } else
  530. #endif
  531. {
  532. flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
  533. KASSERT(flind >= 0,
  534. ("vm_phys_init: DEFAULT flind < 0"));
  535. }
  536. seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
  537. }
  538. /*
  539. * Coalesce physical memory segments that are contiguous and share the
  540. * same per-domain free queues.
  541. */
  542. prev_seg = vm_phys_segs;
  543. seg = &vm_phys_segs[1];
  544. end_seg = &vm_phys_segs[vm_phys_nsegs];
  545. while (seg < end_seg) {
  546. if (prev_seg->end == seg->start &&
  547. prev_seg->free_queues == seg->free_queues) {
  548. prev_seg->end = seg->end;
  549. KASSERT(prev_seg->domain == seg->domain,
  550. ("vm_phys_init: free queues cannot span domains"));
  551. vm_phys_nsegs--;
  552. end_seg--;
  553. for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++)
  554. *tmp_seg = *(tmp_seg + 1);
  555. } else {
  556. prev_seg = seg;
  557. seg++;
  558. }
  559. }
  560. /*
  561. * Initialize the free queues.
  562. */
  563. for (dom = 0; dom < vm_ndomains; dom++) {
  564. for (flind = 0; flind < vm_nfreelists; flind++) {
  565. for (pind = 0; pind < VM_NFREEPOOL; pind++) {
  566. fl = vm_phys_free_queues[dom][flind][pind];
  567. for (oind = 0; oind < VM_NFREEORDER; oind++)
  568. TAILQ_INIT(&fl[oind].pl);
  569. }
  570. }
  571. }
  572. #ifdef VM_FREEPOOL_LAZYINIT
  573. vm_default_freepool = VM_FREEPOOL_LAZYINIT;
  574. #else
  575. vm_default_freepool = VM_FREEPOOL_DEFAULT;
  576. #endif
  577. rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
  578. }
  579. /*
  580. * Register info about the NUMA topology of the system.
  581. *
  582. * Invoked by platform-dependent code prior to vm_phys_init().
  583. */
  584. void
  585. vm_phys_register_domains(int ndomains __numa_used,
  586. struct mem_affinity *affinity __numa_used, int *locality __numa_used)
  587. {
  588. #ifdef NUMA
  589. int i;
  590. /*
  591. * For now the only override value that we support is 1, which
  592. * effectively disables NUMA-awareness in the allocators.
  593. */
  594. TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled);
  595. if (numa_disabled)
  596. ndomains = 1;
  597. if (ndomains > 1) {
  598. vm_ndomains = ndomains;
  599. mem_affinity = affinity;
  600. mem_locality = locality;
  601. }
  602. for (i = 0; i < vm_ndomains; i++)
  603. DOMAINSET_SET(i, &all_domains);
  604. #endif
  605. }
  606. /*
  607. * Split a contiguous, power of two-sized set of physical pages.
  608. *
  609. * When this function is called by a page allocation function, the caller
  610. * should request insertion at the head unless the order [order, oind) queues
  611. * are known to be empty. The objective being to reduce the likelihood of
  612. * long-term fragmentation by promoting contemporaneous allocation and
  613. * (hopefully) deallocation.
  614. */
  615. static __inline void
  616. vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order,
  617. int tail)
  618. {
  619. vm_page_t m_buddy;
  620. while (oind > order) {
  621. oind--;
  622. m_buddy = &m[1 << oind];
  623. KASSERT(m_buddy->order == VM_NFREEORDER,
  624. ("vm_phys_split_pages: page %p has unexpected order %d",
  625. m_buddy, m_buddy->order));
  626. vm_freelist_add(fl, m_buddy, oind, tail);
  627. }
  628. }
  629. static void
  630. vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int tail)
  631. {
  632. KASSERT(order >= 0 && order < VM_NFREEORDER,
  633. ("%s: invalid order %d", __func__, order));
  634. vm_freelist_add(fl, m, order, tail);
  635. #ifdef VM_FREEPOOL_LAZYINIT
  636. if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
  637. vm_page_t m_next;
  638. vm_paddr_t pa;
  639. int npages;
  640. npages = 1 << order;
  641. m_next = m + npages;
  642. pa = m->phys_addr + ptoa(npages);
  643. if (pa < vm_phys_segs[m->segind].end) {
  644. vm_page_init_page(m_next, pa, m->segind,
  645. VM_FREEPOOL_LAZYINIT);
  646. }
  647. }
  648. #endif
  649. }
  650. /*
  651. * Add the physical pages [m, m + npages) at the beginning of a power-of-two
  652. * aligned and sized set to the specified free list.
  653. *
  654. * When this function is called by a page allocation function, the caller
  655. * should request insertion at the head unless the lower-order queues are
  656. * known to be empty. The objective being to reduce the likelihood of long-
  657. * term fragmentation by promoting contemporaneous allocation and (hopefully)
  658. * deallocation.
  659. *
  660. * The physical page m's buddy must not be free.
  661. */
  662. static void
  663. vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail)
  664. {
  665. int order;
  666. KASSERT(npages == 0 ||
  667. (VM_PAGE_TO_PHYS(m) &
  668. ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
  669. ("%s: page %p and npages %u are misaligned",
  670. __func__, m, npages));
  671. while (npages > 0) {
  672. KASSERT(m->order == VM_NFREEORDER,
  673. ("%s: page %p has unexpected order %d",
  674. __func__, m, m->order));
  675. order = ilog2(npages);
  676. KASSERT(order < VM_NFREEORDER,
  677. ("%s: order %d is out of range", __func__, order));
  678. vm_phys_enq_chunk(fl, m, order, tail);
  679. m += 1 << order;
  680. npages -= 1 << order;
  681. }
  682. }
  683. /*
  684. * Add the physical pages [m, m + npages) at the end of a power-of-two aligned
  685. * and sized set to the specified free list.
  686. *
  687. * When this function is called by a page allocation function, the caller
  688. * should request insertion at the head unless the lower-order queues are
  689. * known to be empty. The objective being to reduce the likelihood of long-
  690. * term fragmentation by promoting contemporaneous allocation and (hopefully)
  691. * deallocation.
  692. *
  693. * If npages is zero, this function does nothing and ignores the physical page
  694. * parameter m. Otherwise, the physical page m's buddy must not be free.
  695. */
  696. static vm_page_t
  697. vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail)
  698. {
  699. int order;
  700. KASSERT(npages == 0 ||
  701. ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) &
  702. ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
  703. ("vm_phys_enq_range: page %p and npages %u are misaligned",
  704. m, npages));
  705. while (npages > 0) {
  706. KASSERT(m->order == VM_NFREEORDER,
  707. ("vm_phys_enq_range: page %p has unexpected order %d",
  708. m, m->order));
  709. order = ffs(npages) - 1;
  710. vm_phys_enq_chunk(fl, m, order, tail);
  711. m += 1 << order;
  712. npages -= 1 << order;
  713. }
  714. return (m);
  715. }
  716. /*
  717. * Set the pool for a contiguous, power of two-sized set of physical pages.
  718. *
  719. * If the pages currently belong to the lazy init pool, then the corresponding
  720. * page structures must be initialized. In this case it is assumed that the
  721. * first page in the run has already been initialized.
  722. */
  723. static void
  724. vm_phys_set_pool(int pool, vm_page_t m, int order)
  725. {
  726. #ifdef VM_FREEPOOL_LAZYINIT
  727. if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
  728. vm_paddr_t pa;
  729. int segind;
  730. m->pool = pool;
  731. TSENTER();
  732. pa = m->phys_addr + PAGE_SIZE;
  733. segind = m->segind;
  734. for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order];
  735. m_tmp++, pa += PAGE_SIZE)
  736. vm_page_init_page(m_tmp, pa, segind, pool);
  737. TSEXIT();
  738. } else
  739. #endif
  740. for (vm_page_t m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
  741. m_tmp->pool = pool;
  742. }
  743. /*
  744. * Tries to allocate the specified number of pages from the specified pool
  745. * within the specified domain. Returns the actual number of allocated pages
  746. * and a pointer to each page through the array ma[].
  747. *
  748. * The returned pages may not be physically contiguous. However, in contrast
  749. * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0),
  750. * calling this function once to allocate the desired number of pages will
  751. * avoid wasted time in vm_phys_split_pages().
  752. *
  753. * The free page queues for the specified domain must be locked.
  754. */
  755. int
  756. vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
  757. {
  758. struct vm_freelist *alt, *fl;
  759. vm_page_t m;
  760. int avail, end, flind, freelist, i, oind, pind;
  761. KASSERT(domain >= 0 && domain < vm_ndomains,
  762. ("vm_phys_alloc_npages: domain %d is out of range", domain));
  763. KASSERT(vm_phys_pool_valid(pool),
  764. ("vm_phys_alloc_npages: pool %d is out of range", pool));
  765. KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
  766. ("vm_phys_alloc_npages: npages %d is out of range", npages));
  767. vm_domain_free_assert_locked(VM_DOMAIN(domain));
  768. i = 0;
  769. for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
  770. flind = vm_freelist_to_flind[freelist];
  771. if (flind < 0)
  772. continue;
  773. fl = vm_phys_free_queues[domain][flind][pool];
  774. for (oind = 0; oind < VM_NFREEORDER; oind++) {
  775. while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
  776. vm_freelist_rem(fl, m, oind);
  777. avail = i + (1 << oind);
  778. end = imin(npages, avail);
  779. while (i < end)
  780. ma[i++] = m++;
  781. if (i == npages) {
  782. /*
  783. * Return excess pages to fl. Its order
  784. * [0, oind) queues are empty.
  785. */
  786. vm_phys_enq_range(m, avail - i, fl, 1);
  787. return (npages);
  788. }
  789. }
  790. }
  791. for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
  792. for (pind = vm_default_freepool; pind < VM_NFREEPOOL;
  793. pind++) {
  794. alt = vm_phys_free_queues[domain][flind][pind];
  795. while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
  796. NULL) {
  797. vm_freelist_rem(alt, m, oind);
  798. vm_phys_set_pool(pool, m, oind);
  799. avail = i + (1 << oind);
  800. end = imin(npages, avail);
  801. while (i < end)
  802. ma[i++] = m++;
  803. if (i == npages) {
  804. /*
  805. * Return excess pages to fl.
  806. * Its order [0, oind) queues
  807. * are empty.
  808. */
  809. vm_phys_enq_range(m, avail - i,
  810. fl, 1);
  811. return (npages);
  812. }
  813. }
  814. }
  815. }
  816. }
  817. return (i);
  818. }
  819. /*
  820. * Allocate a contiguous, power of two-sized set of physical pages
  821. * from the free lists.
  822. *
  823. * The free page queues must be locked.
  824. */
  825. vm_page_t
  826. vm_phys_alloc_pages(int domain, int pool, int order)
  827. {
  828. vm_page_t m;
  829. int freelist;
  830. for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
  831. m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order);
  832. if (m != NULL)
  833. return (m);
  834. }
  835. return (NULL);
  836. }
  837. /*
  838. * Allocate a contiguous, power of two-sized set of physical pages from the
  839. * specified free list. The free list must be specified using one of the
  840. * manifest constants VM_FREELIST_*.
  841. *
  842. * The free page queues must be locked.
  843. */
  844. vm_page_t
  845. vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
  846. {
  847. struct vm_freelist *alt, *fl;
  848. vm_page_t m;
  849. int oind, pind, flind;
  850. KASSERT(domain >= 0 && domain < vm_ndomains,
  851. ("vm_phys_alloc_freelist_pages: domain %d is out of range",
  852. domain));
  853. KASSERT(freelist < VM_NFREELIST,
  854. ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
  855. freelist));
  856. KASSERT(vm_phys_pool_valid(pool),
  857. ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
  858. KASSERT(order < VM_NFREEORDER,
  859. ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
  860. flind = vm_freelist_to_flind[freelist];
  861. /* Check if freelist is present */
  862. if (flind < 0)
  863. return (NULL);
  864. vm_domain_free_assert_locked(VM_DOMAIN(domain));
  865. fl = &vm_phys_free_queues[domain][flind][pool][0];
  866. for (oind = order; oind < VM_NFREEORDER; oind++) {
  867. m = TAILQ_FIRST(&fl[oind].pl);
  868. if (m != NULL) {
  869. vm_freelist_rem(fl, m, oind);
  870. /* The order [order, oind) queues are empty. */
  871. vm_phys_split_pages(m, oind, fl, order, 1);
  872. return (m);
  873. }
  874. }
  875. /*
  876. * The given pool was empty. Find the largest
  877. * contiguous, power-of-two-sized set of pages in any
  878. * pool. Transfer these pages to the given pool, and
  879. * use them to satisfy the allocation.
  880. */
  881. for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
  882. for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
  883. alt = &vm_phys_free_queues[domain][flind][pind][0];
  884. m = TAILQ_FIRST(&alt[oind].pl);
  885. if (m != NULL) {
  886. vm_freelist_rem(alt, m, oind);
  887. vm_phys_set_pool(pool, m, oind);
  888. /* The order [order, oind) queues are empty. */
  889. vm_phys_split_pages(m, oind, fl, order, 1);
  890. return (m);
  891. }
  892. }
  893. }
  894. return (NULL);
  895. }
  896. /*
  897. * Find the vm_page corresponding to the given physical address, which must lie
  898. * within the given physical memory segment.
  899. */
  900. vm_page_t
  901. vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa)
  902. {
  903. KASSERT(pa >= seg->start && pa < seg->end,
  904. ("%s: pa %#jx is out of range", __func__, (uintmax_t)pa));
  905. return (&seg->first_page[atop(pa - seg->start)]);
  906. }
  907. /*
  908. * Find the vm_page corresponding to the given physical address.
  909. */
  910. vm_page_t
  911. vm_phys_paddr_to_vm_page(vm_paddr_t pa)
  912. {
  913. struct vm_phys_seg *seg;
  914. if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
  915. return (vm_phys_seg_paddr_to_vm_page(seg, pa));
  916. return (NULL);
  917. }
  918. vm_page_t
  919. vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
  920. {
  921. struct vm_phys_fictitious_seg tmp, *seg;
  922. vm_page_t m;
  923. m = NULL;
  924. tmp.start = pa;
  925. tmp.end = 0;
  926. rw_rlock(&vm_phys_fictitious_reg_lock);
  927. seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
  928. rw_runlock(&vm_phys_fictitious_reg_lock);
  929. if (seg == NULL)
  930. return (NULL);
  931. m = &seg->first_page[atop(pa - seg->start)];
  932. KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
  933. return (m);
  934. }
  935. static inline void
  936. vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
  937. long page_count, vm_memattr_t memattr)
  938. {
  939. long i;
  940. bzero(range, page_count * sizeof(*range));
  941. for (i = 0; i < page_count; i++) {
  942. vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
  943. range[i].oflags &= ~VPO_UNMANAGED;
  944. range[i].busy_lock = VPB_UNBUSIED;
  945. }
  946. }
  947. int
  948. vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
  949. vm_memattr_t memattr)
  950. {
  951. struct vm_phys_fictitious_seg *seg;
  952. vm_page_t fp;
  953. long page_count;
  954. #ifdef VM_PHYSSEG_DENSE
  955. long pi, pe;
  956. long dpage_count;
  957. #endif
  958. KASSERT(start < end,
  959. ("Start of segment isn't less than end (start: %jx end: %jx)",
  960. (uintmax_t)start, (uintmax_t)end));
  961. page_count = (end - start) / PAGE_SIZE;
  962. #ifdef VM_PHYSSEG_DENSE
  963. pi = atop(start);
  964. pe = atop(end);
  965. if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
  966. fp = &vm_page_array[pi - first_page];
  967. if ((pe - first_page) > vm_page_array_size) {
  968. /*
  969. * We have a segment that starts inside
  970. * of vm_page_array, but ends outside of it.
  971. *
  972. * Use vm_page_array pages for those that are
  973. * inside of the vm_page_array range, and
  974. * allocate the remaining ones.
  975. */
  976. dpage_count = vm_page_array_size - (pi - first_page);
  977. vm_phys_fictitious_init_range(fp, start, dpage_count,
  978. memattr);
  979. page_count -= dpage_count;
  980. start += ptoa(dpage_count);
  981. goto alloc;
  982. }
  983. /*
  984. * We can allocate the full range from vm_page_array,
  985. * so there's no need to register the range in the tree.
  986. */
  987. vm_phys_fictitious_init_range(fp, start, page_count, memattr);
  988. return (0);
  989. } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
  990. /*
  991. * We have a segment that ends inside of vm_page_array,
  992. * but starts outside of it.
  993. */
  994. fp = &vm_page_array[0];
  995. dpage_count = pe - first_page;
  996. vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
  997. memattr);
  998. end -= ptoa(dpage_count);
  999. page_count -= dpage_count;
  1000. goto alloc;
  1001. } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
  1002. /*
  1003. * Trying to register a fictitious range that expands before
  1004. * and after vm_page_array.
  1005. */
  1006. return (EINVAL);
  1007. } else {
  1008. alloc:
  1009. #endif
  1010. fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
  1011. M_WAITOK);
  1012. #ifdef VM_PHYSSEG_DENSE
  1013. }
  1014. #endif
  1015. vm_phys_fictitious_init_range(fp, start, page_count, memattr);
  1016. seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
  1017. seg->start = start;
  1018. seg->end = end;
  1019. seg->first_page = fp;
  1020. rw_wlock(&vm_phys_fictitious_reg_lock);
  1021. RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
  1022. rw_wunlock(&vm_phys_fictitious_reg_lock);
  1023. return (0);
  1024. }
  1025. void
  1026. vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
  1027. {
  1028. struct vm_phys_fictitious_seg *seg, tmp;
  1029. #ifdef VM_PHYSSEG_DENSE
  1030. long pi, pe;
  1031. #endif
  1032. KASSERT(start < end,
  1033. ("Start of segment isn't less than end (start: %jx end: %jx)",
  1034. (uintmax_t)start, (uintmax_t)end));
  1035. #ifdef VM_PHYSSEG_DENSE
  1036. pi = atop(start);
  1037. pe = atop(end);
  1038. if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
  1039. if ((pe - first_page) <= vm_page_array_size) {
  1040. /*
  1041. * This segment was allocated using vm_page_array
  1042. * only, there's nothing to do since those pages
  1043. * were never added to the tree.
  1044. */
  1045. return;
  1046. }
  1047. /*
  1048. * We have a segment that starts inside
  1049. * of vm_page_array, but ends outside of it.
  1050. *
  1051. * Calculate how many pages were added to the
  1052. * tree and free them.
  1053. */
  1054. start = ptoa(first_page + vm_page_array_size);
  1055. } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
  1056. /*
  1057. * We have a segment that ends inside of vm_page_array,
  1058. * but starts outside of it.
  1059. */
  1060. end = ptoa(first_page);
  1061. } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
  1062. /* Since it's not possible to register such a range, panic. */
  1063. panic(
  1064. "Unregistering not registered fictitious range [%#jx:%#jx]",
  1065. (uintmax_t)start, (uintmax_t)end);
  1066. }
  1067. #endif
  1068. tmp.start = start;
  1069. tmp.end = 0;
  1070. rw_wlock(&vm_phys_fictitious_reg_lock);
  1071. seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
  1072. if (seg->start != start || seg->end != end) {
  1073. rw_wunlock(&vm_phys_fictitious_reg_lock);
  1074. panic(
  1075. "Unregistering not registered fictitious range [%#jx:%#jx]",
  1076. (uintmax_t)start, (uintmax_t)end);
  1077. }
  1078. RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
  1079. rw_wunlock(&vm_phys_fictitious_reg_lock);
  1080. free(seg->first_page, M_FICT_PAGES);
  1081. free(seg, M_FICT_PAGES);
  1082. }
  1083. /*
  1084. * Free a contiguous, power of two-sized set of physical pages.
  1085. *
  1086. * The free page queues must be locked.
  1087. */
  1088. void
  1089. vm_phys_free_pages(vm_page_t m, int order)
  1090. {
  1091. struct vm_freelist *fl;
  1092. struct vm_phys_seg *seg;
  1093. vm_paddr_t pa;
  1094. vm_page_t m_buddy;
  1095. KASSERT(m->order == VM_NFREEORDER,
  1096. ("vm_phys_free_pages: page %p has unexpected order %d",
  1097. m, m->order));
  1098. KASSERT(vm_phys_pool_valid(m->pool),
  1099. ("vm_phys_free_pages: page %p has unexpected pool %d",
  1100. m, m->pool));
  1101. KASSERT(order < VM_NFREEORDER,
  1102. ("vm_phys_free_pages: order %d is out of range", order));
  1103. seg = &vm_phys_segs[m->segind];
  1104. vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
  1105. if (order < VM_NFREEORDER - 1) {
  1106. pa = VM_PAGE_TO_PHYS(m);
  1107. do {
  1108. pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
  1109. if (pa < seg->start || pa >= seg->end)
  1110. break;
  1111. m_buddy = vm_phys_seg_paddr_to_vm_page(seg, pa);
  1112. if (m_buddy->order != order)
  1113. break;
  1114. fl = (*seg->free_queues)[m_buddy->pool];
  1115. vm_freelist_rem(fl, m_buddy, order);
  1116. if (m_buddy->pool != m->pool)
  1117. vm_phys_set_pool(m->pool, m_buddy, order);
  1118. order++;
  1119. pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
  1120. m = vm_phys_seg_paddr_to_vm_page(seg, pa);
  1121. } while (order < VM_NFREEORDER - 1);
  1122. }
  1123. fl = (*seg->free_queues)[m->pool];
  1124. vm_freelist_add(fl, m, order, 1);
  1125. }
  1126. #ifdef VM_FREEPOOL_LAZYINIT
  1127. /*
  1128. * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving
  1129. * them to the default pool. This is a prerequisite for some rare operations
  1130. * which need to scan the page array and thus depend on all pages being
  1131. * initialized.
  1132. */
  1133. static void
  1134. vm_phys_lazy_init_domain(int domain, bool locked)
  1135. {
  1136. static bool initdone[MAXMEMDOM];
  1137. struct vm_domain *vmd;
  1138. struct vm_freelist *fl;
  1139. vm_page_t m;
  1140. int pind;
  1141. bool unlocked;
  1142. if (__predict_true(atomic_load_bool(&initdone[domain])))
  1143. return;
  1144. vmd = VM_DOMAIN(domain);
  1145. if (locked)
  1146. vm_domain_free_assert_locked(vmd);
  1147. else
  1148. vm_domain_free_lock(vmd);
  1149. if (atomic_load_bool(&initdone[domain]))
  1150. goto out;
  1151. pind = VM_FREEPOOL_LAZYINIT;
  1152. for (int freelist = 0; freelist < VM_NFREELIST; freelist++) {
  1153. int flind;
  1154. flind = vm_freelist_to_flind[freelist];
  1155. if (flind < 0)
  1156. continue;
  1157. fl = vm_phys_free_queues[domain][flind][pind];
  1158. for (int oind = 0; oind < VM_NFREEORDER; oind++) {
  1159. if (atomic_load_int(&fl[oind].lcnt) == 0)
  1160. continue;
  1161. while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
  1162. /*
  1163. * Avoid holding the lock across the
  1164. * initialization unless there's a free page
  1165. * shortage.
  1166. */
  1167. vm_freelist_rem(fl, m, oind);
  1168. unlocked = vm_domain_allocate(vmd,
  1169. VM_ALLOC_NORMAL, 1 << oind);
  1170. if (unlocked)
  1171. vm_domain_free_unlock(vmd);
  1172. vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind);
  1173. if (unlocked) {
  1174. vm_domain_freecnt_inc(vmd, 1 << oind);
  1175. vm_domain_free_lock(vmd);
  1176. }
  1177. vm_phys_free_pages(m, oind);
  1178. }
  1179. }
  1180. }
  1181. atomic_store_bool(&initdone[domain], true);
  1182. out:
  1183. if (!locked)
  1184. vm_domain_free_unlock(vmd);
  1185. }
  1186. static void
  1187. vm_phys_lazy_init(void)
  1188. {
  1189. for (int domain = 0; domain < vm_ndomains; domain++)
  1190. vm_phys_lazy_init_domain(domain, false);
  1191. atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT);
  1192. }
  1193. static void
  1194. vm_phys_lazy_init_kthr(void *arg __unused)
  1195. {
  1196. vm_phys_lazy_init();
  1197. kthread_exit();
  1198. }
  1199. static void
  1200. vm_phys_lazy_sysinit(void *arg __unused)
  1201. {
  1202. struct thread *td;
  1203. int error;
  1204. error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td,
  1205. RFSTOPPED, 0, "vmlazyinit");
  1206. if (error == 0) {
  1207. thread_lock(td);
  1208. sched_prio(td, PRI_MIN_IDLE);
  1209. sched_add(td, SRQ_BORING);
  1210. } else {
  1211. printf("%s: could not create lazy init thread: %d\n",
  1212. __func__, error);
  1213. vm_phys_lazy_init();
  1214. }
  1215. }
  1216. SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit,
  1217. NULL);
  1218. #endif /* VM_FREEPOOL_LAZYINIT */
  1219. /*
  1220. * Free a contiguous, arbitrarily sized set of physical pages, without
  1221. * merging across set boundaries.
  1222. *
  1223. * The free page queues must be locked.
  1224. */
  1225. void
  1226. vm_phys_enqueue_contig(vm_page_t m, u_long npages)
  1227. {
  1228. struct vm_freelist *fl;
  1229. struct vm_phys_seg *seg;
  1230. vm_page_t m_end;
  1231. vm_paddr_t diff, lo;
  1232. int order;
  1233. /*
  1234. * Avoid unnecessary coalescing by freeing the pages in the largest
  1235. * possible power-of-two-sized subsets.
  1236. */
  1237. vm_domain_free_assert_locked(vm_pagequeue_domain(m));
  1238. seg = &vm_phys_segs[m->segind];
  1239. fl = (*seg->free_queues)[m->pool];
  1240. m_end = m + npages;
  1241. /* Free blocks of increasing size. */
  1242. lo = atop(VM_PAGE_TO_PHYS(m));
  1243. if (m < m_end &&
  1244. (diff = lo ^ (lo + npages - 1)) != 0) {
  1245. order = min(ilog2(diff), VM_NFREEORDER - 1);
  1246. m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl, 1);
  1247. }
  1248. /* Free blocks of maximum size. */
  1249. order = VM_NFREEORDER - 1;
  1250. while (m + (1 << order) <= m_end) {
  1251. KASSERT(seg == &vm_phys_segs[m->segind],
  1252. ("%s: page range [%p,%p) spans multiple segments",
  1253. __func__, m_end - npages, m));
  1254. vm_phys_enq_chunk(fl, m, order, 1);
  1255. m += 1 << order;
  1256. }
  1257. /* Free blocks of diminishing size. */
  1258. vm_phys_enq_beg(m, m_end - m, fl, 1);
  1259. }
  1260. /*
  1261. * Free a contiguous, arbitrarily sized set of physical pages.
  1262. *
  1263. * The free page queues must be locked.
  1264. */
  1265. void
  1266. vm_phys_free_contig(vm_page_t m, u_long npages)
  1267. {
  1268. vm_paddr_t lo;
  1269. vm_page_t m_start, m_end;
  1270. unsigned max_order, order_start, order_end;
  1271. vm_domain_free_assert_locked(vm_pagequeue_domain(m));
  1272. lo = atop(VM_PAGE_TO_PHYS(m));
  1273. max_order = min(ilog2(lo ^ (lo + npages)), VM_NFREEORDER - 1);
  1274. m_start = m;
  1275. order_start = ffsll(lo) - 1;
  1276. if (order_start < max_order)
  1277. m_start += 1 << order_start;
  1278. m_end = m + npages;
  1279. order_end = ffsll(lo + npages) - 1;
  1280. if (order_end < max_order)
  1281. m_end -= 1 << order_end;
  1282. /*
  1283. * Avoid unnecessary coalescing by freeing the pages at the start and
  1284. * end of the range last.
  1285. */
  1286. if (m_start < m_end)
  1287. vm_phys_enqueue_contig(m_start, m_end - m_start);
  1288. if (order_start < max_order)
  1289. vm_phys_free_pages(m, order_start);
  1290. if (order_end < max_order)
  1291. vm_phys_free_pages(m_end, order_end);
  1292. }
  1293. /*
  1294. * Identify the first address range within segment segind or greater
  1295. * that matches the domain, lies within the low/high range, and has
  1296. * enough pages. Return -1 if there is none.
  1297. */
  1298. int
  1299. vm_phys_find_range(vm_page_t bounds[], int segind, int domain,
  1300. u_long npages, vm_paddr_t low, vm_paddr_t high)
  1301. {
  1302. vm_paddr_t pa_end, pa_start;
  1303. struct vm_phys_seg *end_seg, *seg;
  1304. KASSERT(npages > 0, ("npages is zero"));
  1305. KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range"));
  1306. end_seg = &vm_phys_segs[vm_phys_nsegs];
  1307. for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) {
  1308. if (seg->domain != domain)
  1309. continue;
  1310. if (seg->start >= high)
  1311. return (-1);
  1312. pa_start = MAX(low, seg->start);
  1313. pa_end = MIN(high, seg->end);
  1314. if (pa_end - pa_start < ptoa(npages))
  1315. continue;
  1316. #ifdef VM_FREEPOOL_LAZYINIT
  1317. /*
  1318. * The pages on the free lists must be initialized.
  1319. */
  1320. vm_phys_lazy_init_domain(domain, false);
  1321. #endif
  1322. bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start);
  1323. bounds[1] = &seg->first_page[atop(pa_end - seg->start)];
  1324. return (seg - vm_phys_segs);
  1325. }
  1326. return (-1);
  1327. }
  1328. /*
  1329. * Search for the given physical page "m" in the free lists. If the search
  1330. * succeeds, remove "m" from the free lists and return true. Otherwise, return
  1331. * false, indicating that "m" is not in the free lists.
  1332. *
  1333. * The free page queues must be locked.
  1334. */
  1335. bool
  1336. vm_phys_unfree_page(vm_paddr_t pa)
  1337. {
  1338. struct vm_freelist *fl;
  1339. struct vm_phys_seg *seg;
  1340. vm_paddr_t pa_half;
  1341. vm_page_t m, m_set, m_tmp;
  1342. int order;
  1343. seg = vm_phys_paddr_to_seg(pa);
  1344. vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
  1345. /*
  1346. * The pages on the free lists must be initialized.
  1347. */
  1348. #ifdef VM_FREEPOOL_LAZYINIT
  1349. vm_phys_lazy_init_domain(seg->domain, true);
  1350. #endif
  1351. /*
  1352. * First, find the contiguous, power of two-sized set of free
  1353. * physical pages containing the given physical page "m" and
  1354. * assign it to "m_set".
  1355. */
  1356. m = vm_phys_paddr_to_vm_page(pa);
  1357. for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
  1358. order < VM_NFREEORDER - 1; ) {
  1359. order++;
  1360. pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
  1361. if (pa >= seg->start)
  1362. m_set = vm_phys_seg_paddr_to_vm_page(seg, pa);
  1363. else
  1364. return (false);
  1365. }
  1366. if (m_set->order < order)
  1367. return (false);
  1368. if (m_set->order == VM_NFREEORDER)
  1369. return (false);
  1370. KASSERT(m_set->order < VM_NFREEORDER,
  1371. ("vm_phys_unfree_page: page %p has unexpected order %d",
  1372. m_set, m_set->order));
  1373. /*
  1374. * Next, remove "m_set" from the free lists. Finally, extract
  1375. * "m" from "m_set" using an iterative algorithm: While "m_set"
  1376. * is larger than a page, shrink "m_set" by returning the half
  1377. * of "m_set" that does not contain "m" to the free lists.
  1378. */
  1379. fl = (*seg->free_queues)[m_set->pool];
  1380. order = m_set->order;
  1381. vm_freelist_rem(fl, m_set, order);
  1382. while (order > 0) {
  1383. order--;
  1384. pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
  1385. if (m->phys_addr < pa_half)
  1386. m_tmp = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
  1387. else {
  1388. m_tmp = m_set;
  1389. m_set = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
  1390. }
  1391. vm_freelist_add(fl, m_tmp, order, 0);
  1392. }
  1393. KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
  1394. return (true);
  1395. }
  1396. /*
  1397. * Find a run of contiguous physical pages, meeting alignment requirements, from
  1398. * a list of max-sized page blocks, where we need at least two consecutive
  1399. * blocks to satisfy the (large) page request.
  1400. */
  1401. static vm_page_t
  1402. vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages,
  1403. vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
  1404. {
  1405. struct vm_phys_seg *seg;
  1406. vm_page_t m, m_iter, m_ret;
  1407. vm_paddr_t max_size, size;
  1408. int max_order;
  1409. max_order = VM_NFREEORDER - 1;
  1410. size = npages << PAGE_SHIFT;
  1411. max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order);
  1412. KASSERT(size > max_size, ("size is too small"));
  1413. /*
  1414. * In order to avoid examining any free max-sized page block more than
  1415. * twice, identify the ones that are first in a physically-contiguous
  1416. * sequence of such blocks, and only for those walk the sequence to
  1417. * check if there are enough free blocks starting at a properly aligned
  1418. * block. Thus, no block is checked for free-ness more than twice.
  1419. */
  1420. TAILQ_FOREACH(m, &fl[max_order].pl, listq) {
  1421. /*
  1422. * Skip m unless it is first in a sequence of free max page
  1423. * blocks >= low in its segment.
  1424. */
  1425. seg = &vm_phys_segs[m->segind];
  1426. if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start))
  1427. continue;
  1428. if (VM_PAGE_TO_PHYS(m) >= max_size &&
  1429. VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) &&
  1430. max_order == m[-1 << max_order].order)
  1431. continue;
  1432. /*
  1433. * Advance m_ret from m to the first of the sequence, if any,
  1434. * that satisfies alignment conditions and might leave enough
  1435. * space.
  1436. */
  1437. m_ret = m;
  1438. while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret),
  1439. size, alignment, boundary) &&
  1440. VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) &&
  1441. max_order == m_ret[1 << max_order].order)
  1442. m_ret += 1 << max_order;
  1443. /*
  1444. * Skip m unless some block m_ret in the sequence is properly
  1445. * aligned, and begins a sequence of enough pages less than
  1446. * high, and in the same segment.
  1447. */
  1448. if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end))
  1449. continue;
  1450. /*
  1451. * Skip m unless the blocks to allocate starting at m_ret are
  1452. * all free.
  1453. */
  1454. for (m_iter = m_ret;
  1455. m_iter < m_ret + npages && max_order == m_iter->order;
  1456. m_iter += 1 << max_order) {
  1457. }
  1458. if (m_iter < m_ret + npages)
  1459. continue;
  1460. return (m_ret);
  1461. }
  1462. return (NULL);
  1463. }
  1464. /*
  1465. * Find a run of contiguous physical pages from the specified free list
  1466. * table.
  1467. */
  1468. static vm_page_t
  1469. vm_phys_find_queues_contig(
  1470. struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX],
  1471. u_long npages, vm_paddr_t low, vm_paddr_t high,
  1472. u_long alignment, vm_paddr_t boundary)
  1473. {
  1474. struct vm_freelist *fl;
  1475. vm_page_t m_ret;
  1476. vm_paddr_t pa, pa_end, size;
  1477. int oind, order, pind;
  1478. KASSERT(npages > 0, ("npages is 0"));
  1479. KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
  1480. KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
  1481. /* Compute the queue that is the best fit for npages. */
  1482. order = flsl(npages - 1);
  1483. /* Search for a large enough free block. */
  1484. size = npages << PAGE_SHIFT;
  1485. for (oind = order; oind < VM_NFREEORDER; oind++) {
  1486. for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
  1487. fl = (*queues)[pind];
  1488. TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
  1489. /*
  1490. * Determine if the address range starting at pa
  1491. * is within the given range, satisfies the
  1492. * given alignment, and does not cross the given
  1493. * boundary.
  1494. */
  1495. pa = VM_PAGE_TO_PHYS(m_ret);
  1496. pa_end = pa + size;
  1497. if (low <= pa && pa_end <= high &&
  1498. vm_addr_ok(pa, size, alignment, boundary))
  1499. return (m_ret);
  1500. }
  1501. }
  1502. }
  1503. if (order < VM_NFREEORDER)
  1504. return (NULL);
  1505. /* Search for a long-enough sequence of max-order blocks. */
  1506. for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
  1507. fl = (*queues)[pind];
  1508. m_ret = vm_phys_find_freelist_contig(fl, npages,
  1509. low, high, alignment, boundary);
  1510. if (m_ret != NULL)
  1511. return (m_ret);
  1512. }
  1513. return (NULL);
  1514. }
  1515. /*
  1516. * Allocate a contiguous set of physical pages of the given size
  1517. * "npages" from the free lists. All of the physical pages must be at
  1518. * or above the given physical address "low" and below the given
  1519. * physical address "high". The given value "alignment" determines the
  1520. * alignment of the first physical page in the set. If the given value
  1521. * "boundary" is non-zero, then the set of physical pages cannot cross
  1522. * any physical address boundary that is a multiple of that value. Both
  1523. * "alignment" and "boundary" must be a power of two.
  1524. */
  1525. vm_page_t
  1526. vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
  1527. u_long alignment, vm_paddr_t boundary)
  1528. {
  1529. vm_paddr_t pa_end, pa_start;
  1530. struct vm_freelist *fl;
  1531. vm_page_t m, m_run;
  1532. struct vm_phys_seg *seg;
  1533. struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX];
  1534. int oind, segind;
  1535. KASSERT(npages > 0, ("npages is 0"));
  1536. KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
  1537. KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
  1538. vm_domain_free_assert_locked(VM_DOMAIN(domain));
  1539. if (low >= high)
  1540. return (NULL);
  1541. queues = NULL;
  1542. m_run = NULL;
  1543. for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
  1544. seg = &vm_phys_segs[segind];
  1545. if (seg->start >= high || seg->domain != domain)
  1546. continue;
  1547. if (low >= seg->end)
  1548. break;
  1549. if (low <= seg->start)
  1550. pa_start = seg->start;
  1551. else
  1552. pa_start = low;
  1553. if (high < seg->end)
  1554. pa_end = high;
  1555. else
  1556. pa_end = seg->end;
  1557. if (pa_end - pa_start < ptoa(npages))
  1558. continue;
  1559. /*
  1560. * If a previous segment led to a search using
  1561. * the same free lists as would this segment, then
  1562. * we've actually already searched within this
  1563. * too. So skip it.
  1564. */
  1565. if (seg->free_queues == queues)
  1566. continue;
  1567. queues = seg->free_queues;
  1568. m_run = vm_phys_find_queues_contig(queues, npages,
  1569. low, high, alignment, boundary);
  1570. if (m_run != NULL)
  1571. break;
  1572. }
  1573. if (m_run == NULL)
  1574. return (NULL);
  1575. /* Allocate pages from the page-range found. */
  1576. for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) {
  1577. fl = (*queues)[m->pool];
  1578. oind = m->order;
  1579. vm_freelist_rem(fl, m, oind);
  1580. if (m->pool != VM_FREEPOOL_DEFAULT)
  1581. vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind);
  1582. }
  1583. /* Return excess pages to the free lists. */
  1584. fl = (*queues)[VM_FREEPOOL_DEFAULT];
  1585. vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 0);
  1586. /* Return page verified to satisfy conditions of request. */
  1587. pa_start = VM_PAGE_TO_PHYS(m_run);
  1588. KASSERT(low <= pa_start,
  1589. ("memory allocated below minimum requested range"));
  1590. KASSERT(pa_start + ptoa(npages) <= high,
  1591. ("memory allocated above maximum requested range"));
  1592. seg = &vm_phys_segs[m_run->segind];
  1593. KASSERT(seg->domain == domain,
  1594. ("memory not allocated from specified domain"));
  1595. KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary),
  1596. ("memory alignment/boundary constraints not satisfied"));
  1597. return (m_run);
  1598. }
  1599. /*
  1600. * Return the index of the first unused slot which may be the terminating
  1601. * entry.
  1602. */
  1603. static int
  1604. vm_phys_avail_count(void)
  1605. {
  1606. int i;
  1607. for (i = 0; phys_avail[i + 1]; i += 2)
  1608. continue;
  1609. if (i > PHYS_AVAIL_ENTRIES)
  1610. panic("Improperly terminated phys_avail %d entries", i);
  1611. return (i);
  1612. }
  1613. /*
  1614. * Assert that a phys_avail entry is valid.
  1615. */
  1616. static void
  1617. vm_phys_avail_check(int i)
  1618. {
  1619. if (phys_avail[i] & PAGE_MASK)
  1620. panic("Unaligned phys_avail[%d]: %#jx", i,
  1621. (intmax_t)phys_avail[i]);
  1622. if (phys_avail[i+1] & PAGE_MASK)
  1623. panic("Unaligned phys_avail[%d + 1]: %#jx", i,
  1624. (intmax_t)phys_avail[i]);
  1625. if (phys_avail[i + 1] < phys_avail[i])
  1626. panic("phys_avail[%d] start %#jx < end %#jx", i,
  1627. (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]);
  1628. }
  1629. /*
  1630. * Return the index of an overlapping phys_avail entry or -1.
  1631. */
  1632. #ifdef NUMA
  1633. static int
  1634. vm_phys_avail_find(vm_paddr_t pa)
  1635. {
  1636. int i;
  1637. for (i = 0; phys_avail[i + 1]; i += 2)
  1638. if (phys_avail[i] <= pa && phys_avail[i + 1] > pa)
  1639. return (i);
  1640. return (-1);
  1641. }
  1642. #endif
  1643. /*
  1644. * Return the index of the largest entry.
  1645. */
  1646. int
  1647. vm_phys_avail_largest(void)
  1648. {
  1649. vm_paddr_t sz, largesz;
  1650. int largest;
  1651. int i;
  1652. largest = 0;
  1653. largesz = 0;
  1654. for (i = 0; phys_avail[i + 1]; i += 2) {
  1655. sz = vm_phys_avail_size(i);
  1656. if (sz > largesz) {
  1657. largesz = sz;
  1658. largest = i;
  1659. }
  1660. }
  1661. return (largest);
  1662. }
  1663. vm_paddr_t
  1664. vm_phys_avail_size(int i)
  1665. {
  1666. return (phys_avail[i + 1] - phys_avail[i]);
  1667. }
  1668. /*
  1669. * Split an entry at the address 'pa'. Return zero on success or errno.
  1670. */
  1671. static int
  1672. vm_phys_avail_split(vm_paddr_t pa, int i)
  1673. {
  1674. int cnt;
  1675. vm_phys_avail_check(i);
  1676. if (pa <= phys_avail[i] || pa >= phys_avail[i + 1])
  1677. panic("vm_phys_avail_split: invalid address");
  1678. cnt = vm_phys_avail_count();
  1679. if (cnt >= PHYS_AVAIL_ENTRIES)
  1680. return (ENOSPC);
  1681. memmove(&phys_avail[i + 2], &phys_avail[i],
  1682. (cnt - i) * sizeof(phys_avail[0]));
  1683. phys_avail[i + 1] = pa;
  1684. phys_avail[i + 2] = pa;
  1685. vm_phys_avail_check(i);
  1686. vm_phys_avail_check(i+2);
  1687. return (0);
  1688. }
  1689. /*
  1690. * Check if a given physical address can be included as part of a crash dump.
  1691. */
  1692. bool
  1693. vm_phys_is_dumpable(vm_paddr_t pa)
  1694. {
  1695. vm_page_t m;
  1696. int i;
  1697. if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL)
  1698. return ((m->flags & PG_NODUMP) == 0);
  1699. for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
  1700. if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
  1701. return (true);
  1702. }
  1703. return (false);
  1704. }
  1705. void
  1706. vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end)
  1707. {
  1708. struct vm_phys_seg *seg;
  1709. if (vm_phys_early_nsegs == -1)
  1710. panic("%s: called after initialization", __func__);
  1711. if (vm_phys_early_nsegs == nitems(vm_phys_early_segs))
  1712. panic("%s: ran out of early segments", __func__);
  1713. seg = &vm_phys_early_segs[vm_phys_early_nsegs++];
  1714. seg->start = start;
  1715. seg->end = end;
  1716. }
  1717. /*
  1718. * This routine allocates NUMA node specific memory before the page
  1719. * allocator is bootstrapped.
  1720. */
  1721. vm_paddr_t
  1722. vm_phys_early_alloc(int domain, size_t alloc_size)
  1723. {
  1724. #ifdef NUMA
  1725. int mem_index;
  1726. #endif
  1727. int i, biggestone;
  1728. vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align;
  1729. KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains),
  1730. ("%s: invalid domain index %d", __func__, domain));
  1731. /*
  1732. * Search the mem_affinity array for the biggest address
  1733. * range in the desired domain. This is used to constrain
  1734. * the phys_avail selection below.
  1735. */
  1736. biggestsize = 0;
  1737. mem_start = 0;
  1738. mem_end = -1;
  1739. #ifdef NUMA
  1740. mem_index = 0;
  1741. if (mem_affinity != NULL) {
  1742. for (i = 0;; i++) {
  1743. size = mem_affinity[i].end - mem_affinity[i].start;
  1744. if (size == 0)
  1745. break;
  1746. if (domain != -1 && mem_affinity[i].domain != domain)
  1747. continue;
  1748. if (size > biggestsize) {
  1749. mem_index = i;
  1750. biggestsize = size;
  1751. }
  1752. }
  1753. mem_start = mem_affinity[mem_index].start;
  1754. mem_end = mem_affinity[mem_index].end;
  1755. }
  1756. #endif
  1757. /*
  1758. * Now find biggest physical segment in within the desired
  1759. * numa domain.
  1760. */
  1761. biggestsize = 0;
  1762. biggestone = 0;
  1763. for (i = 0; phys_avail[i + 1] != 0; i += 2) {
  1764. /* skip regions that are out of range */
  1765. if (phys_avail[i+1] - alloc_size < mem_start ||
  1766. phys_avail[i+1] > mem_end)
  1767. continue;
  1768. size = vm_phys_avail_size(i);
  1769. if (size > biggestsize) {
  1770. biggestone = i;
  1771. biggestsize = size;
  1772. }
  1773. }
  1774. alloc_size = round_page(alloc_size);
  1775. /*
  1776. * Grab single pages from the front to reduce fragmentation.
  1777. */
  1778. if (alloc_size == PAGE_SIZE) {
  1779. pa = phys_avail[biggestone];
  1780. phys_avail[biggestone] += PAGE_SIZE;
  1781. vm_phys_avail_check(biggestone);
  1782. return (pa);
  1783. }
  1784. /*
  1785. * Naturally align large allocations.
  1786. */
  1787. align = phys_avail[biggestone + 1] & (alloc_size - 1);
  1788. if (alloc_size + align > biggestsize)
  1789. panic("cannot find a large enough size\n");
  1790. if (align != 0 &&
  1791. vm_phys_avail_split(phys_avail[biggestone + 1] - align,
  1792. biggestone) != 0)
  1793. /* Wasting memory. */
  1794. phys_avail[biggestone + 1] -= align;
  1795. phys_avail[biggestone + 1] -= alloc_size;
  1796. vm_phys_avail_check(biggestone);
  1797. pa = phys_avail[biggestone + 1];
  1798. return (pa);
  1799. }
  1800. void
  1801. vm_phys_early_startup(void)
  1802. {
  1803. struct vm_phys_seg *seg;
  1804. int i;
  1805. for (i = 0; phys_avail[i + 1] != 0; i += 2) {
  1806. phys_avail[i] = round_page(phys_avail[i]);
  1807. phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
  1808. }
  1809. for (i = 0; i < vm_phys_early_nsegs; i++) {
  1810. seg = &vm_phys_early_segs[i];
  1811. vm_phys_add_seg(seg->start, seg->end);
  1812. }
  1813. vm_phys_early_nsegs = -1;
  1814. #ifdef NUMA
  1815. /* Force phys_avail to be split by domain. */
  1816. if (mem_affinity != NULL) {
  1817. int idx;
  1818. for (i = 0; mem_affinity[i].end != 0; i++) {
  1819. idx = vm_phys_avail_find(mem_affinity[i].start);
  1820. if (idx != -1 &&
  1821. phys_avail[idx] != mem_affinity[i].start)
  1822. vm_phys_avail_split(mem_affinity[i].start, idx);
  1823. idx = vm_phys_avail_find(mem_affinity[i].end);
  1824. if (idx != -1 &&
  1825. phys_avail[idx] != mem_affinity[i].end)
  1826. vm_phys_avail_split(mem_affinity[i].end, idx);
  1827. }
  1828. }
  1829. #endif
  1830. }
  1831. #ifdef DDB
  1832. /*
  1833. * Show the number of physical pages in each of the free lists.
  1834. */
  1835. DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE)
  1836. {
  1837. struct vm_freelist *fl;
  1838. int flind, oind, pind, dom;
  1839. for (dom = 0; dom < vm_ndomains; dom++) {
  1840. db_printf("DOMAIN: %d\n", dom);
  1841. for (flind = 0; flind < vm_nfreelists; flind++) {
  1842. db_printf("FREE LIST %d:\n"
  1843. "\n ORDER (SIZE) | NUMBER"
  1844. "\n ", flind);
  1845. for (pind = 0; pind < VM_NFREEPOOL; pind++)
  1846. db_printf(" | POOL %d", pind);
  1847. db_printf("\n-- ");
  1848. for (pind = 0; pind < VM_NFREEPOOL; pind++)
  1849. db_printf("-- -- ");
  1850. db_printf("--\n");
  1851. for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
  1852. db_printf(" %2.2d (%6.6dK)", oind,
  1853. 1 << (PAGE_SHIFT - 10 + oind));
  1854. for (pind = 0; pind < VM_NFREEPOOL; pind++) {
  1855. fl = vm_phys_free_queues[dom][flind][pind];
  1856. db_printf(" | %6.6d", fl[oind].lcnt);
  1857. }
  1858. db_printf("\n");
  1859. }
  1860. db_printf("\n");
  1861. }
  1862. db_printf("\n");
  1863. }
  1864. }
  1865. #endif