vm_map.c 153 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491
  1. /*-
  2. * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  3. *
  4. * Copyright (c) 1991, 1993
  5. * The Regents of the University of California. All rights reserved.
  6. *
  7. * This code is derived from software contributed to Berkeley by
  8. * The Mach Operating System project at Carnegie-Mellon University.
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. * 1. Redistributions of source code must retain the above copyright
  14. * notice, this list of conditions and the following disclaimer.
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in the
  17. * documentation and/or other materials provided with the distribution.
  18. * 3. Neither the name of the University nor the names of its contributors
  19. * may be used to endorse or promote products derived from this software
  20. * without specific prior written permission.
  21. *
  22. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32. * SUCH DAMAGE.
  33. *
  34. *
  35. * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  36. * All rights reserved.
  37. *
  38. * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  39. *
  40. * Permission to use, copy, modify and distribute this software and
  41. * its documentation is hereby granted, provided that both the copyright
  42. * notice and this permission notice appear in all copies of the
  43. * software, derivative works or modified versions, and any portions
  44. * thereof, and that both notices appear in supporting documentation.
  45. *
  46. * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  47. * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  48. * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  49. *
  50. * Carnegie Mellon requests users of this software to return to
  51. *
  52. * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
  53. * School of Computer Science
  54. * Carnegie Mellon University
  55. * Pittsburgh PA 15213-3890
  56. *
  57. * any improvements or extensions that they make and grant Carnegie the
  58. * rights to redistribute these changes.
  59. */
  60. /*
  61. * Virtual memory mapping module.
  62. */
  63. #include <sys/param.h>
  64. #include <sys/systm.h>
  65. #include <sys/elf.h>
  66. #include <sys/kernel.h>
  67. #include <sys/ktr.h>
  68. #include <sys/lock.h>
  69. #include <sys/mutex.h>
  70. #include <sys/proc.h>
  71. #include <sys/vmmeter.h>
  72. #include <sys/mman.h>
  73. #include <sys/vnode.h>
  74. #include <sys/racct.h>
  75. #include <sys/resourcevar.h>
  76. #include <sys/rwlock.h>
  77. #include <sys/file.h>
  78. #include <sys/sysctl.h>
  79. #include <sys/sysent.h>
  80. #include <sys/shm.h>
  81. #include <vm/vm.h>
  82. #include <vm/vm_param.h>
  83. #include <vm/pmap.h>
  84. #include <vm/vm_map.h>
  85. #include <vm/vm_page.h>
  86. #include <vm/vm_pageout.h>
  87. #include <vm/vm_object.h>
  88. #include <vm/vm_pager.h>
  89. #include <vm/vm_kern.h>
  90. #include <vm/vm_extern.h>
  91. #include <vm/vnode_pager.h>
  92. #include <vm/swap_pager.h>
  93. #include <vm/uma.h>
  94. /*
  95. * Virtual memory maps provide for the mapping, protection,
  96. * and sharing of virtual memory objects. In addition,
  97. * this module provides for an efficient virtual copy of
  98. * memory from one map to another.
  99. *
  100. * Synchronization is required prior to most operations.
  101. *
  102. * Maps consist of an ordered doubly-linked list of simple
  103. * entries; a self-adjusting binary search tree of these
  104. * entries is used to speed up lookups.
  105. *
  106. * Since portions of maps are specified by start/end addresses,
  107. * which may not align with existing map entries, all
  108. * routines merely "clip" entries to these start/end values.
  109. * [That is, an entry is split into two, bordering at a
  110. * start or end value.] Note that these clippings may not
  111. * always be necessary (as the two resulting entries are then
  112. * not changed); however, the clipping is done for convenience.
  113. *
  114. * As mentioned above, virtual copy operations are performed
  115. * by copying VM object references from one map to
  116. * another, and then marking both regions as copy-on-write.
  117. */
  118. static struct mtx map_sleep_mtx;
  119. static uma_zone_t mapentzone;
  120. static uma_zone_t kmapentzone;
  121. static uma_zone_t vmspace_zone;
  122. static int vmspace_zinit(void *mem, int size, int flags);
  123. static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
  124. vm_offset_t max);
  125. static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
  126. static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
  127. static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
  128. static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
  129. vm_map_entry_t gap_entry);
  130. static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
  131. vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
  132. #ifdef INVARIANTS
  133. static void vmspace_zdtor(void *mem, int size, void *arg);
  134. #endif
  135. static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
  136. vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
  137. int cow);
  138. static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
  139. vm_offset_t failed_addr);
  140. #define CONTAINS_BITS(set, bits) ((~(set) & (bits)) == 0)
  141. #define ENTRY_CHARGED(e) ((e)->cred != NULL || \
  142. ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
  143. !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
  144. /*
  145. * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
  146. * stable.
  147. */
  148. #define PROC_VMSPACE_LOCK(p) do { } while (0)
  149. #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
  150. /*
  151. * VM_MAP_RANGE_CHECK: [ internal use only ]
  152. *
  153. * Asserts that the starting and ending region
  154. * addresses fall within the valid range of the map.
  155. */
  156. #define VM_MAP_RANGE_CHECK(map, start, end) \
  157. { \
  158. if (start < vm_map_min(map)) \
  159. start = vm_map_min(map); \
  160. if (end > vm_map_max(map)) \
  161. end = vm_map_max(map); \
  162. if (start > end) \
  163. start = end; \
  164. }
  165. #ifndef UMA_USE_DMAP
  166. /*
  167. * Allocate a new slab for kernel map entries. The kernel map may be locked or
  168. * unlocked, depending on whether the request is coming from the kernel map or a
  169. * submap. This function allocates a virtual address range directly from the
  170. * kernel map instead of the kmem_* layer to avoid recursion on the kernel map
  171. * lock and also to avoid triggering allocator recursion in the vmem boundary
  172. * tag allocator.
  173. */
  174. static void *
  175. kmapent_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
  176. int wait)
  177. {
  178. vm_offset_t addr;
  179. int error, locked;
  180. *pflag = UMA_SLAB_PRIV;
  181. if (!(locked = vm_map_locked(kernel_map)))
  182. vm_map_lock(kernel_map);
  183. addr = vm_map_findspace(kernel_map, vm_map_min(kernel_map), bytes);
  184. if (addr + bytes < addr || addr + bytes > vm_map_max(kernel_map))
  185. panic("%s: kernel map is exhausted", __func__);
  186. error = vm_map_insert(kernel_map, NULL, 0, addr, addr + bytes,
  187. VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
  188. if (error != KERN_SUCCESS)
  189. panic("%s: vm_map_insert() failed: %d", __func__, error);
  190. if (!locked)
  191. vm_map_unlock(kernel_map);
  192. error = kmem_back_domain(domain, kernel_object, addr, bytes, M_NOWAIT |
  193. M_USE_RESERVE | (wait & M_ZERO));
  194. if (error == KERN_SUCCESS) {
  195. return ((void *)addr);
  196. } else {
  197. if (!locked)
  198. vm_map_lock(kernel_map);
  199. vm_map_delete(kernel_map, addr, bytes);
  200. if (!locked)
  201. vm_map_unlock(kernel_map);
  202. return (NULL);
  203. }
  204. }
  205. static void
  206. kmapent_free(void *item, vm_size_t size, uint8_t pflag)
  207. {
  208. vm_offset_t addr;
  209. int error __diagused;
  210. if ((pflag & UMA_SLAB_PRIV) == 0)
  211. /* XXX leaked */
  212. return;
  213. addr = (vm_offset_t)item;
  214. kmem_unback(kernel_object, addr, size);
  215. error = vm_map_remove(kernel_map, addr, addr + size);
  216. KASSERT(error == KERN_SUCCESS,
  217. ("%s: vm_map_remove failed: %d", __func__, error));
  218. }
  219. /*
  220. * The worst-case upper bound on the number of kernel map entries that may be
  221. * created before the zone must be replenished in _vm_map_unlock().
  222. */
  223. #define KMAPENT_RESERVE 1
  224. #endif /* !UMD_MD_SMALL_ALLOC */
  225. /*
  226. * vm_map_startup:
  227. *
  228. * Initialize the vm_map module. Must be called before any other vm_map
  229. * routines.
  230. *
  231. * User map and entry structures are allocated from the general purpose
  232. * memory pool. Kernel maps are statically defined. Kernel map entries
  233. * require special handling to avoid recursion; see the comments above
  234. * kmapent_alloc() and in vm_map_entry_create().
  235. */
  236. void
  237. vm_map_startup(void)
  238. {
  239. mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
  240. /*
  241. * Disable the use of per-CPU buckets: map entry allocation is
  242. * serialized by the kernel map lock.
  243. */
  244. kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
  245. NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
  246. UMA_ZONE_VM | UMA_ZONE_NOBUCKET);
  247. #ifndef UMA_USE_DMAP
  248. /* Reserve an extra map entry for use when replenishing the reserve. */
  249. uma_zone_reserve(kmapentzone, KMAPENT_RESERVE + 1);
  250. uma_prealloc(kmapentzone, KMAPENT_RESERVE + 1);
  251. uma_zone_set_allocf(kmapentzone, kmapent_alloc);
  252. uma_zone_set_freef(kmapentzone, kmapent_free);
  253. #endif
  254. mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
  255. NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
  256. vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
  257. #ifdef INVARIANTS
  258. vmspace_zdtor,
  259. #else
  260. NULL,
  261. #endif
  262. vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
  263. }
  264. static int
  265. vmspace_zinit(void *mem, int size, int flags)
  266. {
  267. struct vmspace *vm;
  268. vm_map_t map;
  269. vm = (struct vmspace *)mem;
  270. map = &vm->vm_map;
  271. memset(map, 0, sizeof(*map));
  272. mtx_init(&map->system_mtx, "vm map (system)", NULL,
  273. MTX_DEF | MTX_DUPOK);
  274. sx_init(&map->lock, "vm map (user)");
  275. PMAP_LOCK_INIT(vmspace_pmap(vm));
  276. return (0);
  277. }
  278. #ifdef INVARIANTS
  279. static void
  280. vmspace_zdtor(void *mem, int size, void *arg)
  281. {
  282. struct vmspace *vm;
  283. vm = (struct vmspace *)mem;
  284. KASSERT(vm->vm_map.nentries == 0,
  285. ("vmspace %p nentries == %d on free", vm, vm->vm_map.nentries));
  286. KASSERT(vm->vm_map.size == 0,
  287. ("vmspace %p size == %ju on free", vm, (uintmax_t)vm->vm_map.size));
  288. }
  289. #endif /* INVARIANTS */
  290. /*
  291. * Allocate a vmspace structure, including a vm_map and pmap,
  292. * and initialize those structures. The refcnt is set to 1.
  293. */
  294. struct vmspace *
  295. vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
  296. {
  297. struct vmspace *vm;
  298. vm = uma_zalloc(vmspace_zone, M_WAITOK);
  299. KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
  300. if (!pinit(vmspace_pmap(vm))) {
  301. uma_zfree(vmspace_zone, vm);
  302. return (NULL);
  303. }
  304. CTR1(KTR_VM, "vmspace_alloc: %p", vm);
  305. _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
  306. refcount_init(&vm->vm_refcnt, 1);
  307. vm->vm_shm = NULL;
  308. vm->vm_swrss = 0;
  309. vm->vm_tsize = 0;
  310. vm->vm_dsize = 0;
  311. vm->vm_ssize = 0;
  312. vm->vm_taddr = 0;
  313. vm->vm_daddr = 0;
  314. vm->vm_maxsaddr = 0;
  315. return (vm);
  316. }
  317. #ifdef RACCT
  318. static void
  319. vmspace_container_reset(struct proc *p)
  320. {
  321. PROC_LOCK(p);
  322. racct_set(p, RACCT_DATA, 0);
  323. racct_set(p, RACCT_STACK, 0);
  324. racct_set(p, RACCT_RSS, 0);
  325. racct_set(p, RACCT_MEMLOCK, 0);
  326. racct_set(p, RACCT_VMEM, 0);
  327. PROC_UNLOCK(p);
  328. }
  329. #endif
  330. static inline void
  331. vmspace_dofree(struct vmspace *vm)
  332. {
  333. CTR1(KTR_VM, "vmspace_free: %p", vm);
  334. /*
  335. * Make sure any SysV shm is freed, it might not have been in
  336. * exit1().
  337. */
  338. shmexit(vm);
  339. /*
  340. * Lock the map, to wait out all other references to it.
  341. * Delete all of the mappings and pages they hold, then call
  342. * the pmap module to reclaim anything left.
  343. */
  344. (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
  345. vm_map_max(&vm->vm_map));
  346. pmap_release(vmspace_pmap(vm));
  347. vm->vm_map.pmap = NULL;
  348. uma_zfree(vmspace_zone, vm);
  349. }
  350. void
  351. vmspace_free(struct vmspace *vm)
  352. {
  353. WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
  354. "vmspace_free() called");
  355. if (refcount_release(&vm->vm_refcnt))
  356. vmspace_dofree(vm);
  357. }
  358. void
  359. vmspace_exitfree(struct proc *p)
  360. {
  361. struct vmspace *vm;
  362. PROC_VMSPACE_LOCK(p);
  363. vm = p->p_vmspace;
  364. p->p_vmspace = NULL;
  365. PROC_VMSPACE_UNLOCK(p);
  366. KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
  367. vmspace_free(vm);
  368. }
  369. void
  370. vmspace_exit(struct thread *td)
  371. {
  372. struct vmspace *vm;
  373. struct proc *p;
  374. bool released;
  375. p = td->td_proc;
  376. vm = p->p_vmspace;
  377. /*
  378. * Prepare to release the vmspace reference. The thread that releases
  379. * the last reference is responsible for tearing down the vmspace.
  380. * However, threads not releasing the final reference must switch to the
  381. * kernel's vmspace0 before the decrement so that the subsequent pmap
  382. * deactivation does not modify a freed vmspace.
  383. */
  384. refcount_acquire(&vmspace0.vm_refcnt);
  385. if (!(released = refcount_release_if_last(&vm->vm_refcnt))) {
  386. if (p->p_vmspace != &vmspace0) {
  387. PROC_VMSPACE_LOCK(p);
  388. p->p_vmspace = &vmspace0;
  389. PROC_VMSPACE_UNLOCK(p);
  390. pmap_activate(td);
  391. }
  392. released = refcount_release(&vm->vm_refcnt);
  393. }
  394. if (released) {
  395. /*
  396. * pmap_remove_pages() expects the pmap to be active, so switch
  397. * back first if necessary.
  398. */
  399. if (p->p_vmspace != vm) {
  400. PROC_VMSPACE_LOCK(p);
  401. p->p_vmspace = vm;
  402. PROC_VMSPACE_UNLOCK(p);
  403. pmap_activate(td);
  404. }
  405. pmap_remove_pages(vmspace_pmap(vm));
  406. PROC_VMSPACE_LOCK(p);
  407. p->p_vmspace = &vmspace0;
  408. PROC_VMSPACE_UNLOCK(p);
  409. pmap_activate(td);
  410. vmspace_dofree(vm);
  411. }
  412. #ifdef RACCT
  413. if (racct_enable)
  414. vmspace_container_reset(p);
  415. #endif
  416. }
  417. /* Acquire reference to vmspace owned by another process. */
  418. struct vmspace *
  419. vmspace_acquire_ref(struct proc *p)
  420. {
  421. struct vmspace *vm;
  422. PROC_VMSPACE_LOCK(p);
  423. vm = p->p_vmspace;
  424. if (vm == NULL || !refcount_acquire_if_not_zero(&vm->vm_refcnt)) {
  425. PROC_VMSPACE_UNLOCK(p);
  426. return (NULL);
  427. }
  428. if (vm != p->p_vmspace) {
  429. PROC_VMSPACE_UNLOCK(p);
  430. vmspace_free(vm);
  431. return (NULL);
  432. }
  433. PROC_VMSPACE_UNLOCK(p);
  434. return (vm);
  435. }
  436. /*
  437. * Switch between vmspaces in an AIO kernel process.
  438. *
  439. * The new vmspace is either the vmspace of a user process obtained
  440. * from an active AIO request or the initial vmspace of the AIO kernel
  441. * process (when it is idling). Because user processes will block to
  442. * drain any active AIO requests before proceeding in exit() or
  443. * execve(), the reference count for vmspaces from AIO requests can
  444. * never be 0. Similarly, AIO kernel processes hold an extra
  445. * reference on their initial vmspace for the life of the process. As
  446. * a result, the 'newvm' vmspace always has a non-zero reference
  447. * count. This permits an additional reference on 'newvm' to be
  448. * acquired via a simple atomic increment rather than the loop in
  449. * vmspace_acquire_ref() above.
  450. */
  451. void
  452. vmspace_switch_aio(struct vmspace *newvm)
  453. {
  454. struct vmspace *oldvm;
  455. /* XXX: Need some way to assert that this is an aio daemon. */
  456. KASSERT(refcount_load(&newvm->vm_refcnt) > 0,
  457. ("vmspace_switch_aio: newvm unreferenced"));
  458. oldvm = curproc->p_vmspace;
  459. if (oldvm == newvm)
  460. return;
  461. /*
  462. * Point to the new address space and refer to it.
  463. */
  464. curproc->p_vmspace = newvm;
  465. refcount_acquire(&newvm->vm_refcnt);
  466. /* Activate the new mapping. */
  467. pmap_activate(curthread);
  468. vmspace_free(oldvm);
  469. }
  470. void
  471. _vm_map_lock(vm_map_t map, const char *file, int line)
  472. {
  473. if (map->system_map)
  474. mtx_lock_flags_(&map->system_mtx, 0, file, line);
  475. else
  476. sx_xlock_(&map->lock, file, line);
  477. map->timestamp++;
  478. }
  479. void
  480. vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add)
  481. {
  482. vm_object_t object;
  483. struct vnode *vp;
  484. bool vp_held;
  485. if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0)
  486. return;
  487. KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
  488. ("Submap with execs"));
  489. object = entry->object.vm_object;
  490. KASSERT(object != NULL, ("No object for text, entry %p", entry));
  491. if ((object->flags & OBJ_ANON) != 0)
  492. object = object->handle;
  493. else
  494. KASSERT(object->backing_object == NULL,
  495. ("non-anon object %p shadows", object));
  496. KASSERT(object != NULL, ("No content object for text, entry %p obj %p",
  497. entry, entry->object.vm_object));
  498. /*
  499. * Mostly, we do not lock the backing object. It is
  500. * referenced by the entry we are processing, so it cannot go
  501. * away.
  502. */
  503. vm_pager_getvp(object, &vp, &vp_held);
  504. if (vp != NULL) {
  505. if (add) {
  506. VOP_SET_TEXT_CHECKED(vp);
  507. } else {
  508. vn_lock(vp, LK_SHARED | LK_RETRY);
  509. VOP_UNSET_TEXT_CHECKED(vp);
  510. VOP_UNLOCK(vp);
  511. }
  512. if (vp_held)
  513. vdrop(vp);
  514. }
  515. }
  516. /*
  517. * Use a different name for this vm_map_entry field when it's use
  518. * is not consistent with its use as part of an ordered search tree.
  519. */
  520. #define defer_next right
  521. static void
  522. vm_map_process_deferred(void)
  523. {
  524. struct thread *td;
  525. vm_map_entry_t entry, next;
  526. vm_object_t object;
  527. td = curthread;
  528. entry = td->td_map_def_user;
  529. td->td_map_def_user = NULL;
  530. while (entry != NULL) {
  531. next = entry->defer_next;
  532. MPASS((entry->eflags & (MAP_ENTRY_WRITECNT |
  533. MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT |
  534. MAP_ENTRY_VN_EXEC));
  535. if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) {
  536. /*
  537. * Decrement the object's writemappings and
  538. * possibly the vnode's v_writecount.
  539. */
  540. KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
  541. ("Submap with writecount"));
  542. object = entry->object.vm_object;
  543. KASSERT(object != NULL, ("No object for writecount"));
  544. vm_pager_release_writecount(object, entry->start,
  545. entry->end);
  546. }
  547. vm_map_entry_set_vnode_text(entry, false);
  548. vm_map_entry_deallocate(entry, FALSE);
  549. entry = next;
  550. }
  551. }
  552. #ifdef INVARIANTS
  553. static void
  554. _vm_map_assert_locked(vm_map_t map, const char *file, int line)
  555. {
  556. if (map->system_map)
  557. mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
  558. else
  559. sx_assert_(&map->lock, SA_XLOCKED, file, line);
  560. }
  561. #define VM_MAP_ASSERT_LOCKED(map) \
  562. _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
  563. enum { VMMAP_CHECK_NONE, VMMAP_CHECK_UNLOCK, VMMAP_CHECK_ALL };
  564. #ifdef DIAGNOSTIC
  565. static int enable_vmmap_check = VMMAP_CHECK_UNLOCK;
  566. #else
  567. static int enable_vmmap_check = VMMAP_CHECK_NONE;
  568. #endif
  569. SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN,
  570. &enable_vmmap_check, 0, "Enable vm map consistency checking");
  571. static void _vm_map_assert_consistent(vm_map_t map, int check);
  572. #define VM_MAP_ASSERT_CONSISTENT(map) \
  573. _vm_map_assert_consistent(map, VMMAP_CHECK_ALL)
  574. #ifdef DIAGNOSTIC
  575. #define VM_MAP_UNLOCK_CONSISTENT(map) do { \
  576. if (map->nupdates > map->nentries) { \
  577. _vm_map_assert_consistent(map, VMMAP_CHECK_UNLOCK); \
  578. map->nupdates = 0; \
  579. } \
  580. } while (0)
  581. #else
  582. #define VM_MAP_UNLOCK_CONSISTENT(map)
  583. #endif
  584. #else
  585. #define VM_MAP_ASSERT_LOCKED(map)
  586. #define VM_MAP_ASSERT_CONSISTENT(map)
  587. #define VM_MAP_UNLOCK_CONSISTENT(map)
  588. #endif /* INVARIANTS */
  589. void
  590. _vm_map_unlock(vm_map_t map, const char *file, int line)
  591. {
  592. VM_MAP_UNLOCK_CONSISTENT(map);
  593. if (map->system_map) {
  594. #ifndef UMA_USE_DMAP
  595. if (map == kernel_map && (map->flags & MAP_REPLENISH) != 0) {
  596. uma_prealloc(kmapentzone, 1);
  597. map->flags &= ~MAP_REPLENISH;
  598. }
  599. #endif
  600. mtx_unlock_flags_(&map->system_mtx, 0, file, line);
  601. } else {
  602. sx_xunlock_(&map->lock, file, line);
  603. vm_map_process_deferred();
  604. }
  605. }
  606. void
  607. _vm_map_lock_read(vm_map_t map, const char *file, int line)
  608. {
  609. if (map->system_map)
  610. mtx_lock_flags_(&map->system_mtx, 0, file, line);
  611. else
  612. sx_slock_(&map->lock, file, line);
  613. }
  614. void
  615. _vm_map_unlock_read(vm_map_t map, const char *file, int line)
  616. {
  617. if (map->system_map) {
  618. KASSERT((map->flags & MAP_REPLENISH) == 0,
  619. ("%s: MAP_REPLENISH leaked", __func__));
  620. mtx_unlock_flags_(&map->system_mtx, 0, file, line);
  621. } else {
  622. sx_sunlock_(&map->lock, file, line);
  623. vm_map_process_deferred();
  624. }
  625. }
  626. int
  627. _vm_map_trylock(vm_map_t map, const char *file, int line)
  628. {
  629. int error;
  630. error = map->system_map ?
  631. !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
  632. !sx_try_xlock_(&map->lock, file, line);
  633. if (error == 0)
  634. map->timestamp++;
  635. return (error == 0);
  636. }
  637. int
  638. _vm_map_trylock_read(vm_map_t map, const char *file, int line)
  639. {
  640. int error;
  641. error = map->system_map ?
  642. !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
  643. !sx_try_slock_(&map->lock, file, line);
  644. return (error == 0);
  645. }
  646. /*
  647. * _vm_map_lock_upgrade: [ internal use only ]
  648. *
  649. * Tries to upgrade a read (shared) lock on the specified map to a write
  650. * (exclusive) lock. Returns the value "0" if the upgrade succeeds and a
  651. * non-zero value if the upgrade fails. If the upgrade fails, the map is
  652. * returned without a read or write lock held.
  653. *
  654. * Requires that the map be read locked.
  655. */
  656. int
  657. _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
  658. {
  659. unsigned int last_timestamp;
  660. if (map->system_map) {
  661. mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
  662. } else {
  663. if (!sx_try_upgrade_(&map->lock, file, line)) {
  664. last_timestamp = map->timestamp;
  665. sx_sunlock_(&map->lock, file, line);
  666. vm_map_process_deferred();
  667. /*
  668. * If the map's timestamp does not change while the
  669. * map is unlocked, then the upgrade succeeds.
  670. */
  671. sx_xlock_(&map->lock, file, line);
  672. if (last_timestamp != map->timestamp) {
  673. sx_xunlock_(&map->lock, file, line);
  674. return (1);
  675. }
  676. }
  677. }
  678. map->timestamp++;
  679. return (0);
  680. }
  681. void
  682. _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
  683. {
  684. if (map->system_map) {
  685. KASSERT((map->flags & MAP_REPLENISH) == 0,
  686. ("%s: MAP_REPLENISH leaked", __func__));
  687. mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
  688. } else {
  689. VM_MAP_UNLOCK_CONSISTENT(map);
  690. sx_downgrade_(&map->lock, file, line);
  691. }
  692. }
  693. /*
  694. * vm_map_locked:
  695. *
  696. * Returns a non-zero value if the caller holds a write (exclusive) lock
  697. * on the specified map and the value "0" otherwise.
  698. */
  699. int
  700. vm_map_locked(vm_map_t map)
  701. {
  702. if (map->system_map)
  703. return (mtx_owned(&map->system_mtx));
  704. else
  705. return (sx_xlocked(&map->lock));
  706. }
  707. /*
  708. * _vm_map_unlock_and_wait:
  709. *
  710. * Atomically releases the lock on the specified map and puts the calling
  711. * thread to sleep. The calling thread will remain asleep until either
  712. * vm_map_wakeup() is performed on the map or the specified timeout is
  713. * exceeded.
  714. *
  715. * WARNING! This function does not perform deferred deallocations of
  716. * objects and map entries. Therefore, the calling thread is expected to
  717. * reacquire the map lock after reawakening and later perform an ordinary
  718. * unlock operation, such as vm_map_unlock(), before completing its
  719. * operation on the map.
  720. */
  721. int
  722. _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
  723. {
  724. VM_MAP_UNLOCK_CONSISTENT(map);
  725. mtx_lock(&map_sleep_mtx);
  726. if (map->system_map) {
  727. KASSERT((map->flags & MAP_REPLENISH) == 0,
  728. ("%s: MAP_REPLENISH leaked", __func__));
  729. mtx_unlock_flags_(&map->system_mtx, 0, file, line);
  730. } else {
  731. sx_xunlock_(&map->lock, file, line);
  732. }
  733. return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
  734. timo));
  735. }
  736. /*
  737. * vm_map_wakeup:
  738. *
  739. * Awaken any threads that have slept on the map using
  740. * vm_map_unlock_and_wait().
  741. */
  742. void
  743. vm_map_wakeup(vm_map_t map)
  744. {
  745. /*
  746. * Acquire and release map_sleep_mtx to prevent a wakeup()
  747. * from being performed (and lost) between the map unlock
  748. * and the msleep() in _vm_map_unlock_and_wait().
  749. */
  750. mtx_lock(&map_sleep_mtx);
  751. mtx_unlock(&map_sleep_mtx);
  752. wakeup(&map->root);
  753. }
  754. void
  755. vm_map_busy(vm_map_t map)
  756. {
  757. VM_MAP_ASSERT_LOCKED(map);
  758. map->busy++;
  759. }
  760. void
  761. vm_map_unbusy(vm_map_t map)
  762. {
  763. VM_MAP_ASSERT_LOCKED(map);
  764. KASSERT(map->busy, ("vm_map_unbusy: not busy"));
  765. if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
  766. vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
  767. wakeup(&map->busy);
  768. }
  769. }
  770. void
  771. vm_map_wait_busy(vm_map_t map)
  772. {
  773. VM_MAP_ASSERT_LOCKED(map);
  774. while (map->busy) {
  775. vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
  776. if (map->system_map)
  777. msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
  778. else
  779. sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
  780. }
  781. map->timestamp++;
  782. }
  783. long
  784. vmspace_resident_count(struct vmspace *vmspace)
  785. {
  786. return pmap_resident_count(vmspace_pmap(vmspace));
  787. }
  788. /*
  789. * Initialize an existing vm_map structure
  790. * such as that in the vmspace structure.
  791. */
  792. static void
  793. _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
  794. {
  795. map->header.eflags = MAP_ENTRY_HEADER;
  796. map->needs_wakeup = FALSE;
  797. map->system_map = 0;
  798. map->pmap = pmap;
  799. map->header.end = min;
  800. map->header.start = max;
  801. map->flags = 0;
  802. map->header.left = map->header.right = &map->header;
  803. map->root = NULL;
  804. map->timestamp = 0;
  805. map->busy = 0;
  806. map->anon_loc = 0;
  807. #ifdef DIAGNOSTIC
  808. map->nupdates = 0;
  809. #endif
  810. }
  811. void
  812. vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
  813. {
  814. _vm_map_init(map, pmap, min, max);
  815. mtx_init(&map->system_mtx, "vm map (system)", NULL,
  816. MTX_DEF | MTX_DUPOK);
  817. sx_init(&map->lock, "vm map (user)");
  818. }
  819. /*
  820. * vm_map_entry_dispose: [ internal use only ]
  821. *
  822. * Inverse of vm_map_entry_create.
  823. */
  824. static void
  825. vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
  826. {
  827. uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
  828. }
  829. /*
  830. * vm_map_entry_create: [ internal use only ]
  831. *
  832. * Allocates a VM map entry for insertion.
  833. * No entry fields are filled in.
  834. */
  835. static vm_map_entry_t
  836. vm_map_entry_create(vm_map_t map)
  837. {
  838. vm_map_entry_t new_entry;
  839. #ifndef UMA_USE_DMAP
  840. if (map == kernel_map) {
  841. VM_MAP_ASSERT_LOCKED(map);
  842. /*
  843. * A new slab of kernel map entries cannot be allocated at this
  844. * point because the kernel map has not yet been updated to
  845. * reflect the caller's request. Therefore, we allocate a new
  846. * map entry, dipping into the reserve if necessary, and set a
  847. * flag indicating that the reserve must be replenished before
  848. * the map is unlocked.
  849. */
  850. new_entry = uma_zalloc(kmapentzone, M_NOWAIT | M_NOVM);
  851. if (new_entry == NULL) {
  852. new_entry = uma_zalloc(kmapentzone,
  853. M_NOWAIT | M_NOVM | M_USE_RESERVE);
  854. kernel_map->flags |= MAP_REPLENISH;
  855. }
  856. } else
  857. #endif
  858. if (map->system_map) {
  859. new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
  860. } else {
  861. new_entry = uma_zalloc(mapentzone, M_WAITOK);
  862. }
  863. KASSERT(new_entry != NULL,
  864. ("vm_map_entry_create: kernel resources exhausted"));
  865. return (new_entry);
  866. }
  867. /*
  868. * vm_map_entry_set_behavior:
  869. *
  870. * Set the expected access behavior, either normal, random, or
  871. * sequential.
  872. */
  873. static inline void
  874. vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
  875. {
  876. entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
  877. (behavior & MAP_ENTRY_BEHAV_MASK);
  878. }
  879. /*
  880. * vm_map_entry_max_free_{left,right}:
  881. *
  882. * Compute the size of the largest free gap between two entries,
  883. * one the root of a tree and the other the ancestor of that root
  884. * that is the least or greatest ancestor found on the search path.
  885. */
  886. static inline vm_size_t
  887. vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor)
  888. {
  889. return (root->left != left_ancestor ?
  890. root->left->max_free : root->start - left_ancestor->end);
  891. }
  892. static inline vm_size_t
  893. vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor)
  894. {
  895. return (root->right != right_ancestor ?
  896. root->right->max_free : right_ancestor->start - root->end);
  897. }
  898. /*
  899. * vm_map_entry_{pred,succ}:
  900. *
  901. * Find the {predecessor, successor} of the entry by taking one step
  902. * in the appropriate direction and backtracking as much as necessary.
  903. * vm_map_entry_succ is defined in vm_map.h.
  904. */
  905. static inline vm_map_entry_t
  906. vm_map_entry_pred(vm_map_entry_t entry)
  907. {
  908. vm_map_entry_t prior;
  909. prior = entry->left;
  910. if (prior->right->start < entry->start) {
  911. do
  912. prior = prior->right;
  913. while (prior->right != entry);
  914. }
  915. return (prior);
  916. }
  917. static inline vm_size_t
  918. vm_size_max(vm_size_t a, vm_size_t b)
  919. {
  920. return (a > b ? a : b);
  921. }
  922. #define SPLAY_LEFT_STEP(root, y, llist, rlist, test) do { \
  923. vm_map_entry_t z; \
  924. vm_size_t max_free; \
  925. \
  926. /* \
  927. * Infer root->right->max_free == root->max_free when \
  928. * y->max_free < root->max_free || root->max_free == 0. \
  929. * Otherwise, look right to find it. \
  930. */ \
  931. y = root->left; \
  932. max_free = root->max_free; \
  933. KASSERT(max_free == vm_size_max( \
  934. vm_map_entry_max_free_left(root, llist), \
  935. vm_map_entry_max_free_right(root, rlist)), \
  936. ("%s: max_free invariant fails", __func__)); \
  937. if (max_free - 1 < vm_map_entry_max_free_left(root, llist)) \
  938. max_free = vm_map_entry_max_free_right(root, rlist); \
  939. if (y != llist && (test)) { \
  940. /* Rotate right and make y root. */ \
  941. z = y->right; \
  942. if (z != root) { \
  943. root->left = z; \
  944. y->right = root; \
  945. if (max_free < y->max_free) \
  946. root->max_free = max_free = \
  947. vm_size_max(max_free, z->max_free); \
  948. } else if (max_free < y->max_free) \
  949. root->max_free = max_free = \
  950. vm_size_max(max_free, root->start - y->end);\
  951. root = y; \
  952. y = root->left; \
  953. } \
  954. /* Copy right->max_free. Put root on rlist. */ \
  955. root->max_free = max_free; \
  956. KASSERT(max_free == vm_map_entry_max_free_right(root, rlist), \
  957. ("%s: max_free not copied from right", __func__)); \
  958. root->left = rlist; \
  959. rlist = root; \
  960. root = y != llist ? y : NULL; \
  961. } while (0)
  962. #define SPLAY_RIGHT_STEP(root, y, llist, rlist, test) do { \
  963. vm_map_entry_t z; \
  964. vm_size_t max_free; \
  965. \
  966. /* \
  967. * Infer root->left->max_free == root->max_free when \
  968. * y->max_free < root->max_free || root->max_free == 0. \
  969. * Otherwise, look left to find it. \
  970. */ \
  971. y = root->right; \
  972. max_free = root->max_free; \
  973. KASSERT(max_free == vm_size_max( \
  974. vm_map_entry_max_free_left(root, llist), \
  975. vm_map_entry_max_free_right(root, rlist)), \
  976. ("%s: max_free invariant fails", __func__)); \
  977. if (max_free - 1 < vm_map_entry_max_free_right(root, rlist)) \
  978. max_free = vm_map_entry_max_free_left(root, llist); \
  979. if (y != rlist && (test)) { \
  980. /* Rotate left and make y root. */ \
  981. z = y->left; \
  982. if (z != root) { \
  983. root->right = z; \
  984. y->left = root; \
  985. if (max_free < y->max_free) \
  986. root->max_free = max_free = \
  987. vm_size_max(max_free, z->max_free); \
  988. } else if (max_free < y->max_free) \
  989. root->max_free = max_free = \
  990. vm_size_max(max_free, y->start - root->end);\
  991. root = y; \
  992. y = root->right; \
  993. } \
  994. /* Copy left->max_free. Put root on llist. */ \
  995. root->max_free = max_free; \
  996. KASSERT(max_free == vm_map_entry_max_free_left(root, llist), \
  997. ("%s: max_free not copied from left", __func__)); \
  998. root->right = llist; \
  999. llist = root; \
  1000. root = y != rlist ? y : NULL; \
  1001. } while (0)
  1002. /*
  1003. * Walk down the tree until we find addr or a gap where addr would go, breaking
  1004. * off left and right subtrees of nodes less than, or greater than addr. Treat
  1005. * subtrees with root->max_free < length as empty trees. llist and rlist are
  1006. * the two sides in reverse order (bottom-up), with llist linked by the right
  1007. * pointer and rlist linked by the left pointer in the vm_map_entry, and both
  1008. * lists terminated by &map->header. This function, and the subsequent call to
  1009. * vm_map_splay_merge_{left,right,pred,succ}, rely on the start and end address
  1010. * values in &map->header.
  1011. */
  1012. static __always_inline vm_map_entry_t
  1013. vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length,
  1014. vm_map_entry_t *llist, vm_map_entry_t *rlist)
  1015. {
  1016. vm_map_entry_t left, right, root, y;
  1017. left = right = &map->header;
  1018. root = map->root;
  1019. while (root != NULL && root->max_free >= length) {
  1020. KASSERT(left->end <= root->start &&
  1021. root->end <= right->start,
  1022. ("%s: root not within tree bounds", __func__));
  1023. if (addr < root->start) {
  1024. SPLAY_LEFT_STEP(root, y, left, right,
  1025. y->max_free >= length && addr < y->start);
  1026. } else if (addr >= root->end) {
  1027. SPLAY_RIGHT_STEP(root, y, left, right,
  1028. y->max_free >= length && addr >= y->end);
  1029. } else
  1030. break;
  1031. }
  1032. *llist = left;
  1033. *rlist = right;
  1034. return (root);
  1035. }
  1036. static __always_inline void
  1037. vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *rlist)
  1038. {
  1039. vm_map_entry_t hi, right, y;
  1040. right = *rlist;
  1041. hi = root->right == right ? NULL : root->right;
  1042. if (hi == NULL)
  1043. return;
  1044. do
  1045. SPLAY_LEFT_STEP(hi, y, root, right, true);
  1046. while (hi != NULL);
  1047. *rlist = right;
  1048. }
  1049. static __always_inline void
  1050. vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *llist)
  1051. {
  1052. vm_map_entry_t left, lo, y;
  1053. left = *llist;
  1054. lo = root->left == left ? NULL : root->left;
  1055. if (lo == NULL)
  1056. return;
  1057. do
  1058. SPLAY_RIGHT_STEP(lo, y, left, root, true);
  1059. while (lo != NULL);
  1060. *llist = left;
  1061. }
  1062. static inline void
  1063. vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b)
  1064. {
  1065. vm_map_entry_t tmp;
  1066. tmp = *b;
  1067. *b = *a;
  1068. *a = tmp;
  1069. }
  1070. /*
  1071. * Walk back up the two spines, flip the pointers and set max_free. The
  1072. * subtrees of the root go at the bottom of llist and rlist.
  1073. */
  1074. static vm_size_t
  1075. vm_map_splay_merge_left_walk(vm_map_entry_t header, vm_map_entry_t root,
  1076. vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t llist)
  1077. {
  1078. do {
  1079. /*
  1080. * The max_free values of the children of llist are in
  1081. * llist->max_free and max_free. Update with the
  1082. * max value.
  1083. */
  1084. llist->max_free = max_free =
  1085. vm_size_max(llist->max_free, max_free);
  1086. vm_map_entry_swap(&llist->right, &tail);
  1087. vm_map_entry_swap(&tail, &llist);
  1088. } while (llist != header);
  1089. root->left = tail;
  1090. return (max_free);
  1091. }
  1092. /*
  1093. * When llist is known to be the predecessor of root.
  1094. */
  1095. static inline vm_size_t
  1096. vm_map_splay_merge_pred(vm_map_entry_t header, vm_map_entry_t root,
  1097. vm_map_entry_t llist)
  1098. {
  1099. vm_size_t max_free;
  1100. max_free = root->start - llist->end;
  1101. if (llist != header) {
  1102. max_free = vm_map_splay_merge_left_walk(header, root,
  1103. root, max_free, llist);
  1104. } else {
  1105. root->left = header;
  1106. header->right = root;
  1107. }
  1108. return (max_free);
  1109. }
  1110. /*
  1111. * When llist may or may not be the predecessor of root.
  1112. */
  1113. static inline vm_size_t
  1114. vm_map_splay_merge_left(vm_map_entry_t header, vm_map_entry_t root,
  1115. vm_map_entry_t llist)
  1116. {
  1117. vm_size_t max_free;
  1118. max_free = vm_map_entry_max_free_left(root, llist);
  1119. if (llist != header) {
  1120. max_free = vm_map_splay_merge_left_walk(header, root,
  1121. root->left == llist ? root : root->left,
  1122. max_free, llist);
  1123. }
  1124. return (max_free);
  1125. }
  1126. static vm_size_t
  1127. vm_map_splay_merge_right_walk(vm_map_entry_t header, vm_map_entry_t root,
  1128. vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t rlist)
  1129. {
  1130. do {
  1131. /*
  1132. * The max_free values of the children of rlist are in
  1133. * rlist->max_free and max_free. Update with the
  1134. * max value.
  1135. */
  1136. rlist->max_free = max_free =
  1137. vm_size_max(rlist->max_free, max_free);
  1138. vm_map_entry_swap(&rlist->left, &tail);
  1139. vm_map_entry_swap(&tail, &rlist);
  1140. } while (rlist != header);
  1141. root->right = tail;
  1142. return (max_free);
  1143. }
  1144. /*
  1145. * When rlist is known to be the succecessor of root.
  1146. */
  1147. static inline vm_size_t
  1148. vm_map_splay_merge_succ(vm_map_entry_t header, vm_map_entry_t root,
  1149. vm_map_entry_t rlist)
  1150. {
  1151. vm_size_t max_free;
  1152. max_free = rlist->start - root->end;
  1153. if (rlist != header) {
  1154. max_free = vm_map_splay_merge_right_walk(header, root,
  1155. root, max_free, rlist);
  1156. } else {
  1157. root->right = header;
  1158. header->left = root;
  1159. }
  1160. return (max_free);
  1161. }
  1162. /*
  1163. * When rlist may or may not be the succecessor of root.
  1164. */
  1165. static inline vm_size_t
  1166. vm_map_splay_merge_right(vm_map_entry_t header, vm_map_entry_t root,
  1167. vm_map_entry_t rlist)
  1168. {
  1169. vm_size_t max_free;
  1170. max_free = vm_map_entry_max_free_right(root, rlist);
  1171. if (rlist != header) {
  1172. max_free = vm_map_splay_merge_right_walk(header, root,
  1173. root->right == rlist ? root : root->right,
  1174. max_free, rlist);
  1175. }
  1176. return (max_free);
  1177. }
  1178. /*
  1179. * vm_map_splay:
  1180. *
  1181. * The Sleator and Tarjan top-down splay algorithm with the
  1182. * following variation. Max_free must be computed bottom-up, so
  1183. * on the downward pass, maintain the left and right spines in
  1184. * reverse order. Then, make a second pass up each side to fix
  1185. * the pointers and compute max_free. The time bound is O(log n)
  1186. * amortized.
  1187. *
  1188. * The tree is threaded, which means that there are no null pointers.
  1189. * When a node has no left child, its left pointer points to its
  1190. * predecessor, which the last ancestor on the search path from the root
  1191. * where the search branched right. Likewise, when a node has no right
  1192. * child, its right pointer points to its successor. The map header node
  1193. * is the predecessor of the first map entry, and the successor of the
  1194. * last.
  1195. *
  1196. * The new root is the vm_map_entry containing "addr", or else an
  1197. * adjacent entry (lower if possible) if addr is not in the tree.
  1198. *
  1199. * The map must be locked, and leaves it so.
  1200. *
  1201. * Returns: the new root.
  1202. */
  1203. static vm_map_entry_t
  1204. vm_map_splay(vm_map_t map, vm_offset_t addr)
  1205. {
  1206. vm_map_entry_t header, llist, rlist, root;
  1207. vm_size_t max_free_left, max_free_right;
  1208. header = &map->header;
  1209. root = vm_map_splay_split(map, addr, 0, &llist, &rlist);
  1210. if (root != NULL) {
  1211. max_free_left = vm_map_splay_merge_left(header, root, llist);
  1212. max_free_right = vm_map_splay_merge_right(header, root, rlist);
  1213. } else if (llist != header) {
  1214. /*
  1215. * Recover the greatest node in the left
  1216. * subtree and make it the root.
  1217. */
  1218. root = llist;
  1219. llist = root->right;
  1220. max_free_left = vm_map_splay_merge_left(header, root, llist);
  1221. max_free_right = vm_map_splay_merge_succ(header, root, rlist);
  1222. } else if (rlist != header) {
  1223. /*
  1224. * Recover the least node in the right
  1225. * subtree and make it the root.
  1226. */
  1227. root = rlist;
  1228. rlist = root->left;
  1229. max_free_left = vm_map_splay_merge_pred(header, root, llist);
  1230. max_free_right = vm_map_splay_merge_right(header, root, rlist);
  1231. } else {
  1232. /* There is no root. */
  1233. return (NULL);
  1234. }
  1235. root->max_free = vm_size_max(max_free_left, max_free_right);
  1236. map->root = root;
  1237. VM_MAP_ASSERT_CONSISTENT(map);
  1238. return (root);
  1239. }
  1240. /*
  1241. * vm_map_entry_{un,}link:
  1242. *
  1243. * Insert/remove entries from maps. On linking, if new entry clips
  1244. * existing entry, trim existing entry to avoid overlap, and manage
  1245. * offsets. On unlinking, merge disappearing entry with neighbor, if
  1246. * called for, and manage offsets. Callers should not modify fields in
  1247. * entries already mapped.
  1248. */
  1249. static void
  1250. vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
  1251. {
  1252. vm_map_entry_t header, llist, rlist, root;
  1253. vm_size_t max_free_left, max_free_right;
  1254. CTR3(KTR_VM,
  1255. "vm_map_entry_link: map %p, nentries %d, entry %p", map,
  1256. map->nentries, entry);
  1257. VM_MAP_ASSERT_LOCKED(map);
  1258. map->nentries++;
  1259. header = &map->header;
  1260. root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
  1261. if (root == NULL) {
  1262. /*
  1263. * The new entry does not overlap any existing entry in the
  1264. * map, so it becomes the new root of the map tree.
  1265. */
  1266. max_free_left = vm_map_splay_merge_pred(header, entry, llist);
  1267. max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
  1268. } else if (entry->start == root->start) {
  1269. /*
  1270. * The new entry is a clone of root, with only the end field
  1271. * changed. The root entry will be shrunk to abut the new
  1272. * entry, and will be the right child of the new root entry in
  1273. * the modified map.
  1274. */
  1275. KASSERT(entry->end < root->end,
  1276. ("%s: clip_start not within entry", __func__));
  1277. vm_map_splay_findprev(root, &llist);
  1278. if ((root->eflags & (MAP_ENTRY_STACK_GAP_DN |
  1279. MAP_ENTRY_STACK_GAP_UP)) == 0)
  1280. root->offset += entry->end - root->start;
  1281. root->start = entry->end;
  1282. max_free_left = vm_map_splay_merge_pred(header, entry, llist);
  1283. max_free_right = root->max_free = vm_size_max(
  1284. vm_map_splay_merge_pred(entry, root, entry),
  1285. vm_map_splay_merge_right(header, root, rlist));
  1286. } else {
  1287. /*
  1288. * The new entry is a clone of root, with only the start field
  1289. * changed. The root entry will be shrunk to abut the new
  1290. * entry, and will be the left child of the new root entry in
  1291. * the modified map.
  1292. */
  1293. KASSERT(entry->end == root->end,
  1294. ("%s: clip_start not within entry", __func__));
  1295. vm_map_splay_findnext(root, &rlist);
  1296. if ((entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
  1297. MAP_ENTRY_STACK_GAP_UP)) == 0)
  1298. entry->offset += entry->start - root->start;
  1299. root->end = entry->start;
  1300. max_free_left = root->max_free = vm_size_max(
  1301. vm_map_splay_merge_left(header, root, llist),
  1302. vm_map_splay_merge_succ(entry, root, entry));
  1303. max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
  1304. }
  1305. entry->max_free = vm_size_max(max_free_left, max_free_right);
  1306. map->root = entry;
  1307. VM_MAP_ASSERT_CONSISTENT(map);
  1308. }
  1309. enum unlink_merge_type {
  1310. UNLINK_MERGE_NONE,
  1311. UNLINK_MERGE_NEXT
  1312. };
  1313. static void
  1314. vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry,
  1315. enum unlink_merge_type op)
  1316. {
  1317. vm_map_entry_t header, llist, rlist, root;
  1318. vm_size_t max_free_left, max_free_right;
  1319. VM_MAP_ASSERT_LOCKED(map);
  1320. header = &map->header;
  1321. root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
  1322. KASSERT(root != NULL,
  1323. ("vm_map_entry_unlink: unlink object not mapped"));
  1324. vm_map_splay_findprev(root, &llist);
  1325. vm_map_splay_findnext(root, &rlist);
  1326. if (op == UNLINK_MERGE_NEXT) {
  1327. rlist->start = root->start;
  1328. MPASS((rlist->eflags & (MAP_ENTRY_STACK_GAP_DN |
  1329. MAP_ENTRY_STACK_GAP_UP)) == 0);
  1330. rlist->offset = root->offset;
  1331. }
  1332. if (llist != header) {
  1333. root = llist;
  1334. llist = root->right;
  1335. max_free_left = vm_map_splay_merge_left(header, root, llist);
  1336. max_free_right = vm_map_splay_merge_succ(header, root, rlist);
  1337. } else if (rlist != header) {
  1338. root = rlist;
  1339. rlist = root->left;
  1340. max_free_left = vm_map_splay_merge_pred(header, root, llist);
  1341. max_free_right = vm_map_splay_merge_right(header, root, rlist);
  1342. } else {
  1343. header->left = header->right = header;
  1344. root = NULL;
  1345. }
  1346. if (root != NULL)
  1347. root->max_free = vm_size_max(max_free_left, max_free_right);
  1348. map->root = root;
  1349. VM_MAP_ASSERT_CONSISTENT(map);
  1350. map->nentries--;
  1351. CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
  1352. map->nentries, entry);
  1353. }
  1354. /*
  1355. * vm_map_entry_resize:
  1356. *
  1357. * Resize a vm_map_entry, recompute the amount of free space that
  1358. * follows it and propagate that value up the tree.
  1359. *
  1360. * The map must be locked, and leaves it so.
  1361. */
  1362. static void
  1363. vm_map_entry_resize(vm_map_t map, vm_map_entry_t entry, vm_size_t grow_amount)
  1364. {
  1365. vm_map_entry_t header, llist, rlist, root;
  1366. VM_MAP_ASSERT_LOCKED(map);
  1367. header = &map->header;
  1368. root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
  1369. KASSERT(root != NULL, ("%s: resize object not mapped", __func__));
  1370. vm_map_splay_findnext(root, &rlist);
  1371. entry->end += grow_amount;
  1372. root->max_free = vm_size_max(
  1373. vm_map_splay_merge_left(header, root, llist),
  1374. vm_map_splay_merge_succ(header, root, rlist));
  1375. map->root = root;
  1376. VM_MAP_ASSERT_CONSISTENT(map);
  1377. CTR4(KTR_VM, "%s: map %p, nentries %d, entry %p",
  1378. __func__, map, map->nentries, entry);
  1379. }
  1380. /*
  1381. * vm_map_lookup_entry: [ internal use only ]
  1382. *
  1383. * Finds the map entry containing (or
  1384. * immediately preceding) the specified address
  1385. * in the given map; the entry is returned
  1386. * in the "entry" parameter. The boolean
  1387. * result indicates whether the address is
  1388. * actually contained in the map.
  1389. */
  1390. boolean_t
  1391. vm_map_lookup_entry(
  1392. vm_map_t map,
  1393. vm_offset_t address,
  1394. vm_map_entry_t *entry) /* OUT */
  1395. {
  1396. vm_map_entry_t cur, header, lbound, ubound;
  1397. boolean_t locked;
  1398. /*
  1399. * If the map is empty, then the map entry immediately preceding
  1400. * "address" is the map's header.
  1401. */
  1402. header = &map->header;
  1403. cur = map->root;
  1404. if (cur == NULL) {
  1405. *entry = header;
  1406. return (FALSE);
  1407. }
  1408. if (address >= cur->start && cur->end > address) {
  1409. *entry = cur;
  1410. return (TRUE);
  1411. }
  1412. if ((locked = vm_map_locked(map)) ||
  1413. sx_try_upgrade(&map->lock)) {
  1414. /*
  1415. * Splay requires a write lock on the map. However, it only
  1416. * restructures the binary search tree; it does not otherwise
  1417. * change the map. Thus, the map's timestamp need not change
  1418. * on a temporary upgrade.
  1419. */
  1420. cur = vm_map_splay(map, address);
  1421. if (!locked) {
  1422. VM_MAP_UNLOCK_CONSISTENT(map);
  1423. sx_downgrade(&map->lock);
  1424. }
  1425. /*
  1426. * If "address" is contained within a map entry, the new root
  1427. * is that map entry. Otherwise, the new root is a map entry
  1428. * immediately before or after "address".
  1429. */
  1430. if (address < cur->start) {
  1431. *entry = header;
  1432. return (FALSE);
  1433. }
  1434. *entry = cur;
  1435. return (address < cur->end);
  1436. }
  1437. /*
  1438. * Since the map is only locked for read access, perform a
  1439. * standard binary search tree lookup for "address".
  1440. */
  1441. lbound = ubound = header;
  1442. for (;;) {
  1443. if (address < cur->start) {
  1444. ubound = cur;
  1445. cur = cur->left;
  1446. if (cur == lbound)
  1447. break;
  1448. } else if (cur->end <= address) {
  1449. lbound = cur;
  1450. cur = cur->right;
  1451. if (cur == ubound)
  1452. break;
  1453. } else {
  1454. *entry = cur;
  1455. return (TRUE);
  1456. }
  1457. }
  1458. *entry = lbound;
  1459. return (FALSE);
  1460. }
  1461. /*
  1462. * vm_map_insert1() is identical to vm_map_insert() except that it
  1463. * returns the newly inserted map entry in '*res'. In case the new
  1464. * entry is coalesced with a neighbor or an existing entry was
  1465. * resized, that entry is returned. In any case, the returned entry
  1466. * covers the specified address range.
  1467. */
  1468. static int
  1469. vm_map_insert1(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
  1470. vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow,
  1471. vm_map_entry_t *res)
  1472. {
  1473. vm_map_entry_t new_entry, next_entry, prev_entry;
  1474. struct ucred *cred;
  1475. vm_eflags_t protoeflags;
  1476. vm_inherit_t inheritance;
  1477. u_long bdry;
  1478. u_int bidx;
  1479. VM_MAP_ASSERT_LOCKED(map);
  1480. KASSERT(object != kernel_object ||
  1481. (cow & MAP_COPY_ON_WRITE) == 0,
  1482. ("vm_map_insert: kernel object and COW"));
  1483. KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 ||
  1484. (cow & MAP_SPLIT_BOUNDARY_MASK) != 0,
  1485. ("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x",
  1486. object, cow));
  1487. KASSERT((prot & ~max) == 0,
  1488. ("prot %#x is not subset of max_prot %#x", prot, max));
  1489. /*
  1490. * Check that the start and end points are not bogus.
  1491. */
  1492. if (start == end || !vm_map_range_valid(map, start, end))
  1493. return (KERN_INVALID_ADDRESS);
  1494. if ((map->flags & MAP_WXORX) != 0 && (prot & (VM_PROT_WRITE |
  1495. VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE))
  1496. return (KERN_PROTECTION_FAILURE);
  1497. /*
  1498. * Find the entry prior to the proposed starting address; if it's part
  1499. * of an existing entry, this range is bogus.
  1500. */
  1501. if (vm_map_lookup_entry(map, start, &prev_entry))
  1502. return (KERN_NO_SPACE);
  1503. /*
  1504. * Assert that the next entry doesn't overlap the end point.
  1505. */
  1506. next_entry = vm_map_entry_succ(prev_entry);
  1507. if (next_entry->start < end)
  1508. return (KERN_NO_SPACE);
  1509. if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
  1510. max != VM_PROT_NONE))
  1511. return (KERN_INVALID_ARGUMENT);
  1512. protoeflags = 0;
  1513. if (cow & MAP_COPY_ON_WRITE)
  1514. protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
  1515. if (cow & MAP_NOFAULT)
  1516. protoeflags |= MAP_ENTRY_NOFAULT;
  1517. if (cow & MAP_DISABLE_SYNCER)
  1518. protoeflags |= MAP_ENTRY_NOSYNC;
  1519. if (cow & MAP_DISABLE_COREDUMP)
  1520. protoeflags |= MAP_ENTRY_NOCOREDUMP;
  1521. if (cow & MAP_STACK_GROWS_DOWN)
  1522. protoeflags |= MAP_ENTRY_GROWS_DOWN;
  1523. if (cow & MAP_STACK_GROWS_UP)
  1524. protoeflags |= MAP_ENTRY_GROWS_UP;
  1525. if (cow & MAP_WRITECOUNT)
  1526. protoeflags |= MAP_ENTRY_WRITECNT;
  1527. if (cow & MAP_VN_EXEC)
  1528. protoeflags |= MAP_ENTRY_VN_EXEC;
  1529. if ((cow & MAP_CREATE_GUARD) != 0)
  1530. protoeflags |= MAP_ENTRY_GUARD;
  1531. if ((cow & MAP_CREATE_STACK_GAP_DN) != 0)
  1532. protoeflags |= MAP_ENTRY_STACK_GAP_DN;
  1533. if ((cow & MAP_CREATE_STACK_GAP_UP) != 0)
  1534. protoeflags |= MAP_ENTRY_STACK_GAP_UP;
  1535. if (cow & MAP_INHERIT_SHARE)
  1536. inheritance = VM_INHERIT_SHARE;
  1537. else
  1538. inheritance = VM_INHERIT_DEFAULT;
  1539. if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) {
  1540. /* This magically ignores index 0, for usual page size. */
  1541. bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >>
  1542. MAP_SPLIT_BOUNDARY_SHIFT;
  1543. if (bidx >= MAXPAGESIZES)
  1544. return (KERN_INVALID_ARGUMENT);
  1545. bdry = pagesizes[bidx] - 1;
  1546. if ((start & bdry) != 0 || (end & bdry) != 0)
  1547. return (KERN_INVALID_ARGUMENT);
  1548. protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
  1549. }
  1550. cred = NULL;
  1551. if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
  1552. goto charged;
  1553. if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
  1554. ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
  1555. if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
  1556. return (KERN_RESOURCE_SHORTAGE);
  1557. KASSERT(object == NULL ||
  1558. (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
  1559. object->cred == NULL,
  1560. ("overcommit: vm_map_insert o %p", object));
  1561. cred = curthread->td_ucred;
  1562. }
  1563. charged:
  1564. /* Expand the kernel pmap, if necessary. */
  1565. if (map == kernel_map && end > kernel_vm_end)
  1566. pmap_growkernel(end);
  1567. if (object != NULL) {
  1568. /*
  1569. * OBJ_ONEMAPPING must be cleared unless this mapping
  1570. * is trivially proven to be the only mapping for any
  1571. * of the object's pages. (Object granularity
  1572. * reference counting is insufficient to recognize
  1573. * aliases with precision.)
  1574. */
  1575. if ((object->flags & OBJ_ANON) != 0) {
  1576. VM_OBJECT_WLOCK(object);
  1577. if (object->ref_count > 1 || object->shadow_count != 0)
  1578. vm_object_clear_flag(object, OBJ_ONEMAPPING);
  1579. VM_OBJECT_WUNLOCK(object);
  1580. }
  1581. } else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) ==
  1582. protoeflags &&
  1583. (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP |
  1584. MAP_VN_EXEC)) == 0 &&
  1585. prev_entry->end == start && (prev_entry->cred == cred ||
  1586. (prev_entry->object.vm_object != NULL &&
  1587. prev_entry->object.vm_object->cred == cred)) &&
  1588. vm_object_coalesce(prev_entry->object.vm_object,
  1589. prev_entry->offset,
  1590. (vm_size_t)(prev_entry->end - prev_entry->start),
  1591. (vm_size_t)(end - prev_entry->end), cred != NULL &&
  1592. (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
  1593. /*
  1594. * We were able to extend the object. Determine if we
  1595. * can extend the previous map entry to include the
  1596. * new range as well.
  1597. */
  1598. if (prev_entry->inheritance == inheritance &&
  1599. prev_entry->protection == prot &&
  1600. prev_entry->max_protection == max &&
  1601. prev_entry->wired_count == 0) {
  1602. KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) ==
  1603. 0, ("prev_entry %p has incoherent wiring",
  1604. prev_entry));
  1605. if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
  1606. map->size += end - prev_entry->end;
  1607. vm_map_entry_resize(map, prev_entry,
  1608. end - prev_entry->end);
  1609. *res = vm_map_try_merge_entries(map, prev_entry,
  1610. next_entry);
  1611. return (KERN_SUCCESS);
  1612. }
  1613. /*
  1614. * If we can extend the object but cannot extend the
  1615. * map entry, we have to create a new map entry. We
  1616. * must bump the ref count on the extended object to
  1617. * account for it. object may be NULL.
  1618. */
  1619. object = prev_entry->object.vm_object;
  1620. offset = prev_entry->offset +
  1621. (prev_entry->end - prev_entry->start);
  1622. vm_object_reference(object);
  1623. if (cred != NULL && object != NULL && object->cred != NULL &&
  1624. !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
  1625. /* Object already accounts for this uid. */
  1626. cred = NULL;
  1627. }
  1628. }
  1629. if (cred != NULL)
  1630. crhold(cred);
  1631. /*
  1632. * Create a new entry
  1633. */
  1634. new_entry = vm_map_entry_create(map);
  1635. new_entry->start = start;
  1636. new_entry->end = end;
  1637. new_entry->cred = NULL;
  1638. new_entry->eflags = protoeflags;
  1639. new_entry->object.vm_object = object;
  1640. new_entry->offset = offset;
  1641. new_entry->inheritance = inheritance;
  1642. new_entry->protection = prot;
  1643. new_entry->max_protection = max;
  1644. new_entry->wired_count = 0;
  1645. new_entry->wiring_thread = NULL;
  1646. new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
  1647. new_entry->next_read = start;
  1648. KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
  1649. ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
  1650. new_entry->cred = cred;
  1651. /*
  1652. * Insert the new entry into the list
  1653. */
  1654. vm_map_entry_link(map, new_entry);
  1655. if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
  1656. map->size += new_entry->end - new_entry->start;
  1657. /*
  1658. * Try to coalesce the new entry with both the previous and next
  1659. * entries in the list. Previously, we only attempted to coalesce
  1660. * with the previous entry when object is NULL. Here, we handle the
  1661. * other cases, which are less common.
  1662. */
  1663. vm_map_try_merge_entries(map, prev_entry, new_entry);
  1664. *res = vm_map_try_merge_entries(map, new_entry, next_entry);
  1665. if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
  1666. vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
  1667. end - start, cow & MAP_PREFAULT_PARTIAL);
  1668. }
  1669. return (KERN_SUCCESS);
  1670. }
  1671. /*
  1672. * vm_map_insert:
  1673. *
  1674. * Inserts the given VM object into the target map at the
  1675. * specified address range.
  1676. *
  1677. * Requires that the map be locked, and leaves it so.
  1678. *
  1679. * If object is non-NULL, ref count must be bumped by caller
  1680. * prior to making call to account for the new entry.
  1681. */
  1682. int
  1683. vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
  1684. vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
  1685. {
  1686. vm_map_entry_t res;
  1687. return (vm_map_insert1(map, object, offset, start, end, prot, max,
  1688. cow, &res));
  1689. }
  1690. /*
  1691. * vm_map_findspace:
  1692. *
  1693. * Find the first fit (lowest VM address) for "length" free bytes
  1694. * beginning at address >= start in the given map.
  1695. *
  1696. * In a vm_map_entry, "max_free" is the maximum amount of
  1697. * contiguous free space between an entry in its subtree and a
  1698. * neighbor of that entry. This allows finding a free region in
  1699. * one path down the tree, so O(log n) amortized with splay
  1700. * trees.
  1701. *
  1702. * The map must be locked, and leaves it so.
  1703. *
  1704. * Returns: starting address if sufficient space,
  1705. * vm_map_max(map)-length+1 if insufficient space.
  1706. */
  1707. vm_offset_t
  1708. vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length)
  1709. {
  1710. vm_map_entry_t header, llist, rlist, root, y;
  1711. vm_size_t left_length, max_free_left, max_free_right;
  1712. vm_offset_t gap_end;
  1713. VM_MAP_ASSERT_LOCKED(map);
  1714. /*
  1715. * Request must fit within min/max VM address and must avoid
  1716. * address wrap.
  1717. */
  1718. start = MAX(start, vm_map_min(map));
  1719. if (start >= vm_map_max(map) || length > vm_map_max(map) - start)
  1720. return (vm_map_max(map) - length + 1);
  1721. /* Empty tree means wide open address space. */
  1722. if (map->root == NULL)
  1723. return (start);
  1724. /*
  1725. * After splay_split, if start is within an entry, push it to the start
  1726. * of the following gap. If rlist is at the end of the gap containing
  1727. * start, save the end of that gap in gap_end to see if the gap is big
  1728. * enough; otherwise set gap_end to start skip gap-checking and move
  1729. * directly to a search of the right subtree.
  1730. */
  1731. header = &map->header;
  1732. root = vm_map_splay_split(map, start, length, &llist, &rlist);
  1733. gap_end = rlist->start;
  1734. if (root != NULL) {
  1735. start = root->end;
  1736. if (root->right != rlist)
  1737. gap_end = start;
  1738. max_free_left = vm_map_splay_merge_left(header, root, llist);
  1739. max_free_right = vm_map_splay_merge_right(header, root, rlist);
  1740. } else if (rlist != header) {
  1741. root = rlist;
  1742. rlist = root->left;
  1743. max_free_left = vm_map_splay_merge_pred(header, root, llist);
  1744. max_free_right = vm_map_splay_merge_right(header, root, rlist);
  1745. } else {
  1746. root = llist;
  1747. llist = root->right;
  1748. max_free_left = vm_map_splay_merge_left(header, root, llist);
  1749. max_free_right = vm_map_splay_merge_succ(header, root, rlist);
  1750. }
  1751. root->max_free = vm_size_max(max_free_left, max_free_right);
  1752. map->root = root;
  1753. VM_MAP_ASSERT_CONSISTENT(map);
  1754. if (length <= gap_end - start)
  1755. return (start);
  1756. /* With max_free, can immediately tell if no solution. */
  1757. if (root->right == header || length > root->right->max_free)
  1758. return (vm_map_max(map) - length + 1);
  1759. /*
  1760. * Splay for the least large-enough gap in the right subtree.
  1761. */
  1762. llist = rlist = header;
  1763. for (left_length = 0;;
  1764. left_length = vm_map_entry_max_free_left(root, llist)) {
  1765. if (length <= left_length)
  1766. SPLAY_LEFT_STEP(root, y, llist, rlist,
  1767. length <= vm_map_entry_max_free_left(y, llist));
  1768. else
  1769. SPLAY_RIGHT_STEP(root, y, llist, rlist,
  1770. length > vm_map_entry_max_free_left(y, root));
  1771. if (root == NULL)
  1772. break;
  1773. }
  1774. root = llist;
  1775. llist = root->right;
  1776. max_free_left = vm_map_splay_merge_left(header, root, llist);
  1777. if (rlist == header) {
  1778. root->max_free = vm_size_max(max_free_left,
  1779. vm_map_splay_merge_succ(header, root, rlist));
  1780. } else {
  1781. y = rlist;
  1782. rlist = y->left;
  1783. y->max_free = vm_size_max(
  1784. vm_map_splay_merge_pred(root, y, root),
  1785. vm_map_splay_merge_right(header, y, rlist));
  1786. root->max_free = vm_size_max(max_free_left, y->max_free);
  1787. }
  1788. map->root = root;
  1789. VM_MAP_ASSERT_CONSISTENT(map);
  1790. return (root->end);
  1791. }
  1792. int
  1793. vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
  1794. vm_offset_t start, vm_size_t length, vm_prot_t prot,
  1795. vm_prot_t max, int cow)
  1796. {
  1797. vm_offset_t end;
  1798. int result;
  1799. end = start + length;
  1800. KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
  1801. object == NULL,
  1802. ("vm_map_fixed: non-NULL backing object for stack"));
  1803. vm_map_lock(map);
  1804. VM_MAP_RANGE_CHECK(map, start, end);
  1805. if ((cow & MAP_CHECK_EXCL) == 0) {
  1806. result = vm_map_delete(map, start, end);
  1807. if (result != KERN_SUCCESS)
  1808. goto out;
  1809. }
  1810. if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
  1811. result = vm_map_stack_locked(map, start, length, sgrowsiz,
  1812. prot, max, cow);
  1813. } else {
  1814. result = vm_map_insert(map, object, offset, start, end,
  1815. prot, max, cow);
  1816. }
  1817. out:
  1818. vm_map_unlock(map);
  1819. return (result);
  1820. }
  1821. #if VM_NRESERVLEVEL <= 1
  1822. static const int aslr_pages_rnd_64[2] = {0x1000, 0x10};
  1823. static const int aslr_pages_rnd_32[2] = {0x100, 0x4};
  1824. #elif VM_NRESERVLEVEL == 2
  1825. static const int aslr_pages_rnd_64[3] = {0x1000, 0x1000, 0x10};
  1826. static const int aslr_pages_rnd_32[3] = {0x100, 0x100, 0x4};
  1827. #else
  1828. #error "Unsupported VM_NRESERVLEVEL"
  1829. #endif
  1830. static int cluster_anon = 1;
  1831. SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW,
  1832. &cluster_anon, 0,
  1833. "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always");
  1834. static bool
  1835. clustering_anon_allowed(vm_offset_t addr, int cow)
  1836. {
  1837. switch (cluster_anon) {
  1838. case 0:
  1839. return (false);
  1840. case 1:
  1841. return (addr == 0 || (cow & MAP_NO_HINT) != 0);
  1842. case 2:
  1843. default:
  1844. return (true);
  1845. }
  1846. }
  1847. static long aslr_restarts;
  1848. SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD,
  1849. &aslr_restarts, 0,
  1850. "Number of aslr failures");
  1851. /*
  1852. * Searches for the specified amount of free space in the given map with the
  1853. * specified alignment. Performs an address-ordered, first-fit search from
  1854. * the given address "*addr", with an optional upper bound "max_addr". If the
  1855. * parameter "alignment" is zero, then the alignment is computed from the
  1856. * given (object, offset) pair so as to enable the greatest possible use of
  1857. * superpage mappings. Returns KERN_SUCCESS and the address of the free space
  1858. * in "*addr" if successful. Otherwise, returns KERN_NO_SPACE.
  1859. *
  1860. * The map must be locked. Initially, there must be at least "length" bytes
  1861. * of free space at the given address.
  1862. */
  1863. static int
  1864. vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
  1865. vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr,
  1866. vm_offset_t alignment)
  1867. {
  1868. vm_offset_t aligned_addr, free_addr;
  1869. VM_MAP_ASSERT_LOCKED(map);
  1870. free_addr = *addr;
  1871. KASSERT(free_addr == vm_map_findspace(map, free_addr, length),
  1872. ("caller failed to provide space %#jx at address %p",
  1873. (uintmax_t)length, (void *)free_addr));
  1874. for (;;) {
  1875. /*
  1876. * At the start of every iteration, the free space at address
  1877. * "*addr" is at least "length" bytes.
  1878. */
  1879. if (alignment == 0)
  1880. pmap_align_superpage(object, offset, addr, length);
  1881. else
  1882. *addr = roundup2(*addr, alignment);
  1883. aligned_addr = *addr;
  1884. if (aligned_addr == free_addr) {
  1885. /*
  1886. * Alignment did not change "*addr", so "*addr" must
  1887. * still provide sufficient free space.
  1888. */
  1889. return (KERN_SUCCESS);
  1890. }
  1891. /*
  1892. * Test for address wrap on "*addr". A wrapped "*addr" could
  1893. * be a valid address, in which case vm_map_findspace() cannot
  1894. * be relied upon to fail.
  1895. */
  1896. if (aligned_addr < free_addr)
  1897. return (KERN_NO_SPACE);
  1898. *addr = vm_map_findspace(map, aligned_addr, length);
  1899. if (*addr + length > vm_map_max(map) ||
  1900. (max_addr != 0 && *addr + length > max_addr))
  1901. return (KERN_NO_SPACE);
  1902. free_addr = *addr;
  1903. if (free_addr == aligned_addr) {
  1904. /*
  1905. * If a successful call to vm_map_findspace() did not
  1906. * change "*addr", then "*addr" must still be aligned
  1907. * and provide sufficient free space.
  1908. */
  1909. return (KERN_SUCCESS);
  1910. }
  1911. }
  1912. }
  1913. int
  1914. vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
  1915. vm_offset_t max_addr, vm_offset_t alignment)
  1916. {
  1917. /* XXXKIB ASLR eh ? */
  1918. *addr = vm_map_findspace(map, *addr, length);
  1919. if (*addr + length > vm_map_max(map) ||
  1920. (max_addr != 0 && *addr + length > max_addr))
  1921. return (KERN_NO_SPACE);
  1922. return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr,
  1923. alignment));
  1924. }
  1925. /*
  1926. * vm_map_find finds an unallocated region in the target address
  1927. * map with the given length. The search is defined to be
  1928. * first-fit from the specified address; the region found is
  1929. * returned in the same parameter.
  1930. *
  1931. * If object is non-NULL, ref count must be bumped by caller
  1932. * prior to making call to account for the new entry.
  1933. */
  1934. int
  1935. vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
  1936. vm_offset_t *addr, /* IN/OUT */
  1937. vm_size_t length, vm_offset_t max_addr, int find_space,
  1938. vm_prot_t prot, vm_prot_t max, int cow)
  1939. {
  1940. vm_offset_t alignment, curr_min_addr, min_addr;
  1941. int gap, pidx, rv, try;
  1942. bool cluster, en_aslr, update_anon;
  1943. KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
  1944. object == NULL,
  1945. ("vm_map_find: non-NULL backing object for stack"));
  1946. MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
  1947. (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0));
  1948. if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
  1949. (object->flags & OBJ_COLORED) == 0))
  1950. find_space = VMFS_ANY_SPACE;
  1951. if (find_space >> 8 != 0) {
  1952. KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
  1953. alignment = (vm_offset_t)1 << (find_space >> 8);
  1954. } else
  1955. alignment = 0;
  1956. en_aslr = (map->flags & MAP_ASLR) != 0;
  1957. update_anon = cluster = clustering_anon_allowed(*addr, cow) &&
  1958. (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 &&
  1959. find_space != VMFS_NO_SPACE && object == NULL &&
  1960. (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP |
  1961. MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE;
  1962. curr_min_addr = min_addr = *addr;
  1963. if (en_aslr && min_addr == 0 && !cluster &&
  1964. find_space != VMFS_NO_SPACE &&
  1965. (map->flags & MAP_ASLR_IGNSTART) != 0)
  1966. curr_min_addr = min_addr = vm_map_min(map);
  1967. try = 0;
  1968. vm_map_lock(map);
  1969. if (cluster) {
  1970. curr_min_addr = map->anon_loc;
  1971. if (curr_min_addr == 0)
  1972. cluster = false;
  1973. }
  1974. if (find_space != VMFS_NO_SPACE) {
  1975. KASSERT(find_space == VMFS_ANY_SPACE ||
  1976. find_space == VMFS_OPTIMAL_SPACE ||
  1977. find_space == VMFS_SUPER_SPACE ||
  1978. alignment != 0, ("unexpected VMFS flag"));
  1979. again:
  1980. /*
  1981. * When creating an anonymous mapping, try clustering
  1982. * with an existing anonymous mapping first.
  1983. *
  1984. * We make up to two attempts to find address space
  1985. * for a given find_space value. The first attempt may
  1986. * apply randomization or may cluster with an existing
  1987. * anonymous mapping. If this first attempt fails,
  1988. * perform a first-fit search of the available address
  1989. * space.
  1990. *
  1991. * If all tries failed, and find_space is
  1992. * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE.
  1993. * Again enable clustering and randomization.
  1994. */
  1995. try++;
  1996. MPASS(try <= 2);
  1997. if (try == 2) {
  1998. /*
  1999. * Second try: we failed either to find a
  2000. * suitable region for randomizing the
  2001. * allocation, or to cluster with an existing
  2002. * mapping. Retry with free run.
  2003. */
  2004. curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ?
  2005. vm_map_min(map) : min_addr;
  2006. atomic_add_long(&aslr_restarts, 1);
  2007. }
  2008. if (try == 1 && en_aslr && !cluster) {
  2009. /*
  2010. * Find space for allocation, including
  2011. * gap needed for later randomization.
  2012. */
  2013. pidx = 0;
  2014. #if VM_NRESERVLEVEL > 0
  2015. if ((find_space == VMFS_SUPER_SPACE ||
  2016. find_space == VMFS_OPTIMAL_SPACE) &&
  2017. pagesizes[VM_NRESERVLEVEL] != 0) {
  2018. /*
  2019. * Do not pointlessly increase the space that
  2020. * is requested from vm_map_findspace().
  2021. * pmap_align_superpage() will only change a
  2022. * mapping's alignment if that mapping is at
  2023. * least a superpage in size.
  2024. */
  2025. pidx = VM_NRESERVLEVEL;
  2026. while (pidx > 0 && length < pagesizes[pidx])
  2027. pidx--;
  2028. }
  2029. #endif
  2030. gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR &&
  2031. (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ?
  2032. aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx];
  2033. *addr = vm_map_findspace(map, curr_min_addr,
  2034. length + gap * pagesizes[pidx]);
  2035. if (*addr + length + gap * pagesizes[pidx] >
  2036. vm_map_max(map))
  2037. goto again;
  2038. /* And randomize the start address. */
  2039. *addr += (arc4random() % gap) * pagesizes[pidx];
  2040. if (max_addr != 0 && *addr + length > max_addr)
  2041. goto again;
  2042. } else {
  2043. *addr = vm_map_findspace(map, curr_min_addr, length);
  2044. if (*addr + length > vm_map_max(map) ||
  2045. (max_addr != 0 && *addr + length > max_addr)) {
  2046. if (cluster) {
  2047. cluster = false;
  2048. MPASS(try == 1);
  2049. goto again;
  2050. }
  2051. rv = KERN_NO_SPACE;
  2052. goto done;
  2053. }
  2054. }
  2055. if (find_space != VMFS_ANY_SPACE &&
  2056. (rv = vm_map_alignspace(map, object, offset, addr, length,
  2057. max_addr, alignment)) != KERN_SUCCESS) {
  2058. if (find_space == VMFS_OPTIMAL_SPACE) {
  2059. find_space = VMFS_ANY_SPACE;
  2060. curr_min_addr = min_addr;
  2061. cluster = update_anon;
  2062. try = 0;
  2063. goto again;
  2064. }
  2065. goto done;
  2066. }
  2067. } else if ((cow & MAP_REMAP) != 0) {
  2068. if (!vm_map_range_valid(map, *addr, *addr + length)) {
  2069. rv = KERN_INVALID_ADDRESS;
  2070. goto done;
  2071. }
  2072. rv = vm_map_delete(map, *addr, *addr + length);
  2073. if (rv != KERN_SUCCESS)
  2074. goto done;
  2075. }
  2076. if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
  2077. rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot,
  2078. max, cow);
  2079. } else {
  2080. rv = vm_map_insert(map, object, offset, *addr, *addr + length,
  2081. prot, max, cow);
  2082. }
  2083. /*
  2084. * Update the starting address for clustered anonymous memory mappings
  2085. * if a starting address was not previously defined or an ASLR restart
  2086. * placed an anonymous memory mapping at a lower address.
  2087. */
  2088. if (update_anon && rv == KERN_SUCCESS && (map->anon_loc == 0 ||
  2089. *addr < map->anon_loc))
  2090. map->anon_loc = *addr;
  2091. done:
  2092. vm_map_unlock(map);
  2093. return (rv);
  2094. }
  2095. /*
  2096. * vm_map_find_min() is a variant of vm_map_find() that takes an
  2097. * additional parameter ("default_addr") and treats the given address
  2098. * ("*addr") differently. Specifically, it treats "*addr" as a hint
  2099. * and not as the minimum address where the mapping is created.
  2100. *
  2101. * This function works in two phases. First, it tries to
  2102. * allocate above the hint. If that fails and the hint is
  2103. * greater than "default_addr", it performs a second pass, replacing
  2104. * the hint with "default_addr" as the minimum address for the
  2105. * allocation.
  2106. */
  2107. int
  2108. vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
  2109. vm_offset_t *addr, vm_size_t length, vm_offset_t default_addr,
  2110. vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
  2111. int cow)
  2112. {
  2113. vm_offset_t hint;
  2114. int rv;
  2115. hint = *addr;
  2116. if (hint == 0) {
  2117. cow |= MAP_NO_HINT;
  2118. *addr = hint = default_addr;
  2119. }
  2120. for (;;) {
  2121. rv = vm_map_find(map, object, offset, addr, length, max_addr,
  2122. find_space, prot, max, cow);
  2123. if (rv == KERN_SUCCESS || default_addr >= hint)
  2124. return (rv);
  2125. *addr = hint = default_addr;
  2126. }
  2127. }
  2128. /*
  2129. * A map entry with any of the following flags set must not be merged with
  2130. * another entry.
  2131. */
  2132. #define MAP_ENTRY_NOMERGE_MASK (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \
  2133. MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC | \
  2134. MAP_ENTRY_STACK_GAP_UP | MAP_ENTRY_STACK_GAP_DN)
  2135. static bool
  2136. vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry)
  2137. {
  2138. KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 ||
  2139. (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0,
  2140. ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable",
  2141. prev, entry));
  2142. return (prev->end == entry->start &&
  2143. prev->object.vm_object == entry->object.vm_object &&
  2144. (prev->object.vm_object == NULL ||
  2145. prev->offset + (prev->end - prev->start) == entry->offset) &&
  2146. prev->eflags == entry->eflags &&
  2147. prev->protection == entry->protection &&
  2148. prev->max_protection == entry->max_protection &&
  2149. prev->inheritance == entry->inheritance &&
  2150. prev->wired_count == entry->wired_count &&
  2151. prev->cred == entry->cred);
  2152. }
  2153. static void
  2154. vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry)
  2155. {
  2156. /*
  2157. * If the backing object is a vnode object, vm_object_deallocate()
  2158. * calls vrele(). However, vrele() does not lock the vnode because
  2159. * the vnode has additional references. Thus, the map lock can be
  2160. * kept without causing a lock-order reversal with the vnode lock.
  2161. *
  2162. * Since we count the number of virtual page mappings in
  2163. * object->un_pager.vnp.writemappings, the writemappings value
  2164. * should not be adjusted when the entry is disposed of.
  2165. */
  2166. if (entry->object.vm_object != NULL)
  2167. vm_object_deallocate(entry->object.vm_object);
  2168. if (entry->cred != NULL)
  2169. crfree(entry->cred);
  2170. vm_map_entry_dispose(map, entry);
  2171. }
  2172. /*
  2173. * vm_map_try_merge_entries:
  2174. *
  2175. * Compare two map entries that represent consecutive ranges. If
  2176. * the entries can be merged, expand the range of the second to
  2177. * cover the range of the first and delete the first. Then return
  2178. * the map entry that includes the first range.
  2179. *
  2180. * The map must be locked.
  2181. */
  2182. vm_map_entry_t
  2183. vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev_entry,
  2184. vm_map_entry_t entry)
  2185. {
  2186. VM_MAP_ASSERT_LOCKED(map);
  2187. if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 &&
  2188. vm_map_mergeable_neighbors(prev_entry, entry)) {
  2189. vm_map_entry_unlink(map, prev_entry, UNLINK_MERGE_NEXT);
  2190. vm_map_merged_neighbor_dispose(map, prev_entry);
  2191. return (entry);
  2192. }
  2193. return (prev_entry);
  2194. }
  2195. /*
  2196. * vm_map_entry_back:
  2197. *
  2198. * Allocate an object to back a map entry.
  2199. */
  2200. static inline void
  2201. vm_map_entry_back(vm_map_entry_t entry)
  2202. {
  2203. vm_object_t object;
  2204. KASSERT(entry->object.vm_object == NULL,
  2205. ("map entry %p has backing object", entry));
  2206. KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
  2207. ("map entry %p is a submap", entry));
  2208. object = vm_object_allocate_anon(atop(entry->end - entry->start), NULL,
  2209. entry->cred, entry->end - entry->start);
  2210. entry->object.vm_object = object;
  2211. entry->offset = 0;
  2212. entry->cred = NULL;
  2213. }
  2214. /*
  2215. * vm_map_entry_charge_object
  2216. *
  2217. * If there is no object backing this entry, create one. Otherwise, if
  2218. * the entry has cred, give it to the backing object.
  2219. */
  2220. static inline void
  2221. vm_map_entry_charge_object(vm_map_t map, vm_map_entry_t entry)
  2222. {
  2223. VM_MAP_ASSERT_LOCKED(map);
  2224. KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
  2225. ("map entry %p is a submap", entry));
  2226. if (entry->object.vm_object == NULL && !map->system_map &&
  2227. (entry->eflags & MAP_ENTRY_GUARD) == 0)
  2228. vm_map_entry_back(entry);
  2229. else if (entry->object.vm_object != NULL &&
  2230. ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
  2231. entry->cred != NULL) {
  2232. VM_OBJECT_WLOCK(entry->object.vm_object);
  2233. KASSERT(entry->object.vm_object->cred == NULL,
  2234. ("OVERCOMMIT: %s: both cred e %p", __func__, entry));
  2235. entry->object.vm_object->cred = entry->cred;
  2236. entry->object.vm_object->charge = entry->end - entry->start;
  2237. VM_OBJECT_WUNLOCK(entry->object.vm_object);
  2238. entry->cred = NULL;
  2239. }
  2240. }
  2241. /*
  2242. * vm_map_entry_clone
  2243. *
  2244. * Create a duplicate map entry for clipping.
  2245. */
  2246. static vm_map_entry_t
  2247. vm_map_entry_clone(vm_map_t map, vm_map_entry_t entry)
  2248. {
  2249. vm_map_entry_t new_entry;
  2250. VM_MAP_ASSERT_LOCKED(map);
  2251. /*
  2252. * Create a backing object now, if none exists, so that more individual
  2253. * objects won't be created after the map entry is split.
  2254. */
  2255. vm_map_entry_charge_object(map, entry);
  2256. /* Clone the entry. */
  2257. new_entry = vm_map_entry_create(map);
  2258. *new_entry = *entry;
  2259. if (new_entry->cred != NULL)
  2260. crhold(entry->cred);
  2261. if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
  2262. vm_object_reference(new_entry->object.vm_object);
  2263. vm_map_entry_set_vnode_text(new_entry, true);
  2264. /*
  2265. * The object->un_pager.vnp.writemappings for the object of
  2266. * MAP_ENTRY_WRITECNT type entry shall be kept as is here. The
  2267. * virtual pages are re-distributed among the clipped entries,
  2268. * so the sum is left the same.
  2269. */
  2270. }
  2271. return (new_entry);
  2272. }
  2273. /*
  2274. * vm_map_clip_start: [ internal use only ]
  2275. *
  2276. * Asserts that the given entry begins at or after
  2277. * the specified address; if necessary,
  2278. * it splits the entry into two.
  2279. */
  2280. static int
  2281. vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr)
  2282. {
  2283. vm_map_entry_t new_entry;
  2284. int bdry_idx;
  2285. if (!map->system_map)
  2286. WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
  2287. "%s: map %p entry %p start 0x%jx", __func__, map, entry,
  2288. (uintmax_t)startaddr);
  2289. if (startaddr <= entry->start)
  2290. return (KERN_SUCCESS);
  2291. VM_MAP_ASSERT_LOCKED(map);
  2292. KASSERT(entry->end > startaddr && entry->start < startaddr,
  2293. ("%s: invalid clip of entry %p", __func__, entry));
  2294. bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
  2295. if (bdry_idx != 0) {
  2296. if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0)
  2297. return (KERN_INVALID_ARGUMENT);
  2298. }
  2299. new_entry = vm_map_entry_clone(map, entry);
  2300. /*
  2301. * Split off the front portion. Insert the new entry BEFORE this one,
  2302. * so that this entry has the specified starting address.
  2303. */
  2304. new_entry->end = startaddr;
  2305. vm_map_entry_link(map, new_entry);
  2306. return (KERN_SUCCESS);
  2307. }
  2308. /*
  2309. * vm_map_lookup_clip_start:
  2310. *
  2311. * Find the entry at or just after 'start', and clip it if 'start' is in
  2312. * the interior of the entry. Return entry after 'start', and in
  2313. * prev_entry set the entry before 'start'.
  2314. */
  2315. static int
  2316. vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start,
  2317. vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry)
  2318. {
  2319. vm_map_entry_t entry;
  2320. int rv;
  2321. if (!map->system_map)
  2322. WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
  2323. "%s: map %p start 0x%jx prev %p", __func__, map,
  2324. (uintmax_t)start, prev_entry);
  2325. if (vm_map_lookup_entry(map, start, prev_entry)) {
  2326. entry = *prev_entry;
  2327. rv = vm_map_clip_start(map, entry, start);
  2328. if (rv != KERN_SUCCESS)
  2329. return (rv);
  2330. *prev_entry = vm_map_entry_pred(entry);
  2331. } else
  2332. entry = vm_map_entry_succ(*prev_entry);
  2333. *res_entry = entry;
  2334. return (KERN_SUCCESS);
  2335. }
  2336. /*
  2337. * vm_map_clip_end: [ internal use only ]
  2338. *
  2339. * Asserts that the given entry ends at or before
  2340. * the specified address; if necessary,
  2341. * it splits the entry into two.
  2342. */
  2343. static int
  2344. vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr)
  2345. {
  2346. vm_map_entry_t new_entry;
  2347. int bdry_idx;
  2348. if (!map->system_map)
  2349. WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
  2350. "%s: map %p entry %p end 0x%jx", __func__, map, entry,
  2351. (uintmax_t)endaddr);
  2352. if (endaddr >= entry->end)
  2353. return (KERN_SUCCESS);
  2354. VM_MAP_ASSERT_LOCKED(map);
  2355. KASSERT(entry->start < endaddr && entry->end > endaddr,
  2356. ("%s: invalid clip of entry %p", __func__, entry));
  2357. bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
  2358. if (bdry_idx != 0) {
  2359. if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0)
  2360. return (KERN_INVALID_ARGUMENT);
  2361. }
  2362. new_entry = vm_map_entry_clone(map, entry);
  2363. /*
  2364. * Split off the back portion. Insert the new entry AFTER this one,
  2365. * so that this entry has the specified ending address.
  2366. */
  2367. new_entry->start = endaddr;
  2368. vm_map_entry_link(map, new_entry);
  2369. return (KERN_SUCCESS);
  2370. }
  2371. /*
  2372. * vm_map_submap: [ kernel use only ]
  2373. *
  2374. * Mark the given range as handled by a subordinate map.
  2375. *
  2376. * This range must have been created with vm_map_find,
  2377. * and no other operations may have been performed on this
  2378. * range prior to calling vm_map_submap.
  2379. *
  2380. * Only a limited number of operations can be performed
  2381. * within this rage after calling vm_map_submap:
  2382. * vm_fault
  2383. * [Don't try vm_map_copy!]
  2384. *
  2385. * To remove a submapping, one must first remove the
  2386. * range from the superior map, and then destroy the
  2387. * submap (if desired). [Better yet, don't try it.]
  2388. */
  2389. int
  2390. vm_map_submap(
  2391. vm_map_t map,
  2392. vm_offset_t start,
  2393. vm_offset_t end,
  2394. vm_map_t submap)
  2395. {
  2396. vm_map_entry_t entry;
  2397. int result;
  2398. result = KERN_INVALID_ARGUMENT;
  2399. vm_map_lock(submap);
  2400. submap->flags |= MAP_IS_SUB_MAP;
  2401. vm_map_unlock(submap);
  2402. vm_map_lock(map);
  2403. VM_MAP_RANGE_CHECK(map, start, end);
  2404. if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end &&
  2405. (entry->eflags & MAP_ENTRY_COW) == 0 &&
  2406. entry->object.vm_object == NULL) {
  2407. result = vm_map_clip_start(map, entry, start);
  2408. if (result != KERN_SUCCESS)
  2409. goto unlock;
  2410. result = vm_map_clip_end(map, entry, end);
  2411. if (result != KERN_SUCCESS)
  2412. goto unlock;
  2413. entry->object.sub_map = submap;
  2414. entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
  2415. result = KERN_SUCCESS;
  2416. }
  2417. unlock:
  2418. vm_map_unlock(map);
  2419. if (result != KERN_SUCCESS) {
  2420. vm_map_lock(submap);
  2421. submap->flags &= ~MAP_IS_SUB_MAP;
  2422. vm_map_unlock(submap);
  2423. }
  2424. return (result);
  2425. }
  2426. /*
  2427. * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
  2428. */
  2429. #define MAX_INIT_PT 96
  2430. /*
  2431. * vm_map_pmap_enter:
  2432. *
  2433. * Preload the specified map's pmap with mappings to the specified
  2434. * object's memory-resident pages. No further physical pages are
  2435. * allocated, and no further virtual pages are retrieved from secondary
  2436. * storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a
  2437. * limited number of page mappings are created at the low-end of the
  2438. * specified address range. (For this purpose, a superpage mapping
  2439. * counts as one page mapping.) Otherwise, all resident pages within
  2440. * the specified address range are mapped.
  2441. */
  2442. static void
  2443. vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
  2444. vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
  2445. {
  2446. vm_offset_t start;
  2447. vm_page_t p, p_start;
  2448. vm_pindex_t mask, psize, threshold, tmpidx;
  2449. int psind;
  2450. if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
  2451. return;
  2452. if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
  2453. VM_OBJECT_WLOCK(object);
  2454. if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
  2455. pmap_object_init_pt(map->pmap, addr, object, pindex,
  2456. size);
  2457. VM_OBJECT_WUNLOCK(object);
  2458. return;
  2459. }
  2460. VM_OBJECT_LOCK_DOWNGRADE(object);
  2461. } else
  2462. VM_OBJECT_RLOCK(object);
  2463. psize = atop(size);
  2464. if (psize + pindex > object->size) {
  2465. if (pindex >= object->size) {
  2466. VM_OBJECT_RUNLOCK(object);
  2467. return;
  2468. }
  2469. psize = object->size - pindex;
  2470. }
  2471. start = 0;
  2472. p_start = NULL;
  2473. threshold = MAX_INIT_PT;
  2474. p = vm_page_find_least(object, pindex);
  2475. /*
  2476. * Assert: the variable p is either (1) the page with the
  2477. * least pindex greater than or equal to the parameter pindex
  2478. * or (2) NULL.
  2479. */
  2480. for (;
  2481. p != NULL && (tmpidx = p->pindex - pindex) < psize;
  2482. p = TAILQ_NEXT(p, listq)) {
  2483. /*
  2484. * don't allow an madvise to blow away our really
  2485. * free pages allocating pv entries.
  2486. */
  2487. if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
  2488. vm_page_count_severe()) ||
  2489. ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
  2490. tmpidx >= threshold)) {
  2491. psize = tmpidx;
  2492. break;
  2493. }
  2494. if (vm_page_all_valid(p)) {
  2495. if (p_start == NULL) {
  2496. start = addr + ptoa(tmpidx);
  2497. p_start = p;
  2498. }
  2499. /* Jump ahead if a superpage mapping is possible. */
  2500. for (psind = p->psind; psind > 0; psind--) {
  2501. if (((addr + ptoa(tmpidx)) &
  2502. (pagesizes[psind] - 1)) == 0) {
  2503. mask = atop(pagesizes[psind]) - 1;
  2504. if (tmpidx + mask < psize &&
  2505. vm_page_ps_test(p, psind,
  2506. PS_ALL_VALID, NULL)) {
  2507. p += mask;
  2508. threshold += mask;
  2509. break;
  2510. }
  2511. }
  2512. }
  2513. } else if (p_start != NULL) {
  2514. pmap_enter_object(map->pmap, start, addr +
  2515. ptoa(tmpidx), p_start, prot);
  2516. p_start = NULL;
  2517. }
  2518. }
  2519. if (p_start != NULL)
  2520. pmap_enter_object(map->pmap, start, addr + ptoa(psize),
  2521. p_start, prot);
  2522. VM_OBJECT_RUNLOCK(object);
  2523. }
  2524. static void
  2525. vm_map_protect_guard(vm_map_entry_t entry, vm_prot_t new_prot,
  2526. vm_prot_t new_maxprot, int flags)
  2527. {
  2528. vm_prot_t old_prot;
  2529. MPASS((entry->eflags & MAP_ENTRY_GUARD) != 0);
  2530. if ((entry->eflags & (MAP_ENTRY_STACK_GAP_UP |
  2531. MAP_ENTRY_STACK_GAP_DN)) == 0)
  2532. return;
  2533. old_prot = PROT_EXTRACT(entry->offset);
  2534. if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0) {
  2535. entry->offset = PROT_MAX(new_maxprot) |
  2536. (new_maxprot & old_prot);
  2537. }
  2538. if ((flags & VM_MAP_PROTECT_SET_PROT) != 0) {
  2539. entry->offset = new_prot | PROT_MAX(
  2540. PROT_MAX_EXTRACT(entry->offset));
  2541. }
  2542. }
  2543. /*
  2544. * vm_map_protect:
  2545. *
  2546. * Sets the protection and/or the maximum protection of the
  2547. * specified address region in the target map.
  2548. */
  2549. int
  2550. vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
  2551. vm_prot_t new_prot, vm_prot_t new_maxprot, int flags)
  2552. {
  2553. vm_map_entry_t entry, first_entry, in_tran, prev_entry;
  2554. vm_object_t obj;
  2555. struct ucred *cred;
  2556. vm_offset_t orig_start;
  2557. vm_prot_t check_prot, max_prot, old_prot;
  2558. int rv;
  2559. if (start == end)
  2560. return (KERN_SUCCESS);
  2561. if (CONTAINS_BITS(flags, VM_MAP_PROTECT_SET_PROT |
  2562. VM_MAP_PROTECT_SET_MAXPROT) &&
  2563. !CONTAINS_BITS(new_maxprot, new_prot))
  2564. return (KERN_OUT_OF_BOUNDS);
  2565. orig_start = start;
  2566. again:
  2567. in_tran = NULL;
  2568. start = orig_start;
  2569. vm_map_lock(map);
  2570. if ((map->flags & MAP_WXORX) != 0 &&
  2571. (flags & VM_MAP_PROTECT_SET_PROT) != 0 &&
  2572. CONTAINS_BITS(new_prot, VM_PROT_WRITE | VM_PROT_EXECUTE)) {
  2573. vm_map_unlock(map);
  2574. return (KERN_PROTECTION_FAILURE);
  2575. }
  2576. /*
  2577. * Ensure that we are not concurrently wiring pages. vm_map_wire() may
  2578. * need to fault pages into the map and will drop the map lock while
  2579. * doing so, and the VM object may end up in an inconsistent state if we
  2580. * update the protection on the map entry in between faults.
  2581. */
  2582. vm_map_wait_busy(map);
  2583. VM_MAP_RANGE_CHECK(map, start, end);
  2584. if (!vm_map_lookup_entry(map, start, &first_entry))
  2585. first_entry = vm_map_entry_succ(first_entry);
  2586. if ((flags & VM_MAP_PROTECT_GROWSDOWN) != 0 &&
  2587. (first_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0) {
  2588. /*
  2589. * Handle Linux's PROT_GROWSDOWN flag.
  2590. * It means that protection is applied down to the
  2591. * whole stack, including the specified range of the
  2592. * mapped region, and the grow down region (AKA
  2593. * guard).
  2594. */
  2595. while (!CONTAINS_BITS(first_entry->eflags,
  2596. MAP_ENTRY_GUARD | MAP_ENTRY_STACK_GAP_DN) &&
  2597. first_entry != vm_map_entry_first(map))
  2598. first_entry = vm_map_entry_pred(first_entry);
  2599. start = first_entry->start;
  2600. }
  2601. /*
  2602. * Make a first pass to check for protection violations.
  2603. */
  2604. check_prot = 0;
  2605. if ((flags & VM_MAP_PROTECT_SET_PROT) != 0)
  2606. check_prot |= new_prot;
  2607. if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0)
  2608. check_prot |= new_maxprot;
  2609. for (entry = first_entry; entry->start < end;
  2610. entry = vm_map_entry_succ(entry)) {
  2611. if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
  2612. vm_map_unlock(map);
  2613. return (KERN_INVALID_ARGUMENT);
  2614. }
  2615. if ((entry->eflags & (MAP_ENTRY_GUARD |
  2616. MAP_ENTRY_STACK_GAP_DN | MAP_ENTRY_STACK_GAP_UP)) ==
  2617. MAP_ENTRY_GUARD)
  2618. continue;
  2619. max_prot = (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
  2620. MAP_ENTRY_STACK_GAP_UP)) != 0 ?
  2621. PROT_MAX_EXTRACT(entry->offset) : entry->max_protection;
  2622. if (!CONTAINS_BITS(max_prot, check_prot)) {
  2623. vm_map_unlock(map);
  2624. return (KERN_PROTECTION_FAILURE);
  2625. }
  2626. if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0)
  2627. in_tran = entry;
  2628. }
  2629. /*
  2630. * Postpone the operation until all in-transition map entries have
  2631. * stabilized. An in-transition entry might already have its pages
  2632. * wired and wired_count incremented, but not yet have its
  2633. * MAP_ENTRY_USER_WIRED flag set. In which case, we would fail to call
  2634. * vm_fault_copy_entry() in the final loop below.
  2635. */
  2636. if (in_tran != NULL) {
  2637. in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
  2638. vm_map_unlock_and_wait(map, 0);
  2639. goto again;
  2640. }
  2641. /*
  2642. * Before changing the protections, try to reserve swap space for any
  2643. * private (i.e., copy-on-write) mappings that are transitioning from
  2644. * read-only to read/write access. If a reservation fails, break out
  2645. * of this loop early and let the next loop simplify the entries, since
  2646. * some may now be mergeable.
  2647. */
  2648. rv = vm_map_clip_start(map, first_entry, start);
  2649. if (rv != KERN_SUCCESS) {
  2650. vm_map_unlock(map);
  2651. return (rv);
  2652. }
  2653. for (entry = first_entry; entry->start < end;
  2654. entry = vm_map_entry_succ(entry)) {
  2655. rv = vm_map_clip_end(map, entry, end);
  2656. if (rv != KERN_SUCCESS) {
  2657. vm_map_unlock(map);
  2658. return (rv);
  2659. }
  2660. if ((flags & VM_MAP_PROTECT_SET_PROT) == 0 ||
  2661. ((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 ||
  2662. ENTRY_CHARGED(entry) ||
  2663. (entry->eflags & MAP_ENTRY_GUARD) != 0)
  2664. continue;
  2665. cred = curthread->td_ucred;
  2666. obj = entry->object.vm_object;
  2667. if (obj == NULL ||
  2668. (entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0) {
  2669. if (!swap_reserve(entry->end - entry->start)) {
  2670. rv = KERN_RESOURCE_SHORTAGE;
  2671. end = entry->end;
  2672. break;
  2673. }
  2674. crhold(cred);
  2675. entry->cred = cred;
  2676. continue;
  2677. }
  2678. VM_OBJECT_WLOCK(obj);
  2679. if ((obj->flags & OBJ_SWAP) == 0) {
  2680. VM_OBJECT_WUNLOCK(obj);
  2681. continue;
  2682. }
  2683. /*
  2684. * Charge for the whole object allocation now, since
  2685. * we cannot distinguish between non-charged and
  2686. * charged clipped mapping of the same object later.
  2687. */
  2688. KASSERT(obj->charge == 0,
  2689. ("vm_map_protect: object %p overcharged (entry %p)",
  2690. obj, entry));
  2691. if (!swap_reserve(ptoa(obj->size))) {
  2692. VM_OBJECT_WUNLOCK(obj);
  2693. rv = KERN_RESOURCE_SHORTAGE;
  2694. end = entry->end;
  2695. break;
  2696. }
  2697. crhold(cred);
  2698. obj->cred = cred;
  2699. obj->charge = ptoa(obj->size);
  2700. VM_OBJECT_WUNLOCK(obj);
  2701. }
  2702. /*
  2703. * If enough swap space was available, go back and fix up protections.
  2704. * Otherwise, just simplify entries, since some may have been modified.
  2705. * [Note that clipping is not necessary the second time.]
  2706. */
  2707. for (prev_entry = vm_map_entry_pred(first_entry), entry = first_entry;
  2708. entry->start < end;
  2709. vm_map_try_merge_entries(map, prev_entry, entry),
  2710. prev_entry = entry, entry = vm_map_entry_succ(entry)) {
  2711. if (rv != KERN_SUCCESS)
  2712. continue;
  2713. if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
  2714. vm_map_protect_guard(entry, new_prot, new_maxprot,
  2715. flags);
  2716. continue;
  2717. }
  2718. old_prot = entry->protection;
  2719. if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0) {
  2720. entry->max_protection = new_maxprot;
  2721. entry->protection = new_maxprot & old_prot;
  2722. }
  2723. if ((flags & VM_MAP_PROTECT_SET_PROT) != 0)
  2724. entry->protection = new_prot;
  2725. /*
  2726. * For user wired map entries, the normal lazy evaluation of
  2727. * write access upgrades through soft page faults is
  2728. * undesirable. Instead, immediately copy any pages that are
  2729. * copy-on-write and enable write access in the physical map.
  2730. */
  2731. if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
  2732. (entry->protection & VM_PROT_WRITE) != 0 &&
  2733. (old_prot & VM_PROT_WRITE) == 0)
  2734. vm_fault_copy_entry(map, map, entry, entry, NULL);
  2735. /*
  2736. * When restricting access, update the physical map. Worry
  2737. * about copy-on-write here.
  2738. */
  2739. if ((old_prot & ~entry->protection) != 0) {
  2740. #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
  2741. VM_PROT_ALL)
  2742. pmap_protect(map->pmap, entry->start,
  2743. entry->end,
  2744. entry->protection & MASK(entry));
  2745. #undef MASK
  2746. }
  2747. }
  2748. vm_map_try_merge_entries(map, prev_entry, entry);
  2749. vm_map_unlock(map);
  2750. return (rv);
  2751. }
  2752. /*
  2753. * vm_map_madvise:
  2754. *
  2755. * This routine traverses a processes map handling the madvise
  2756. * system call. Advisories are classified as either those effecting
  2757. * the vm_map_entry structure, or those effecting the underlying
  2758. * objects.
  2759. */
  2760. int
  2761. vm_map_madvise(
  2762. vm_map_t map,
  2763. vm_offset_t start,
  2764. vm_offset_t end,
  2765. int behav)
  2766. {
  2767. vm_map_entry_t entry, prev_entry;
  2768. int rv;
  2769. bool modify_map;
  2770. /*
  2771. * Some madvise calls directly modify the vm_map_entry, in which case
  2772. * we need to use an exclusive lock on the map and we need to perform
  2773. * various clipping operations. Otherwise we only need a read-lock
  2774. * on the map.
  2775. */
  2776. switch(behav) {
  2777. case MADV_NORMAL:
  2778. case MADV_SEQUENTIAL:
  2779. case MADV_RANDOM:
  2780. case MADV_NOSYNC:
  2781. case MADV_AUTOSYNC:
  2782. case MADV_NOCORE:
  2783. case MADV_CORE:
  2784. if (start == end)
  2785. return (0);
  2786. modify_map = true;
  2787. vm_map_lock(map);
  2788. break;
  2789. case MADV_WILLNEED:
  2790. case MADV_DONTNEED:
  2791. case MADV_FREE:
  2792. if (start == end)
  2793. return (0);
  2794. modify_map = false;
  2795. vm_map_lock_read(map);
  2796. break;
  2797. default:
  2798. return (EINVAL);
  2799. }
  2800. /*
  2801. * Locate starting entry and clip if necessary.
  2802. */
  2803. VM_MAP_RANGE_CHECK(map, start, end);
  2804. if (modify_map) {
  2805. /*
  2806. * madvise behaviors that are implemented in the vm_map_entry.
  2807. *
  2808. * We clip the vm_map_entry so that behavioral changes are
  2809. * limited to the specified address range.
  2810. */
  2811. rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
  2812. if (rv != KERN_SUCCESS) {
  2813. vm_map_unlock(map);
  2814. return (vm_mmap_to_errno(rv));
  2815. }
  2816. for (; entry->start < end; prev_entry = entry,
  2817. entry = vm_map_entry_succ(entry)) {
  2818. if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
  2819. continue;
  2820. rv = vm_map_clip_end(map, entry, end);
  2821. if (rv != KERN_SUCCESS) {
  2822. vm_map_unlock(map);
  2823. return (vm_mmap_to_errno(rv));
  2824. }
  2825. switch (behav) {
  2826. case MADV_NORMAL:
  2827. vm_map_entry_set_behavior(entry,
  2828. MAP_ENTRY_BEHAV_NORMAL);
  2829. break;
  2830. case MADV_SEQUENTIAL:
  2831. vm_map_entry_set_behavior(entry,
  2832. MAP_ENTRY_BEHAV_SEQUENTIAL);
  2833. break;
  2834. case MADV_RANDOM:
  2835. vm_map_entry_set_behavior(entry,
  2836. MAP_ENTRY_BEHAV_RANDOM);
  2837. break;
  2838. case MADV_NOSYNC:
  2839. entry->eflags |= MAP_ENTRY_NOSYNC;
  2840. break;
  2841. case MADV_AUTOSYNC:
  2842. entry->eflags &= ~MAP_ENTRY_NOSYNC;
  2843. break;
  2844. case MADV_NOCORE:
  2845. entry->eflags |= MAP_ENTRY_NOCOREDUMP;
  2846. break;
  2847. case MADV_CORE:
  2848. entry->eflags &= ~MAP_ENTRY_NOCOREDUMP;
  2849. break;
  2850. default:
  2851. break;
  2852. }
  2853. vm_map_try_merge_entries(map, prev_entry, entry);
  2854. }
  2855. vm_map_try_merge_entries(map, prev_entry, entry);
  2856. vm_map_unlock(map);
  2857. } else {
  2858. vm_pindex_t pstart, pend;
  2859. /*
  2860. * madvise behaviors that are implemented in the underlying
  2861. * vm_object.
  2862. *
  2863. * Since we don't clip the vm_map_entry, we have to clip
  2864. * the vm_object pindex and count.
  2865. */
  2866. if (!vm_map_lookup_entry(map, start, &entry))
  2867. entry = vm_map_entry_succ(entry);
  2868. for (; entry->start < end;
  2869. entry = vm_map_entry_succ(entry)) {
  2870. vm_offset_t useEnd, useStart;
  2871. if ((entry->eflags & (MAP_ENTRY_IS_SUB_MAP |
  2872. MAP_ENTRY_GUARD)) != 0)
  2873. continue;
  2874. /*
  2875. * MADV_FREE would otherwise rewind time to
  2876. * the creation of the shadow object. Because
  2877. * we hold the VM map read-locked, neither the
  2878. * entry's object nor the presence of a
  2879. * backing object can change.
  2880. */
  2881. if (behav == MADV_FREE &&
  2882. entry->object.vm_object != NULL &&
  2883. entry->object.vm_object->backing_object != NULL)
  2884. continue;
  2885. pstart = OFF_TO_IDX(entry->offset);
  2886. pend = pstart + atop(entry->end - entry->start);
  2887. useStart = entry->start;
  2888. useEnd = entry->end;
  2889. if (entry->start < start) {
  2890. pstart += atop(start - entry->start);
  2891. useStart = start;
  2892. }
  2893. if (entry->end > end) {
  2894. pend -= atop(entry->end - end);
  2895. useEnd = end;
  2896. }
  2897. if (pstart >= pend)
  2898. continue;
  2899. /*
  2900. * Perform the pmap_advise() before clearing
  2901. * PGA_REFERENCED in vm_page_advise(). Otherwise, a
  2902. * concurrent pmap operation, such as pmap_remove(),
  2903. * could clear a reference in the pmap and set
  2904. * PGA_REFERENCED on the page before the pmap_advise()
  2905. * had completed. Consequently, the page would appear
  2906. * referenced based upon an old reference that
  2907. * occurred before this pmap_advise() ran.
  2908. */
  2909. if (behav == MADV_DONTNEED || behav == MADV_FREE)
  2910. pmap_advise(map->pmap, useStart, useEnd,
  2911. behav);
  2912. vm_object_madvise(entry->object.vm_object, pstart,
  2913. pend, behav);
  2914. /*
  2915. * Pre-populate paging structures in the
  2916. * WILLNEED case. For wired entries, the
  2917. * paging structures are already populated.
  2918. */
  2919. if (behav == MADV_WILLNEED &&
  2920. entry->wired_count == 0) {
  2921. vm_map_pmap_enter(map,
  2922. useStart,
  2923. entry->protection,
  2924. entry->object.vm_object,
  2925. pstart,
  2926. ptoa(pend - pstart),
  2927. MAP_PREFAULT_MADVISE
  2928. );
  2929. }
  2930. }
  2931. vm_map_unlock_read(map);
  2932. }
  2933. return (0);
  2934. }
  2935. /*
  2936. * vm_map_inherit:
  2937. *
  2938. * Sets the inheritance of the specified address
  2939. * range in the target map. Inheritance
  2940. * affects how the map will be shared with
  2941. * child maps at the time of vmspace_fork.
  2942. */
  2943. int
  2944. vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
  2945. vm_inherit_t new_inheritance)
  2946. {
  2947. vm_map_entry_t entry, lentry, prev_entry, start_entry;
  2948. int rv;
  2949. switch (new_inheritance) {
  2950. case VM_INHERIT_NONE:
  2951. case VM_INHERIT_COPY:
  2952. case VM_INHERIT_SHARE:
  2953. case VM_INHERIT_ZERO:
  2954. break;
  2955. default:
  2956. return (KERN_INVALID_ARGUMENT);
  2957. }
  2958. if (start == end)
  2959. return (KERN_SUCCESS);
  2960. vm_map_lock(map);
  2961. VM_MAP_RANGE_CHECK(map, start, end);
  2962. rv = vm_map_lookup_clip_start(map, start, &start_entry, &prev_entry);
  2963. if (rv != KERN_SUCCESS)
  2964. goto unlock;
  2965. if (vm_map_lookup_entry(map, end - 1, &lentry)) {
  2966. rv = vm_map_clip_end(map, lentry, end);
  2967. if (rv != KERN_SUCCESS)
  2968. goto unlock;
  2969. }
  2970. if (new_inheritance == VM_INHERIT_COPY) {
  2971. for (entry = start_entry; entry->start < end;
  2972. prev_entry = entry, entry = vm_map_entry_succ(entry)) {
  2973. if ((entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK)
  2974. != 0) {
  2975. rv = KERN_INVALID_ARGUMENT;
  2976. goto unlock;
  2977. }
  2978. }
  2979. }
  2980. for (entry = start_entry; entry->start < end; prev_entry = entry,
  2981. entry = vm_map_entry_succ(entry)) {
  2982. KASSERT(entry->end <= end, ("non-clipped entry %p end %jx %jx",
  2983. entry, (uintmax_t)entry->end, (uintmax_t)end));
  2984. if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
  2985. new_inheritance != VM_INHERIT_ZERO)
  2986. entry->inheritance = new_inheritance;
  2987. vm_map_try_merge_entries(map, prev_entry, entry);
  2988. }
  2989. vm_map_try_merge_entries(map, prev_entry, entry);
  2990. unlock:
  2991. vm_map_unlock(map);
  2992. return (rv);
  2993. }
  2994. /*
  2995. * vm_map_entry_in_transition:
  2996. *
  2997. * Release the map lock, and sleep until the entry is no longer in
  2998. * transition. Awake and acquire the map lock. If the map changed while
  2999. * another held the lock, lookup a possibly-changed entry at or after the
  3000. * 'start' position of the old entry.
  3001. */
  3002. static vm_map_entry_t
  3003. vm_map_entry_in_transition(vm_map_t map, vm_offset_t in_start,
  3004. vm_offset_t *io_end, bool holes_ok, vm_map_entry_t in_entry)
  3005. {
  3006. vm_map_entry_t entry;
  3007. vm_offset_t start;
  3008. u_int last_timestamp;
  3009. VM_MAP_ASSERT_LOCKED(map);
  3010. KASSERT((in_entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
  3011. ("not in-tranition map entry %p", in_entry));
  3012. /*
  3013. * We have not yet clipped the entry.
  3014. */
  3015. start = MAX(in_start, in_entry->start);
  3016. in_entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
  3017. last_timestamp = map->timestamp;
  3018. if (vm_map_unlock_and_wait(map, 0)) {
  3019. /*
  3020. * Allow interruption of user wiring/unwiring?
  3021. */
  3022. }
  3023. vm_map_lock(map);
  3024. if (last_timestamp + 1 == map->timestamp)
  3025. return (in_entry);
  3026. /*
  3027. * Look again for the entry because the map was modified while it was
  3028. * unlocked. Specifically, the entry may have been clipped, merged, or
  3029. * deleted.
  3030. */
  3031. if (!vm_map_lookup_entry(map, start, &entry)) {
  3032. if (!holes_ok) {
  3033. *io_end = start;
  3034. return (NULL);
  3035. }
  3036. entry = vm_map_entry_succ(entry);
  3037. }
  3038. return (entry);
  3039. }
  3040. /*
  3041. * vm_map_unwire:
  3042. *
  3043. * Implements both kernel and user unwiring.
  3044. */
  3045. int
  3046. vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
  3047. int flags)
  3048. {
  3049. vm_map_entry_t entry, first_entry, next_entry, prev_entry;
  3050. int rv;
  3051. bool holes_ok, need_wakeup, user_unwire;
  3052. if (start == end)
  3053. return (KERN_SUCCESS);
  3054. holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
  3055. user_unwire = (flags & VM_MAP_WIRE_USER) != 0;
  3056. vm_map_lock(map);
  3057. VM_MAP_RANGE_CHECK(map, start, end);
  3058. if (!vm_map_lookup_entry(map, start, &first_entry)) {
  3059. if (holes_ok)
  3060. first_entry = vm_map_entry_succ(first_entry);
  3061. else {
  3062. vm_map_unlock(map);
  3063. return (KERN_INVALID_ADDRESS);
  3064. }
  3065. }
  3066. rv = KERN_SUCCESS;
  3067. for (entry = first_entry; entry->start < end; entry = next_entry) {
  3068. if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
  3069. /*
  3070. * We have not yet clipped the entry.
  3071. */
  3072. next_entry = vm_map_entry_in_transition(map, start,
  3073. &end, holes_ok, entry);
  3074. if (next_entry == NULL) {
  3075. if (entry == first_entry) {
  3076. vm_map_unlock(map);
  3077. return (KERN_INVALID_ADDRESS);
  3078. }
  3079. rv = KERN_INVALID_ADDRESS;
  3080. break;
  3081. }
  3082. first_entry = (entry == first_entry) ?
  3083. next_entry : NULL;
  3084. continue;
  3085. }
  3086. rv = vm_map_clip_start(map, entry, start);
  3087. if (rv != KERN_SUCCESS)
  3088. break;
  3089. rv = vm_map_clip_end(map, entry, end);
  3090. if (rv != KERN_SUCCESS)
  3091. break;
  3092. /*
  3093. * Mark the entry in case the map lock is released. (See
  3094. * above.)
  3095. */
  3096. KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
  3097. entry->wiring_thread == NULL,
  3098. ("owned map entry %p", entry));
  3099. entry->eflags |= MAP_ENTRY_IN_TRANSITION;
  3100. entry->wiring_thread = curthread;
  3101. next_entry = vm_map_entry_succ(entry);
  3102. /*
  3103. * Check the map for holes in the specified region.
  3104. * If holes_ok, skip this check.
  3105. */
  3106. if (!holes_ok &&
  3107. entry->end < end && next_entry->start > entry->end) {
  3108. end = entry->end;
  3109. rv = KERN_INVALID_ADDRESS;
  3110. break;
  3111. }
  3112. /*
  3113. * If system unwiring, require that the entry is system wired.
  3114. */
  3115. if (!user_unwire &&
  3116. vm_map_entry_system_wired_count(entry) == 0) {
  3117. end = entry->end;
  3118. rv = KERN_INVALID_ARGUMENT;
  3119. break;
  3120. }
  3121. }
  3122. need_wakeup = false;
  3123. if (first_entry == NULL &&
  3124. !vm_map_lookup_entry(map, start, &first_entry)) {
  3125. KASSERT(holes_ok, ("vm_map_unwire: lookup failed"));
  3126. prev_entry = first_entry;
  3127. entry = vm_map_entry_succ(first_entry);
  3128. } else {
  3129. prev_entry = vm_map_entry_pred(first_entry);
  3130. entry = first_entry;
  3131. }
  3132. for (; entry->start < end;
  3133. prev_entry = entry, entry = vm_map_entry_succ(entry)) {
  3134. /*
  3135. * If holes_ok was specified, an empty
  3136. * space in the unwired region could have been mapped
  3137. * while the map lock was dropped for draining
  3138. * MAP_ENTRY_IN_TRANSITION. Moreover, another thread
  3139. * could be simultaneously wiring this new mapping
  3140. * entry. Detect these cases and skip any entries
  3141. * marked as in transition by us.
  3142. */
  3143. if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
  3144. entry->wiring_thread != curthread) {
  3145. KASSERT(holes_ok,
  3146. ("vm_map_unwire: !HOLESOK and new/changed entry"));
  3147. continue;
  3148. }
  3149. if (rv == KERN_SUCCESS && (!user_unwire ||
  3150. (entry->eflags & MAP_ENTRY_USER_WIRED))) {
  3151. if (entry->wired_count == 1)
  3152. vm_map_entry_unwire(map, entry);
  3153. else
  3154. entry->wired_count--;
  3155. if (user_unwire)
  3156. entry->eflags &= ~MAP_ENTRY_USER_WIRED;
  3157. }
  3158. KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
  3159. ("vm_map_unwire: in-transition flag missing %p", entry));
  3160. KASSERT(entry->wiring_thread == curthread,
  3161. ("vm_map_unwire: alien wire %p", entry));
  3162. entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
  3163. entry->wiring_thread = NULL;
  3164. if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
  3165. entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
  3166. need_wakeup = true;
  3167. }
  3168. vm_map_try_merge_entries(map, prev_entry, entry);
  3169. }
  3170. vm_map_try_merge_entries(map, prev_entry, entry);
  3171. vm_map_unlock(map);
  3172. if (need_wakeup)
  3173. vm_map_wakeup(map);
  3174. return (rv);
  3175. }
  3176. static void
  3177. vm_map_wire_user_count_sub(u_long npages)
  3178. {
  3179. atomic_subtract_long(&vm_user_wire_count, npages);
  3180. }
  3181. static bool
  3182. vm_map_wire_user_count_add(u_long npages)
  3183. {
  3184. u_long wired;
  3185. wired = vm_user_wire_count;
  3186. do {
  3187. if (npages + wired > vm_page_max_user_wired)
  3188. return (false);
  3189. } while (!atomic_fcmpset_long(&vm_user_wire_count, &wired,
  3190. npages + wired));
  3191. return (true);
  3192. }
  3193. /*
  3194. * vm_map_wire_entry_failure:
  3195. *
  3196. * Handle a wiring failure on the given entry.
  3197. *
  3198. * The map should be locked.
  3199. */
  3200. static void
  3201. vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
  3202. vm_offset_t failed_addr)
  3203. {
  3204. VM_MAP_ASSERT_LOCKED(map);
  3205. KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
  3206. entry->wired_count == 1,
  3207. ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
  3208. KASSERT(failed_addr < entry->end,
  3209. ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
  3210. /*
  3211. * If any pages at the start of this entry were successfully wired,
  3212. * then unwire them.
  3213. */
  3214. if (failed_addr > entry->start) {
  3215. pmap_unwire(map->pmap, entry->start, failed_addr);
  3216. vm_object_unwire(entry->object.vm_object, entry->offset,
  3217. failed_addr - entry->start, PQ_ACTIVE);
  3218. }
  3219. /*
  3220. * Assign an out-of-range value to represent the failure to wire this
  3221. * entry.
  3222. */
  3223. entry->wired_count = -1;
  3224. }
  3225. int
  3226. vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
  3227. {
  3228. int rv;
  3229. vm_map_lock(map);
  3230. rv = vm_map_wire_locked(map, start, end, flags);
  3231. vm_map_unlock(map);
  3232. return (rv);
  3233. }
  3234. /*
  3235. * vm_map_wire_locked:
  3236. *
  3237. * Implements both kernel and user wiring. Returns with the map locked,
  3238. * the map lock may be dropped.
  3239. */
  3240. int
  3241. vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
  3242. {
  3243. vm_map_entry_t entry, first_entry, next_entry, prev_entry;
  3244. vm_offset_t faddr, saved_end, saved_start;
  3245. u_long incr, npages;
  3246. u_int bidx, last_timestamp;
  3247. int rv;
  3248. bool holes_ok, need_wakeup, user_wire;
  3249. vm_prot_t prot;
  3250. VM_MAP_ASSERT_LOCKED(map);
  3251. if (start == end)
  3252. return (KERN_SUCCESS);
  3253. prot = 0;
  3254. if (flags & VM_MAP_WIRE_WRITE)
  3255. prot |= VM_PROT_WRITE;
  3256. holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
  3257. user_wire = (flags & VM_MAP_WIRE_USER) != 0;
  3258. VM_MAP_RANGE_CHECK(map, start, end);
  3259. if (!vm_map_lookup_entry(map, start, &first_entry)) {
  3260. if (holes_ok)
  3261. first_entry = vm_map_entry_succ(first_entry);
  3262. else
  3263. return (KERN_INVALID_ADDRESS);
  3264. }
  3265. for (entry = first_entry; entry->start < end; entry = next_entry) {
  3266. if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
  3267. /*
  3268. * We have not yet clipped the entry.
  3269. */
  3270. next_entry = vm_map_entry_in_transition(map, start,
  3271. &end, holes_ok, entry);
  3272. if (next_entry == NULL) {
  3273. if (entry == first_entry)
  3274. return (KERN_INVALID_ADDRESS);
  3275. rv = KERN_INVALID_ADDRESS;
  3276. goto done;
  3277. }
  3278. first_entry = (entry == first_entry) ?
  3279. next_entry : NULL;
  3280. continue;
  3281. }
  3282. rv = vm_map_clip_start(map, entry, start);
  3283. if (rv != KERN_SUCCESS)
  3284. goto done;
  3285. rv = vm_map_clip_end(map, entry, end);
  3286. if (rv != KERN_SUCCESS)
  3287. goto done;
  3288. /*
  3289. * Mark the entry in case the map lock is released. (See
  3290. * above.)
  3291. */
  3292. KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
  3293. entry->wiring_thread == NULL,
  3294. ("owned map entry %p", entry));
  3295. entry->eflags |= MAP_ENTRY_IN_TRANSITION;
  3296. entry->wiring_thread = curthread;
  3297. if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
  3298. || (entry->protection & prot) != prot) {
  3299. entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
  3300. if (!holes_ok) {
  3301. end = entry->end;
  3302. rv = KERN_INVALID_ADDRESS;
  3303. goto done;
  3304. }
  3305. } else if (entry->wired_count == 0) {
  3306. entry->wired_count++;
  3307. npages = atop(entry->end - entry->start);
  3308. if (user_wire && !vm_map_wire_user_count_add(npages)) {
  3309. vm_map_wire_entry_failure(map, entry,
  3310. entry->start);
  3311. end = entry->end;
  3312. rv = KERN_RESOURCE_SHORTAGE;
  3313. goto done;
  3314. }
  3315. /*
  3316. * Release the map lock, relying on the in-transition
  3317. * mark. Mark the map busy for fork.
  3318. */
  3319. saved_start = entry->start;
  3320. saved_end = entry->end;
  3321. last_timestamp = map->timestamp;
  3322. bidx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
  3323. incr = pagesizes[bidx];
  3324. vm_map_busy(map);
  3325. vm_map_unlock(map);
  3326. for (faddr = saved_start; faddr < saved_end;
  3327. faddr += incr) {
  3328. /*
  3329. * Simulate a fault to get the page and enter
  3330. * it into the physical map.
  3331. */
  3332. rv = vm_fault(map, faddr, VM_PROT_NONE,
  3333. VM_FAULT_WIRE, NULL);
  3334. if (rv != KERN_SUCCESS)
  3335. break;
  3336. }
  3337. vm_map_lock(map);
  3338. vm_map_unbusy(map);
  3339. if (last_timestamp + 1 != map->timestamp) {
  3340. /*
  3341. * Look again for the entry because the map was
  3342. * modified while it was unlocked. The entry
  3343. * may have been clipped, but NOT merged or
  3344. * deleted.
  3345. */
  3346. if (!vm_map_lookup_entry(map, saved_start,
  3347. &next_entry))
  3348. KASSERT(false,
  3349. ("vm_map_wire: lookup failed"));
  3350. first_entry = (entry == first_entry) ?
  3351. next_entry : NULL;
  3352. for (entry = next_entry; entry->end < saved_end;
  3353. entry = vm_map_entry_succ(entry)) {
  3354. /*
  3355. * In case of failure, handle entries
  3356. * that were not fully wired here;
  3357. * fully wired entries are handled
  3358. * later.
  3359. */
  3360. if (rv != KERN_SUCCESS &&
  3361. faddr < entry->end)
  3362. vm_map_wire_entry_failure(map,
  3363. entry, faddr);
  3364. }
  3365. }
  3366. if (rv != KERN_SUCCESS) {
  3367. vm_map_wire_entry_failure(map, entry, faddr);
  3368. if (user_wire)
  3369. vm_map_wire_user_count_sub(npages);
  3370. end = entry->end;
  3371. goto done;
  3372. }
  3373. } else if (!user_wire ||
  3374. (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
  3375. entry->wired_count++;
  3376. }
  3377. /*
  3378. * Check the map for holes in the specified region.
  3379. * If holes_ok was specified, skip this check.
  3380. */
  3381. next_entry = vm_map_entry_succ(entry);
  3382. if (!holes_ok &&
  3383. entry->end < end && next_entry->start > entry->end) {
  3384. end = entry->end;
  3385. rv = KERN_INVALID_ADDRESS;
  3386. goto done;
  3387. }
  3388. }
  3389. rv = KERN_SUCCESS;
  3390. done:
  3391. need_wakeup = false;
  3392. if (first_entry == NULL &&
  3393. !vm_map_lookup_entry(map, start, &first_entry)) {
  3394. KASSERT(holes_ok, ("vm_map_wire: lookup failed"));
  3395. prev_entry = first_entry;
  3396. entry = vm_map_entry_succ(first_entry);
  3397. } else {
  3398. prev_entry = vm_map_entry_pred(first_entry);
  3399. entry = first_entry;
  3400. }
  3401. for (; entry->start < end;
  3402. prev_entry = entry, entry = vm_map_entry_succ(entry)) {
  3403. /*
  3404. * If holes_ok was specified, an empty
  3405. * space in the unwired region could have been mapped
  3406. * while the map lock was dropped for faulting in the
  3407. * pages or draining MAP_ENTRY_IN_TRANSITION.
  3408. * Moreover, another thread could be simultaneously
  3409. * wiring this new mapping entry. Detect these cases
  3410. * and skip any entries marked as in transition not by us.
  3411. *
  3412. * Another way to get an entry not marked with
  3413. * MAP_ENTRY_IN_TRANSITION is after failed clipping,
  3414. * which set rv to KERN_INVALID_ARGUMENT.
  3415. */
  3416. if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
  3417. entry->wiring_thread != curthread) {
  3418. KASSERT(holes_ok || rv == KERN_INVALID_ARGUMENT,
  3419. ("vm_map_wire: !HOLESOK and new/changed entry"));
  3420. continue;
  3421. }
  3422. if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) {
  3423. /* do nothing */
  3424. } else if (rv == KERN_SUCCESS) {
  3425. if (user_wire)
  3426. entry->eflags |= MAP_ENTRY_USER_WIRED;
  3427. } else if (entry->wired_count == -1) {
  3428. /*
  3429. * Wiring failed on this entry. Thus, unwiring is
  3430. * unnecessary.
  3431. */
  3432. entry->wired_count = 0;
  3433. } else if (!user_wire ||
  3434. (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
  3435. /*
  3436. * Undo the wiring. Wiring succeeded on this entry
  3437. * but failed on a later entry.
  3438. */
  3439. if (entry->wired_count == 1) {
  3440. vm_map_entry_unwire(map, entry);
  3441. if (user_wire)
  3442. vm_map_wire_user_count_sub(
  3443. atop(entry->end - entry->start));
  3444. } else
  3445. entry->wired_count--;
  3446. }
  3447. KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
  3448. ("vm_map_wire: in-transition flag missing %p", entry));
  3449. KASSERT(entry->wiring_thread == curthread,
  3450. ("vm_map_wire: alien wire %p", entry));
  3451. entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
  3452. MAP_ENTRY_WIRE_SKIPPED);
  3453. entry->wiring_thread = NULL;
  3454. if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
  3455. entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
  3456. need_wakeup = true;
  3457. }
  3458. vm_map_try_merge_entries(map, prev_entry, entry);
  3459. }
  3460. vm_map_try_merge_entries(map, prev_entry, entry);
  3461. if (need_wakeup)
  3462. vm_map_wakeup(map);
  3463. return (rv);
  3464. }
  3465. /*
  3466. * vm_map_sync
  3467. *
  3468. * Push any dirty cached pages in the address range to their pager.
  3469. * If syncio is TRUE, dirty pages are written synchronously.
  3470. * If invalidate is TRUE, any cached pages are freed as well.
  3471. *
  3472. * If the size of the region from start to end is zero, we are
  3473. * supposed to flush all modified pages within the region containing
  3474. * start. Unfortunately, a region can be split or coalesced with
  3475. * neighboring regions, making it difficult to determine what the
  3476. * original region was. Therefore, we approximate this requirement by
  3477. * flushing the current region containing start.
  3478. *
  3479. * Returns an error if any part of the specified range is not mapped.
  3480. */
  3481. int
  3482. vm_map_sync(
  3483. vm_map_t map,
  3484. vm_offset_t start,
  3485. vm_offset_t end,
  3486. boolean_t syncio,
  3487. boolean_t invalidate)
  3488. {
  3489. vm_map_entry_t entry, first_entry, next_entry;
  3490. vm_size_t size;
  3491. vm_object_t object;
  3492. vm_ooffset_t offset;
  3493. unsigned int last_timestamp;
  3494. int bdry_idx;
  3495. boolean_t failed;
  3496. vm_map_lock_read(map);
  3497. VM_MAP_RANGE_CHECK(map, start, end);
  3498. if (!vm_map_lookup_entry(map, start, &first_entry)) {
  3499. vm_map_unlock_read(map);
  3500. return (KERN_INVALID_ADDRESS);
  3501. } else if (start == end) {
  3502. start = first_entry->start;
  3503. end = first_entry->end;
  3504. }
  3505. /*
  3506. * Make a first pass to check for user-wired memory, holes,
  3507. * and partial invalidation of largepage mappings.
  3508. */
  3509. for (entry = first_entry; entry->start < end; entry = next_entry) {
  3510. if (invalidate) {
  3511. if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) {
  3512. vm_map_unlock_read(map);
  3513. return (KERN_INVALID_ARGUMENT);
  3514. }
  3515. bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
  3516. if (bdry_idx != 0 &&
  3517. ((start & (pagesizes[bdry_idx] - 1)) != 0 ||
  3518. (end & (pagesizes[bdry_idx] - 1)) != 0)) {
  3519. vm_map_unlock_read(map);
  3520. return (KERN_INVALID_ARGUMENT);
  3521. }
  3522. }
  3523. next_entry = vm_map_entry_succ(entry);
  3524. if (end > entry->end &&
  3525. entry->end != next_entry->start) {
  3526. vm_map_unlock_read(map);
  3527. return (KERN_INVALID_ADDRESS);
  3528. }
  3529. }
  3530. if (invalidate)
  3531. pmap_remove(map->pmap, start, end);
  3532. failed = FALSE;
  3533. /*
  3534. * Make a second pass, cleaning/uncaching pages from the indicated
  3535. * objects as we go.
  3536. */
  3537. for (entry = first_entry; entry->start < end;) {
  3538. offset = entry->offset + (start - entry->start);
  3539. size = (end <= entry->end ? end : entry->end) - start;
  3540. if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
  3541. vm_map_t smap;
  3542. vm_map_entry_t tentry;
  3543. vm_size_t tsize;
  3544. smap = entry->object.sub_map;
  3545. vm_map_lock_read(smap);
  3546. (void) vm_map_lookup_entry(smap, offset, &tentry);
  3547. tsize = tentry->end - offset;
  3548. if (tsize < size)
  3549. size = tsize;
  3550. object = tentry->object.vm_object;
  3551. offset = tentry->offset + (offset - tentry->start);
  3552. vm_map_unlock_read(smap);
  3553. } else {
  3554. object = entry->object.vm_object;
  3555. }
  3556. vm_object_reference(object);
  3557. last_timestamp = map->timestamp;
  3558. vm_map_unlock_read(map);
  3559. if (!vm_object_sync(object, offset, size, syncio, invalidate))
  3560. failed = TRUE;
  3561. start += size;
  3562. vm_object_deallocate(object);
  3563. vm_map_lock_read(map);
  3564. if (last_timestamp == map->timestamp ||
  3565. !vm_map_lookup_entry(map, start, &entry))
  3566. entry = vm_map_entry_succ(entry);
  3567. }
  3568. vm_map_unlock_read(map);
  3569. return (failed ? KERN_FAILURE : KERN_SUCCESS);
  3570. }
  3571. /*
  3572. * vm_map_entry_unwire: [ internal use only ]
  3573. *
  3574. * Make the region specified by this entry pageable.
  3575. *
  3576. * The map in question should be locked.
  3577. * [This is the reason for this routine's existence.]
  3578. */
  3579. static void
  3580. vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
  3581. {
  3582. vm_size_t size;
  3583. VM_MAP_ASSERT_LOCKED(map);
  3584. KASSERT(entry->wired_count > 0,
  3585. ("vm_map_entry_unwire: entry %p isn't wired", entry));
  3586. size = entry->end - entry->start;
  3587. if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0)
  3588. vm_map_wire_user_count_sub(atop(size));
  3589. pmap_unwire(map->pmap, entry->start, entry->end);
  3590. vm_object_unwire(entry->object.vm_object, entry->offset, size,
  3591. PQ_ACTIVE);
  3592. entry->wired_count = 0;
  3593. }
  3594. static void
  3595. vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
  3596. {
  3597. if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
  3598. vm_object_deallocate(entry->object.vm_object);
  3599. uma_zfree(system_map ? kmapentzone : mapentzone, entry);
  3600. }
  3601. /*
  3602. * vm_map_entry_delete: [ internal use only ]
  3603. *
  3604. * Deallocate the given entry from the target map.
  3605. */
  3606. static void
  3607. vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
  3608. {
  3609. vm_object_t object;
  3610. vm_pindex_t offidxstart, offidxend, size1;
  3611. vm_size_t size;
  3612. vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE);
  3613. object = entry->object.vm_object;
  3614. if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
  3615. MPASS(entry->cred == NULL);
  3616. MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
  3617. MPASS(object == NULL);
  3618. vm_map_entry_deallocate(entry, map->system_map);
  3619. return;
  3620. }
  3621. size = entry->end - entry->start;
  3622. map->size -= size;
  3623. if (entry->cred != NULL) {
  3624. swap_release_by_cred(size, entry->cred);
  3625. crfree(entry->cred);
  3626. }
  3627. if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || object == NULL) {
  3628. entry->object.vm_object = NULL;
  3629. } else if ((object->flags & OBJ_ANON) != 0 ||
  3630. object == kernel_object) {
  3631. KASSERT(entry->cred == NULL || object->cred == NULL ||
  3632. (entry->eflags & MAP_ENTRY_NEEDS_COPY),
  3633. ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
  3634. offidxstart = OFF_TO_IDX(entry->offset);
  3635. offidxend = offidxstart + atop(size);
  3636. VM_OBJECT_WLOCK(object);
  3637. if (object->ref_count != 1 &&
  3638. ((object->flags & OBJ_ONEMAPPING) != 0 ||
  3639. object == kernel_object)) {
  3640. vm_object_collapse(object);
  3641. /*
  3642. * The option OBJPR_NOTMAPPED can be passed here
  3643. * because vm_map_delete() already performed
  3644. * pmap_remove() on the only mapping to this range
  3645. * of pages.
  3646. */
  3647. vm_object_page_remove(object, offidxstart, offidxend,
  3648. OBJPR_NOTMAPPED);
  3649. if (offidxend >= object->size &&
  3650. offidxstart < object->size) {
  3651. size1 = object->size;
  3652. object->size = offidxstart;
  3653. if (object->cred != NULL) {
  3654. size1 -= object->size;
  3655. KASSERT(object->charge >= ptoa(size1),
  3656. ("object %p charge < 0", object));
  3657. swap_release_by_cred(ptoa(size1),
  3658. object->cred);
  3659. object->charge -= ptoa(size1);
  3660. }
  3661. }
  3662. }
  3663. VM_OBJECT_WUNLOCK(object);
  3664. }
  3665. if (map->system_map)
  3666. vm_map_entry_deallocate(entry, TRUE);
  3667. else {
  3668. entry->defer_next = curthread->td_map_def_user;
  3669. curthread->td_map_def_user = entry;
  3670. }
  3671. }
  3672. /*
  3673. * vm_map_delete: [ internal use only ]
  3674. *
  3675. * Deallocates the given address range from the target
  3676. * map.
  3677. */
  3678. int
  3679. vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
  3680. {
  3681. vm_map_entry_t entry, next_entry, scratch_entry;
  3682. int rv;
  3683. VM_MAP_ASSERT_LOCKED(map);
  3684. if (start == end)
  3685. return (KERN_SUCCESS);
  3686. /*
  3687. * Find the start of the region, and clip it.
  3688. * Step through all entries in this region.
  3689. */
  3690. rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry);
  3691. if (rv != KERN_SUCCESS)
  3692. return (rv);
  3693. for (; entry->start < end; entry = next_entry) {
  3694. /*
  3695. * Wait for wiring or unwiring of an entry to complete.
  3696. * Also wait for any system wirings to disappear on
  3697. * user maps.
  3698. */
  3699. if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
  3700. (vm_map_pmap(map) != kernel_pmap &&
  3701. vm_map_entry_system_wired_count(entry) != 0)) {
  3702. unsigned int last_timestamp;
  3703. vm_offset_t saved_start;
  3704. saved_start = entry->start;
  3705. entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
  3706. last_timestamp = map->timestamp;
  3707. (void) vm_map_unlock_and_wait(map, 0);
  3708. vm_map_lock(map);
  3709. if (last_timestamp + 1 != map->timestamp) {
  3710. /*
  3711. * Look again for the entry because the map was
  3712. * modified while it was unlocked.
  3713. * Specifically, the entry may have been
  3714. * clipped, merged, or deleted.
  3715. */
  3716. rv = vm_map_lookup_clip_start(map, saved_start,
  3717. &next_entry, &scratch_entry);
  3718. if (rv != KERN_SUCCESS)
  3719. break;
  3720. } else
  3721. next_entry = entry;
  3722. continue;
  3723. }
  3724. /* XXXKIB or delete to the upper superpage boundary ? */
  3725. rv = vm_map_clip_end(map, entry, end);
  3726. if (rv != KERN_SUCCESS)
  3727. break;
  3728. next_entry = vm_map_entry_succ(entry);
  3729. /*
  3730. * Unwire before removing addresses from the pmap; otherwise,
  3731. * unwiring will put the entries back in the pmap.
  3732. */
  3733. if (entry->wired_count != 0)
  3734. vm_map_entry_unwire(map, entry);
  3735. /*
  3736. * Remove mappings for the pages, but only if the
  3737. * mappings could exist. For instance, it does not
  3738. * make sense to call pmap_remove() for guard entries.
  3739. */
  3740. if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
  3741. entry->object.vm_object != NULL)
  3742. pmap_map_delete(map->pmap, entry->start, entry->end);
  3743. /*
  3744. * Delete the entry only after removing all pmap
  3745. * entries pointing to its pages. (Otherwise, its
  3746. * page frames may be reallocated, and any modify bits
  3747. * will be set in the wrong object!)
  3748. */
  3749. vm_map_entry_delete(map, entry);
  3750. }
  3751. return (rv);
  3752. }
  3753. /*
  3754. * vm_map_remove:
  3755. *
  3756. * Remove the given address range from the target map.
  3757. * This is the exported form of vm_map_delete.
  3758. */
  3759. int
  3760. vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
  3761. {
  3762. int result;
  3763. vm_map_lock(map);
  3764. VM_MAP_RANGE_CHECK(map, start, end);
  3765. result = vm_map_delete(map, start, end);
  3766. vm_map_unlock(map);
  3767. return (result);
  3768. }
  3769. /*
  3770. * vm_map_check_protection:
  3771. *
  3772. * Assert that the target map allows the specified privilege on the
  3773. * entire address region given. The entire region must be allocated.
  3774. *
  3775. * WARNING! This code does not and should not check whether the
  3776. * contents of the region is accessible. For example a smaller file
  3777. * might be mapped into a larger address space.
  3778. *
  3779. * NOTE! This code is also called by munmap().
  3780. *
  3781. * The map must be locked. A read lock is sufficient.
  3782. */
  3783. boolean_t
  3784. vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
  3785. vm_prot_t protection)
  3786. {
  3787. vm_map_entry_t entry;
  3788. vm_map_entry_t tmp_entry;
  3789. if (!vm_map_lookup_entry(map, start, &tmp_entry))
  3790. return (FALSE);
  3791. entry = tmp_entry;
  3792. while (start < end) {
  3793. /*
  3794. * No holes allowed!
  3795. */
  3796. if (start < entry->start)
  3797. return (FALSE);
  3798. /*
  3799. * Check protection associated with entry.
  3800. */
  3801. if ((entry->protection & protection) != protection)
  3802. return (FALSE);
  3803. /* go to next entry */
  3804. start = entry->end;
  3805. entry = vm_map_entry_succ(entry);
  3806. }
  3807. return (TRUE);
  3808. }
  3809. /*
  3810. *
  3811. * vm_map_copy_swap_object:
  3812. *
  3813. * Copies a swap-backed object from an existing map entry to a
  3814. * new one. Carries forward the swap charge. May change the
  3815. * src object on return.
  3816. */
  3817. static void
  3818. vm_map_copy_swap_object(vm_map_entry_t src_entry, vm_map_entry_t dst_entry,
  3819. vm_offset_t size, vm_ooffset_t *fork_charge)
  3820. {
  3821. vm_object_t src_object;
  3822. struct ucred *cred;
  3823. int charged;
  3824. src_object = src_entry->object.vm_object;
  3825. charged = ENTRY_CHARGED(src_entry);
  3826. if ((src_object->flags & OBJ_ANON) != 0) {
  3827. VM_OBJECT_WLOCK(src_object);
  3828. vm_object_collapse(src_object);
  3829. if ((src_object->flags & OBJ_ONEMAPPING) != 0) {
  3830. vm_object_split(src_entry);
  3831. src_object = src_entry->object.vm_object;
  3832. }
  3833. vm_object_reference_locked(src_object);
  3834. vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
  3835. VM_OBJECT_WUNLOCK(src_object);
  3836. } else
  3837. vm_object_reference(src_object);
  3838. if (src_entry->cred != NULL &&
  3839. !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
  3840. KASSERT(src_object->cred == NULL,
  3841. ("OVERCOMMIT: vm_map_copy_anon_entry: cred %p",
  3842. src_object));
  3843. src_object->cred = src_entry->cred;
  3844. src_object->charge = size;
  3845. }
  3846. dst_entry->object.vm_object = src_object;
  3847. if (charged) {
  3848. cred = curthread->td_ucred;
  3849. crhold(cred);
  3850. dst_entry->cred = cred;
  3851. *fork_charge += size;
  3852. if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
  3853. crhold(cred);
  3854. src_entry->cred = cred;
  3855. *fork_charge += size;
  3856. }
  3857. }
  3858. }
  3859. /*
  3860. * vm_map_copy_entry:
  3861. *
  3862. * Copies the contents of the source entry to the destination
  3863. * entry. The entries *must* be aligned properly.
  3864. */
  3865. static void
  3866. vm_map_copy_entry(
  3867. vm_map_t src_map,
  3868. vm_map_t dst_map,
  3869. vm_map_entry_t src_entry,
  3870. vm_map_entry_t dst_entry,
  3871. vm_ooffset_t *fork_charge)
  3872. {
  3873. vm_object_t src_object;
  3874. vm_map_entry_t fake_entry;
  3875. vm_offset_t size;
  3876. VM_MAP_ASSERT_LOCKED(dst_map);
  3877. if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
  3878. return;
  3879. if (src_entry->wired_count == 0 ||
  3880. (src_entry->protection & VM_PROT_WRITE) == 0) {
  3881. /*
  3882. * If the source entry is marked needs_copy, it is already
  3883. * write-protected.
  3884. */
  3885. if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
  3886. (src_entry->protection & VM_PROT_WRITE) != 0) {
  3887. pmap_protect(src_map->pmap,
  3888. src_entry->start,
  3889. src_entry->end,
  3890. src_entry->protection & ~VM_PROT_WRITE);
  3891. }
  3892. /*
  3893. * Make a copy of the object.
  3894. */
  3895. size = src_entry->end - src_entry->start;
  3896. if ((src_object = src_entry->object.vm_object) != NULL) {
  3897. if ((src_object->flags & OBJ_SWAP) != 0) {
  3898. vm_map_copy_swap_object(src_entry, dst_entry,
  3899. size, fork_charge);
  3900. /* May have split/collapsed, reload obj. */
  3901. src_object = src_entry->object.vm_object;
  3902. } else {
  3903. vm_object_reference(src_object);
  3904. dst_entry->object.vm_object = src_object;
  3905. }
  3906. src_entry->eflags |= MAP_ENTRY_COW |
  3907. MAP_ENTRY_NEEDS_COPY;
  3908. dst_entry->eflags |= MAP_ENTRY_COW |
  3909. MAP_ENTRY_NEEDS_COPY;
  3910. dst_entry->offset = src_entry->offset;
  3911. if (src_entry->eflags & MAP_ENTRY_WRITECNT) {
  3912. /*
  3913. * MAP_ENTRY_WRITECNT cannot
  3914. * indicate write reference from
  3915. * src_entry, since the entry is
  3916. * marked as needs copy. Allocate a
  3917. * fake entry that is used to
  3918. * decrement object->un_pager writecount
  3919. * at the appropriate time. Attach
  3920. * fake_entry to the deferred list.
  3921. */
  3922. fake_entry = vm_map_entry_create(dst_map);
  3923. fake_entry->eflags = MAP_ENTRY_WRITECNT;
  3924. src_entry->eflags &= ~MAP_ENTRY_WRITECNT;
  3925. vm_object_reference(src_object);
  3926. fake_entry->object.vm_object = src_object;
  3927. fake_entry->start = src_entry->start;
  3928. fake_entry->end = src_entry->end;
  3929. fake_entry->defer_next =
  3930. curthread->td_map_def_user;
  3931. curthread->td_map_def_user = fake_entry;
  3932. }
  3933. pmap_copy(dst_map->pmap, src_map->pmap,
  3934. dst_entry->start, dst_entry->end - dst_entry->start,
  3935. src_entry->start);
  3936. } else {
  3937. dst_entry->object.vm_object = NULL;
  3938. if ((dst_entry->eflags & MAP_ENTRY_GUARD) == 0)
  3939. dst_entry->offset = 0;
  3940. if (src_entry->cred != NULL) {
  3941. dst_entry->cred = curthread->td_ucred;
  3942. crhold(dst_entry->cred);
  3943. *fork_charge += size;
  3944. }
  3945. }
  3946. } else {
  3947. /*
  3948. * We don't want to make writeable wired pages copy-on-write.
  3949. * Immediately copy these pages into the new map by simulating
  3950. * page faults. The new pages are pageable.
  3951. */
  3952. vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
  3953. fork_charge);
  3954. }
  3955. }
  3956. /*
  3957. * vmspace_map_entry_forked:
  3958. * Update the newly-forked vmspace each time a map entry is inherited
  3959. * or copied. The values for vm_dsize and vm_tsize are approximate
  3960. * (and mostly-obsolete ideas in the face of mmap(2) et al.)
  3961. */
  3962. static void
  3963. vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
  3964. vm_map_entry_t entry)
  3965. {
  3966. vm_size_t entrysize;
  3967. vm_offset_t newend;
  3968. if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
  3969. return;
  3970. entrysize = entry->end - entry->start;
  3971. vm2->vm_map.size += entrysize;
  3972. if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
  3973. vm2->vm_ssize += btoc(entrysize);
  3974. } else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
  3975. entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
  3976. newend = MIN(entry->end,
  3977. (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
  3978. vm2->vm_dsize += btoc(newend - entry->start);
  3979. } else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
  3980. entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
  3981. newend = MIN(entry->end,
  3982. (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
  3983. vm2->vm_tsize += btoc(newend - entry->start);
  3984. }
  3985. }
  3986. /*
  3987. * vmspace_fork:
  3988. * Create a new process vmspace structure and vm_map
  3989. * based on those of an existing process. The new map
  3990. * is based on the old map, according to the inheritance
  3991. * values on the regions in that map.
  3992. *
  3993. * XXX It might be worth coalescing the entries added to the new vmspace.
  3994. *
  3995. * The source map must not be locked.
  3996. */
  3997. struct vmspace *
  3998. vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
  3999. {
  4000. struct vmspace *vm2;
  4001. vm_map_t new_map, old_map;
  4002. vm_map_entry_t new_entry, old_entry;
  4003. vm_object_t object;
  4004. int error, locked __diagused;
  4005. vm_inherit_t inh;
  4006. old_map = &vm1->vm_map;
  4007. /* Copy immutable fields of vm1 to vm2. */
  4008. vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
  4009. pmap_pinit);
  4010. if (vm2 == NULL)
  4011. return (NULL);
  4012. vm2->vm_taddr = vm1->vm_taddr;
  4013. vm2->vm_daddr = vm1->vm_daddr;
  4014. vm2->vm_maxsaddr = vm1->vm_maxsaddr;
  4015. vm2->vm_stacktop = vm1->vm_stacktop;
  4016. vm2->vm_shp_base = vm1->vm_shp_base;
  4017. vm_map_lock(old_map);
  4018. if (old_map->busy)
  4019. vm_map_wait_busy(old_map);
  4020. new_map = &vm2->vm_map;
  4021. locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
  4022. KASSERT(locked, ("vmspace_fork: lock failed"));
  4023. error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
  4024. if (error != 0) {
  4025. sx_xunlock(&old_map->lock);
  4026. sx_xunlock(&new_map->lock);
  4027. vm_map_process_deferred();
  4028. vmspace_free(vm2);
  4029. return (NULL);
  4030. }
  4031. new_map->anon_loc = old_map->anon_loc;
  4032. new_map->flags |= old_map->flags & (MAP_ASLR | MAP_ASLR_IGNSTART |
  4033. MAP_ASLR_STACK | MAP_WXORX);
  4034. VM_MAP_ENTRY_FOREACH(old_entry, old_map) {
  4035. if ((old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
  4036. panic("vm_map_fork: encountered a submap");
  4037. inh = old_entry->inheritance;
  4038. if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
  4039. inh != VM_INHERIT_NONE)
  4040. inh = VM_INHERIT_COPY;
  4041. switch (inh) {
  4042. case VM_INHERIT_NONE:
  4043. break;
  4044. case VM_INHERIT_SHARE:
  4045. /*
  4046. * Clone the entry, creating the shared object if
  4047. * necessary.
  4048. */
  4049. object = old_entry->object.vm_object;
  4050. if (object == NULL) {
  4051. vm_map_entry_back(old_entry);
  4052. object = old_entry->object.vm_object;
  4053. }
  4054. /*
  4055. * Add the reference before calling vm_object_shadow
  4056. * to insure that a shadow object is created.
  4057. */
  4058. vm_object_reference(object);
  4059. if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
  4060. vm_object_shadow(&old_entry->object.vm_object,
  4061. &old_entry->offset,
  4062. old_entry->end - old_entry->start,
  4063. old_entry->cred,
  4064. /* Transfer the second reference too. */
  4065. true);
  4066. old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
  4067. old_entry->cred = NULL;
  4068. /*
  4069. * As in vm_map_merged_neighbor_dispose(),
  4070. * the vnode lock will not be acquired in
  4071. * this call to vm_object_deallocate().
  4072. */
  4073. vm_object_deallocate(object);
  4074. object = old_entry->object.vm_object;
  4075. } else {
  4076. VM_OBJECT_WLOCK(object);
  4077. vm_object_clear_flag(object, OBJ_ONEMAPPING);
  4078. if (old_entry->cred != NULL) {
  4079. KASSERT(object->cred == NULL,
  4080. ("vmspace_fork both cred"));
  4081. object->cred = old_entry->cred;
  4082. object->charge = old_entry->end -
  4083. old_entry->start;
  4084. old_entry->cred = NULL;
  4085. }
  4086. /*
  4087. * Assert the correct state of the vnode
  4088. * v_writecount while the object is locked, to
  4089. * not relock it later for the assertion
  4090. * correctness.
  4091. */
  4092. if (old_entry->eflags & MAP_ENTRY_WRITECNT &&
  4093. object->type == OBJT_VNODE) {
  4094. KASSERT(((struct vnode *)object->
  4095. handle)->v_writecount > 0,
  4096. ("vmspace_fork: v_writecount %p",
  4097. object));
  4098. KASSERT(object->un_pager.vnp.
  4099. writemappings > 0,
  4100. ("vmspace_fork: vnp.writecount %p",
  4101. object));
  4102. }
  4103. VM_OBJECT_WUNLOCK(object);
  4104. }
  4105. /*
  4106. * Clone the entry, referencing the shared object.
  4107. */
  4108. new_entry = vm_map_entry_create(new_map);
  4109. *new_entry = *old_entry;
  4110. new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
  4111. MAP_ENTRY_IN_TRANSITION);
  4112. new_entry->wiring_thread = NULL;
  4113. new_entry->wired_count = 0;
  4114. if (new_entry->eflags & MAP_ENTRY_WRITECNT) {
  4115. vm_pager_update_writecount(object,
  4116. new_entry->start, new_entry->end);
  4117. }
  4118. vm_map_entry_set_vnode_text(new_entry, true);
  4119. /*
  4120. * Insert the entry into the new map -- we know we're
  4121. * inserting at the end of the new map.
  4122. */
  4123. vm_map_entry_link(new_map, new_entry);
  4124. vmspace_map_entry_forked(vm1, vm2, new_entry);
  4125. /*
  4126. * Update the physical map
  4127. */
  4128. pmap_copy(new_map->pmap, old_map->pmap,
  4129. new_entry->start,
  4130. (old_entry->end - old_entry->start),
  4131. old_entry->start);
  4132. break;
  4133. case VM_INHERIT_COPY:
  4134. /*
  4135. * Clone the entry and link into the map.
  4136. */
  4137. new_entry = vm_map_entry_create(new_map);
  4138. *new_entry = *old_entry;
  4139. /*
  4140. * Copied entry is COW over the old object.
  4141. */
  4142. new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
  4143. MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT);
  4144. new_entry->wiring_thread = NULL;
  4145. new_entry->wired_count = 0;
  4146. new_entry->object.vm_object = NULL;
  4147. new_entry->cred = NULL;
  4148. vm_map_entry_link(new_map, new_entry);
  4149. vmspace_map_entry_forked(vm1, vm2, new_entry);
  4150. vm_map_copy_entry(old_map, new_map, old_entry,
  4151. new_entry, fork_charge);
  4152. vm_map_entry_set_vnode_text(new_entry, true);
  4153. break;
  4154. case VM_INHERIT_ZERO:
  4155. /*
  4156. * Create a new anonymous mapping entry modelled from
  4157. * the old one.
  4158. */
  4159. new_entry = vm_map_entry_create(new_map);
  4160. memset(new_entry, 0, sizeof(*new_entry));
  4161. new_entry->start = old_entry->start;
  4162. new_entry->end = old_entry->end;
  4163. new_entry->eflags = old_entry->eflags &
  4164. ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
  4165. MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC |
  4166. MAP_ENTRY_SPLIT_BOUNDARY_MASK);
  4167. new_entry->protection = old_entry->protection;
  4168. new_entry->max_protection = old_entry->max_protection;
  4169. new_entry->inheritance = VM_INHERIT_ZERO;
  4170. vm_map_entry_link(new_map, new_entry);
  4171. vmspace_map_entry_forked(vm1, vm2, new_entry);
  4172. new_entry->cred = curthread->td_ucred;
  4173. crhold(new_entry->cred);
  4174. *fork_charge += (new_entry->end - new_entry->start);
  4175. break;
  4176. }
  4177. }
  4178. /*
  4179. * Use inlined vm_map_unlock() to postpone handling the deferred
  4180. * map entries, which cannot be done until both old_map and
  4181. * new_map locks are released.
  4182. */
  4183. sx_xunlock(&old_map->lock);
  4184. sx_xunlock(&new_map->lock);
  4185. vm_map_process_deferred();
  4186. return (vm2);
  4187. }
  4188. /*
  4189. * Create a process's stack for exec_new_vmspace(). This function is never
  4190. * asked to wire the newly created stack.
  4191. */
  4192. int
  4193. vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
  4194. vm_prot_t prot, vm_prot_t max, int cow)
  4195. {
  4196. vm_size_t growsize, init_ssize;
  4197. rlim_t vmemlim;
  4198. int rv;
  4199. MPASS((map->flags & MAP_WIREFUTURE) == 0);
  4200. growsize = sgrowsiz;
  4201. init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
  4202. vm_map_lock(map);
  4203. vmemlim = lim_cur(curthread, RLIMIT_VMEM);
  4204. /* If we would blow our VMEM resource limit, no go */
  4205. if (map->size + init_ssize > vmemlim) {
  4206. rv = KERN_NO_SPACE;
  4207. goto out;
  4208. }
  4209. rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
  4210. max, cow);
  4211. out:
  4212. vm_map_unlock(map);
  4213. return (rv);
  4214. }
  4215. static int stack_guard_page = 1;
  4216. SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
  4217. &stack_guard_page, 0,
  4218. "Specifies the number of guard pages for a stack that grows");
  4219. static int
  4220. vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
  4221. vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
  4222. {
  4223. vm_map_entry_t gap_entry, new_entry, prev_entry;
  4224. vm_offset_t bot, gap_bot, gap_top, top;
  4225. vm_size_t init_ssize, sgp;
  4226. int orient, rv;
  4227. /*
  4228. * The stack orientation is piggybacked with the cow argument.
  4229. * Extract it into orient and mask the cow argument so that we
  4230. * don't pass it around further.
  4231. */
  4232. orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP);
  4233. KASSERT(orient != 0, ("No stack grow direction"));
  4234. KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
  4235. ("bi-dir stack"));
  4236. if (max_ssize == 0 ||
  4237. !vm_map_range_valid(map, addrbos, addrbos + max_ssize))
  4238. return (KERN_INVALID_ADDRESS);
  4239. sgp = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
  4240. (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
  4241. (vm_size_t)stack_guard_page * PAGE_SIZE;
  4242. if (sgp >= max_ssize)
  4243. return (KERN_INVALID_ARGUMENT);
  4244. init_ssize = growsize;
  4245. if (max_ssize < init_ssize + sgp)
  4246. init_ssize = max_ssize - sgp;
  4247. /* If addr is already mapped, no go */
  4248. if (vm_map_lookup_entry(map, addrbos, &prev_entry))
  4249. return (KERN_NO_SPACE);
  4250. /*
  4251. * If we can't accommodate max_ssize in the current mapping, no go.
  4252. */
  4253. if (vm_map_entry_succ(prev_entry)->start < addrbos + max_ssize)
  4254. return (KERN_NO_SPACE);
  4255. /*
  4256. * We initially map a stack of only init_ssize. We will grow as
  4257. * needed later. Depending on the orientation of the stack (i.e.
  4258. * the grow direction) we either map at the top of the range, the
  4259. * bottom of the range or in the middle.
  4260. *
  4261. * Note: we would normally expect prot and max to be VM_PROT_ALL,
  4262. * and cow to be 0. Possibly we should eliminate these as input
  4263. * parameters, and just pass these values here in the insert call.
  4264. */
  4265. if (orient == MAP_STACK_GROWS_DOWN) {
  4266. bot = addrbos + max_ssize - init_ssize;
  4267. top = bot + init_ssize;
  4268. gap_bot = addrbos;
  4269. gap_top = bot;
  4270. } else /* if (orient == MAP_STACK_GROWS_UP) */ {
  4271. bot = addrbos;
  4272. top = bot + init_ssize;
  4273. gap_bot = top;
  4274. gap_top = addrbos + max_ssize;
  4275. }
  4276. rv = vm_map_insert1(map, NULL, 0, bot, top, prot, max, cow,
  4277. &new_entry);
  4278. if (rv != KERN_SUCCESS)
  4279. return (rv);
  4280. KASSERT(new_entry->end == top || new_entry->start == bot,
  4281. ("Bad entry start/end for new stack entry"));
  4282. KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
  4283. (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
  4284. ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
  4285. KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
  4286. (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
  4287. ("new entry lacks MAP_ENTRY_GROWS_UP"));
  4288. if (gap_bot == gap_top)
  4289. return (KERN_SUCCESS);
  4290. rv = vm_map_insert1(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
  4291. VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
  4292. MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP), &gap_entry);
  4293. if (rv == KERN_SUCCESS) {
  4294. KASSERT((gap_entry->eflags & MAP_ENTRY_GUARD) != 0,
  4295. ("entry %p not gap %#x", gap_entry, gap_entry->eflags));
  4296. KASSERT((gap_entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
  4297. MAP_ENTRY_STACK_GAP_UP)) != 0,
  4298. ("entry %p not stack gap %#x", gap_entry,
  4299. gap_entry->eflags));
  4300. /*
  4301. * Gap can never successfully handle a fault, so
  4302. * read-ahead logic is never used for it. Re-use
  4303. * next_read of the gap entry to store
  4304. * stack_guard_page for vm_map_growstack().
  4305. * Similarly, since a gap cannot have a backing object,
  4306. * store the original stack protections in the
  4307. * object offset.
  4308. */
  4309. gap_entry->next_read = sgp;
  4310. gap_entry->offset = prot | PROT_MAX(max);
  4311. } else {
  4312. (void)vm_map_delete(map, bot, top);
  4313. }
  4314. return (rv);
  4315. }
  4316. /*
  4317. * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we
  4318. * successfully grow the stack.
  4319. */
  4320. static int
  4321. vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
  4322. {
  4323. vm_map_entry_t stack_entry;
  4324. struct proc *p;
  4325. struct vmspace *vm;
  4326. struct ucred *cred;
  4327. vm_offset_t gap_end, gap_start, grow_start;
  4328. vm_size_t grow_amount, guard, max_grow, sgp;
  4329. vm_prot_t prot, max;
  4330. rlim_t lmemlim, stacklim, vmemlim;
  4331. int rv, rv1 __diagused;
  4332. bool gap_deleted, grow_down, is_procstack;
  4333. #ifdef notyet
  4334. uint64_t limit;
  4335. #endif
  4336. #ifdef RACCT
  4337. int error __diagused;
  4338. #endif
  4339. p = curproc;
  4340. vm = p->p_vmspace;
  4341. /*
  4342. * Disallow stack growth when the access is performed by a
  4343. * debugger or AIO daemon. The reason is that the wrong
  4344. * resource limits are applied.
  4345. */
  4346. if (p != initproc && (map != &p->p_vmspace->vm_map ||
  4347. p->p_textvp == NULL))
  4348. return (KERN_FAILURE);
  4349. MPASS(!map->system_map);
  4350. lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
  4351. stacklim = lim_cur(curthread, RLIMIT_STACK);
  4352. vmemlim = lim_cur(curthread, RLIMIT_VMEM);
  4353. retry:
  4354. /* If addr is not in a hole for a stack grow area, no need to grow. */
  4355. if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
  4356. return (KERN_FAILURE);
  4357. if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
  4358. return (KERN_SUCCESS);
  4359. if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) {
  4360. stack_entry = vm_map_entry_succ(gap_entry);
  4361. if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
  4362. stack_entry->start != gap_entry->end)
  4363. return (KERN_FAILURE);
  4364. grow_amount = round_page(stack_entry->start - addr);
  4365. grow_down = true;
  4366. } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) {
  4367. stack_entry = vm_map_entry_pred(gap_entry);
  4368. if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 ||
  4369. stack_entry->end != gap_entry->start)
  4370. return (KERN_FAILURE);
  4371. grow_amount = round_page(addr + 1 - stack_entry->end);
  4372. grow_down = false;
  4373. } else {
  4374. return (KERN_FAILURE);
  4375. }
  4376. guard = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
  4377. (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
  4378. gap_entry->next_read;
  4379. max_grow = gap_entry->end - gap_entry->start;
  4380. if (guard > max_grow)
  4381. return (KERN_NO_SPACE);
  4382. max_grow -= guard;
  4383. if (grow_amount > max_grow)
  4384. return (KERN_NO_SPACE);
  4385. /*
  4386. * If this is the main process stack, see if we're over the stack
  4387. * limit.
  4388. */
  4389. is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
  4390. addr < (vm_offset_t)vm->vm_stacktop;
  4391. if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
  4392. return (KERN_NO_SPACE);
  4393. #ifdef RACCT
  4394. if (racct_enable) {
  4395. PROC_LOCK(p);
  4396. if (is_procstack && racct_set(p, RACCT_STACK,
  4397. ctob(vm->vm_ssize) + grow_amount)) {
  4398. PROC_UNLOCK(p);
  4399. return (KERN_NO_SPACE);
  4400. }
  4401. PROC_UNLOCK(p);
  4402. }
  4403. #endif
  4404. grow_amount = roundup(grow_amount, sgrowsiz);
  4405. if (grow_amount > max_grow)
  4406. grow_amount = max_grow;
  4407. if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
  4408. grow_amount = trunc_page((vm_size_t)stacklim) -
  4409. ctob(vm->vm_ssize);
  4410. }
  4411. #ifdef notyet
  4412. PROC_LOCK(p);
  4413. limit = racct_get_available(p, RACCT_STACK);
  4414. PROC_UNLOCK(p);
  4415. if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
  4416. grow_amount = limit - ctob(vm->vm_ssize);
  4417. #endif
  4418. if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
  4419. if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
  4420. rv = KERN_NO_SPACE;
  4421. goto out;
  4422. }
  4423. #ifdef RACCT
  4424. if (racct_enable) {
  4425. PROC_LOCK(p);
  4426. if (racct_set(p, RACCT_MEMLOCK,
  4427. ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
  4428. PROC_UNLOCK(p);
  4429. rv = KERN_NO_SPACE;
  4430. goto out;
  4431. }
  4432. PROC_UNLOCK(p);
  4433. }
  4434. #endif
  4435. }
  4436. /* If we would blow our VMEM resource limit, no go */
  4437. if (map->size + grow_amount > vmemlim) {
  4438. rv = KERN_NO_SPACE;
  4439. goto out;
  4440. }
  4441. #ifdef RACCT
  4442. if (racct_enable) {
  4443. PROC_LOCK(p);
  4444. if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
  4445. PROC_UNLOCK(p);
  4446. rv = KERN_NO_SPACE;
  4447. goto out;
  4448. }
  4449. PROC_UNLOCK(p);
  4450. }
  4451. #endif
  4452. if (vm_map_lock_upgrade(map)) {
  4453. gap_entry = NULL;
  4454. vm_map_lock_read(map);
  4455. goto retry;
  4456. }
  4457. if (grow_down) {
  4458. /*
  4459. * The gap_entry "offset" field is overloaded. See
  4460. * vm_map_stack_locked().
  4461. */
  4462. prot = PROT_EXTRACT(gap_entry->offset);
  4463. max = PROT_MAX_EXTRACT(gap_entry->offset);
  4464. sgp = gap_entry->next_read;
  4465. grow_start = gap_entry->end - grow_amount;
  4466. if (gap_entry->start + grow_amount == gap_entry->end) {
  4467. gap_start = gap_entry->start;
  4468. gap_end = gap_entry->end;
  4469. vm_map_entry_delete(map, gap_entry);
  4470. gap_deleted = true;
  4471. } else {
  4472. MPASS(gap_entry->start < gap_entry->end - grow_amount);
  4473. vm_map_entry_resize(map, gap_entry, -grow_amount);
  4474. gap_deleted = false;
  4475. }
  4476. rv = vm_map_insert(map, NULL, 0, grow_start,
  4477. grow_start + grow_amount, prot, max, MAP_STACK_GROWS_DOWN);
  4478. if (rv != KERN_SUCCESS) {
  4479. if (gap_deleted) {
  4480. rv1 = vm_map_insert1(map, NULL, 0, gap_start,
  4481. gap_end, VM_PROT_NONE, VM_PROT_NONE,
  4482. MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN,
  4483. &gap_entry);
  4484. MPASS(rv1 == KERN_SUCCESS);
  4485. gap_entry->next_read = sgp;
  4486. gap_entry->offset = prot | PROT_MAX(max);
  4487. } else
  4488. vm_map_entry_resize(map, gap_entry,
  4489. grow_amount);
  4490. }
  4491. } else {
  4492. grow_start = stack_entry->end;
  4493. cred = stack_entry->cred;
  4494. if (cred == NULL && stack_entry->object.vm_object != NULL)
  4495. cred = stack_entry->object.vm_object->cred;
  4496. if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
  4497. rv = KERN_NO_SPACE;
  4498. /* Grow the underlying object if applicable. */
  4499. else if (stack_entry->object.vm_object == NULL ||
  4500. vm_object_coalesce(stack_entry->object.vm_object,
  4501. stack_entry->offset,
  4502. (vm_size_t)(stack_entry->end - stack_entry->start),
  4503. grow_amount, cred != NULL)) {
  4504. if (gap_entry->start + grow_amount == gap_entry->end) {
  4505. vm_map_entry_delete(map, gap_entry);
  4506. vm_map_entry_resize(map, stack_entry,
  4507. grow_amount);
  4508. } else {
  4509. gap_entry->start += grow_amount;
  4510. stack_entry->end += grow_amount;
  4511. }
  4512. map->size += grow_amount;
  4513. rv = KERN_SUCCESS;
  4514. } else
  4515. rv = KERN_FAILURE;
  4516. }
  4517. if (rv == KERN_SUCCESS && is_procstack)
  4518. vm->vm_ssize += btoc(grow_amount);
  4519. /*
  4520. * Heed the MAP_WIREFUTURE flag if it was set for this process.
  4521. */
  4522. if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
  4523. rv = vm_map_wire_locked(map, grow_start,
  4524. grow_start + grow_amount,
  4525. VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
  4526. }
  4527. vm_map_lock_downgrade(map);
  4528. out:
  4529. #ifdef RACCT
  4530. if (racct_enable && rv != KERN_SUCCESS) {
  4531. PROC_LOCK(p);
  4532. error = racct_set(p, RACCT_VMEM, map->size);
  4533. KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
  4534. if (!old_mlock) {
  4535. error = racct_set(p, RACCT_MEMLOCK,
  4536. ptoa(pmap_wired_count(map->pmap)));
  4537. KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
  4538. }
  4539. error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
  4540. KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
  4541. PROC_UNLOCK(p);
  4542. }
  4543. #endif
  4544. return (rv);
  4545. }
  4546. /*
  4547. * Unshare the specified VM space for exec. If other processes are
  4548. * mapped to it, then create a new one. The new vmspace is null.
  4549. */
  4550. int
  4551. vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
  4552. {
  4553. struct vmspace *oldvmspace = p->p_vmspace;
  4554. struct vmspace *newvmspace;
  4555. KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
  4556. ("vmspace_exec recursed"));
  4557. newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
  4558. if (newvmspace == NULL)
  4559. return (ENOMEM);
  4560. newvmspace->vm_swrss = oldvmspace->vm_swrss;
  4561. /*
  4562. * This code is written like this for prototype purposes. The
  4563. * goal is to avoid running down the vmspace here, but let the
  4564. * other process's that are still using the vmspace to finally
  4565. * run it down. Even though there is little or no chance of blocking
  4566. * here, it is a good idea to keep this form for future mods.
  4567. */
  4568. PROC_VMSPACE_LOCK(p);
  4569. p->p_vmspace = newvmspace;
  4570. PROC_VMSPACE_UNLOCK(p);
  4571. if (p == curthread->td_proc)
  4572. pmap_activate(curthread);
  4573. curthread->td_pflags |= TDP_EXECVMSPC;
  4574. return (0);
  4575. }
  4576. /*
  4577. * Unshare the specified VM space for forcing COW. This
  4578. * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
  4579. */
  4580. int
  4581. vmspace_unshare(struct proc *p)
  4582. {
  4583. struct vmspace *oldvmspace = p->p_vmspace;
  4584. struct vmspace *newvmspace;
  4585. vm_ooffset_t fork_charge;
  4586. /*
  4587. * The caller is responsible for ensuring that the reference count
  4588. * cannot concurrently transition 1 -> 2.
  4589. */
  4590. if (refcount_load(&oldvmspace->vm_refcnt) == 1)
  4591. return (0);
  4592. fork_charge = 0;
  4593. newvmspace = vmspace_fork(oldvmspace, &fork_charge);
  4594. if (newvmspace == NULL)
  4595. return (ENOMEM);
  4596. if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
  4597. vmspace_free(newvmspace);
  4598. return (ENOMEM);
  4599. }
  4600. PROC_VMSPACE_LOCK(p);
  4601. p->p_vmspace = newvmspace;
  4602. PROC_VMSPACE_UNLOCK(p);
  4603. if (p == curthread->td_proc)
  4604. pmap_activate(curthread);
  4605. vmspace_free(oldvmspace);
  4606. return (0);
  4607. }
  4608. /*
  4609. * vm_map_lookup:
  4610. *
  4611. * Finds the VM object, offset, and
  4612. * protection for a given virtual address in the
  4613. * specified map, assuming a page fault of the
  4614. * type specified.
  4615. *
  4616. * Leaves the map in question locked for read; return
  4617. * values are guaranteed until a vm_map_lookup_done
  4618. * call is performed. Note that the map argument
  4619. * is in/out; the returned map must be used in
  4620. * the call to vm_map_lookup_done.
  4621. *
  4622. * A handle (out_entry) is returned for use in
  4623. * vm_map_lookup_done, to make that fast.
  4624. *
  4625. * If a lookup is requested with "write protection"
  4626. * specified, the map may be changed to perform virtual
  4627. * copying operations, although the data referenced will
  4628. * remain the same.
  4629. */
  4630. int
  4631. vm_map_lookup(vm_map_t *var_map, /* IN/OUT */
  4632. vm_offset_t vaddr,
  4633. vm_prot_t fault_typea,
  4634. vm_map_entry_t *out_entry, /* OUT */
  4635. vm_object_t *object, /* OUT */
  4636. vm_pindex_t *pindex, /* OUT */
  4637. vm_prot_t *out_prot, /* OUT */
  4638. boolean_t *wired) /* OUT */
  4639. {
  4640. vm_map_entry_t entry;
  4641. vm_map_t map = *var_map;
  4642. vm_prot_t prot;
  4643. vm_prot_t fault_type;
  4644. vm_object_t eobject;
  4645. vm_size_t size;
  4646. struct ucred *cred;
  4647. RetryLookup:
  4648. vm_map_lock_read(map);
  4649. RetryLookupLocked:
  4650. /*
  4651. * Lookup the faulting address.
  4652. */
  4653. if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
  4654. vm_map_unlock_read(map);
  4655. return (KERN_INVALID_ADDRESS);
  4656. }
  4657. entry = *out_entry;
  4658. /*
  4659. * Handle submaps.
  4660. */
  4661. if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
  4662. vm_map_t old_map = map;
  4663. *var_map = map = entry->object.sub_map;
  4664. vm_map_unlock_read(old_map);
  4665. goto RetryLookup;
  4666. }
  4667. /*
  4668. * Check whether this task is allowed to have this page.
  4669. */
  4670. prot = entry->protection;
  4671. if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
  4672. fault_typea &= ~VM_PROT_FAULT_LOOKUP;
  4673. if (prot == VM_PROT_NONE && map != kernel_map &&
  4674. (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
  4675. (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
  4676. MAP_ENTRY_STACK_GAP_UP)) != 0 &&
  4677. vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
  4678. goto RetryLookupLocked;
  4679. }
  4680. fault_type = fault_typea & VM_PROT_ALL;
  4681. if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
  4682. vm_map_unlock_read(map);
  4683. return (KERN_PROTECTION_FAILURE);
  4684. }
  4685. KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
  4686. (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
  4687. (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
  4688. ("entry %p flags %x", entry, entry->eflags));
  4689. if ((fault_typea & VM_PROT_COPY) != 0 &&
  4690. (entry->max_protection & VM_PROT_WRITE) == 0 &&
  4691. (entry->eflags & MAP_ENTRY_COW) == 0) {
  4692. vm_map_unlock_read(map);
  4693. return (KERN_PROTECTION_FAILURE);
  4694. }
  4695. /*
  4696. * If this page is not pageable, we have to get it for all possible
  4697. * accesses.
  4698. */
  4699. *wired = (entry->wired_count != 0);
  4700. if (*wired)
  4701. fault_type = entry->protection;
  4702. size = entry->end - entry->start;
  4703. /*
  4704. * If the entry was copy-on-write, we either ...
  4705. */
  4706. if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
  4707. /*
  4708. * If we want to write the page, we may as well handle that
  4709. * now since we've got the map locked.
  4710. *
  4711. * If we don't need to write the page, we just demote the
  4712. * permissions allowed.
  4713. */
  4714. if ((fault_type & VM_PROT_WRITE) != 0 ||
  4715. (fault_typea & VM_PROT_COPY) != 0) {
  4716. /*
  4717. * Make a new object, and place it in the object
  4718. * chain. Note that no new references have appeared
  4719. * -- one just moved from the map to the new
  4720. * object.
  4721. */
  4722. if (vm_map_lock_upgrade(map))
  4723. goto RetryLookup;
  4724. if (entry->cred == NULL) {
  4725. /*
  4726. * The debugger owner is charged for
  4727. * the memory.
  4728. */
  4729. cred = curthread->td_ucred;
  4730. crhold(cred);
  4731. if (!swap_reserve_by_cred(size, cred)) {
  4732. crfree(cred);
  4733. vm_map_unlock(map);
  4734. return (KERN_RESOURCE_SHORTAGE);
  4735. }
  4736. entry->cred = cred;
  4737. }
  4738. eobject = entry->object.vm_object;
  4739. vm_object_shadow(&entry->object.vm_object,
  4740. &entry->offset, size, entry->cred, false);
  4741. if (eobject == entry->object.vm_object) {
  4742. /*
  4743. * The object was not shadowed.
  4744. */
  4745. swap_release_by_cred(size, entry->cred);
  4746. crfree(entry->cred);
  4747. }
  4748. entry->cred = NULL;
  4749. entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
  4750. vm_map_lock_downgrade(map);
  4751. } else {
  4752. /*
  4753. * We're attempting to read a copy-on-write page --
  4754. * don't allow writes.
  4755. */
  4756. prot &= ~VM_PROT_WRITE;
  4757. }
  4758. }
  4759. /*
  4760. * Create an object if necessary.
  4761. */
  4762. if (entry->object.vm_object == NULL && !map->system_map) {
  4763. if (vm_map_lock_upgrade(map))
  4764. goto RetryLookup;
  4765. entry->object.vm_object = vm_object_allocate_anon(atop(size),
  4766. NULL, entry->cred, size);
  4767. entry->offset = 0;
  4768. entry->cred = NULL;
  4769. vm_map_lock_downgrade(map);
  4770. }
  4771. /*
  4772. * Return the object/offset from this entry. If the entry was
  4773. * copy-on-write or empty, it has been fixed up.
  4774. */
  4775. *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
  4776. *object = entry->object.vm_object;
  4777. *out_prot = prot;
  4778. return (KERN_SUCCESS);
  4779. }
  4780. /*
  4781. * vm_map_lookup_locked:
  4782. *
  4783. * Lookup the faulting address. A version of vm_map_lookup that returns
  4784. * KERN_FAILURE instead of blocking on map lock or memory allocation.
  4785. */
  4786. int
  4787. vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */
  4788. vm_offset_t vaddr,
  4789. vm_prot_t fault_typea,
  4790. vm_map_entry_t *out_entry, /* OUT */
  4791. vm_object_t *object, /* OUT */
  4792. vm_pindex_t *pindex, /* OUT */
  4793. vm_prot_t *out_prot, /* OUT */
  4794. boolean_t *wired) /* OUT */
  4795. {
  4796. vm_map_entry_t entry;
  4797. vm_map_t map = *var_map;
  4798. vm_prot_t prot;
  4799. vm_prot_t fault_type = fault_typea;
  4800. /*
  4801. * Lookup the faulting address.
  4802. */
  4803. if (!vm_map_lookup_entry(map, vaddr, out_entry))
  4804. return (KERN_INVALID_ADDRESS);
  4805. entry = *out_entry;
  4806. /*
  4807. * Fail if the entry refers to a submap.
  4808. */
  4809. if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
  4810. return (KERN_FAILURE);
  4811. /*
  4812. * Check whether this task is allowed to have this page.
  4813. */
  4814. prot = entry->protection;
  4815. fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
  4816. if ((fault_type & prot) != fault_type)
  4817. return (KERN_PROTECTION_FAILURE);
  4818. /*
  4819. * If this page is not pageable, we have to get it for all possible
  4820. * accesses.
  4821. */
  4822. *wired = (entry->wired_count != 0);
  4823. if (*wired)
  4824. fault_type = entry->protection;
  4825. if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
  4826. /*
  4827. * Fail if the entry was copy-on-write for a write fault.
  4828. */
  4829. if (fault_type & VM_PROT_WRITE)
  4830. return (KERN_FAILURE);
  4831. /*
  4832. * We're attempting to read a copy-on-write page --
  4833. * don't allow writes.
  4834. */
  4835. prot &= ~VM_PROT_WRITE;
  4836. }
  4837. /*
  4838. * Fail if an object should be created.
  4839. */
  4840. if (entry->object.vm_object == NULL && !map->system_map)
  4841. return (KERN_FAILURE);
  4842. /*
  4843. * Return the object/offset from this entry. If the entry was
  4844. * copy-on-write or empty, it has been fixed up.
  4845. */
  4846. *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
  4847. *object = entry->object.vm_object;
  4848. *out_prot = prot;
  4849. return (KERN_SUCCESS);
  4850. }
  4851. /*
  4852. * vm_map_lookup_done:
  4853. *
  4854. * Releases locks acquired by a vm_map_lookup
  4855. * (according to the handle returned by that lookup).
  4856. */
  4857. void
  4858. vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
  4859. {
  4860. /*
  4861. * Unlock the main-level map
  4862. */
  4863. vm_map_unlock_read(map);
  4864. }
  4865. vm_offset_t
  4866. vm_map_max_KBI(const struct vm_map *map)
  4867. {
  4868. return (vm_map_max(map));
  4869. }
  4870. vm_offset_t
  4871. vm_map_min_KBI(const struct vm_map *map)
  4872. {
  4873. return (vm_map_min(map));
  4874. }
  4875. pmap_t
  4876. vm_map_pmap_KBI(vm_map_t map)
  4877. {
  4878. return (map->pmap);
  4879. }
  4880. bool
  4881. vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end)
  4882. {
  4883. return (vm_map_range_valid(map, start, end));
  4884. }
  4885. #ifdef INVARIANTS
  4886. static void
  4887. _vm_map_assert_consistent(vm_map_t map, int check)
  4888. {
  4889. vm_map_entry_t entry, prev;
  4890. vm_map_entry_t cur, header, lbound, ubound;
  4891. vm_size_t max_left, max_right;
  4892. #ifdef DIAGNOSTIC
  4893. ++map->nupdates;
  4894. #endif
  4895. if (enable_vmmap_check != check)
  4896. return;
  4897. header = prev = &map->header;
  4898. VM_MAP_ENTRY_FOREACH(entry, map) {
  4899. KASSERT(prev->end <= entry->start,
  4900. ("map %p prev->end = %jx, start = %jx", map,
  4901. (uintmax_t)prev->end, (uintmax_t)entry->start));
  4902. KASSERT(entry->start < entry->end,
  4903. ("map %p start = %jx, end = %jx", map,
  4904. (uintmax_t)entry->start, (uintmax_t)entry->end));
  4905. KASSERT(entry->left == header ||
  4906. entry->left->start < entry->start,
  4907. ("map %p left->start = %jx, start = %jx", map,
  4908. (uintmax_t)entry->left->start, (uintmax_t)entry->start));
  4909. KASSERT(entry->right == header ||
  4910. entry->start < entry->right->start,
  4911. ("map %p start = %jx, right->start = %jx", map,
  4912. (uintmax_t)entry->start, (uintmax_t)entry->right->start));
  4913. cur = map->root;
  4914. lbound = ubound = header;
  4915. for (;;) {
  4916. if (entry->start < cur->start) {
  4917. ubound = cur;
  4918. cur = cur->left;
  4919. KASSERT(cur != lbound,
  4920. ("map %p cannot find %jx",
  4921. map, (uintmax_t)entry->start));
  4922. } else if (cur->end <= entry->start) {
  4923. lbound = cur;
  4924. cur = cur->right;
  4925. KASSERT(cur != ubound,
  4926. ("map %p cannot find %jx",
  4927. map, (uintmax_t)entry->start));
  4928. } else {
  4929. KASSERT(cur == entry,
  4930. ("map %p cannot find %jx",
  4931. map, (uintmax_t)entry->start));
  4932. break;
  4933. }
  4934. }
  4935. max_left = vm_map_entry_max_free_left(entry, lbound);
  4936. max_right = vm_map_entry_max_free_right(entry, ubound);
  4937. KASSERT(entry->max_free == vm_size_max(max_left, max_right),
  4938. ("map %p max = %jx, max_left = %jx, max_right = %jx", map,
  4939. (uintmax_t)entry->max_free,
  4940. (uintmax_t)max_left, (uintmax_t)max_right));
  4941. prev = entry;
  4942. }
  4943. KASSERT(prev->end <= entry->start,
  4944. ("map %p prev->end = %jx, start = %jx", map,
  4945. (uintmax_t)prev->end, (uintmax_t)entry->start));
  4946. }
  4947. #endif
  4948. #include "opt_ddb.h"
  4949. #ifdef DDB
  4950. #include <sys/kernel.h>
  4951. #include <ddb/ddb.h>
  4952. static void
  4953. vm_map_print(vm_map_t map)
  4954. {
  4955. vm_map_entry_t entry, prev;
  4956. db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
  4957. (void *)map,
  4958. (void *)map->pmap, map->nentries, map->timestamp);
  4959. db_indent += 2;
  4960. prev = &map->header;
  4961. VM_MAP_ENTRY_FOREACH(entry, map) {
  4962. db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
  4963. (void *)entry, (void *)entry->start, (void *)entry->end,
  4964. entry->eflags);
  4965. {
  4966. static const char * const inheritance_name[4] =
  4967. {"share", "copy", "none", "donate_copy"};
  4968. db_iprintf(" prot=%x/%x/%s",
  4969. entry->protection,
  4970. entry->max_protection,
  4971. inheritance_name[(int)(unsigned char)
  4972. entry->inheritance]);
  4973. if (entry->wired_count != 0)
  4974. db_printf(", wired");
  4975. }
  4976. if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
  4977. db_printf(", share=%p, offset=0x%jx\n",
  4978. (void *)entry->object.sub_map,
  4979. (uintmax_t)entry->offset);
  4980. if (prev == &map->header ||
  4981. prev->object.sub_map !=
  4982. entry->object.sub_map) {
  4983. db_indent += 2;
  4984. vm_map_print((vm_map_t)entry->object.sub_map);
  4985. db_indent -= 2;
  4986. }
  4987. } else {
  4988. if (entry->cred != NULL)
  4989. db_printf(", ruid %d", entry->cred->cr_ruid);
  4990. db_printf(", object=%p, offset=0x%jx",
  4991. (void *)entry->object.vm_object,
  4992. (uintmax_t)entry->offset);
  4993. if (entry->object.vm_object && entry->object.vm_object->cred)
  4994. db_printf(", obj ruid %d charge %jx",
  4995. entry->object.vm_object->cred->cr_ruid,
  4996. (uintmax_t)entry->object.vm_object->charge);
  4997. if (entry->eflags & MAP_ENTRY_COW)
  4998. db_printf(", copy (%s)",
  4999. (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
  5000. db_printf("\n");
  5001. if (prev == &map->header ||
  5002. prev->object.vm_object !=
  5003. entry->object.vm_object) {
  5004. db_indent += 2;
  5005. vm_object_print((db_expr_t)(intptr_t)
  5006. entry->object.vm_object,
  5007. 0, 0, (char *)0);
  5008. db_indent -= 2;
  5009. }
  5010. }
  5011. prev = entry;
  5012. }
  5013. db_indent -= 2;
  5014. }
  5015. DB_SHOW_COMMAND(map, map)
  5016. {
  5017. if (!have_addr) {
  5018. db_printf("usage: show map <addr>\n");
  5019. return;
  5020. }
  5021. vm_map_print((vm_map_t)addr);
  5022. }
  5023. DB_SHOW_COMMAND(procvm, procvm)
  5024. {
  5025. struct proc *p;
  5026. if (have_addr) {
  5027. p = db_lookup_proc(addr);
  5028. } else {
  5029. p = curproc;
  5030. }
  5031. db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
  5032. (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
  5033. (void *)vmspace_pmap(p->p_vmspace));
  5034. vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
  5035. }
  5036. #endif /* DDB */