thread.c 76 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863
  1. /*
  2. * Copyright (c) 2012-2018 Richard Braun.
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation, either version 3 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. *
  17. *
  18. * The scheduling algorithm implemented by this module, named Distributed
  19. * Group Ratio Round-Robin (DGR3), is based on the following papers :
  20. * - "Group Ratio Round-Robin: O(1) Proportional Share Scheduling for
  21. * Uniprocessor and Multiprocessor Systems" by Bogdan Caprita, Wong Chun
  22. * Chan, Jason Nieh, Clifford Stein and Haoqiang Zheng.
  23. * - "Efficient and Scalable Multiprocessor Fair Scheduling Using Distributed
  24. * Weighted Round-Robin" by Tong li, Dan Baumberger and Scott Hahn.
  25. *
  26. * Note that the Group Ratio Round-Robin (GR3) paper offers a multiprocessor
  27. * extension, but based on a single global queue, which strongly limits its
  28. * scalability on systems with many processors. That extension isn't used in
  29. * this implementation.
  30. *
  31. * The basic idea is to use GR3 for processor-local scheduling, and Distributed
  32. * Weighted Round-Robin (DWRR) for inter-processor load balancing. These
  33. * algorithms were chosen for several reasons. To begin with, they provide
  34. * fair scheduling, a very desirable property for a modern scheduler. Next,
  35. * being based on round-robin, their algorithmic complexity is very low (GR3
  36. * has O(1) scheduling complexity, and O(g) complexity on thread addition
  37. * or removal, g being the number of groups, with one group per priority, a
  38. * low number in practice). Finally, they're very simple to implement, making
  39. * them easy to adjust and maintain.
  40. *
  41. * Both algorithms are actually slightly modified for efficiency. First, this
  42. * version of GR3 is simplified by mapping exactly one group to one priority,
  43. * and in turn, one weight. This is possible because priorities are intended
  44. * to match Unix nice values, and systems commonly provide a constant, small
  45. * set of nice values. This removes the need for accounting deficit. Next,
  46. * round tracking is used to improve the handling of dynamic events : work
  47. * scaling is performed only on thread addition, and not when a thread that
  48. * was removed is added again during the same round. In addition, since GR3
  49. * is itself a round-robin algorithm, it already provides the feature required
  50. * from local scheduling by DWRR, namely round slicing. Consequently, DWRR
  51. * doesn't sit "on top" of GR3, but is actually merged with it. The result is
  52. * an algorithm that shares the same data for both local scheduling and load
  53. * balancing.
  54. *
  55. * A few terms are used by both papers with slightly different meanings. Here
  56. * are the definitions used in this implementation :
  57. * - The time unit is the system timer period (1 / tick frequency)
  58. * - Work is the amount of execution time units consumed
  59. * - Weight is the amount of execution time units allocated
  60. * - A round is the shortest period during which all threads in a run queue
  61. * consume their allocated time (i.e. their work reaches their weight)
  62. *
  63. * TODO Sub-tick accounting.
  64. *
  65. * TODO Take into account the underlying CPU topology (and adjust load
  66. * balancing to access the global highest round less frequently on large
  67. * processor groups, perhaps by applying the load balancing algorithm in a
  68. * bottom-up fashion with one highest round per processor group).
  69. *
  70. * TODO For now, interactivity can not be experimented. The current strategy
  71. * is to always add threads in front of their group queue and track rounds
  72. * so that they don't get more time than they should. A direct consequence
  73. * is that continually spawning threads at short intervals is likely to cause
  74. * starvation. This could be fixed by adding newly created threads at the back
  75. * of their group queue. For now, don't overengineer, and wait until all this
  76. * can actually be tested.
  77. *
  78. * TODO Review weight computation (it may be more appropriate to determine
  79. * weights in a smoother way than a raw scaling).
  80. */
  81. #include <assert.h>
  82. #include <errno.h>
  83. #include <stdalign.h>
  84. #include <stdbool.h>
  85. #include <stddef.h>
  86. #include <stdint.h>
  87. #include <stdio.h>
  88. #include <stdnoreturn.h>
  89. #include <string.h>
  90. #include <kern/atomic.h>
  91. #include <kern/capability.h>
  92. #include <kern/clock.h>
  93. #include <kern/cpumap.h>
  94. #include <kern/init.h>
  95. #include <kern/kmem.h>
  96. #include <kern/list.h>
  97. #include <kern/macros.h>
  98. #include <kern/panic.h>
  99. #include <kern/percpu.h>
  100. #include <kern/perfmon.h>
  101. #include <kern/rcu.h>
  102. #include <kern/shell.h>
  103. #include <kern/sleepq.h>
  104. #include <kern/spinlock.h>
  105. #include <kern/syscnt.h>
  106. #include <kern/task.h>
  107. #include <kern/thread.h>
  108. #include <kern/timer.h>
  109. #include <kern/turnstile.h>
  110. #include <kern/types.h>
  111. #include <kern/user.h>
  112. #include <kern/work.h>
  113. #include <machine/cpu.h>
  114. #include <machine/page.h>
  115. #include <machine/pmap.h>
  116. #include <machine/tcb.h>
  117. #include <vm/kmem.h>
  118. #include <vm/map.h>
  119. /*
  120. * Preemption level of a suspended thread.
  121. *
  122. * The expected interrupt, preemption and run queue lock state when
  123. * dispatching a thread is :
  124. * - interrupts disabled
  125. * - preemption disabled
  126. * - run queue locked
  127. *
  128. * Locking the run queue increases the preemption level once more,
  129. * making its value 2.
  130. */
  131. #define THREAD_SUSPEND_PREEMPT_LEVEL 2
  132. /*
  133. * Scheduling classes.
  134. *
  135. * Classes are sorted by order of priority (lower indexes first). The same
  136. * class can apply to several policies.
  137. *
  138. * The idle class is reserved for the per-CPU idle threads.
  139. */
  140. #define THREAD_SCHED_CLASS_RT 0
  141. #define THREAD_SCHED_CLASS_FS 1
  142. #define THREAD_SCHED_CLASS_IDLE 2
  143. #define THREAD_NR_SCHED_CLASSES 3
  144. /*
  145. * Global priority bases for each scheduling class.
  146. *
  147. * Global priorities are only used to determine which of two threads
  148. * has the higher priority, and should only matter for priority
  149. * inheritance.
  150. *
  151. * In the current configuration, all fair-scheduling threads have the
  152. * same global priority.
  153. */
  154. #define THREAD_SCHED_GLOBAL_PRIO_RT 2
  155. #define THREAD_SCHED_GLOBAL_PRIO_FS 1
  156. #define THREAD_SCHED_GLOBAL_PRIO_IDLE 0
  157. // Default time slice for real-time round-robin scheduling.
  158. #define THREAD_DEFAULT_RR_TIME_SLICE (CLOCK_FREQ / 10)
  159. /*
  160. * Maximum number of threads which can be pulled from a remote run queue
  161. * while interrupts are disabled.
  162. */
  163. #define THREAD_MAX_MIGRATIONS 16
  164. // Delay (in ticks) between two balance attempts when a run queue is idle.
  165. #define THREAD_IDLE_BALANCE_TICKS (CLOCK_FREQ / 2)
  166. // Run queue properties for real-time threads.
  167. struct thread_rt_runq
  168. {
  169. uint32_t bitmap;
  170. struct list threads[THREAD_SCHED_RT_PRIO_MAX + 1];
  171. };
  172. /*
  173. * Initial value of the highest round.
  174. *
  175. * Set to a high value to make sure overflows are correctly handled.
  176. */
  177. #define THREAD_FS_INITIAL_ROUND ((unsigned long)-10)
  178. // Round slice base unit for fair-scheduling threads.
  179. #define THREAD_FS_ROUND_SLICE_BASE (CLOCK_FREQ / 10)
  180. // Group of threads sharing the same weight.
  181. struct thread_fs_group
  182. {
  183. struct list node;
  184. struct list threads;
  185. uint32_t weight;
  186. uint32_t work;
  187. };
  188. /*
  189. * Run queue properties for fair-scheduling threads.
  190. *
  191. * The current group pointer has a valid address only when the run queue isn't
  192. * empty.
  193. */
  194. struct thread_fs_runq
  195. {
  196. struct thread_fs_group group_array[THREAD_SCHED_FS_PRIO_MAX + 1];
  197. struct list groups;
  198. struct list threads;
  199. struct thread_fs_group *current;
  200. uint32_t nr_threads;
  201. uint32_t weight;
  202. uint32_t work;
  203. };
  204. /*
  205. * Per processor run queue.
  206. *
  207. * Locking multiple run queues is done in the ascending order of their CPU
  208. * identifier. Interrupts must be disabled whenever locking a run queue, even
  209. * a remote one, otherwise an interrupt (which invokes the scheduler on its
  210. * return path) may violate the locking order.
  211. */
  212. struct thread_runq
  213. {
  214. __cacheline_aligned struct spinlock lock;
  215. uint32_t cpu;
  216. uint32_t nr_threads;
  217. struct thread *current;
  218. // Real-time related members.
  219. struct thread_rt_runq rt_runq;
  220. /*
  221. * Fair-scheduling related members.
  222. *
  223. * The current round is set when the active run queue becomes non-empty.
  224. * It's not reset when both run queues become empty. As a result, the
  225. * current round has a meaningful value only when at least one thread is
  226. * present, i.e. the global weight isn't zero.
  227. */
  228. size_t fs_round;
  229. uint32_t fs_weight;
  230. struct thread_fs_runq fs_runqs[2];
  231. struct thread_fs_runq *fs_runq_active;
  232. struct thread_fs_runq *fs_runq_expired;
  233. struct thread *balancer;
  234. struct thread *idler;
  235. // Ticks before the next balancing attempt when a run queue is idle.
  236. uint32_t idle_balance_ticks;
  237. struct syscnt sc_schedule_intrs;
  238. struct syscnt sc_boosts;
  239. };
  240. // Operations of a scheduling class.
  241. struct thread_sched_ops
  242. {
  243. struct thread_runq* (*select_runq) (struct thread *);
  244. void (*add) (struct thread_runq *, struct thread *);
  245. void (*remove) (struct thread_runq *, struct thread *);
  246. void (*put_prev) (struct thread_runq *, struct thread *);
  247. struct thread* (*get_next) (struct thread_runq *);
  248. void (*reset_priority) (struct thread *, uint16_t);
  249. void (*update_priority) (struct thread *, uint16_t);
  250. uint32_t (*get_global_priority) (uint16_t);
  251. void (*set_next) (struct thread_runq *, struct thread *);
  252. void (*tick) (struct thread_runq *, struct thread *);
  253. };
  254. static struct thread_runq thread_runq __percpu;
  255. /*
  256. * Statically allocated fake threads that provide thread context to processors
  257. * during bootstrap.
  258. */
  259. static struct thread thread_booters[CONFIG_MAX_CPUS] __initdata;
  260. static struct kmem_cache thread_cache;
  261. #ifndef CONFIG_THREAD_STACK_GUARD
  262. static struct kmem_cache thread_stack_cache;
  263. #endif
  264. static const uint8_t thread_policy_table[THREAD_NR_SCHED_POLICIES] =
  265. {
  266. [THREAD_SCHED_POLICY_FIFO] = THREAD_SCHED_CLASS_RT,
  267. [THREAD_SCHED_POLICY_RR] = THREAD_SCHED_CLASS_RT,
  268. [THREAD_SCHED_POLICY_FS] = THREAD_SCHED_CLASS_FS,
  269. [THREAD_SCHED_POLICY_IDLE] = THREAD_SCHED_CLASS_IDLE,
  270. };
  271. static const struct thread_sched_ops thread_sched_ops[THREAD_NR_SCHED_CLASSES];
  272. // Map of run queues for which a processor is running.
  273. static struct cpumap thread_active_runqs;
  274. /*
  275. * Map of idle run queues.
  276. *
  277. * Access to this map isn't synchronized. It is merely used as a fast hint
  278. * to find run queues that are likely to be idle.
  279. */
  280. static struct cpumap thread_idle_runqs;
  281. /*
  282. * System-wide value of the current highest round.
  283. *
  284. * This global variable is accessed without any synchronization. Its value
  285. * being slightly inaccurate doesn't harm the fairness properties of the
  286. * scheduling and load balancing algorithms.
  287. *
  288. * There can be moderate bouncing on this word so give it its own cache line.
  289. */
  290. static struct
  291. {
  292. __cacheline_aligned volatile size_t value;
  293. } thread_fs_highest_round_struct;
  294. #define thread_fs_highest_round (thread_fs_highest_round_struct.value)
  295. /*
  296. * Number of processors which have requested the scheduler to run.
  297. *
  298. * This value is used to implement a global barrier across the entire
  299. * system at boot time, so that inter-processor requests may not be
  300. * lost in case a processor is slower to initialize.
  301. */
  302. static uint32_t thread_nr_boot_cpus __initdata;
  303. struct thread_zombie
  304. {
  305. struct work work;
  306. struct thread *thread;
  307. };
  308. struct thread_runq_guard_t
  309. {
  310. struct thread_runq *runq;
  311. cpu_flags_t flags;
  312. bool preempt_disabled;
  313. };
  314. static uint8_t
  315. thread_policy_to_class (uint8_t policy)
  316. {
  317. assert (policy < ARRAY_SIZE (thread_policy_table));
  318. return (thread_policy_table[policy]);
  319. }
  320. static void
  321. thread_set_wchan (struct thread *thread, const void *wchan_addr,
  322. const char *wchan_desc)
  323. {
  324. assert (wchan_addr && wchan_desc);
  325. thread->wchan_addr = wchan_addr;
  326. thread->wchan_desc = wchan_desc;
  327. }
  328. static void
  329. thread_clear_wchan (struct thread *thread)
  330. {
  331. thread->wchan_addr = NULL;
  332. thread->wchan_desc = NULL;
  333. }
  334. static const struct thread_sched_ops*
  335. thread_get_sched_ops (uint8_t sched_class)
  336. {
  337. assert (sched_class < ARRAY_SIZE (thread_sched_ops));
  338. return (&thread_sched_ops[sched_class]);
  339. }
  340. static const struct thread_sched_ops*
  341. thread_get_user_sched_ops (const struct thread *thread)
  342. {
  343. return (thread_get_sched_ops (thread_user_sched_class (thread)));
  344. }
  345. static const struct thread_sched_ops*
  346. thread_get_real_sched_ops (const struct thread *thread)
  347. {
  348. return (thread_get_sched_ops (thread_real_sched_class (thread)));
  349. }
  350. static void __init
  351. thread_runq_init_rt (struct thread_runq *runq)
  352. {
  353. runq->rt_runq.bitmap = 0;
  354. for (size_t i = 0; i < ARRAY_SIZE (runq->rt_runq.threads); i++)
  355. list_init (&runq->rt_runq.threads[i]);
  356. }
  357. static void __init
  358. thread_fs_group_init (struct thread_fs_group *group)
  359. {
  360. list_init (&group->threads);
  361. group->weight = 0;
  362. group->work = 0;
  363. }
  364. static void __init
  365. thread_fs_runq_init (struct thread_fs_runq *fs_runq)
  366. {
  367. for (size_t i = 0; i < ARRAY_SIZE (fs_runq->group_array); i++)
  368. thread_fs_group_init (&fs_runq->group_array[i]);
  369. list_init (&fs_runq->groups);
  370. list_init (&fs_runq->threads);
  371. fs_runq->nr_threads = 0;
  372. fs_runq->weight = 0;
  373. fs_runq->work = 0;
  374. }
  375. static void __init
  376. thread_runq_init_fs (struct thread_runq *runq)
  377. {
  378. runq->fs_weight = 0;
  379. runq->fs_runq_active = &runq->fs_runqs[0];
  380. runq->fs_runq_expired = &runq->fs_runqs[1];
  381. thread_fs_runq_init (runq->fs_runq_active);
  382. thread_fs_runq_init (runq->fs_runq_expired);
  383. }
  384. static void __init
  385. thread_runq_init (struct thread_runq *runq, uint32_t cpu,
  386. struct thread *booter)
  387. {
  388. char name[SYSCNT_NAME_SIZE];
  389. spinlock_init (&runq->lock);
  390. runq->cpu = cpu;
  391. runq->nr_threads = 0;
  392. runq->current = booter;
  393. thread_runq_init_rt (runq);
  394. thread_runq_init_fs (runq);
  395. runq->balancer = NULL;
  396. runq->idler = NULL;
  397. runq->idle_balance_ticks = (uint32_t)-1;
  398. snprintf (name, sizeof (name), "thread_schedule_intrs/%u", cpu);
  399. syscnt_register (&runq->sc_schedule_intrs, name);
  400. snprintf (name, sizeof (name), "thread_boosts/%u", cpu);
  401. syscnt_register (&runq->sc_boosts, name);
  402. }
  403. static inline struct thread_runq*
  404. thread_runq_local (void)
  405. {
  406. assert (!thread_preempt_enabled () || thread_pinned ());
  407. return (cpu_local_ptr (thread_runq));
  408. }
  409. static inline uint32_t
  410. thread_runq_cpu (struct thread_runq *runq)
  411. {
  412. return (runq->cpu);
  413. }
  414. static void
  415. thread_runq_add (struct thread_runq *runq, struct thread *thread)
  416. {
  417. assert (!cpu_intr_enabled ());
  418. assert (spinlock_locked (&runq->lock));
  419. assert (!thread->in_runq);
  420. const _Auto ops = thread_get_real_sched_ops (thread);
  421. ops->add (runq, thread);
  422. if (runq->nr_threads == 0)
  423. cpumap_clear_atomic (&thread_idle_runqs, thread_runq_cpu (runq));
  424. ++runq->nr_threads;
  425. if (thread_real_sched_class (thread) <
  426. thread_real_sched_class (runq->current))
  427. thread_set_flag (runq->current, THREAD_YIELD);
  428. atomic_store_rlx (&thread->runq, runq);
  429. thread->in_runq = true;
  430. }
  431. static void
  432. thread_runq_remove (struct thread_runq *runq, struct thread *thread)
  433. {
  434. assert (!cpu_intr_enabled ());
  435. assert (spinlock_locked (&runq->lock));
  436. assert (thread->in_runq);
  437. if (--runq->nr_threads == 0)
  438. cpumap_set_atomic (&thread_idle_runqs, thread_runq_cpu (runq));
  439. const _Auto ops = thread_get_real_sched_ops (thread);
  440. ops->remove (runq, thread);
  441. thread->in_runq = false;
  442. }
  443. static void
  444. thread_runq_put_prev (struct thread_runq *runq, struct thread *thread)
  445. {
  446. assert (!cpu_intr_enabled ());
  447. assert (spinlock_locked (&runq->lock));
  448. const _Auto ops = thread_get_real_sched_ops (thread);
  449. if (ops->put_prev)
  450. ops->put_prev (runq, thread);
  451. }
  452. static struct thread*
  453. thread_runq_get_next (struct thread_runq *runq)
  454. {
  455. assert (!cpu_intr_enabled ());
  456. assert (spinlock_locked (&runq->lock));
  457. for (size_t i = 0; i < ARRAY_SIZE (thread_sched_ops); i++)
  458. {
  459. struct thread *thread = thread_sched_ops[i].get_next (runq);
  460. if (thread)
  461. {
  462. atomic_store_rlx (&runq->current, thread);
  463. return (thread);
  464. }
  465. }
  466. // The idle class should never be empty.
  467. panic ("thread: unable to find next thread");
  468. }
  469. static void
  470. thread_runq_set_next (struct thread_runq *runq, struct thread *thread)
  471. {
  472. const _Auto ops = thread_get_real_sched_ops (thread);
  473. if (ops->set_next)
  474. ops->set_next (runq, thread);
  475. atomic_store_rlx (&runq->current, thread);
  476. }
  477. static void
  478. thread_runq_wakeup (struct thread_runq *runq, struct thread *thread)
  479. {
  480. assert (!cpu_intr_enabled ());
  481. assert (spinlock_locked (&runq->lock));
  482. assert (thread->state == THREAD_RUNNING);
  483. thread_runq_add (runq, thread);
  484. if (runq != thread_runq_local () &&
  485. thread_test_flag (runq->current, THREAD_YIELD))
  486. cpu_send_thread_schedule (thread_runq_cpu (runq));
  487. }
  488. static void
  489. thread_runq_wakeup_balancer (struct thread_runq *runq)
  490. {
  491. if (runq->balancer->state == THREAD_RUNNING)
  492. return;
  493. thread_clear_wchan (runq->balancer);
  494. atomic_store_rlx (&runq->balancer->state, THREAD_RUNNING);
  495. thread_runq_wakeup (runq, runq->balancer);
  496. }
  497. static void
  498. thread_runq_schedule_load (struct thread *thread)
  499. {
  500. pmap_load (thread->xtask->map->pmap);
  501. #ifdef CONFIG_PERFMON
  502. perfmon_td_load (thread_get_perfmon_td (thread));
  503. #endif
  504. }
  505. static void
  506. thread_runq_schedule_unload (struct thread *thread __unused)
  507. {
  508. #ifdef CONFIG_PERFMON
  509. perfmon_td_unload (thread_get_perfmon_td (thread));
  510. #endif
  511. }
  512. static struct thread_runq*
  513. thread_lock_runq (struct thread *thread, cpu_flags_t *flags)
  514. {
  515. while (1)
  516. {
  517. _Auto runq = atomic_load_rlx (&thread->runq);
  518. spinlock_lock_intr_save (&runq->lock, flags);
  519. if (likely (runq == atomic_load_rlx (&thread->runq)))
  520. return (runq);
  521. spinlock_unlock_intr_restore (&runq->lock, *flags);
  522. }
  523. }
  524. static void
  525. thread_unlock_runq (struct thread_runq *runq, cpu_flags_t flags)
  526. {
  527. spinlock_unlock_intr_restore (&runq->lock, flags);
  528. }
  529. static struct thread_runq_guard_t
  530. thread_runq_guard_make (struct thread *thread, bool disable_preempt)
  531. {
  532. struct thread_runq_guard_t ret;
  533. ret.preempt_disabled = disable_preempt;
  534. if (ret.preempt_disabled)
  535. thread_preempt_disable ();
  536. ret.runq = thread_lock_runq (thread, &ret.flags);
  537. return (ret);
  538. }
  539. static void
  540. thread_runq_guard_fini (struct thread_runq_guard_t *guard)
  541. {
  542. thread_unlock_runq (guard->runq, guard->flags);
  543. if (guard->preempt_disabled)
  544. thread_preempt_enable ();
  545. }
  546. #define thread_runq_guard \
  547. thread_runq_guard_t CLEANUP (thread_runq_guard_fini) __unused
  548. static void thread_setsched_impl (struct thread *, uint8_t,
  549. uint16_t, struct thread_runq *);
  550. static void
  551. thread_rcu_handle_switch (struct thread *thread, struct thread_runq *runq)
  552. {
  553. _Auto reader = thread_rcu_reader (thread);
  554. if (!rcu_report_context_switch (reader) || reader->saved_sched)
  555. return;
  556. reader->saved_prio = thread_user_priority (thread);
  557. // Boost to the max priority, but don't change scheduling class.
  558. uint16_t nprio = thread_user_sched_class (thread) == THREAD_SCHED_CLASS_RT ?
  559. THREAD_SCHED_RT_PRIO_MAX : THREAD_SCHED_FS_PRIO_MAX;
  560. _Auto td = thread_turnstile_td (thread);
  561. turnstile_td_lock (td);
  562. thread_setsched_impl (thread, thread_user_sched_policy (thread),
  563. nprio, runq);
  564. turnstile_td_unlock (td);
  565. reader->saved_sched = true;
  566. }
  567. static struct thread_runq*
  568. thread_runq_schedule (struct thread_runq *runq)
  569. {
  570. struct thread *prev = thread_self ();
  571. assert (prev->cur_lpad ||
  572. (__builtin_frame_address (0) >= prev->stack &&
  573. __builtin_frame_address (0) < prev->stack + TCB_STACK_SIZE));
  574. assert (prev->preempt_level == THREAD_SUSPEND_PREEMPT_LEVEL);
  575. assert (!cpu_intr_enabled ());
  576. assert (spinlock_locked (&runq->lock));
  577. thread_clear_flag (prev, THREAD_YIELD);
  578. thread_runq_put_prev (runq, prev);
  579. if (prev->suspend)
  580. {
  581. prev->state = THREAD_SUSPENDED;
  582. prev->suspend = false;
  583. }
  584. if (prev->state != THREAD_RUNNING)
  585. {
  586. thread_runq_remove (runq, prev);
  587. if (!runq->nr_threads && prev != runq->balancer)
  588. thread_runq_wakeup_balancer (runq);
  589. }
  590. struct thread *next = thread_runq_get_next (runq);
  591. assert (next != runq->idler || !runq->nr_threads);
  592. assert (next->preempt_level == THREAD_SUSPEND_PREEMPT_LEVEL);
  593. if (likely (prev != next))
  594. {
  595. thread_runq_schedule_unload (prev);
  596. pmap_context_switch (prev, next);
  597. thread_rcu_handle_switch (prev, runq);
  598. spinlock_transfer_owner (&runq->lock, next);
  599. /*
  600. * This is where the true context switch occurs. The next thread must
  601. * unlock the run queue and reenable preemption. Note that unlocking
  602. * and locking the run queue again is equivalent to a full memory
  603. * barrier.
  604. */
  605. tcb_switch (&prev->tcb, &next->tcb);
  606. /*
  607. * The thread is dispatched on a processor once again.
  608. *
  609. * Keep in mind the system state may have changed a lot since this
  610. * function was called. In particular :
  611. * - The next thread may have been destroyed, and must not be
  612. * referenced any more.
  613. * - The current thread may have been migrated to another processor.
  614. */
  615. barrier ();
  616. thread_runq_schedule_load (prev);
  617. runq = thread_runq_local ();
  618. }
  619. assert (prev->preempt_level == THREAD_SUSPEND_PREEMPT_LEVEL);
  620. assert (!cpu_intr_enabled ());
  621. assert (spinlock_locked (&runq->lock));
  622. return (runq);
  623. }
  624. static void
  625. thread_runq_double_lock (struct thread_runq *a, struct thread_runq *b)
  626. {
  627. assert (!cpu_intr_enabled ());
  628. assert (!thread_preempt_enabled ());
  629. assert (a != b);
  630. if (a->cpu < b->cpu)
  631. {
  632. spinlock_lock (&a->lock);
  633. spinlock_lock (&b->lock);
  634. }
  635. else
  636. {
  637. spinlock_lock (&b->lock);
  638. spinlock_lock (&a->lock);
  639. }
  640. }
  641. static struct thread_runq*
  642. thread_sched_rt_select_runq (struct thread *thread)
  643. {
  644. /*
  645. * Real-time tasks are commonly configured to run on one specific
  646. * processor only.
  647. */
  648. int i = cpumap_find_first (&thread->cpumap);
  649. assert (i >= 0);
  650. assert (cpumap_test (&thread_active_runqs, i));
  651. struct thread_runq *runq = percpu_ptr (thread_runq, i);
  652. spinlock_lock (&runq->lock);
  653. return (runq);
  654. }
  655. static void
  656. thread_sched_rt_add (struct thread_runq *runq, struct thread *thread)
  657. {
  658. struct thread_rt_runq *rt_runq = &runq->rt_runq;
  659. struct list *threads = &rt_runq->threads[thread_real_priority (thread)];
  660. list_insert_tail (threads, &thread->rt_data.node);
  661. if (list_singular (threads))
  662. rt_runq->bitmap |= (1ULL << thread_real_priority (thread));
  663. if (thread_real_sched_class (thread) ==
  664. thread_real_sched_class (runq->current) &&
  665. thread_real_priority (thread) > thread_real_priority (runq->current))
  666. thread_set_flag (runq->current, THREAD_YIELD);
  667. }
  668. static void
  669. thread_sched_rt_remove (struct thread_runq *runq, struct thread *thread)
  670. {
  671. struct thread_rt_runq *rt_runq = &runq->rt_runq;
  672. assert (thread_real_priority (thread) < ARRAY_SIZE (rt_runq->threads));
  673. struct list *threads = &rt_runq->threads[thread_real_priority (thread)];
  674. list_remove (&thread->rt_data.node);
  675. if (list_empty (threads))
  676. rt_runq->bitmap &= ~(1ULL << thread_real_priority (thread));
  677. }
  678. static void
  679. thread_sched_rt_put_prev (struct thread_runq *runq, struct thread *thread)
  680. {
  681. thread_sched_rt_add (runq, thread);
  682. }
  683. static struct thread*
  684. thread_sched_rt_get_next (struct thread_runq *runq)
  685. {
  686. struct thread_rt_runq *rt_runq = &runq->rt_runq;
  687. if (!rt_runq->bitmap)
  688. return (NULL);
  689. uint32_t priority = THREAD_SCHED_RT_PRIO_MAX -
  690. __builtin_clz (rt_runq->bitmap);
  691. assert (priority < ARRAY_SIZE (rt_runq->threads));
  692. struct list *threads = &rt_runq->threads[priority];
  693. assert (!list_empty (threads));
  694. _Auto thread = list_first_entry (threads, struct thread, rt_data.node);
  695. thread_sched_rt_remove (runq, thread);
  696. return (thread);
  697. }
  698. static void
  699. thread_sched_rt_reset_priority (struct thread *thread, uint16_t priority)
  700. {
  701. assert (priority <= THREAD_SCHED_RT_PRIO_MAX);
  702. thread->rt_data.time_slice = THREAD_DEFAULT_RR_TIME_SLICE;
  703. }
  704. static uint32_t
  705. thread_sched_rt_get_global_priority (uint16_t priority)
  706. {
  707. return (THREAD_SCHED_GLOBAL_PRIO_RT + priority);
  708. }
  709. static void
  710. thread_sched_rt_set_next (struct thread_runq *runq, struct thread *thread)
  711. {
  712. thread_sched_rt_remove (runq, thread);
  713. }
  714. static void
  715. thread_sched_rt_tick (struct thread_runq *runq __unused, struct thread *thread)
  716. {
  717. if (thread_real_sched_policy (thread) != THREAD_SCHED_POLICY_RR ||
  718. --thread->rt_data.time_slice > 0)
  719. return;
  720. thread->rt_data.time_slice = THREAD_DEFAULT_RR_TIME_SLICE;
  721. thread_set_flag (thread, THREAD_YIELD);
  722. }
  723. static inline uint16_t
  724. thread_sched_fs_prio2weight (uint16_t priority)
  725. {
  726. return ((priority + 1) * THREAD_FS_ROUND_SLICE_BASE);
  727. }
  728. static struct thread_runq*
  729. thread_sched_fs_select_runq (struct thread *thread)
  730. {
  731. struct thread_runq *runq;
  732. cpumap_for_each (&thread_idle_runqs, i)
  733. {
  734. if (!cpumap_test (&thread->cpumap, i))
  735. continue;
  736. runq = percpu_ptr (thread_runq, i);
  737. spinlock_lock (&runq->lock);
  738. // The run queue really is idle, return it.
  739. if (runq->current == runq->idler)
  740. return (runq);
  741. spinlock_unlock (&runq->lock);
  742. }
  743. runq = NULL;
  744. cpumap_for_each (&thread_active_runqs, i)
  745. {
  746. if (!cpumap_test (&thread->cpumap, i))
  747. continue;
  748. _Auto tmp = percpu_ptr (thread_runq, i);
  749. spinlock_lock (&tmp->lock);
  750. if (! runq)
  751. {
  752. runq = tmp;
  753. continue;
  754. }
  755. // A run queue may have become idle.
  756. if (tmp->current == tmp->idler)
  757. {
  758. spinlock_unlock (&runq->lock);
  759. return (tmp);
  760. }
  761. /*
  762. * The run queue isn't idle, but there are no fair-scheduling thread,
  763. * which means there are real-time threads.
  764. */
  765. if (tmp->fs_weight == 0)
  766. {
  767. spinlock_unlock (&tmp->lock);
  768. continue;
  769. }
  770. ssize_t delta = (ssize_t)(tmp->fs_round - runq->fs_round);
  771. // Look for the least loaded of the run queues in the highest round.
  772. if (delta > 0 ||
  773. (!delta && tmp->fs_weight < runq->fs_weight))
  774. {
  775. spinlock_unlock (&runq->lock);
  776. runq = tmp;
  777. continue;
  778. }
  779. spinlock_unlock (&tmp->lock);
  780. }
  781. assert (runq);
  782. return (runq);
  783. }
  784. static uint32_t
  785. thread_sched_fs_enqueue_scale (uint32_t work, uint32_t old_weight,
  786. uint32_t new_weight)
  787. {
  788. assert (old_weight);
  789. #ifndef __LP64__
  790. if (likely (work < 0x10000 && new_weight < 0x10000))
  791. return ((work * new_weight) / old_weight);
  792. #endif
  793. return ((uint32_t)(((uint64_t)work * new_weight) / old_weight));
  794. }
  795. static void
  796. thread_sched_fs_enqueue (struct thread_fs_runq *fs_runq, size_t round,
  797. struct thread *thread)
  798. {
  799. assert (!thread->fs_data.fs_runq);
  800. assert (thread->fs_data.work <= thread->fs_data.weight);
  801. _Auto group = &fs_runq->group_array[thread_real_priority (thread)];
  802. uint32_t group_weight = group->weight + thread->fs_data.weight,
  803. total_weight = fs_runq->weight + thread->fs_data.weight;
  804. struct list *node = group->weight ?
  805. list_prev (&group->node) : list_last (&fs_runq->groups);
  806. struct list *init_node = node;
  807. while (!list_end (&fs_runq->groups, node))
  808. {
  809. _Auto tmp = list_entry (node, struct thread_fs_group, node);
  810. if (tmp->weight >= group_weight)
  811. break;
  812. node = list_prev (node);
  813. }
  814. if (!group->weight)
  815. list_insert_after (&group->node, node);
  816. else if (node != init_node)
  817. {
  818. list_remove (&group->node);
  819. list_insert_after (&group->node, node);
  820. }
  821. /*
  822. * XXX Unfairness can occur if the run queue round wraps around and the
  823. * thread is "lucky" enough to have the same round value. This should be
  824. * rare and harmless otherwise.
  825. */
  826. if (thread->fs_data.round == round)
  827. {
  828. fs_runq->work += thread->fs_data.work;
  829. group->work += thread->fs_data.work;
  830. }
  831. else
  832. {
  833. uint32_t group_work, thread_work;
  834. if (!fs_runq->weight)
  835. thread_work = 0;
  836. else
  837. {
  838. group_work = group->weight == 0 ?
  839. thread_sched_fs_enqueue_scale (fs_runq->work,
  840. fs_runq->weight,
  841. thread->fs_data.weight) :
  842. thread_sched_fs_enqueue_scale (group->work,
  843. group->weight,
  844. group_weight);
  845. thread_work = group_work - group->work;
  846. fs_runq->work += thread_work;
  847. group->work = group_work;
  848. }
  849. thread->fs_data.round = round;
  850. thread->fs_data.work = thread_work;
  851. }
  852. ++fs_runq->nr_threads;
  853. fs_runq->weight = total_weight;
  854. group->weight = group_weight;
  855. // Insert at the front of the group to improve interactivity.
  856. list_insert_head (&group->threads, &thread->fs_data.group_node);
  857. list_insert_tail (&fs_runq->threads, &thread->fs_data.runq_node);
  858. thread->fs_data.fs_runq = fs_runq;
  859. }
  860. static void
  861. thread_sched_fs_restart (struct thread_runq *runq)
  862. {
  863. _Auto fs_runq = runq->fs_runq_active;
  864. struct list *node = list_first (&fs_runq->groups);
  865. assert (node);
  866. fs_runq->current = list_entry (node, struct thread_fs_group, node);
  867. if (thread_real_sched_class (runq->current) == THREAD_SCHED_CLASS_FS)
  868. thread_set_flag (runq->current, THREAD_YIELD);
  869. }
  870. static void
  871. thread_sched_fs_add (struct thread_runq *runq, struct thread *thread)
  872. {
  873. if (!runq->fs_weight)
  874. runq->fs_round = thread_fs_highest_round;
  875. uint32_t total_weight = runq->fs_weight + thread->fs_data.weight;
  876. // TODO Limit the maximum number of threads to prevent this situation.
  877. if (total_weight < runq->fs_weight)
  878. panic ("thread: weight overflow");
  879. runq->fs_weight = total_weight;
  880. thread_sched_fs_enqueue (runq->fs_runq_active, runq->fs_round, thread);
  881. thread_sched_fs_restart (runq);
  882. }
  883. static void
  884. thread_sched_fs_dequeue (struct thread *thread)
  885. {
  886. assert (thread->fs_data.fs_runq);
  887. _Auto fs_runq = thread->fs_data.fs_runq;
  888. _Auto group = &fs_runq->group_array[thread_real_priority (thread)];
  889. thread->fs_data.fs_runq = NULL;
  890. list_remove (&thread->fs_data.runq_node);
  891. list_remove (&thread->fs_data.group_node);
  892. fs_runq->work -= thread->fs_data.work;
  893. group->work -= thread->fs_data.work;
  894. fs_runq->weight -= thread->fs_data.weight;
  895. group->weight -= thread->fs_data.weight;
  896. --fs_runq->nr_threads;
  897. if (!group->weight)
  898. list_remove (&group->node);
  899. else
  900. {
  901. struct list *node = list_next (&group->node),
  902. *init_node = node;
  903. while (!list_end (&fs_runq->groups, node))
  904. {
  905. _Auto tmp = list_entry (node, struct thread_fs_group, node);
  906. if (tmp->weight <= group->weight)
  907. break;
  908. node = list_next (node);
  909. }
  910. if (node != init_node)
  911. {
  912. list_remove (&group->node);
  913. list_insert_before (&group->node, node);
  914. }
  915. }
  916. }
  917. static void
  918. thread_sched_fs_remove (struct thread_runq *runq, struct thread *thread)
  919. {
  920. runq->fs_weight -= thread->fs_data.weight;
  921. _Auto fs_runq = thread->fs_data.fs_runq;
  922. thread_sched_fs_dequeue (thread);
  923. if (fs_runq != runq->fs_runq_active)
  924. ;
  925. else if (!fs_runq->nr_threads)
  926. thread_runq_wakeup_balancer (runq);
  927. else
  928. thread_sched_fs_restart (runq);
  929. }
  930. static void
  931. thread_sched_fs_deactivate (struct thread_runq *runq, struct thread *thread)
  932. {
  933. assert (thread->fs_data.fs_runq == runq->fs_runq_active);
  934. assert (thread->fs_data.round == runq->fs_round);
  935. thread_sched_fs_dequeue (thread);
  936. ++thread->fs_data.round;
  937. thread->fs_data.work -= thread->fs_data.weight;
  938. thread_sched_fs_enqueue (runq->fs_runq_expired, runq->fs_round + 1, thread);
  939. if (!runq->fs_runq_active->nr_threads)
  940. thread_runq_wakeup_balancer (runq);
  941. }
  942. static void
  943. thread_sched_fs_put_prev (struct thread_runq *runq, struct thread *thread)
  944. {
  945. _Auto fs_runq = runq->fs_runq_active;
  946. assert (thread_real_priority (thread) < ARRAY_SIZE (fs_runq->group_array));
  947. _Auto group = &fs_runq->group_array[thread_real_priority (thread)];
  948. list_insert_tail (&group->threads, &thread->fs_data.group_node);
  949. if (thread->fs_data.work >= thread->fs_data.weight)
  950. thread_sched_fs_deactivate (runq, thread);
  951. }
  952. static int
  953. thread_sched_fs_ratio_exceeded (struct thread_fs_group *current,
  954. struct thread_fs_group *next)
  955. {
  956. #ifndef __LP64__
  957. if (likely (current->weight < 0x10000 && next->weight < 0x10000))
  958. {
  959. uint32_t ia = (current->work + 1) * next->weight,
  960. ib = (next->work + 1) * current->weight;
  961. return (ia > ib);
  962. }
  963. #endif
  964. uint64_t a = ((uint64_t)current->work + 1) * next->weight,
  965. b = ((uint64_t)next->work + 1) * current->weight;
  966. return (a > b);
  967. }
  968. static struct thread*
  969. thread_sched_fs_get_next (struct thread_runq *runq)
  970. {
  971. _Auto fs_runq = runq->fs_runq_active;
  972. if (!fs_runq->nr_threads)
  973. return (NULL);
  974. _Auto group = fs_runq->current;
  975. struct list *node = list_next (&group->node);
  976. if (list_end (&fs_runq->groups, node))
  977. group = list_entry (list_first (&fs_runq->groups),
  978. struct thread_fs_group, node);
  979. else
  980. {
  981. _Auto next = list_entry (node, struct thread_fs_group, node);
  982. group = thread_sched_fs_ratio_exceeded (group, next) ?
  983. next : list_entry (list_first (&fs_runq->groups),
  984. struct thread_fs_group, node);
  985. }
  986. fs_runq->current = group;
  987. return (list_pop (&group->threads, struct thread, fs_data.group_node));
  988. }
  989. static void
  990. thread_sched_fs_reset_priority (struct thread *thread, uint16_t priority)
  991. {
  992. assert (priority <= THREAD_SCHED_FS_PRIO_MAX);
  993. thread->fs_data.fs_runq = NULL;
  994. thread->fs_data.round = 0;
  995. thread->fs_data.weight = thread_sched_fs_prio2weight (priority);
  996. thread->fs_data.work = 0;
  997. }
  998. static void
  999. thread_sched_fs_update_priority (struct thread *thread, uint16_t priority)
  1000. {
  1001. assert (priority <= THREAD_SCHED_FS_PRIO_MAX);
  1002. thread->fs_data.weight = thread_sched_fs_prio2weight (priority);
  1003. if (thread->fs_data.work >= thread->fs_data.weight)
  1004. thread->fs_data.work = thread->fs_data.weight;
  1005. }
  1006. static uint32_t
  1007. thread_sched_fs_get_global_priority (uint16_t priority __unused)
  1008. {
  1009. return (THREAD_SCHED_GLOBAL_PRIO_FS);
  1010. }
  1011. static void
  1012. thread_sched_fs_set_next (struct thread_runq *rq __unused, struct thread *thr)
  1013. {
  1014. list_remove (&thr->fs_data.group_node);
  1015. }
  1016. static void
  1017. thread_sched_fs_tick (struct thread_runq *runq, struct thread *thread)
  1018. {
  1019. _Auto fs_runq = runq->fs_runq_active;
  1020. ++fs_runq->work;
  1021. _Auto group = &fs_runq->group_array[thread_real_priority (thread)];
  1022. ++group->work;
  1023. thread_set_flag (thread, THREAD_YIELD);
  1024. ++thread->fs_data.work;
  1025. }
  1026. static void
  1027. thread_sched_fs_start_next_round (struct thread_runq *runq)
  1028. {
  1029. _Auto tmp = runq->fs_runq_expired;
  1030. runq->fs_runq_expired = runq->fs_runq_active;
  1031. runq->fs_runq_active = tmp;
  1032. if (runq->fs_runq_active->nr_threads)
  1033. {
  1034. ++runq->fs_round;
  1035. ssize_t delta = (ssize_t)(runq->fs_round - thread_fs_highest_round);
  1036. if (delta > 0)
  1037. thread_fs_highest_round = runq->fs_round;
  1038. thread_sched_fs_restart (runq);
  1039. }
  1040. }
  1041. // Check that a remote run queue satisfies the minimum migration requirements.
  1042. static int
  1043. thread_sched_fs_balance_eligible (struct thread_runq *runq,
  1044. size_t highest_round)
  1045. {
  1046. if (!runq->fs_weight ||
  1047. (runq->fs_round != highest_round &&
  1048. runq->fs_round != highest_round - 1))
  1049. return (0);
  1050. uint32_t nr_threads = runq->fs_runq_active->nr_threads +
  1051. runq->fs_runq_expired->nr_threads;
  1052. if (! nr_threads ||
  1053. (nr_threads == 1 &&
  1054. thread_real_sched_class (runq->current) == THREAD_SCHED_CLASS_FS))
  1055. return (0);
  1056. return (1);
  1057. }
  1058. // Try to find the most suitable run queue from which to pull threads.
  1059. static struct thread_runq*
  1060. thread_sched_fs_balance_scan (struct thread_runq *runq,
  1061. size_t highest_round)
  1062. {
  1063. struct thread_runq *remote_runq = NULL;
  1064. cpu_flags_t flags;
  1065. thread_preempt_disable_intr_save (&flags);
  1066. cpumap_for_each (&thread_active_runqs, i)
  1067. {
  1068. _Auto tmp = percpu_ptr (thread_runq, i);
  1069. if (tmp == runq)
  1070. continue;
  1071. spinlock_lock (&tmp->lock);
  1072. if (!thread_sched_fs_balance_eligible (tmp, highest_round))
  1073. {
  1074. spinlock_unlock (&tmp->lock);
  1075. continue;
  1076. }
  1077. else if (! remote_runq)
  1078. {
  1079. remote_runq = tmp;
  1080. continue;
  1081. }
  1082. else if (tmp->fs_weight > remote_runq->fs_weight)
  1083. {
  1084. spinlock_unlock (&remote_runq->lock);
  1085. remote_runq = tmp;
  1086. continue;
  1087. }
  1088. spinlock_unlock (&tmp->lock);
  1089. }
  1090. if (remote_runq)
  1091. spinlock_unlock (&remote_runq->lock);
  1092. thread_preempt_enable_intr_restore (flags);
  1093. return (remote_runq);
  1094. }
  1095. static uint32_t
  1096. thread_sched_fs_balance_pull (struct thread_runq *runq,
  1097. struct thread_runq *remote_runq,
  1098. struct thread_fs_runq *fs_runq,
  1099. uint32_t nr_pulls)
  1100. {
  1101. int cpu = thread_runq_cpu (runq);
  1102. struct thread *thread, *tmp;
  1103. list_for_each_entry_safe (&fs_runq->threads, thread, tmp,
  1104. fs_data.runq_node)
  1105. {
  1106. if (thread == remote_runq->current)
  1107. continue;
  1108. /*
  1109. * The pin level is changed without explicit synchronization.
  1110. * However, it can only be changed by its owning thread. As threads
  1111. * currently running aren't considered for migration, the thread had
  1112. * to be preempted and invoke the scheduler. Since balancer threads
  1113. * acquire the run queue lock, there is strong ordering between
  1114. * changing the pin level and setting the current thread of a
  1115. * run queue.
  1116. *
  1117. * TODO Review comment.
  1118. */
  1119. if (thread->pin_level || !cpumap_test (&thread->cpumap, cpu))
  1120. continue;
  1121. /*
  1122. * Make sure at least one thread is pulled if possible. If one or more
  1123. * thread has already been pulled, take weights into account.
  1124. */
  1125. if (nr_pulls &&
  1126. runq->fs_weight + thread->fs_data.weight >
  1127. remote_runq->fs_weight - thread->fs_data.weight)
  1128. break;
  1129. thread_runq_remove (remote_runq, thread);
  1130. // Don't discard the work already accounted for.
  1131. thread->fs_data.round = runq->fs_round;
  1132. thread_runq_add (runq, thread);
  1133. if (++nr_pulls == THREAD_MAX_MIGRATIONS)
  1134. break;
  1135. }
  1136. return (nr_pulls);
  1137. }
  1138. static uint32_t
  1139. thread_sched_fs_balance_migrate (struct thread_runq *runq,
  1140. struct thread_runq *remote_runq,
  1141. size_t highest_round)
  1142. {
  1143. uint32_t nr_pulls = 0;
  1144. if (!thread_sched_fs_balance_eligible (remote_runq, highest_round))
  1145. return (nr_pulls);
  1146. nr_pulls = thread_sched_fs_balance_pull (runq, remote_runq,
  1147. remote_runq->fs_runq_active, 0);
  1148. if (nr_pulls == THREAD_MAX_MIGRATIONS)
  1149. return (nr_pulls);
  1150. /*
  1151. * Threads in the expired queue of a processor in round highest are
  1152. * actually in round highest + 1.
  1153. */
  1154. if (remote_runq->fs_round != highest_round)
  1155. nr_pulls = thread_sched_fs_balance_pull (runq, remote_runq,
  1156. remote_runq->fs_runq_expired,
  1157. nr_pulls);
  1158. return (nr_pulls);
  1159. }
  1160. /*
  1161. * Inter-processor load balancing for fair-scheduling threads.
  1162. *
  1163. * Preemption must be disabled, and the local run queue must be locked when
  1164. * calling this function. If balancing actually occurs, the lock will be
  1165. * released and preemption enabled when needed.
  1166. */
  1167. static void
  1168. thread_sched_fs_balance (struct thread_runq *runq, cpu_flags_t *flags)
  1169. {
  1170. /*
  1171. * Grab the highest round now and only use the copy so the value is stable
  1172. * during the balancing operation.
  1173. */
  1174. size_t highest_round = thread_fs_highest_round;
  1175. if (runq->fs_round != highest_round &&
  1176. runq->fs_runq_expired->nr_threads)
  1177. goto no_migration;
  1178. spinlock_unlock_intr_restore (&runq->lock, *flags);
  1179. thread_preempt_enable ();
  1180. uint32_t nr_migrations;
  1181. _Auto remote_runq = thread_sched_fs_balance_scan (runq, highest_round);
  1182. if (remote_runq)
  1183. {
  1184. thread_preempt_disable_intr_save (flags);
  1185. thread_runq_double_lock (runq, remote_runq);
  1186. nr_migrations = thread_sched_fs_balance_migrate (runq, remote_runq,
  1187. highest_round);
  1188. spinlock_unlock (&remote_runq->lock);
  1189. if (nr_migrations)
  1190. return;
  1191. spinlock_unlock_intr_restore (&runq->lock, *flags);
  1192. thread_preempt_enable ();
  1193. }
  1194. /*
  1195. * The scan or the migration failed. As a fallback, make another, simpler
  1196. * pass on every run queue, and stop as soon as at least one thread could
  1197. * be successfully pulled.
  1198. */
  1199. cpumap_for_each (&thread_active_runqs, i)
  1200. {
  1201. remote_runq = percpu_ptr (thread_runq, i);
  1202. if (remote_runq == runq)
  1203. continue;
  1204. thread_preempt_disable_intr_save (flags);
  1205. thread_runq_double_lock (runq, remote_runq);
  1206. nr_migrations = thread_sched_fs_balance_migrate (runq, remote_runq,
  1207. highest_round);
  1208. spinlock_unlock (&remote_runq->lock);
  1209. if (nr_migrations != 0)
  1210. return;
  1211. spinlock_unlock_intr_restore (&runq->lock, *flags);
  1212. thread_preempt_enable ();
  1213. }
  1214. thread_preempt_disable ();
  1215. spinlock_lock_intr_save (&runq->lock, flags);
  1216. no_migration:
  1217. /*
  1218. * No thread could be migrated. Check the active run queue, as another
  1219. * processor might have added threads while the balancer was running.
  1220. * If the run queue is still empty, switch to the next round. The run
  1221. * queue lock must remain held until the next scheduling decision to
  1222. * prevent a remote balancer thread from stealing active threads.
  1223. */
  1224. if (!runq->fs_runq_active->nr_threads)
  1225. thread_sched_fs_start_next_round (runq);
  1226. }
  1227. static struct thread_runq*
  1228. thread_sched_idle_select_runq (struct thread *thread __unused)
  1229. {
  1230. panic ("thread: idler threads cannot be awoken");
  1231. }
  1232. static noreturn void
  1233. thread_sched_idle_panic (void)
  1234. {
  1235. panic ("thread: only idle threads are allowed in the idle class");
  1236. }
  1237. static void
  1238. thread_sched_idle_add (struct thread_runq *runq __unused,
  1239. struct thread *thread __unused)
  1240. {
  1241. thread_sched_idle_panic ();
  1242. }
  1243. #define thread_sched_idle_remove thread_sched_idle_add
  1244. static struct thread*
  1245. thread_sched_idle_get_next (struct thread_runq *runq)
  1246. {
  1247. return (runq->idler);
  1248. }
  1249. static uint32_t
  1250. thread_sched_idle_get_global_priority (uint16_t priority __unused)
  1251. {
  1252. return (THREAD_SCHED_GLOBAL_PRIO_IDLE);
  1253. }
  1254. static const struct thread_sched_ops thread_sched_ops[THREAD_NR_SCHED_CLASSES] =
  1255. {
  1256. [THREAD_SCHED_CLASS_RT] =
  1257. {
  1258. .select_runq = thread_sched_rt_select_runq,
  1259. .add = thread_sched_rt_add,
  1260. .remove = thread_sched_rt_remove,
  1261. .put_prev = thread_sched_rt_put_prev,
  1262. .get_next = thread_sched_rt_get_next,
  1263. .reset_priority = thread_sched_rt_reset_priority,
  1264. .update_priority = NULL,
  1265. .get_global_priority = thread_sched_rt_get_global_priority,
  1266. .set_next = thread_sched_rt_set_next,
  1267. .tick = thread_sched_rt_tick,
  1268. },
  1269. [THREAD_SCHED_CLASS_FS] =
  1270. {
  1271. .select_runq = thread_sched_fs_select_runq,
  1272. .add = thread_sched_fs_add,
  1273. .remove = thread_sched_fs_remove,
  1274. .put_prev = thread_sched_fs_put_prev,
  1275. .get_next = thread_sched_fs_get_next,
  1276. .reset_priority = thread_sched_fs_reset_priority,
  1277. .update_priority = thread_sched_fs_update_priority,
  1278. .get_global_priority = thread_sched_fs_get_global_priority,
  1279. .set_next = thread_sched_fs_set_next,
  1280. .tick = thread_sched_fs_tick,
  1281. },
  1282. [THREAD_SCHED_CLASS_IDLE] =
  1283. {
  1284. .select_runq = thread_sched_idle_select_runq,
  1285. .add = thread_sched_idle_add,
  1286. .remove = thread_sched_idle_remove,
  1287. .put_prev = NULL,
  1288. .get_next = thread_sched_idle_get_next,
  1289. .reset_priority = NULL,
  1290. .update_priority = NULL,
  1291. .get_global_priority = thread_sched_idle_get_global_priority,
  1292. .set_next = NULL,
  1293. .tick = NULL,
  1294. },
  1295. };
  1296. static void
  1297. thread_set_user_sched_policy (struct thread *thread, uint8_t sched_policy)
  1298. {
  1299. thread->user_sched_data.sched_policy = sched_policy;
  1300. }
  1301. static void
  1302. thread_set_user_sched_class (struct thread *thread, uint8_t sched_class)
  1303. {
  1304. thread->user_sched_data.sched_class = sched_class;
  1305. }
  1306. static void
  1307. thread_set_user_priority (struct thread *thread, uint16_t prio)
  1308. {
  1309. const _Auto ops = thread_get_user_sched_ops (thread);
  1310. thread->user_sched_data.priority = prio;
  1311. thread->user_sched_data.global_priority = ops->get_global_priority (prio);
  1312. }
  1313. static void
  1314. thread_update_user_priority (struct thread *thread, uint16_t priority)
  1315. {
  1316. thread_set_user_priority (thread, priority);
  1317. }
  1318. static void
  1319. thread_set_real_sched_policy (struct thread *thread, uint8_t sched_policy)
  1320. {
  1321. thread->real_sched_data.sched_policy = sched_policy;
  1322. }
  1323. static void
  1324. thread_set_real_sched_class (struct thread *thread, uint8_t sched_class)
  1325. {
  1326. thread->real_sched_data.sched_class = sched_class;
  1327. }
  1328. static void
  1329. thread_set_real_priority (struct thread *thread, uint16_t prio)
  1330. {
  1331. const _Auto ops = thread_get_real_sched_ops (thread);
  1332. thread->real_sched_data.priority = prio;
  1333. thread->real_sched_data.global_priority = ops->get_global_priority (prio);
  1334. if (ops->reset_priority)
  1335. ops->reset_priority (thread, prio);
  1336. }
  1337. static void
  1338. thread_update_real_priority (struct thread *thread, uint16_t prio)
  1339. {
  1340. const _Auto ops = thread_get_real_sched_ops (thread);
  1341. thread->real_sched_data.priority = prio;
  1342. thread->real_sched_data.global_priority = ops->get_global_priority (prio);
  1343. if (ops->update_priority)
  1344. ops->update_priority (thread, prio);
  1345. }
  1346. static void
  1347. thread_reset_real_priority (struct thread *thread)
  1348. {
  1349. thread->real_sched_data = thread->user_sched_data;
  1350. thread->boosted = false;
  1351. const _Auto ops = thread_get_user_sched_ops (thread);
  1352. if (ops->reset_priority)
  1353. ops->reset_priority (thread, thread->real_sched_data.priority);
  1354. }
  1355. static void __init
  1356. thread_init_booter (uint32_t cpu)
  1357. {
  1358. // Initialize only what's needed during bootstrap.
  1359. struct thread *booter = &thread_booters[cpu];
  1360. booter->kuid.id = 0;
  1361. booter->kuid.nr_refs = 0; // Make sure booters aren't destroyed.
  1362. booter->flags = 0;
  1363. booter->intr_level = 0;
  1364. booter->preempt_level = 1;
  1365. booter->pagefault_level = 0;
  1366. rcu_reader_init (&booter->rcu_reader);
  1367. cpumap_fill (&booter->cpumap);
  1368. thread_set_user_sched_policy (booter, THREAD_SCHED_POLICY_IDLE);
  1369. thread_set_user_sched_class (booter, THREAD_SCHED_CLASS_IDLE);
  1370. thread_set_user_priority (booter, 0);
  1371. thread_reset_real_priority (booter);
  1372. booter->task = booter->xtask = task_get_kernel_task ();
  1373. snprintf (booter->name, sizeof (booter->name),
  1374. THREAD_KERNEL_PREFIX "thread_boot/%u", cpu);
  1375. }
  1376. static int __init
  1377. thread_setup_booter (void)
  1378. {
  1379. tcb_set_current (&thread_booters[0].tcb);
  1380. thread_init_booter (0);
  1381. return (0);
  1382. }
  1383. INIT_OP_DEFINE (thread_setup_booter,
  1384. INIT_OP_DEP (tcb_setup, true));
  1385. static int __init
  1386. thread_bootstrap (void)
  1387. {
  1388. cpumap_zero (&thread_active_runqs);
  1389. cpumap_zero (&thread_idle_runqs);
  1390. thread_fs_highest_round = THREAD_FS_INITIAL_ROUND;
  1391. cpumap_set (&thread_active_runqs, 0);
  1392. thread_runq_init (cpu_local_ptr (thread_runq), 0, &thread_booters[0]);
  1393. return (0);
  1394. }
  1395. INIT_OP_DEFINE (thread_bootstrap,
  1396. INIT_OP_DEP (syscnt_setup, true),
  1397. INIT_OP_DEP (thread_setup_booter, true));
  1398. void
  1399. thread_main (void (*fn) (void *), void *arg)
  1400. {
  1401. assert (!cpu_intr_enabled ());
  1402. assert (!thread_preempt_enabled ());
  1403. struct thread *thread = thread_self ();
  1404. thread_runq_schedule_load (thread);
  1405. spinlock_unlock (&thread_runq_local()->lock);
  1406. cpu_intr_enable ();
  1407. thread_preempt_enable ();
  1408. fn (arg);
  1409. thread_exit ();
  1410. }
  1411. static int
  1412. thread_init (struct thread *thread, void *stack,
  1413. const struct thread_attr *attr,
  1414. void (*fn) (void *), void *arg)
  1415. {
  1416. struct thread *caller = thread_self ();
  1417. struct task *task = attr->task ?: caller->task;
  1418. struct cpumap *cpumap = attr->cpumap ?: &caller->cpumap;
  1419. assert (attr->policy < ARRAY_SIZE (thread_policy_table));
  1420. kuid_head_init (&thread->kuid);
  1421. thread->flags = 0;
  1422. thread->runq = NULL;
  1423. thread->in_runq = false;
  1424. thread_set_wchan (thread, thread, "init");
  1425. thread->state = THREAD_SLEEPING;
  1426. thread->priv_sleepq = sleepq_create ();
  1427. int error;
  1428. if (!thread->priv_sleepq)
  1429. {
  1430. error = ENOMEM;
  1431. goto error_sleepq;
  1432. }
  1433. thread->priv_turnstile = turnstile_create ();
  1434. if (!thread->priv_turnstile)
  1435. {
  1436. error = ENOMEM;
  1437. goto error_turnstile;
  1438. }
  1439. turnstile_td_init (&thread->turnstile_td);
  1440. thread->propagate_priority = false;
  1441. thread->suspend = false;
  1442. thread->preempt_level = THREAD_SUSPEND_PREEMPT_LEVEL;
  1443. thread->pin_level = 0;
  1444. thread->intr_level = 0;
  1445. thread->pagefault_level = 0;
  1446. rcu_reader_init (&thread->rcu_reader);
  1447. cpumap_copy (&thread->cpumap, cpumap);
  1448. thread_set_user_sched_policy (thread, attr->policy);
  1449. thread_set_user_sched_class (thread, thread_policy_to_class (attr->policy));
  1450. thread_set_user_priority (thread, attr->priority);
  1451. thread_reset_real_priority (thread);
  1452. thread->join_waiter = NULL;
  1453. spinlock_init (&thread->join_lock);
  1454. thread->terminating = false;
  1455. thread->task = thread->xtask = task;
  1456. thread->stack = stack;
  1457. strlcpy (thread->name, attr->name, sizeof (thread->name));
  1458. thread->fixup = NULL;
  1459. thread->cur_lpad = NULL;
  1460. thread->futex_td = NULL;
  1461. bulletin_init (&thread->dead_subs);
  1462. for (int i = 0; i < (int)ARRAY_SIZE (thread->pmap_windows); ++i)
  1463. thread->pmap_windows[i] = NULL;
  1464. #ifdef CONFIG_PERFMON
  1465. perfmon_td_init (thread_get_perfmon_td (thread));
  1466. #endif
  1467. if (attr->flags & THREAD_ATTR_DETACHED)
  1468. thread->flags |= THREAD_DETACHED;
  1469. error = tcb_build (&thread->tcb, stack, fn, arg);
  1470. if (error)
  1471. goto error_tcb;
  1472. else if (thread->task != task_get_kernel_task ())
  1473. {
  1474. error = kuid_alloc (&thread->kuid, KUID_THREAD);
  1475. if (error)
  1476. goto error_kuid;
  1477. }
  1478. task_add_thread (task, thread);
  1479. return (0);
  1480. error_kuid:
  1481. tcb_cleanup (&thread->tcb);
  1482. error_tcb:
  1483. turnstile_destroy (thread->priv_turnstile);
  1484. error_turnstile:
  1485. sleepq_destroy (thread->priv_sleepq);
  1486. error_sleepq:
  1487. return (error);
  1488. }
  1489. #ifdef CONFIG_THREAD_STACK_GUARD
  1490. #include <machine/pmap.h>
  1491. #include <vm/kmem.h>
  1492. #include <vm/page.h>
  1493. static void*
  1494. thread_alloc_stack (void)
  1495. {
  1496. _Auto kernel_pmap = pmap_get_kernel_pmap ();
  1497. size_t stack_size = vm_page_round (TCB_STACK_SIZE);
  1498. void *mem = vm_kmem_alloc ((PAGE_SIZE * 2) + stack_size);
  1499. if (! mem)
  1500. return (NULL);
  1501. uintptr_t va = (uintptr_t)mem;
  1502. /*
  1503. * TODO Until memory protection is implemented, use the pmap system
  1504. * to remove mappings.
  1505. */
  1506. phys_addr_t first_pa, last_pa;
  1507. int error = pmap_kextract (va, &first_pa);
  1508. assert (! error);
  1509. error = pmap_kextract (va + PAGE_SIZE + stack_size, &last_pa);
  1510. assert (! error);
  1511. _Auto first_page = vm_page_lookup (first_pa);
  1512. assert (first_page);
  1513. _Auto last_page = vm_page_lookup (last_pa);
  1514. assert (last_page);
  1515. pmap_remove (kernel_pmap, va, PMAP_PEF_GLOBAL);
  1516. pmap_remove (kernel_pmap, va + PAGE_SIZE + stack_size,
  1517. PMAP_PEF_GLOBAL | PMAP_IGNORE_ERRORS);
  1518. pmap_update (kernel_pmap);
  1519. return ((char *)va + PAGE_SIZE);
  1520. }
  1521. static void
  1522. thread_free_stack (void *stack)
  1523. {
  1524. size_t stack_size = vm_page_round (TCB_STACK_SIZE);
  1525. void *va = (char *)stack - PAGE_SIZE;
  1526. vm_kmem_free (va, (PAGE_SIZE * 2) + stack_size);
  1527. }
  1528. #else // CONFIG_THREAD_STACK_GUARD
  1529. static void*
  1530. thread_alloc_stack (void)
  1531. {
  1532. return (kmem_cache_alloc (&thread_stack_cache));
  1533. }
  1534. static void
  1535. thread_free_stack (void *stack)
  1536. {
  1537. kmem_cache_free (&thread_stack_cache, stack);
  1538. }
  1539. #endif
  1540. static void
  1541. thread_destroy (struct thread *thread)
  1542. {
  1543. assert (thread != thread_self ());
  1544. assert (thread->state == THREAD_DEAD);
  1545. cap_notify_dead (&thread->dead_subs);
  1546. // See task_info().
  1547. task_remove_thread (thread->task, thread);
  1548. kuid_remove (&thread->kuid, KUID_THREAD);
  1549. turnstile_destroy (thread->priv_turnstile);
  1550. sleepq_destroy (thread->priv_sleepq);
  1551. thread_free_stack (thread->stack);
  1552. tcb_cleanup (&thread->tcb);
  1553. kmem_cache_free (&thread_cache, thread);
  1554. }
  1555. static void
  1556. thread_join_common (struct thread *thread)
  1557. {
  1558. struct thread *self = thread_self ();
  1559. assert (thread != self);
  1560. spinlock_lock (&thread->join_lock);
  1561. assert (!thread->join_waiter);
  1562. thread->join_waiter = self;
  1563. while (!thread->terminating)
  1564. thread_sleep (&thread->join_lock, thread, "exit");
  1565. spinlock_unlock (&thread->join_lock);
  1566. uint32_t state;
  1567. do
  1568. {
  1569. struct thread_runq_guard g = thread_runq_guard_make (thread, false);
  1570. state = thread->state;
  1571. }
  1572. while (state != THREAD_DEAD);
  1573. thread_destroy (thread);
  1574. }
  1575. void
  1576. thread_terminate (struct thread *thread)
  1577. {
  1578. SPINLOCK_GUARD (&thread->join_lock);
  1579. thread->terminating = true;
  1580. thread_wakeup (thread->join_waiter);
  1581. }
  1582. static void
  1583. thread_balance_idle_tick (struct thread_runq *runq)
  1584. {
  1585. assert (runq->idle_balance_ticks != 0);
  1586. /*
  1587. * Interrupts can occur early, at a time the balancer thread hasn't been
  1588. * created yet.
  1589. */
  1590. if (runq->balancer && --runq->idle_balance_ticks == 0)
  1591. thread_runq_wakeup_balancer (runq);
  1592. }
  1593. static void
  1594. thread_balance (void *arg)
  1595. {
  1596. struct thread_runq *runq = arg;
  1597. struct thread *self = runq->balancer;
  1598. assert (self == runq->balancer);
  1599. thread_preempt_disable ();
  1600. cpu_flags_t flags;
  1601. spinlock_lock_intr_save (&runq->lock, &flags);
  1602. while (1)
  1603. {
  1604. runq->idle_balance_ticks = THREAD_IDLE_BALANCE_TICKS;
  1605. thread_set_wchan (self, runq, "runq");
  1606. atomic_store_rlx (&self->state, THREAD_SLEEPING);
  1607. runq = thread_runq_schedule (runq);
  1608. assert (runq == arg);
  1609. /*
  1610. * This function may temporarily enable preemption and release the
  1611. * run queue lock, but on return, the lock must remain held until this
  1612. * balancer thread sleeps.
  1613. */
  1614. thread_sched_fs_balance (runq, &flags);
  1615. }
  1616. }
  1617. static void __init
  1618. thread_setup_balancer (struct thread_runq *runq)
  1619. {
  1620. struct cpumap *cpumap;
  1621. if (cpumap_create (&cpumap) != 0)
  1622. panic ("thread: unable to create balancer thread CPU map");
  1623. cpumap_zero (cpumap);
  1624. cpumap_set (cpumap, thread_runq_cpu (runq));
  1625. char name[THREAD_NAME_SIZE];
  1626. snprintf (name, sizeof (name), THREAD_KERNEL_PREFIX "thread_balance/%u",
  1627. thread_runq_cpu (runq));
  1628. struct thread_attr attr;
  1629. thread_attr_init (&attr, name);
  1630. thread_attr_set_cpumap (&attr, cpumap);
  1631. thread_attr_set_policy (&attr, THREAD_SCHED_POLICY_FIFO);
  1632. thread_attr_set_priority (&attr, THREAD_SCHED_RT_PRIO_MIN);
  1633. int error = thread_create (&runq->balancer, &attr, thread_balance, runq);
  1634. cpumap_destroy (cpumap);
  1635. if (error)
  1636. panic ("thread: unable to create balancer thread");
  1637. }
  1638. static void
  1639. thread_idle (void *arg __unused)
  1640. {
  1641. struct thread *self = thread_self ();
  1642. while (1)
  1643. {
  1644. thread_preempt_disable ();
  1645. while (1)
  1646. {
  1647. cpu_intr_disable ();
  1648. if (thread_test_flag (self, THREAD_YIELD))
  1649. {
  1650. cpu_intr_enable ();
  1651. break;
  1652. }
  1653. cpu_idle ();
  1654. }
  1655. thread_preempt_enable ();
  1656. }
  1657. }
  1658. static void __init
  1659. thread_setup_idler (struct thread_runq *runq)
  1660. {
  1661. struct cpumap *cpumap;
  1662. if (cpumap_create (&cpumap) != 0)
  1663. panic ("thread: unable to allocate idler thread CPU map");
  1664. cpumap_zero (cpumap);
  1665. cpumap_set (cpumap, thread_runq_cpu (runq));
  1666. struct thread *idler = kmem_cache_alloc (&thread_cache);
  1667. if (! idler)
  1668. panic ("thread: unable to allocate idler thread");
  1669. void *stack = thread_alloc_stack ();
  1670. if (! stack)
  1671. panic ("thread: unable to allocate idler thread stack");
  1672. char name[THREAD_NAME_SIZE];
  1673. snprintf (name, sizeof (name), THREAD_KERNEL_PREFIX "thread_idle/%u",
  1674. thread_runq_cpu (runq));
  1675. struct thread_attr attr;
  1676. thread_attr_init (&attr, name);
  1677. thread_attr_set_cpumap (&attr, cpumap);
  1678. thread_attr_set_policy (&attr, THREAD_SCHED_POLICY_IDLE);
  1679. if (thread_init (idler, stack, &attr, thread_idle, NULL) != 0)
  1680. panic ("thread: unable to initialize idler thread");
  1681. cpumap_destroy (cpumap);
  1682. // An idler thread needs special tuning.
  1683. thread_clear_wchan (idler);
  1684. idler->state = THREAD_RUNNING;
  1685. idler->runq = runq;
  1686. runq->idler = idler;
  1687. }
  1688. static void __init
  1689. thread_setup_runq (struct thread_runq *runq)
  1690. {
  1691. thread_setup_balancer (runq);
  1692. thread_setup_idler (runq);
  1693. }
  1694. #ifdef CONFIG_SHELL
  1695. /*
  1696. * This function is meant for debugging only. As a result, it uses a weak
  1697. * locking policy which allows tracing threads which state may mutate during
  1698. * tracing.
  1699. */
  1700. static void
  1701. thread_shell_trace (struct shell *shell, int argc, char **argv)
  1702. {
  1703. if (argc != 3)
  1704. {
  1705. stream_puts (shell->stream, "usage: thread_trace task thread\n");
  1706. return;
  1707. }
  1708. const char *task_name = argv[1], *thread_name = argv[2];
  1709. struct task *task = task_lookup (task_name);
  1710. if (! task)
  1711. {
  1712. fmt_xprintf (shell->stream, "thread_trace: task not found: %s\n",
  1713. task_name);
  1714. return;
  1715. }
  1716. struct thread *thread = task_lookup_thread (task, thread_name);
  1717. task_unref (task);
  1718. if (! thread)
  1719. {
  1720. fmt_xprintf (shell->stream, "thread_trace: thread not found: %s\n",
  1721. thread_name);
  1722. return;
  1723. }
  1724. cpu_flags_t flags;
  1725. _Auto runq = thread_lock_runq (thread, &flags);
  1726. if (thread == runq->current)
  1727. stream_puts (shell->stream, "thread_trace: thread is running\n");
  1728. else
  1729. tcb_trace (&thread->tcb);
  1730. thread_unlock_runq (runq, flags);
  1731. thread_unref (thread);
  1732. }
  1733. static struct shell_cmd thread_shell_cmds[] =
  1734. {
  1735. SHELL_CMD_INITIALIZER ("thread_trace", thread_shell_trace,
  1736. "thread_trace <task_name> <thread_name>",
  1737. "display the stack trace of a given thread"),
  1738. };
  1739. static int __init
  1740. thread_setup_shell (void)
  1741. {
  1742. SHELL_REGISTER_CMDS (thread_shell_cmds, shell_get_main_cmd_set ());
  1743. return (0);
  1744. }
  1745. INIT_OP_DEFINE (thread_setup_shell,
  1746. INIT_OP_DEP (printf_setup, true),
  1747. INIT_OP_DEP (shell_setup, true),
  1748. INIT_OP_DEP (task_setup, true),
  1749. INIT_OP_DEP (thread_setup, true));
  1750. #endif
  1751. static void __init
  1752. thread_setup_common (uint32_t cpu)
  1753. {
  1754. assert (cpu);
  1755. cpumap_set (&thread_active_runqs, cpu);
  1756. thread_init_booter (cpu);
  1757. thread_runq_init (percpu_ptr (thread_runq, cpu), cpu, &thread_booters[cpu]);
  1758. }
  1759. static int __init
  1760. thread_setup (void)
  1761. {
  1762. for (uint32_t cpu = 1; cpu < cpu_count (); ++cpu)
  1763. thread_setup_common (cpu);
  1764. kmem_cache_init (&thread_cache, "thread", sizeof (struct thread),
  1765. CPU_L1_SIZE, NULL, 0);
  1766. #ifndef CONFIG_THREAD_STACK_GUARD
  1767. kmem_cache_init (&thread_stack_cache, "thread_stack", TCB_STACK_SIZE,
  1768. CPU_DATA_ALIGN, NULL, 0);
  1769. #endif
  1770. cpumap_for_each (&thread_active_runqs, cpu)
  1771. thread_setup_runq (percpu_ptr (thread_runq, cpu));
  1772. return (0);
  1773. }
  1774. #ifdef CONFIG_THREAD_STACK_GUARD
  1775. #define THREAD_STACK_GUARD_INIT_OP_DEPS \
  1776. INIT_OP_DEP (vm_kmem_setup, true), \
  1777. INIT_OP_DEP (vm_map_setup, true), \
  1778. INIT_OP_DEP (vm_page_setup, true),
  1779. #else
  1780. #define THREAD_STACK_GUARD_INIT_OP_DEPS
  1781. #endif
  1782. #ifdef CONFIG_PERFMON
  1783. #define THREAD_PERFMON_INIT_OP_DEPS INIT_OP_DEP (perfmon_bootstrap, true),
  1784. #else
  1785. #define THREAD_PERFMON_INIT_OP_DEPS
  1786. #endif
  1787. INIT_OP_DEFINE (thread_setup,
  1788. INIT_OP_DEP (cpumap_setup, true),
  1789. INIT_OP_DEP (kmem_setup, true),
  1790. INIT_OP_DEP (pmap_setup, true),
  1791. INIT_OP_DEP (sleepq_setup, true),
  1792. INIT_OP_DEP (task_setup, true),
  1793. INIT_OP_DEP (thread_bootstrap, true),
  1794. INIT_OP_DEP (turnstile_setup, true),
  1795. THREAD_STACK_GUARD_INIT_OP_DEPS
  1796. THREAD_PERFMON_INIT_OP_DEPS);
  1797. void __init
  1798. thread_ap_setup (void)
  1799. {
  1800. tcb_set_current (&thread_booters[cpu_id ()].tcb);
  1801. }
  1802. int
  1803. thread_create (struct thread **threadp, const struct thread_attr *attr,
  1804. void (*fn) (void *), void *arg)
  1805. {
  1806. int error;
  1807. if (attr->cpumap)
  1808. {
  1809. error = cpumap_check (attr->cpumap);
  1810. if (error)
  1811. return (error);
  1812. }
  1813. struct thread *thread = kmem_cache_alloc (&thread_cache);
  1814. if (! thread)
  1815. {
  1816. error = ENOMEM;
  1817. goto error_thread;
  1818. }
  1819. void *stack = thread_alloc_stack ();
  1820. if (! stack)
  1821. {
  1822. error = ENOMEM;
  1823. goto error_stack;
  1824. }
  1825. error = thread_init (thread, stack, attr, fn, arg);
  1826. if (error)
  1827. goto error_init;
  1828. /*
  1829. * The new thread address must be written before the thread is started
  1830. * in case it's passed to it.
  1831. */
  1832. if (threadp)
  1833. *threadp = thread;
  1834. thread_wakeup (thread);
  1835. return (0);
  1836. error_init:
  1837. thread_free_stack (stack);
  1838. error_stack:
  1839. kmem_cache_free (&thread_cache, thread);
  1840. error_thread:
  1841. return (error);
  1842. }
  1843. static void
  1844. thread_reap (struct work *work)
  1845. {
  1846. _Auto zombie = structof (work, struct thread_zombie, work);
  1847. thread_join_common (zombie->thread);
  1848. }
  1849. void
  1850. thread_exit (void)
  1851. {
  1852. struct thread_zombie zombie;
  1853. struct thread *thread = thread_self ();
  1854. if (likely (thread->task != task_get_kernel_task ()))
  1855. turnstile_td_exit (&thread->turnstile_td);
  1856. futex_td_exit (thread->futex_td);
  1857. if (thread_test_flag (thread, THREAD_DETACHED))
  1858. {
  1859. zombie.thread = thread;
  1860. work_init (&zombie.work, thread_reap);
  1861. work_schedule (&zombie.work, 0);
  1862. }
  1863. /*
  1864. * Disable preemption before dropping the reference, as this may
  1865. * trigger the active state poll of the join operation. Doing so
  1866. * keeps the duration of that active wait minimum.
  1867. */
  1868. thread_preempt_disable ();
  1869. thread_unref (thread);
  1870. _Auto runq = thread_runq_local ();
  1871. cpu_flags_t flags;
  1872. spinlock_lock_intr_save (&runq->lock, &flags);
  1873. atomic_store_rlx (&thread->state, THREAD_DEAD);
  1874. thread_runq_schedule (runq);
  1875. panic ("thread: dead thread walking");
  1876. }
  1877. void
  1878. thread_join (struct thread *thread)
  1879. {
  1880. assert (!thread_test_flag (thread, THREAD_DETACHED));
  1881. thread_join_common (thread);
  1882. }
  1883. static int
  1884. thread_wakeup_common (struct thread *thread, int error, bool resume)
  1885. {
  1886. if (!thread || thread == thread_self ())
  1887. return (EINVAL);
  1888. /*
  1889. * There is at most one reference on threads that were never dispatched,
  1890. * in which case there is no need to lock anything.
  1891. */
  1892. struct thread_runq *runq;
  1893. cpu_flags_t flags;
  1894. if (!thread->runq)
  1895. {
  1896. assert (thread->state != THREAD_RUNNING);
  1897. thread_clear_wchan (thread);
  1898. thread->state = THREAD_RUNNING;
  1899. }
  1900. else
  1901. {
  1902. runq = thread_lock_runq (thread, &flags);
  1903. if (thread->state == THREAD_RUNNING ||
  1904. (thread->state == THREAD_SUSPENDED && !resume))
  1905. {
  1906. thread_unlock_runq (runq, flags);
  1907. return (EINVAL);
  1908. }
  1909. thread_clear_wchan (thread);
  1910. atomic_store_rlx (&thread->state, THREAD_RUNNING);
  1911. thread_unlock_runq (runq, flags);
  1912. }
  1913. thread_preempt_disable_intr_save (&flags);
  1914. if (!thread->pin_level)
  1915. runq = thread_get_real_sched_ops(thread)->select_runq (thread);
  1916. else
  1917. {
  1918. /*
  1919. * This access doesn't need to be atomic, as the current thread is
  1920. * the only one which may update the member.
  1921. */
  1922. runq = thread->runq;
  1923. spinlock_lock (&runq->lock);
  1924. }
  1925. thread->wakeup_error = error;
  1926. thread_runq_wakeup (runq, thread);
  1927. spinlock_unlock (&runq->lock);
  1928. thread_preempt_enable_intr_restore (flags);
  1929. return (0);
  1930. }
  1931. int
  1932. thread_wakeup (struct thread *thread)
  1933. {
  1934. return (thread_wakeup_common (thread, 0, false));
  1935. }
  1936. struct thread_timeout_waiter
  1937. {
  1938. struct thread *thread;
  1939. struct timer timer;
  1940. };
  1941. static void
  1942. thread_timeout (struct timer *timer)
  1943. {
  1944. _Auto waiter = structof (timer, struct thread_timeout_waiter, timer);
  1945. thread_wakeup_common (waiter->thread, ETIMEDOUT, false);
  1946. }
  1947. static int
  1948. thread_sleep_common (struct spinlock *interlock, const void *wchan_addr,
  1949. const char *wchan_desc, bool timed, uint64_t ticks)
  1950. {
  1951. struct thread *thread = thread_self ();
  1952. struct thread_timeout_waiter waiter;
  1953. if (timed)
  1954. {
  1955. waiter.thread = thread;
  1956. timer_init (&waiter.timer, thread_timeout, TIMER_INTR);
  1957. timer_schedule (&waiter.timer, ticks);
  1958. }
  1959. _Auto runq = thread_runq_local ();
  1960. cpu_flags_t flags;
  1961. spinlock_lock_intr_save (&runq->lock, &flags);
  1962. if (interlock)
  1963. {
  1964. thread_preempt_disable ();
  1965. spinlock_unlock (interlock);
  1966. }
  1967. thread_set_wchan (thread, wchan_addr, wchan_desc);
  1968. atomic_store_rlx (&thread->state, THREAD_SLEEPING);
  1969. runq = thread_runq_schedule (runq);
  1970. assert (thread->state == THREAD_RUNNING);
  1971. spinlock_unlock_intr_restore (&runq->lock, flags);
  1972. if (timed)
  1973. timer_cancel (&waiter.timer);
  1974. if (interlock)
  1975. {
  1976. spinlock_lock (interlock);
  1977. thread_preempt_enable_no_resched ();
  1978. }
  1979. return (thread->wakeup_error);
  1980. }
  1981. void
  1982. thread_sleep (struct spinlock *lock, const void *wchan_addr,
  1983. const char *wchan_desc)
  1984. {
  1985. int error = thread_sleep_common (lock, wchan_addr, wchan_desc, false, 0);
  1986. assert (! error);
  1987. }
  1988. int
  1989. thread_timedsleep (struct spinlock *lock, const void *wchan_addr,
  1990. const char *wchan_desc, uint64_t ticks)
  1991. {
  1992. return (thread_sleep_common (lock, wchan_addr, wchan_desc, true, ticks));
  1993. }
  1994. int
  1995. thread_suspend (struct thread *thread)
  1996. {
  1997. if (! thread)
  1998. return (EINVAL);
  1999. struct thread_runq_guard g = thread_runq_guard_make (thread, true);
  2000. if (thread == g.runq->idler ||
  2001. thread == g.runq->balancer ||
  2002. thread->state == THREAD_DEAD)
  2003. return (EINVAL);
  2004. else if (thread->state == THREAD_SUSPENDED || thread->suspend)
  2005. return (0);
  2006. else if (thread->state == THREAD_SLEEPING)
  2007. {
  2008. thread->state = THREAD_SUSPENDED;
  2009. return (0);
  2010. }
  2011. assert (thread->state == THREAD_RUNNING);
  2012. if (thread != g.runq->current)
  2013. {
  2014. thread->state = THREAD_SUSPENDED;
  2015. thread_runq_remove (g.runq, thread);
  2016. }
  2017. else
  2018. {
  2019. thread->suspend = true;
  2020. if (g.runq == thread_runq_local ())
  2021. g.runq = thread_runq_schedule (g.runq);
  2022. else
  2023. {
  2024. thread_set_flag (thread, THREAD_YIELD);
  2025. cpu_send_thread_schedule (thread_runq_cpu (g.runq));
  2026. }
  2027. }
  2028. return (0);
  2029. }
  2030. int
  2031. thread_resume (struct thread *thread)
  2032. {
  2033. return (thread_wakeup_common (thread, 0, true));
  2034. }
  2035. void
  2036. thread_delay (uint64_t ticks, bool absolute)
  2037. {
  2038. thread_preempt_disable ();
  2039. if (! absolute)
  2040. // Add a tick to avoid quantization errors.
  2041. ticks += clock_get_time () + 1;
  2042. thread_timedsleep (NULL, thread_self (), "delay", ticks);
  2043. thread_preempt_enable ();
  2044. }
  2045. static void __init
  2046. thread_boot_barrier_wait (void)
  2047. {
  2048. assert (!cpu_intr_enabled ());
  2049. atomic_add_rlx (&thread_nr_boot_cpus, 1);
  2050. while (atomic_load_seq (&thread_nr_boot_cpus) != cpu_count ())
  2051. atomic_spin_nop ();
  2052. }
  2053. void __init
  2054. thread_run_scheduler (void)
  2055. {
  2056. assert (!cpu_intr_enabled ());
  2057. thread_boot_barrier_wait ();
  2058. _Auto runq = thread_runq_local ();
  2059. struct thread *thread = thread_self ();
  2060. assert (thread == runq->current);
  2061. assert (thread->preempt_level == THREAD_SUSPEND_PREEMPT_LEVEL - 1);
  2062. spinlock_lock (&runq->lock);
  2063. thread = thread_runq_get_next (thread_runq_local ());
  2064. spinlock_transfer_owner (&runq->lock, thread);
  2065. tcb_load (&thread->tcb);
  2066. }
  2067. void
  2068. thread_yield (void)
  2069. {
  2070. struct thread *thread = thread_self ();
  2071. if (!thread_preempt_enabled ())
  2072. return;
  2073. do
  2074. {
  2075. thread_preempt_disable ();
  2076. _Auto runq = thread_runq_local ();
  2077. cpu_flags_t flags;
  2078. spinlock_lock_intr_save (&runq->lock, &flags);
  2079. runq = thread_runq_schedule (runq);
  2080. spinlock_unlock_intr_restore (&runq->lock, flags);
  2081. thread_preempt_enable_no_resched ();
  2082. }
  2083. while (thread_test_flag (thread, THREAD_YIELD));
  2084. }
  2085. void
  2086. thread_schedule (void)
  2087. {
  2088. if (unlikely (thread_test_flag (thread_self (), THREAD_YIELD)))
  2089. thread_yield ();
  2090. }
  2091. void
  2092. thread_schedule_intr (void)
  2093. {
  2094. assert (thread_check_intr_context ());
  2095. syscnt_inc (&thread_runq_local()->sc_schedule_intrs);
  2096. }
  2097. void
  2098. thread_report_periodic_event (void)
  2099. {
  2100. assert (thread_check_intr_context ());
  2101. _Auto runq = thread_runq_local ();
  2102. struct thread *thread = thread_self ();
  2103. spinlock_lock (&runq->lock);
  2104. if (!runq->nr_threads)
  2105. thread_balance_idle_tick (runq);
  2106. const _Auto ops = thread_get_real_sched_ops (thread);
  2107. if (ops->tick)
  2108. ops->tick (runq, thread);
  2109. spinlock_unlock (&runq->lock);
  2110. }
  2111. char
  2112. thread_state_to_chr (uint32_t state)
  2113. {
  2114. switch (state)
  2115. {
  2116. case THREAD_RUNNING:
  2117. return ('R');
  2118. case THREAD_SLEEPING:
  2119. return ('S');
  2120. case THREAD_DEAD:
  2121. return ('Z');
  2122. case THREAD_SUSPENDED:
  2123. return ('T');
  2124. default:
  2125. panic ("thread: unknown state");
  2126. }
  2127. }
  2128. const char*
  2129. thread_sched_class_to_str (uint8_t sched_class)
  2130. {
  2131. switch (sched_class)
  2132. {
  2133. case THREAD_SCHED_CLASS_RT:
  2134. return ("rt");
  2135. case THREAD_SCHED_CLASS_FS:
  2136. return ("fs");
  2137. case THREAD_SCHED_CLASS_IDLE:
  2138. return ("idle");
  2139. default:
  2140. panic ("thread: unknown scheduling class");
  2141. }
  2142. }
  2143. static void
  2144. thread_setsched_impl (struct thread *thread, uint8_t policy,
  2145. uint16_t priority, struct thread_runq *runq)
  2146. {
  2147. if (thread_user_sched_policy (thread) == policy &&
  2148. thread_user_priority (thread) == priority)
  2149. return;
  2150. bool current, requeue = thread->in_runq;
  2151. if (! requeue)
  2152. current = false;
  2153. else
  2154. {
  2155. if (thread != runq->current)
  2156. current = false;
  2157. else
  2158. {
  2159. thread_runq_put_prev (runq, thread);
  2160. current = true;
  2161. }
  2162. thread_runq_remove (runq, thread);
  2163. }
  2164. bool update = true;
  2165. if (thread_user_sched_policy (thread) == policy)
  2166. thread_update_user_priority (thread, priority);
  2167. else
  2168. {
  2169. thread_set_user_sched_policy (thread, policy);
  2170. thread_set_user_sched_class (thread, thread_policy_to_class (policy));
  2171. thread_set_user_priority (thread, priority);
  2172. update = false;
  2173. }
  2174. if (thread->boosted)
  2175. {
  2176. if (thread_user_global_priority (thread) >=
  2177. thread_real_global_priority (thread))
  2178. thread_reset_real_priority (thread);
  2179. }
  2180. else if (update)
  2181. thread_update_real_priority (thread, priority);
  2182. else
  2183. {
  2184. thread_set_real_sched_policy (thread, policy);
  2185. thread_set_real_sched_class (thread, thread_policy_to_class (policy));
  2186. thread_set_real_priority (thread, priority);
  2187. }
  2188. if (requeue)
  2189. {
  2190. thread_runq_add (runq, thread);
  2191. if (current)
  2192. thread_runq_set_next (runq, thread);
  2193. }
  2194. }
  2195. void
  2196. thread_setscheduler (struct thread *thread, uint8_t policy, uint16_t prio)
  2197. {
  2198. _Auto td = thread_turnstile_td (thread);
  2199. turnstile_td_lock (td);
  2200. cpu_flags_t flags;
  2201. _Auto runq = thread_lock_runq (thread, &flags);
  2202. thread_setsched_impl (thread, policy, prio, runq);
  2203. turnstile_td_unlock (td);
  2204. thread_unlock_runq (runq, flags);
  2205. turnstile_td_propagate_priority (td);
  2206. }
  2207. static void
  2208. thread_pi_setsched_impl (struct thread_runq *runq, struct thread *thread,
  2209. uint8_t policy, uint16_t prio)
  2210. {
  2211. assert (turnstile_td_locked (thread_turnstile_td (thread)));
  2212. if (thread_real_sched_policy (thread) == policy &&
  2213. thread_real_priority (thread) == prio)
  2214. return;
  2215. const _Auto ops = thread_get_sched_ops (thread_policy_to_class (policy));
  2216. uint32_t global_prio = ops->get_global_priority (prio);
  2217. bool current, requeue = thread->in_runq;
  2218. if (! requeue)
  2219. current = false;
  2220. else
  2221. {
  2222. if (thread != runq->current)
  2223. current = false;
  2224. else
  2225. {
  2226. thread_runq_put_prev (runq, thread);
  2227. current = true;
  2228. }
  2229. thread_runq_remove (runq, thread);
  2230. }
  2231. if (global_prio <= thread_user_global_priority (thread))
  2232. thread_reset_real_priority (thread);
  2233. else
  2234. {
  2235. if (thread_real_sched_policy (thread) == policy)
  2236. thread_update_real_priority (thread, prio);
  2237. else
  2238. {
  2239. thread_set_real_sched_policy (thread, policy);
  2240. thread_set_real_sched_class (thread,
  2241. thread_policy_to_class (policy));
  2242. thread_set_real_priority (thread, prio);
  2243. }
  2244. thread->boosted = true;
  2245. syscnt_inc (&runq->sc_boosts);
  2246. }
  2247. if (requeue)
  2248. {
  2249. thread_runq_add (runq, thread);
  2250. if (current)
  2251. thread_runq_set_next (runq, thread);
  2252. }
  2253. }
  2254. void
  2255. thread_pi_setscheduler (struct thread *thread, uint8_t policy, uint16_t prio)
  2256. {
  2257. struct thread_runq_guard g = thread_runq_guard_make (thread, false);
  2258. thread_pi_setsched_impl (g.runq, thread, policy, prio);
  2259. }
  2260. void
  2261. thread_propagate_priority (void)
  2262. {
  2263. /*
  2264. * Although it's possible to propagate priority with preemption
  2265. * disabled, the operation can be too expensive to allow it.
  2266. */
  2267. if (!thread_preempt_enabled ())
  2268. {
  2269. thread_set_priority_propagation_needed ();
  2270. return;
  2271. }
  2272. struct thread *thread = thread_self ();
  2273. // Clear before propagation to avoid infinite recursion.
  2274. thread->propagate_priority = false;
  2275. turnstile_td_propagate_priority (thread_turnstile_td (thread));
  2276. }
  2277. uint32_t
  2278. thread_cpu (const struct thread *thread)
  2279. {
  2280. const _Auto runq = atomic_load_rlx (&thread->runq);
  2281. return (runq->cpu);
  2282. }
  2283. uint32_t
  2284. thread_state (const struct thread *thread)
  2285. {
  2286. return (atomic_load_rlx (&thread->state));
  2287. }
  2288. bool
  2289. thread_is_running (const struct thread *thread)
  2290. {
  2291. const _Auto runq = atomic_load_rlx (&thread->runq);
  2292. return (runq && atomic_load_rlx (&runq->current) == thread);
  2293. }
  2294. int
  2295. thread_get_affinity (const struct thread *thr, struct cpumap *cpumap)
  2296. {
  2297. if (! thr)
  2298. return (EINVAL);
  2299. struct thread_runq_guard g =
  2300. thread_runq_guard_make ((struct thread *)thr, true);
  2301. cpumap_copy (cpumap, &thr->cpumap);
  2302. return (0);
  2303. }
  2304. int
  2305. thread_set_affinity (struct thread *thread, const struct cpumap *cpumap)
  2306. {
  2307. if (! thread)
  2308. return (EINVAL);
  2309. struct thread_runq_guard g = thread_runq_guard_make (thread, true);
  2310. if (thread == g.runq->idler ||
  2311. thread == g.runq->balancer ||
  2312. thread->state == THREAD_DEAD)
  2313. return (EINVAL);
  2314. else if (cpumap_intersects (&thread->cpumap, cpumap))
  2315. { // The desired CPU map intersects the current one.
  2316. cpumap_copy (&thread->cpumap, cpumap);
  2317. return (0);
  2318. }
  2319. else if (thread->pin_level != 0)
  2320. // The thread is pinned, and cannot be migrated to a different CPU.
  2321. return (EAGAIN);
  2322. // At this point, we know the thread must be migrated.
  2323. cpumap_copy (&thread->cpumap, cpumap);
  2324. if (thread == g.runq->current)
  2325. {
  2326. if (g.runq == thread_runq_local ())
  2327. g.runq = thread_runq_schedule (g.runq);
  2328. else
  2329. {
  2330. thread_set_flag (thread, THREAD_YIELD);
  2331. cpu_send_thread_schedule (thread_runq_cpu (g.runq));
  2332. }
  2333. }
  2334. return (0);
  2335. }
  2336. static ssize_t
  2337. thread_name_impl (struct thread *thread, char *name, bool set)
  2338. {
  2339. SPINLOCK_GUARD (&thread->task->lock);
  2340. if (set)
  2341. memcpy (thread->name, name, sizeof (thread->name));
  2342. else
  2343. memcpy (name, thread->name, sizeof (thread->name));
  2344. return (0);
  2345. }
  2346. static ssize_t
  2347. thread_ipc_affinity_impl (struct thread *thread, void *map,
  2348. uint32_t size, bool set)
  2349. {
  2350. if (! size)
  2351. return (-EINVAL);
  2352. struct cpumap *cpumap;
  2353. if (cpumap_create (&cpumap) != 0)
  2354. return (-ENOMEM);
  2355. cpumap_zero (cpumap);
  2356. size = MIN (size, sizeof (cpumap->cpus));
  2357. int error = user_copy_from (cpumap->cpus, map, size);
  2358. if (error)
  2359. ;
  2360. else if (set)
  2361. error = thread_set_affinity (thread, cpumap);
  2362. else if ((error = thread_get_affinity (thread, cpumap)) == 0)
  2363. error = user_copy_to (map, cpumap->cpus, size);
  2364. cpumap_destroy (cpumap);
  2365. return (-error);
  2366. }
  2367. #define THREAD_IPC_NEEDS_COPY \
  2368. ((1u << THREAD_IPC_GET_NAME) | (1u << THREAD_IPC_GET_AFFINITY) | \
  2369. (1u << THREAD_IPC_GET_ID))
  2370. ssize_t
  2371. thread_handle_msg (struct thread *thr, struct cap_iters *src,
  2372. struct cap_iters *dst, struct ipc_msg_data *data)
  2373. {
  2374. struct thread_ipc_msg tmsg;
  2375. struct ipc_iov_iter k_it;
  2376. ipc_iov_iter_init_buf (&k_it, &tmsg, sizeof (tmsg));
  2377. ssize_t rv = user_copyv_from (&k_it, &src->iov);
  2378. if (rv < 0)
  2379. return (rv);
  2380. switch (tmsg.op)
  2381. {
  2382. case THREAD_IPC_GET_NAME:
  2383. case THREAD_IPC_SET_NAME:
  2384. rv = thread_name_impl (thr, tmsg.name,
  2385. tmsg.op == THREAD_IPC_SET_NAME);
  2386. break;
  2387. case THREAD_IPC_GET_AFFINITY:
  2388. case THREAD_IPC_SET_AFFINITY:
  2389. rv = thread_ipc_affinity_impl (thr, tmsg.cpumap.map, tmsg.cpumap.size,
  2390. tmsg.op == THREAD_IPC_SET_AFFINITY);
  2391. break;
  2392. case THREAD_IPC_GET_ID:
  2393. tmsg.id = thread_id (thr);
  2394. break;
  2395. default:
  2396. return (-EINVAL);
  2397. }
  2398. if (rv == 0 && ((1u << tmsg.op) & THREAD_IPC_NEEDS_COPY))
  2399. {
  2400. ipc_iov_iter_init_buf (&k_it, &tmsg, sizeof (tmsg));
  2401. rv = user_copyv_to (&dst->iov, &k_it);
  2402. }
  2403. (void)data;
  2404. return (rv < 0 ? rv : 0);
  2405. }