pelt.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Per Entity Load Tracking
  4. *
  5. * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  6. *
  7. * Interactivity improvements by Mike Galbraith
  8. * (C) 2007 Mike Galbraith <efault@gmx.de>
  9. *
  10. * Various enhancements by Dmitry Adamushko.
  11. * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  12. *
  13. * Group scheduling enhancements by Srivatsa Vaddagiri
  14. * Copyright IBM Corporation, 2007
  15. * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  16. *
  17. * Scaled math optimizations by Thomas Gleixner
  18. * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  19. *
  20. * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  21. * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  22. *
  23. * Move PELT related code from fair.c into this pelt.c file
  24. * Author: Vincent Guittot <vincent.guittot@linaro.org>
  25. */
  26. #include <linux/sched.h>
  27. #include "sched.h"
  28. #include "sched-pelt.h"
  29. #include "pelt.h"
  30. /*
  31. * Approximate:
  32. * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
  33. */
  34. static u64 decay_load(u64 val, u64 n)
  35. {
  36. unsigned int local_n;
  37. if (unlikely(n > LOAD_AVG_PERIOD * 63))
  38. return 0;
  39. /* after bounds checking we can collapse to 32-bit */
  40. local_n = n;
  41. /*
  42. * As y^PERIOD = 1/2, we can combine
  43. * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
  44. * With a look-up table which covers y^n (n<PERIOD)
  45. *
  46. * To achieve constant time decay_load.
  47. */
  48. if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
  49. val >>= local_n / LOAD_AVG_PERIOD;
  50. local_n %= LOAD_AVG_PERIOD;
  51. }
  52. val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
  53. return val;
  54. }
  55. static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
  56. {
  57. u32 c1, c2, c3 = d3; /* y^0 == 1 */
  58. /*
  59. * c1 = d1 y^p
  60. */
  61. c1 = decay_load((u64)d1, periods);
  62. /*
  63. * p-1
  64. * c2 = 1024 \Sum y^n
  65. * n=1
  66. *
  67. * inf inf
  68. * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
  69. * n=0 n=p
  70. */
  71. c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
  72. return c1 + c2 + c3;
  73. }
  74. #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
  75. /*
  76. * Accumulate the three separate parts of the sum; d1 the remainder
  77. * of the last (incomplete) period, d2 the span of full periods and d3
  78. * the remainder of the (incomplete) current period.
  79. *
  80. * d1 d2 d3
  81. * ^ ^ ^
  82. * | | |
  83. * |<->|<----------------->|<--->|
  84. * ... |---x---|------| ... |------|-----x (now)
  85. *
  86. * p-1
  87. * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
  88. * n=1
  89. *
  90. * = u y^p + (Step 1)
  91. *
  92. * p-1
  93. * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
  94. * n=1
  95. */
  96. static __always_inline u32
  97. accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
  98. unsigned long load, unsigned long runnable, int running)
  99. {
  100. unsigned long scale_freq, scale_cpu;
  101. u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
  102. u64 periods;
  103. scale_freq = arch_scale_freq_capacity(cpu);
  104. scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
  105. delta += sa->period_contrib;
  106. periods = delta / 1024; /* A period is 1024us (~1ms) */
  107. /*
  108. * Step 1: decay old *_sum if we crossed period boundaries.
  109. */
  110. if (periods) {
  111. sa->load_sum = decay_load(sa->load_sum, periods);
  112. sa->runnable_load_sum =
  113. decay_load(sa->runnable_load_sum, periods);
  114. sa->util_sum = decay_load((u64)(sa->util_sum), periods);
  115. /*
  116. * Step 2
  117. */
  118. delta %= 1024;
  119. contrib = __accumulate_pelt_segments(periods,
  120. 1024 - sa->period_contrib, delta);
  121. }
  122. sa->period_contrib = delta;
  123. contrib = cap_scale(contrib, scale_freq);
  124. if (load)
  125. sa->load_sum += load * contrib;
  126. if (runnable)
  127. sa->runnable_load_sum += runnable * contrib;
  128. if (running)
  129. sa->util_sum += contrib * scale_cpu;
  130. return periods;
  131. }
  132. /*
  133. * We can represent the historical contribution to runnable average as the
  134. * coefficients of a geometric series. To do this we sub-divide our runnable
  135. * history into segments of approximately 1ms (1024us); label the segment that
  136. * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
  137. *
  138. * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
  139. * p0 p1 p2
  140. * (now) (~1ms ago) (~2ms ago)
  141. *
  142. * Let u_i denote the fraction of p_i that the entity was runnable.
  143. *
  144. * We then designate the fractions u_i as our co-efficients, yielding the
  145. * following representation of historical load:
  146. * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
  147. *
  148. * We choose y based on the with of a reasonably scheduling period, fixing:
  149. * y^32 = 0.5
  150. *
  151. * This means that the contribution to load ~32ms ago (u_32) will be weighted
  152. * approximately half as much as the contribution to load within the last ms
  153. * (u_0).
  154. *
  155. * When a period "rolls over" and we have new u_0`, multiplying the previous
  156. * sum again by y is sufficient to update:
  157. * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
  158. * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
  159. */
  160. static __always_inline int
  161. ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
  162. unsigned long load, unsigned long runnable, int running)
  163. {
  164. u64 delta;
  165. delta = now - sa->last_update_time;
  166. /*
  167. * This should only happen when time goes backwards, which it
  168. * unfortunately does during sched clock init when we swap over to TSC.
  169. */
  170. if ((s64)delta < 0) {
  171. sa->last_update_time = now;
  172. return 0;
  173. }
  174. /*
  175. * Use 1024ns as the unit of measurement since it's a reasonable
  176. * approximation of 1us and fast to compute.
  177. */
  178. delta >>= 10;
  179. if (!delta)
  180. return 0;
  181. sa->last_update_time += delta << 10;
  182. /*
  183. * running is a subset of runnable (weight) so running can't be set if
  184. * runnable is clear. But there are some corner cases where the current
  185. * se has been already dequeued but cfs_rq->curr still points to it.
  186. * This means that weight will be 0 but not running for a sched_entity
  187. * but also for a cfs_rq if the latter becomes idle. As an example,
  188. * this happens during idle_balance() which calls
  189. * update_blocked_averages()
  190. */
  191. if (!load)
  192. runnable = running = 0;
  193. /*
  194. * Now we know we crossed measurement unit boundaries. The *_avg
  195. * accrues by two steps:
  196. *
  197. * Step 1: accumulate *_sum since last_update_time. If we haven't
  198. * crossed period boundaries, finish.
  199. */
  200. if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
  201. return 0;
  202. return 1;
  203. }
  204. static __always_inline void
  205. ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
  206. {
  207. u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
  208. /*
  209. * Step 2: update *_avg.
  210. */
  211. sa->load_avg = div_u64(load * sa->load_sum, divider);
  212. sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
  213. WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
  214. }
  215. /*
  216. * sched_entity:
  217. *
  218. * task:
  219. * se_runnable() == se_weight()
  220. *
  221. * group: [ see update_cfs_group() ]
  222. * se_weight() = tg->weight * grq->load_avg / tg->load_avg
  223. * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
  224. *
  225. * load_sum := runnable_sum
  226. * load_avg = se_weight(se) * runnable_avg
  227. *
  228. * runnable_load_sum := runnable_sum
  229. * runnable_load_avg = se_runnable(se) * runnable_avg
  230. *
  231. * XXX collapse load_sum and runnable_load_sum
  232. *
  233. * cfq_rq:
  234. *
  235. * load_sum = \Sum se_weight(se) * se->avg.load_sum
  236. * load_avg = \Sum se->avg.load_avg
  237. *
  238. * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
  239. * runnable_load_avg = \Sum se->avg.runable_load_avg
  240. */
  241. int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
  242. {
  243. if (entity_is_task(se))
  244. se->runnable_weight = se->load.weight;
  245. if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
  246. ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
  247. return 1;
  248. }
  249. return 0;
  250. }
  251. int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
  252. {
  253. if (entity_is_task(se))
  254. se->runnable_weight = se->load.weight;
  255. if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
  256. cfs_rq->curr == se)) {
  257. ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
  258. cfs_se_util_change(&se->avg);
  259. return 1;
  260. }
  261. return 0;
  262. }
  263. int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
  264. {
  265. if (___update_load_sum(now, cpu, &cfs_rq->avg,
  266. scale_load_down(cfs_rq->load.weight),
  267. scale_load_down(cfs_rq->runnable_weight),
  268. cfs_rq->curr != NULL)) {
  269. ___update_load_avg(&cfs_rq->avg, 1, 1);
  270. return 1;
  271. }
  272. return 0;
  273. }
  274. /*
  275. * rt_rq:
  276. *
  277. * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
  278. * util_sum = cpu_scale * load_sum
  279. * runnable_load_sum = load_sum
  280. *
  281. * load_avg and runnable_load_avg are not supported and meaningless.
  282. *
  283. */
  284. int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
  285. {
  286. if (___update_load_sum(now, rq->cpu, &rq->avg_rt,
  287. running,
  288. running,
  289. running)) {
  290. ___update_load_avg(&rq->avg_rt, 1, 1);
  291. return 1;
  292. }
  293. return 0;
  294. }
  295. /*
  296. * dl_rq:
  297. *
  298. * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
  299. * util_sum = cpu_scale * load_sum
  300. * runnable_load_sum = load_sum
  301. *
  302. */
  303. int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
  304. {
  305. if (___update_load_sum(now, rq->cpu, &rq->avg_dl,
  306. running,
  307. running,
  308. running)) {
  309. ___update_load_avg(&rq->avg_dl, 1, 1);
  310. return 1;
  311. }
  312. return 0;
  313. }
  314. #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
  315. /*
  316. * irq:
  317. *
  318. * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
  319. * util_sum = cpu_scale * load_sum
  320. * runnable_load_sum = load_sum
  321. *
  322. */
  323. int update_irq_load_avg(struct rq *rq, u64 running)
  324. {
  325. int ret = 0;
  326. /*
  327. * We know the time that has been used by interrupt since last update
  328. * but we don't when. Let be pessimistic and assume that interrupt has
  329. * happened just before the update. This is not so far from reality
  330. * because interrupt will most probably wake up task and trig an update
  331. * of rq clock during which the metric si updated.
  332. * We start to decay with normal context time and then we add the
  333. * interrupt context time.
  334. * We can safely remove running from rq->clock because
  335. * rq->clock += delta with delta >= running
  336. */
  337. ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,
  338. 0,
  339. 0,
  340. 0);
  341. ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,
  342. 1,
  343. 1,
  344. 1);
  345. if (ret)
  346. ___update_load_avg(&rq->avg_irq, 1, 1);
  347. return ret;
  348. }
  349. #endif