dm-cache-policy-smq.c 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792
  1. /*
  2. * Copyright (C) 2015 Red Hat. All rights reserved.
  3. *
  4. * This file is released under the GPL.
  5. */
  6. #include "dm-cache-policy.h"
  7. #include "dm-cache-policy-internal.h"
  8. #include "dm.h"
  9. #include <linux/hash.h>
  10. #include <linux/jiffies.h>
  11. #include <linux/module.h>
  12. #include <linux/mutex.h>
  13. #include <linux/vmalloc.h>
  14. #include <linux/math64.h>
  15. #define DM_MSG_PREFIX "cache-policy-smq"
  16. /*----------------------------------------------------------------*/
  17. /*
  18. * Safe division functions that return zero on divide by zero.
  19. */
  20. static unsigned safe_div(unsigned n, unsigned d)
  21. {
  22. return d ? n / d : 0u;
  23. }
  24. static unsigned safe_mod(unsigned n, unsigned d)
  25. {
  26. return d ? n % d : 0u;
  27. }
  28. /*----------------------------------------------------------------*/
  29. struct entry {
  30. unsigned hash_next:28;
  31. unsigned prev:28;
  32. unsigned next:28;
  33. unsigned level:7;
  34. bool dirty:1;
  35. bool allocated:1;
  36. bool sentinel:1;
  37. dm_oblock_t oblock;
  38. };
  39. /*----------------------------------------------------------------*/
  40. #define INDEXER_NULL ((1u << 28u) - 1u)
  41. /*
  42. * An entry_space manages a set of entries that we use for the queues.
  43. * The clean and dirty queues share entries, so this object is separate
  44. * from the queue itself.
  45. */
  46. struct entry_space {
  47. struct entry *begin;
  48. struct entry *end;
  49. };
  50. static int space_init(struct entry_space *es, unsigned nr_entries)
  51. {
  52. if (!nr_entries) {
  53. es->begin = es->end = NULL;
  54. return 0;
  55. }
  56. es->begin = vzalloc(sizeof(struct entry) * nr_entries);
  57. if (!es->begin)
  58. return -ENOMEM;
  59. es->end = es->begin + nr_entries;
  60. return 0;
  61. }
  62. static void space_exit(struct entry_space *es)
  63. {
  64. vfree(es->begin);
  65. }
  66. static struct entry *__get_entry(struct entry_space *es, unsigned block)
  67. {
  68. struct entry *e;
  69. e = es->begin + block;
  70. BUG_ON(e >= es->end);
  71. return e;
  72. }
  73. static unsigned to_index(struct entry_space *es, struct entry *e)
  74. {
  75. BUG_ON(e < es->begin || e >= es->end);
  76. return e - es->begin;
  77. }
  78. static struct entry *to_entry(struct entry_space *es, unsigned block)
  79. {
  80. if (block == INDEXER_NULL)
  81. return NULL;
  82. return __get_entry(es, block);
  83. }
  84. /*----------------------------------------------------------------*/
  85. struct ilist {
  86. unsigned nr_elts; /* excluding sentinel entries */
  87. unsigned head, tail;
  88. };
  89. static void l_init(struct ilist *l)
  90. {
  91. l->nr_elts = 0;
  92. l->head = l->tail = INDEXER_NULL;
  93. }
  94. static struct entry *l_head(struct entry_space *es, struct ilist *l)
  95. {
  96. return to_entry(es, l->head);
  97. }
  98. static struct entry *l_tail(struct entry_space *es, struct ilist *l)
  99. {
  100. return to_entry(es, l->tail);
  101. }
  102. static struct entry *l_next(struct entry_space *es, struct entry *e)
  103. {
  104. return to_entry(es, e->next);
  105. }
  106. static struct entry *l_prev(struct entry_space *es, struct entry *e)
  107. {
  108. return to_entry(es, e->prev);
  109. }
  110. static bool l_empty(struct ilist *l)
  111. {
  112. return l->head == INDEXER_NULL;
  113. }
  114. static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
  115. {
  116. struct entry *head = l_head(es, l);
  117. e->next = l->head;
  118. e->prev = INDEXER_NULL;
  119. if (head)
  120. head->prev = l->head = to_index(es, e);
  121. else
  122. l->head = l->tail = to_index(es, e);
  123. if (!e->sentinel)
  124. l->nr_elts++;
  125. }
  126. static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
  127. {
  128. struct entry *tail = l_tail(es, l);
  129. e->next = INDEXER_NULL;
  130. e->prev = l->tail;
  131. if (tail)
  132. tail->next = l->tail = to_index(es, e);
  133. else
  134. l->head = l->tail = to_index(es, e);
  135. if (!e->sentinel)
  136. l->nr_elts++;
  137. }
  138. static void l_add_before(struct entry_space *es, struct ilist *l,
  139. struct entry *old, struct entry *e)
  140. {
  141. struct entry *prev = l_prev(es, old);
  142. if (!prev)
  143. l_add_head(es, l, e);
  144. else {
  145. e->prev = old->prev;
  146. e->next = to_index(es, old);
  147. prev->next = old->prev = to_index(es, e);
  148. if (!e->sentinel)
  149. l->nr_elts++;
  150. }
  151. }
  152. static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
  153. {
  154. struct entry *prev = l_prev(es, e);
  155. struct entry *next = l_next(es, e);
  156. if (prev)
  157. prev->next = e->next;
  158. else
  159. l->head = e->next;
  160. if (next)
  161. next->prev = e->prev;
  162. else
  163. l->tail = e->prev;
  164. if (!e->sentinel)
  165. l->nr_elts--;
  166. }
  167. static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
  168. {
  169. struct entry *e;
  170. for (e = l_tail(es, l); e; e = l_prev(es, e))
  171. if (!e->sentinel) {
  172. l_del(es, l, e);
  173. return e;
  174. }
  175. return NULL;
  176. }
  177. /*----------------------------------------------------------------*/
  178. /*
  179. * The stochastic-multi-queue is a set of lru lists stacked into levels.
  180. * Entries are moved up levels when they are used, which loosely orders the
  181. * most accessed entries in the top levels and least in the bottom. This
  182. * structure is *much* better than a single lru list.
  183. */
  184. #define MAX_LEVELS 64u
  185. struct queue {
  186. struct entry_space *es;
  187. unsigned nr_elts;
  188. unsigned nr_levels;
  189. struct ilist qs[MAX_LEVELS];
  190. /*
  191. * We maintain a count of the number of entries we would like in each
  192. * level.
  193. */
  194. unsigned last_target_nr_elts;
  195. unsigned nr_top_levels;
  196. unsigned nr_in_top_levels;
  197. unsigned target_count[MAX_LEVELS];
  198. };
  199. static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
  200. {
  201. unsigned i;
  202. q->es = es;
  203. q->nr_elts = 0;
  204. q->nr_levels = nr_levels;
  205. for (i = 0; i < q->nr_levels; i++) {
  206. l_init(q->qs + i);
  207. q->target_count[i] = 0u;
  208. }
  209. q->last_target_nr_elts = 0u;
  210. q->nr_top_levels = 0u;
  211. q->nr_in_top_levels = 0u;
  212. }
  213. static unsigned q_size(struct queue *q)
  214. {
  215. return q->nr_elts;
  216. }
  217. /*
  218. * Insert an entry to the back of the given level.
  219. */
  220. static void q_push(struct queue *q, struct entry *e)
  221. {
  222. if (!e->sentinel)
  223. q->nr_elts++;
  224. l_add_tail(q->es, q->qs + e->level, e);
  225. }
  226. static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
  227. {
  228. if (!e->sentinel)
  229. q->nr_elts++;
  230. l_add_before(q->es, q->qs + e->level, old, e);
  231. }
  232. static void q_del(struct queue *q, struct entry *e)
  233. {
  234. l_del(q->es, q->qs + e->level, e);
  235. if (!e->sentinel)
  236. q->nr_elts--;
  237. }
  238. /*
  239. * Return the oldest entry of the lowest populated level.
  240. */
  241. static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
  242. {
  243. unsigned level;
  244. struct entry *e;
  245. max_level = min(max_level, q->nr_levels);
  246. for (level = 0; level < max_level; level++)
  247. for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
  248. if (e->sentinel) {
  249. if (can_cross_sentinel)
  250. continue;
  251. else
  252. break;
  253. }
  254. return e;
  255. }
  256. return NULL;
  257. }
  258. static struct entry *q_pop(struct queue *q)
  259. {
  260. struct entry *e = q_peek(q, q->nr_levels, true);
  261. if (e)
  262. q_del(q, e);
  263. return e;
  264. }
  265. /*
  266. * Pops an entry from a level that is not past a sentinel.
  267. */
  268. static struct entry *q_pop_old(struct queue *q, unsigned max_level)
  269. {
  270. struct entry *e = q_peek(q, max_level, false);
  271. if (e)
  272. q_del(q, e);
  273. return e;
  274. }
  275. /*
  276. * This function assumes there is a non-sentinel entry to pop. It's only
  277. * used by redistribute, so we know this is true. It also doesn't adjust
  278. * the q->nr_elts count.
  279. */
  280. static struct entry *__redist_pop_from(struct queue *q, unsigned level)
  281. {
  282. struct entry *e;
  283. for (; level < q->nr_levels; level++)
  284. for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
  285. if (!e->sentinel) {
  286. l_del(q->es, q->qs + e->level, e);
  287. return e;
  288. }
  289. return NULL;
  290. }
  291. static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
  292. {
  293. unsigned level, nr_levels, entries_per_level, remainder;
  294. BUG_ON(lbegin > lend);
  295. BUG_ON(lend > q->nr_levels);
  296. nr_levels = lend - lbegin;
  297. entries_per_level = safe_div(nr_elts, nr_levels);
  298. remainder = safe_mod(nr_elts, nr_levels);
  299. for (level = lbegin; level < lend; level++)
  300. q->target_count[level] =
  301. (level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
  302. }
  303. /*
  304. * Typically we have fewer elements in the top few levels which allows us
  305. * to adjust the promote threshold nicely.
  306. */
  307. static void q_set_targets(struct queue *q)
  308. {
  309. if (q->last_target_nr_elts == q->nr_elts)
  310. return;
  311. q->last_target_nr_elts = q->nr_elts;
  312. if (q->nr_top_levels > q->nr_levels)
  313. q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
  314. else {
  315. q_set_targets_subrange_(q, q->nr_in_top_levels,
  316. q->nr_levels - q->nr_top_levels, q->nr_levels);
  317. if (q->nr_in_top_levels < q->nr_elts)
  318. q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
  319. 0, q->nr_levels - q->nr_top_levels);
  320. else
  321. q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
  322. }
  323. }
  324. static void q_redistribute(struct queue *q)
  325. {
  326. unsigned target, level;
  327. struct ilist *l, *l_above;
  328. struct entry *e;
  329. q_set_targets(q);
  330. for (level = 0u; level < q->nr_levels - 1u; level++) {
  331. l = q->qs + level;
  332. target = q->target_count[level];
  333. /*
  334. * Pull down some entries from the level above.
  335. */
  336. while (l->nr_elts < target) {
  337. e = __redist_pop_from(q, level + 1u);
  338. if (!e) {
  339. /* bug in nr_elts */
  340. break;
  341. }
  342. e->level = level;
  343. l_add_tail(q->es, l, e);
  344. }
  345. /*
  346. * Push some entries up.
  347. */
  348. l_above = q->qs + level + 1u;
  349. while (l->nr_elts > target) {
  350. e = l_pop_tail(q->es, l);
  351. if (!e)
  352. /* bug in nr_elts */
  353. break;
  354. e->level = level + 1u;
  355. l_add_head(q->es, l_above, e);
  356. }
  357. }
  358. }
  359. static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels)
  360. {
  361. struct entry *de;
  362. unsigned new_level;
  363. q_del(q, e);
  364. if (extra_levels && (e->level < q->nr_levels - 1u)) {
  365. new_level = min(q->nr_levels - 1u, e->level + extra_levels);
  366. for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) {
  367. if (de->sentinel)
  368. continue;
  369. q_del(q, de);
  370. de->level = e->level;
  371. if (dest)
  372. q_push_before(q, dest, de);
  373. else
  374. q_push(q, de);
  375. break;
  376. }
  377. e->level = new_level;
  378. }
  379. q_push(q, e);
  380. }
  381. static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
  382. {
  383. q_requeue_before(q, NULL, e, extra_levels);
  384. }
  385. /*----------------------------------------------------------------*/
  386. #define FP_SHIFT 8
  387. #define SIXTEENTH (1u << (FP_SHIFT - 4u))
  388. #define EIGHTH (1u << (FP_SHIFT - 3u))
  389. struct stats {
  390. unsigned hit_threshold;
  391. unsigned hits;
  392. unsigned misses;
  393. };
  394. enum performance {
  395. Q_POOR,
  396. Q_FAIR,
  397. Q_WELL
  398. };
  399. static void stats_init(struct stats *s, unsigned nr_levels)
  400. {
  401. s->hit_threshold = (nr_levels * 3u) / 4u;
  402. s->hits = 0u;
  403. s->misses = 0u;
  404. }
  405. static void stats_reset(struct stats *s)
  406. {
  407. s->hits = s->misses = 0u;
  408. }
  409. static void stats_level_accessed(struct stats *s, unsigned level)
  410. {
  411. if (level >= s->hit_threshold)
  412. s->hits++;
  413. else
  414. s->misses++;
  415. }
  416. static void stats_miss(struct stats *s)
  417. {
  418. s->misses++;
  419. }
  420. /*
  421. * There are times when we don't have any confidence in the hotspot queue.
  422. * Such as when a fresh cache is created and the blocks have been spread
  423. * out across the levels, or if an io load changes. We detect this by
  424. * seeing how often a lookup is in the top levels of the hotspot queue.
  425. */
  426. static enum performance stats_assess(struct stats *s)
  427. {
  428. unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
  429. if (confidence < SIXTEENTH)
  430. return Q_POOR;
  431. else if (confidence < EIGHTH)
  432. return Q_FAIR;
  433. else
  434. return Q_WELL;
  435. }
  436. /*----------------------------------------------------------------*/
  437. struct hash_table {
  438. struct entry_space *es;
  439. unsigned long long hash_bits;
  440. unsigned *buckets;
  441. };
  442. /*
  443. * All cache entries are stored in a chained hash table. To save space we
  444. * use indexing again, and only store indexes to the next entry.
  445. */
  446. static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries)
  447. {
  448. unsigned i, nr_buckets;
  449. ht->es = es;
  450. nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
  451. ht->hash_bits = ffs(nr_buckets) - 1;
  452. ht->buckets = vmalloc(sizeof(*ht->buckets) * nr_buckets);
  453. if (!ht->buckets)
  454. return -ENOMEM;
  455. for (i = 0; i < nr_buckets; i++)
  456. ht->buckets[i] = INDEXER_NULL;
  457. return 0;
  458. }
  459. static void h_exit(struct hash_table *ht)
  460. {
  461. vfree(ht->buckets);
  462. }
  463. static struct entry *h_head(struct hash_table *ht, unsigned bucket)
  464. {
  465. return to_entry(ht->es, ht->buckets[bucket]);
  466. }
  467. static struct entry *h_next(struct hash_table *ht, struct entry *e)
  468. {
  469. return to_entry(ht->es, e->hash_next);
  470. }
  471. static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e)
  472. {
  473. e->hash_next = ht->buckets[bucket];
  474. ht->buckets[bucket] = to_index(ht->es, e);
  475. }
  476. static void h_insert(struct hash_table *ht, struct entry *e)
  477. {
  478. unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
  479. __h_insert(ht, h, e);
  480. }
  481. static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock,
  482. struct entry **prev)
  483. {
  484. struct entry *e;
  485. *prev = NULL;
  486. for (e = h_head(ht, h); e; e = h_next(ht, e)) {
  487. if (e->oblock == oblock)
  488. return e;
  489. *prev = e;
  490. }
  491. return NULL;
  492. }
  493. static void __h_unlink(struct hash_table *ht, unsigned h,
  494. struct entry *e, struct entry *prev)
  495. {
  496. if (prev)
  497. prev->hash_next = e->hash_next;
  498. else
  499. ht->buckets[h] = e->hash_next;
  500. }
  501. /*
  502. * Also moves each entry to the front of the bucket.
  503. */
  504. static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
  505. {
  506. struct entry *e, *prev;
  507. unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
  508. e = __h_lookup(ht, h, oblock, &prev);
  509. if (e && prev) {
  510. /*
  511. * Move to the front because this entry is likely
  512. * to be hit again.
  513. */
  514. __h_unlink(ht, h, e, prev);
  515. __h_insert(ht, h, e);
  516. }
  517. return e;
  518. }
  519. static void h_remove(struct hash_table *ht, struct entry *e)
  520. {
  521. unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
  522. struct entry *prev;
  523. /*
  524. * The down side of using a singly linked list is we have to
  525. * iterate the bucket to remove an item.
  526. */
  527. e = __h_lookup(ht, h, e->oblock, &prev);
  528. if (e)
  529. __h_unlink(ht, h, e, prev);
  530. }
  531. /*----------------------------------------------------------------*/
  532. struct entry_alloc {
  533. struct entry_space *es;
  534. unsigned begin;
  535. unsigned nr_allocated;
  536. struct ilist free;
  537. };
  538. static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
  539. unsigned begin, unsigned end)
  540. {
  541. unsigned i;
  542. ea->es = es;
  543. ea->nr_allocated = 0u;
  544. ea->begin = begin;
  545. l_init(&ea->free);
  546. for (i = begin; i != end; i++)
  547. l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
  548. }
  549. static void init_entry(struct entry *e)
  550. {
  551. /*
  552. * We can't memset because that would clear the hotspot and
  553. * sentinel bits which remain constant.
  554. */
  555. e->hash_next = INDEXER_NULL;
  556. e->next = INDEXER_NULL;
  557. e->prev = INDEXER_NULL;
  558. e->level = 0u;
  559. e->allocated = true;
  560. }
  561. static struct entry *alloc_entry(struct entry_alloc *ea)
  562. {
  563. struct entry *e;
  564. if (l_empty(&ea->free))
  565. return NULL;
  566. e = l_pop_tail(ea->es, &ea->free);
  567. init_entry(e);
  568. ea->nr_allocated++;
  569. return e;
  570. }
  571. /*
  572. * This assumes the cblock hasn't already been allocated.
  573. */
  574. static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
  575. {
  576. struct entry *e = __get_entry(ea->es, ea->begin + i);
  577. BUG_ON(e->allocated);
  578. l_del(ea->es, &ea->free, e);
  579. init_entry(e);
  580. ea->nr_allocated++;
  581. return e;
  582. }
  583. static void free_entry(struct entry_alloc *ea, struct entry *e)
  584. {
  585. BUG_ON(!ea->nr_allocated);
  586. BUG_ON(!e->allocated);
  587. ea->nr_allocated--;
  588. e->allocated = false;
  589. l_add_tail(ea->es, &ea->free, e);
  590. }
  591. static bool allocator_empty(struct entry_alloc *ea)
  592. {
  593. return l_empty(&ea->free);
  594. }
  595. static unsigned get_index(struct entry_alloc *ea, struct entry *e)
  596. {
  597. return to_index(ea->es, e) - ea->begin;
  598. }
  599. static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
  600. {
  601. return __get_entry(ea->es, ea->begin + index);
  602. }
  603. /*----------------------------------------------------------------*/
  604. #define NR_HOTSPOT_LEVELS 64u
  605. #define NR_CACHE_LEVELS 64u
  606. #define WRITEBACK_PERIOD (10 * HZ)
  607. #define DEMOTE_PERIOD (60 * HZ)
  608. #define HOTSPOT_UPDATE_PERIOD (HZ)
  609. #define CACHE_UPDATE_PERIOD (10u * HZ)
  610. struct smq_policy {
  611. struct dm_cache_policy policy;
  612. /* protects everything */
  613. struct mutex lock;
  614. dm_cblock_t cache_size;
  615. sector_t cache_block_size;
  616. sector_t hotspot_block_size;
  617. unsigned nr_hotspot_blocks;
  618. unsigned cache_blocks_per_hotspot_block;
  619. unsigned hotspot_level_jump;
  620. struct entry_space es;
  621. struct entry_alloc writeback_sentinel_alloc;
  622. struct entry_alloc demote_sentinel_alloc;
  623. struct entry_alloc hotspot_alloc;
  624. struct entry_alloc cache_alloc;
  625. unsigned long *hotspot_hit_bits;
  626. unsigned long *cache_hit_bits;
  627. /*
  628. * We maintain three queues of entries. The cache proper,
  629. * consisting of a clean and dirty queue, containing the currently
  630. * active mappings. The hotspot queue uses a larger block size to
  631. * track blocks that are being hit frequently and potential
  632. * candidates for promotion to the cache.
  633. */
  634. struct queue hotspot;
  635. struct queue clean;
  636. struct queue dirty;
  637. struct stats hotspot_stats;
  638. struct stats cache_stats;
  639. /*
  640. * Keeps track of time, incremented by the core. We use this to
  641. * avoid attributing multiple hits within the same tick.
  642. *
  643. * Access to tick_protected should be done with the spin lock held.
  644. * It's copied to tick at the start of the map function (within the
  645. * mutex).
  646. */
  647. spinlock_t tick_lock;
  648. unsigned tick_protected;
  649. unsigned tick;
  650. /*
  651. * The hash tables allows us to quickly find an entry by origin
  652. * block.
  653. */
  654. struct hash_table table;
  655. struct hash_table hotspot_table;
  656. bool current_writeback_sentinels;
  657. unsigned long next_writeback_period;
  658. bool current_demote_sentinels;
  659. unsigned long next_demote_period;
  660. unsigned write_promote_level;
  661. unsigned read_promote_level;
  662. unsigned long next_hotspot_period;
  663. unsigned long next_cache_period;
  664. };
  665. /*----------------------------------------------------------------*/
  666. static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
  667. {
  668. return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
  669. }
  670. static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
  671. {
  672. return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
  673. }
  674. static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
  675. {
  676. return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
  677. }
  678. static void __update_writeback_sentinels(struct smq_policy *mq)
  679. {
  680. unsigned level;
  681. struct queue *q = &mq->dirty;
  682. struct entry *sentinel;
  683. for (level = 0; level < q->nr_levels; level++) {
  684. sentinel = writeback_sentinel(mq, level);
  685. q_del(q, sentinel);
  686. q_push(q, sentinel);
  687. }
  688. }
  689. static void __update_demote_sentinels(struct smq_policy *mq)
  690. {
  691. unsigned level;
  692. struct queue *q = &mq->clean;
  693. struct entry *sentinel;
  694. for (level = 0; level < q->nr_levels; level++) {
  695. sentinel = demote_sentinel(mq, level);
  696. q_del(q, sentinel);
  697. q_push(q, sentinel);
  698. }
  699. }
  700. static void update_sentinels(struct smq_policy *mq)
  701. {
  702. if (time_after(jiffies, mq->next_writeback_period)) {
  703. __update_writeback_sentinels(mq);
  704. mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
  705. mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
  706. }
  707. if (time_after(jiffies, mq->next_demote_period)) {
  708. __update_demote_sentinels(mq);
  709. mq->next_demote_period = jiffies + DEMOTE_PERIOD;
  710. mq->current_demote_sentinels = !mq->current_demote_sentinels;
  711. }
  712. }
  713. static void __sentinels_init(struct smq_policy *mq)
  714. {
  715. unsigned level;
  716. struct entry *sentinel;
  717. for (level = 0; level < NR_CACHE_LEVELS; level++) {
  718. sentinel = writeback_sentinel(mq, level);
  719. sentinel->level = level;
  720. q_push(&mq->dirty, sentinel);
  721. sentinel = demote_sentinel(mq, level);
  722. sentinel->level = level;
  723. q_push(&mq->clean, sentinel);
  724. }
  725. }
  726. static void sentinels_init(struct smq_policy *mq)
  727. {
  728. mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
  729. mq->next_demote_period = jiffies + DEMOTE_PERIOD;
  730. mq->current_writeback_sentinels = false;
  731. mq->current_demote_sentinels = false;
  732. __sentinels_init(mq);
  733. mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
  734. mq->current_demote_sentinels = !mq->current_demote_sentinels;
  735. __sentinels_init(mq);
  736. }
  737. /*----------------------------------------------------------------*/
  738. /*
  739. * These methods tie together the dirty queue, clean queue and hash table.
  740. */
  741. static void push_new(struct smq_policy *mq, struct entry *e)
  742. {
  743. struct queue *q = e->dirty ? &mq->dirty : &mq->clean;
  744. h_insert(&mq->table, e);
  745. q_push(q, e);
  746. }
  747. static void push(struct smq_policy *mq, struct entry *e)
  748. {
  749. struct entry *sentinel;
  750. h_insert(&mq->table, e);
  751. /*
  752. * Punch this into the queue just in front of the sentinel, to
  753. * ensure it's cleaned straight away.
  754. */
  755. if (e->dirty) {
  756. sentinel = writeback_sentinel(mq, e->level);
  757. q_push_before(&mq->dirty, sentinel, e);
  758. } else {
  759. sentinel = demote_sentinel(mq, e->level);
  760. q_push_before(&mq->clean, sentinel, e);
  761. }
  762. }
  763. /*
  764. * Removes an entry from cache. Removes from the hash table.
  765. */
  766. static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
  767. {
  768. q_del(q, e);
  769. h_remove(&mq->table, e);
  770. }
  771. static void del(struct smq_policy *mq, struct entry *e)
  772. {
  773. __del(mq, e->dirty ? &mq->dirty : &mq->clean, e);
  774. }
  775. static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level)
  776. {
  777. struct entry *e = q_pop_old(q, max_level);
  778. if (e)
  779. h_remove(&mq->table, e);
  780. return e;
  781. }
  782. static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
  783. {
  784. return to_cblock(get_index(&mq->cache_alloc, e));
  785. }
  786. static void requeue(struct smq_policy *mq, struct entry *e)
  787. {
  788. struct entry *sentinel;
  789. if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
  790. if (e->dirty) {
  791. sentinel = writeback_sentinel(mq, e->level);
  792. q_requeue_before(&mq->dirty, sentinel, e, 1u);
  793. } else {
  794. sentinel = demote_sentinel(mq, e->level);
  795. q_requeue_before(&mq->clean, sentinel, e, 1u);
  796. }
  797. }
  798. }
  799. static unsigned default_promote_level(struct smq_policy *mq)
  800. {
  801. /*
  802. * The promote level depends on the current performance of the
  803. * cache.
  804. *
  805. * If the cache is performing badly, then we can't afford
  806. * to promote much without causing performance to drop below that
  807. * of the origin device.
  808. *
  809. * If the cache is performing well, then we don't need to promote
  810. * much. If it isn't broken, don't fix it.
  811. *
  812. * If the cache is middling then we promote more.
  813. *
  814. * This scheme reminds me of a graph of entropy vs probability of a
  815. * binary variable.
  816. */
  817. static unsigned table[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
  818. unsigned hits = mq->cache_stats.hits;
  819. unsigned misses = mq->cache_stats.misses;
  820. unsigned index = safe_div(hits << 4u, hits + misses);
  821. return table[index];
  822. }
  823. static void update_promote_levels(struct smq_policy *mq)
  824. {
  825. /*
  826. * If there are unused cache entries then we want to be really
  827. * eager to promote.
  828. */
  829. unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
  830. default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
  831. /*
  832. * If the hotspot queue is performing badly then we have little
  833. * confidence that we know which blocks to promote. So we cut down
  834. * the amount of promotions.
  835. */
  836. switch (stats_assess(&mq->hotspot_stats)) {
  837. case Q_POOR:
  838. threshold_level /= 4u;
  839. break;
  840. case Q_FAIR:
  841. threshold_level /= 2u;
  842. break;
  843. case Q_WELL:
  844. break;
  845. }
  846. mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
  847. mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u;
  848. }
  849. /*
  850. * If the hotspot queue is performing badly, then we try and move entries
  851. * around more quickly.
  852. */
  853. static void update_level_jump(struct smq_policy *mq)
  854. {
  855. switch (stats_assess(&mq->hotspot_stats)) {
  856. case Q_POOR:
  857. mq->hotspot_level_jump = 4u;
  858. break;
  859. case Q_FAIR:
  860. mq->hotspot_level_jump = 2u;
  861. break;
  862. case Q_WELL:
  863. mq->hotspot_level_jump = 1u;
  864. break;
  865. }
  866. }
  867. static void end_hotspot_period(struct smq_policy *mq)
  868. {
  869. clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
  870. update_promote_levels(mq);
  871. if (time_after(jiffies, mq->next_hotspot_period)) {
  872. update_level_jump(mq);
  873. q_redistribute(&mq->hotspot);
  874. stats_reset(&mq->hotspot_stats);
  875. mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
  876. }
  877. }
  878. static void end_cache_period(struct smq_policy *mq)
  879. {
  880. if (time_after(jiffies, mq->next_cache_period)) {
  881. clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
  882. q_redistribute(&mq->dirty);
  883. q_redistribute(&mq->clean);
  884. stats_reset(&mq->cache_stats);
  885. mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
  886. }
  887. }
  888. static int demote_cblock(struct smq_policy *mq,
  889. struct policy_locker *locker,
  890. dm_oblock_t *oblock)
  891. {
  892. struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false);
  893. if (!demoted)
  894. /*
  895. * We could get a block from mq->dirty, but that
  896. * would add extra latency to the triggering bio as it
  897. * waits for the writeback. Better to not promote this
  898. * time and hope there's a clean block next time this block
  899. * is hit.
  900. */
  901. return -ENOSPC;
  902. if (locker->fn(locker, demoted->oblock))
  903. /*
  904. * We couldn't lock this block.
  905. */
  906. return -EBUSY;
  907. del(mq, demoted);
  908. *oblock = demoted->oblock;
  909. free_entry(&mq->cache_alloc, demoted);
  910. return 0;
  911. }
  912. enum promote_result {
  913. PROMOTE_NOT,
  914. PROMOTE_TEMPORARY,
  915. PROMOTE_PERMANENT
  916. };
  917. /*
  918. * Converts a boolean into a promote result.
  919. */
  920. static enum promote_result maybe_promote(bool promote)
  921. {
  922. return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
  923. }
  924. static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio,
  925. bool fast_promote)
  926. {
  927. if (bio_data_dir(bio) == WRITE) {
  928. if (!allocator_empty(&mq->cache_alloc) && fast_promote)
  929. return PROMOTE_TEMPORARY;
  930. else
  931. return maybe_promote(hs_e->level >= mq->write_promote_level);
  932. } else
  933. return maybe_promote(hs_e->level >= mq->read_promote_level);
  934. }
  935. static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
  936. struct policy_locker *locker,
  937. struct policy_result *result, enum promote_result pr)
  938. {
  939. int r;
  940. struct entry *e;
  941. if (allocator_empty(&mq->cache_alloc)) {
  942. result->op = POLICY_REPLACE;
  943. r = demote_cblock(mq, locker, &result->old_oblock);
  944. if (r) {
  945. result->op = POLICY_MISS;
  946. return;
  947. }
  948. } else
  949. result->op = POLICY_NEW;
  950. e = alloc_entry(&mq->cache_alloc);
  951. BUG_ON(!e);
  952. e->oblock = oblock;
  953. if (pr == PROMOTE_TEMPORARY)
  954. push(mq, e);
  955. else
  956. push_new(mq, e);
  957. result->cblock = infer_cblock(mq, e);
  958. }
  959. static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
  960. {
  961. sector_t r = from_oblock(b);
  962. (void) sector_div(r, mq->cache_blocks_per_hotspot_block);
  963. return to_oblock(r);
  964. }
  965. static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio)
  966. {
  967. unsigned hi;
  968. dm_oblock_t hb = to_hblock(mq, b);
  969. struct entry *e = h_lookup(&mq->hotspot_table, hb);
  970. if (e) {
  971. stats_level_accessed(&mq->hotspot_stats, e->level);
  972. hi = get_index(&mq->hotspot_alloc, e);
  973. q_requeue(&mq->hotspot, e,
  974. test_and_set_bit(hi, mq->hotspot_hit_bits) ?
  975. 0u : mq->hotspot_level_jump);
  976. } else {
  977. stats_miss(&mq->hotspot_stats);
  978. e = alloc_entry(&mq->hotspot_alloc);
  979. if (!e) {
  980. e = q_pop(&mq->hotspot);
  981. if (e) {
  982. h_remove(&mq->hotspot_table, e);
  983. hi = get_index(&mq->hotspot_alloc, e);
  984. clear_bit(hi, mq->hotspot_hit_bits);
  985. }
  986. }
  987. if (e) {
  988. e->oblock = hb;
  989. q_push(&mq->hotspot, e);
  990. h_insert(&mq->hotspot_table, e);
  991. }
  992. }
  993. return e;
  994. }
  995. /*
  996. * Looks the oblock up in the hash table, then decides whether to put in
  997. * pre_cache, or cache etc.
  998. */
  999. static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
  1000. bool can_migrate, bool fast_promote,
  1001. struct policy_locker *locker, struct policy_result *result)
  1002. {
  1003. struct entry *e, *hs_e;
  1004. enum promote_result pr;
  1005. hs_e = update_hotspot_queue(mq, oblock, bio);
  1006. e = h_lookup(&mq->table, oblock);
  1007. if (e) {
  1008. stats_level_accessed(&mq->cache_stats, e->level);
  1009. requeue(mq, e);
  1010. result->op = POLICY_HIT;
  1011. result->cblock = infer_cblock(mq, e);
  1012. } else {
  1013. stats_miss(&mq->cache_stats);
  1014. pr = should_promote(mq, hs_e, bio, fast_promote);
  1015. if (pr == PROMOTE_NOT)
  1016. result->op = POLICY_MISS;
  1017. else {
  1018. if (!can_migrate) {
  1019. result->op = POLICY_MISS;
  1020. return -EWOULDBLOCK;
  1021. }
  1022. insert_in_cache(mq, oblock, locker, result, pr);
  1023. }
  1024. }
  1025. return 0;
  1026. }
  1027. /*----------------------------------------------------------------*/
  1028. /*
  1029. * Public interface, via the policy struct. See dm-cache-policy.h for a
  1030. * description of these.
  1031. */
  1032. static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
  1033. {
  1034. return container_of(p, struct smq_policy, policy);
  1035. }
  1036. static void smq_destroy(struct dm_cache_policy *p)
  1037. {
  1038. struct smq_policy *mq = to_smq_policy(p);
  1039. h_exit(&mq->hotspot_table);
  1040. h_exit(&mq->table);
  1041. free_bitset(mq->hotspot_hit_bits);
  1042. free_bitset(mq->cache_hit_bits);
  1043. space_exit(&mq->es);
  1044. kfree(mq);
  1045. }
  1046. static void copy_tick(struct smq_policy *mq)
  1047. {
  1048. unsigned long flags, tick;
  1049. spin_lock_irqsave(&mq->tick_lock, flags);
  1050. tick = mq->tick_protected;
  1051. if (tick != mq->tick) {
  1052. update_sentinels(mq);
  1053. end_hotspot_period(mq);
  1054. end_cache_period(mq);
  1055. mq->tick = tick;
  1056. }
  1057. spin_unlock_irqrestore(&mq->tick_lock, flags);
  1058. }
  1059. static bool maybe_lock(struct smq_policy *mq, bool can_block)
  1060. {
  1061. if (can_block) {
  1062. mutex_lock(&mq->lock);
  1063. return true;
  1064. } else
  1065. return mutex_trylock(&mq->lock);
  1066. }
  1067. static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
  1068. bool can_block, bool can_migrate, bool fast_promote,
  1069. struct bio *bio, struct policy_locker *locker,
  1070. struct policy_result *result)
  1071. {
  1072. int r;
  1073. struct smq_policy *mq = to_smq_policy(p);
  1074. result->op = POLICY_MISS;
  1075. if (!maybe_lock(mq, can_block))
  1076. return -EWOULDBLOCK;
  1077. copy_tick(mq);
  1078. r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
  1079. mutex_unlock(&mq->lock);
  1080. return r;
  1081. }
  1082. static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
  1083. {
  1084. int r;
  1085. struct smq_policy *mq = to_smq_policy(p);
  1086. struct entry *e;
  1087. if (!mutex_trylock(&mq->lock))
  1088. return -EWOULDBLOCK;
  1089. e = h_lookup(&mq->table, oblock);
  1090. if (e) {
  1091. *cblock = infer_cblock(mq, e);
  1092. r = 0;
  1093. } else
  1094. r = -ENOENT;
  1095. mutex_unlock(&mq->lock);
  1096. return r;
  1097. }
  1098. static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set)
  1099. {
  1100. struct entry *e;
  1101. e = h_lookup(&mq->table, oblock);
  1102. BUG_ON(!e);
  1103. del(mq, e);
  1104. e->dirty = set;
  1105. push(mq, e);
  1106. }
  1107. static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
  1108. {
  1109. struct smq_policy *mq = to_smq_policy(p);
  1110. mutex_lock(&mq->lock);
  1111. __smq_set_clear_dirty(mq, oblock, true);
  1112. mutex_unlock(&mq->lock);
  1113. }
  1114. static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
  1115. {
  1116. struct smq_policy *mq = to_smq_policy(p);
  1117. mutex_lock(&mq->lock);
  1118. __smq_set_clear_dirty(mq, oblock, false);
  1119. mutex_unlock(&mq->lock);
  1120. }
  1121. static int smq_load_mapping(struct dm_cache_policy *p,
  1122. dm_oblock_t oblock, dm_cblock_t cblock,
  1123. uint32_t hint, bool hint_valid)
  1124. {
  1125. struct smq_policy *mq = to_smq_policy(p);
  1126. struct entry *e;
  1127. e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
  1128. e->oblock = oblock;
  1129. e->dirty = false; /* this gets corrected in a minute */
  1130. e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : 1;
  1131. push(mq, e);
  1132. return 0;
  1133. }
  1134. static int smq_save_hints(struct smq_policy *mq, struct queue *q,
  1135. policy_walk_fn fn, void *context)
  1136. {
  1137. int r;
  1138. unsigned level;
  1139. struct entry *e;
  1140. for (level = 0; level < q->nr_levels; level++)
  1141. for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
  1142. if (!e->sentinel) {
  1143. r = fn(context, infer_cblock(mq, e),
  1144. e->oblock, e->level);
  1145. if (r)
  1146. return r;
  1147. }
  1148. }
  1149. return 0;
  1150. }
  1151. static int smq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
  1152. void *context)
  1153. {
  1154. struct smq_policy *mq = to_smq_policy(p);
  1155. int r = 0;
  1156. mutex_lock(&mq->lock);
  1157. r = smq_save_hints(mq, &mq->clean, fn, context);
  1158. if (!r)
  1159. r = smq_save_hints(mq, &mq->dirty, fn, context);
  1160. mutex_unlock(&mq->lock);
  1161. return r;
  1162. }
  1163. static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
  1164. {
  1165. struct entry *e;
  1166. e = h_lookup(&mq->table, oblock);
  1167. BUG_ON(!e);
  1168. del(mq, e);
  1169. free_entry(&mq->cache_alloc, e);
  1170. }
  1171. static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
  1172. {
  1173. struct smq_policy *mq = to_smq_policy(p);
  1174. mutex_lock(&mq->lock);
  1175. __remove_mapping(mq, oblock);
  1176. mutex_unlock(&mq->lock);
  1177. }
  1178. static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
  1179. {
  1180. struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
  1181. if (!e || !e->allocated)
  1182. return -ENODATA;
  1183. del(mq, e);
  1184. free_entry(&mq->cache_alloc, e);
  1185. return 0;
  1186. }
  1187. static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
  1188. {
  1189. int r;
  1190. struct smq_policy *mq = to_smq_policy(p);
  1191. mutex_lock(&mq->lock);
  1192. r = __remove_cblock(mq, cblock);
  1193. mutex_unlock(&mq->lock);
  1194. return r;
  1195. }
  1196. #define CLEAN_TARGET_CRITICAL 5u /* percent */
  1197. static bool clean_target_met(struct smq_policy *mq, bool critical)
  1198. {
  1199. if (critical) {
  1200. /*
  1201. * Cache entries may not be populated. So we're cannot rely on the
  1202. * size of the clean queue.
  1203. */
  1204. unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
  1205. unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
  1206. return nr_clean >= target;
  1207. } else
  1208. return !q_size(&mq->dirty);
  1209. }
  1210. static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock,
  1211. dm_cblock_t *cblock, bool critical_only)
  1212. {
  1213. struct entry *e = NULL;
  1214. bool target_met = clean_target_met(mq, critical_only);
  1215. if (critical_only)
  1216. /*
  1217. * Always try and keep the bottom level clean.
  1218. */
  1219. e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
  1220. else
  1221. e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels);
  1222. if (!e)
  1223. return -ENODATA;
  1224. *oblock = e->oblock;
  1225. *cblock = infer_cblock(mq, e);
  1226. e->dirty = false;
  1227. push_new(mq, e);
  1228. return 0;
  1229. }
  1230. static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
  1231. dm_cblock_t *cblock, bool critical_only)
  1232. {
  1233. int r;
  1234. struct smq_policy *mq = to_smq_policy(p);
  1235. mutex_lock(&mq->lock);
  1236. r = __smq_writeback_work(mq, oblock, cblock, critical_only);
  1237. mutex_unlock(&mq->lock);
  1238. return r;
  1239. }
  1240. static void __force_mapping(struct smq_policy *mq,
  1241. dm_oblock_t current_oblock, dm_oblock_t new_oblock)
  1242. {
  1243. struct entry *e = h_lookup(&mq->table, current_oblock);
  1244. if (e) {
  1245. del(mq, e);
  1246. e->oblock = new_oblock;
  1247. e->dirty = true;
  1248. push(mq, e);
  1249. }
  1250. }
  1251. static void smq_force_mapping(struct dm_cache_policy *p,
  1252. dm_oblock_t current_oblock, dm_oblock_t new_oblock)
  1253. {
  1254. struct smq_policy *mq = to_smq_policy(p);
  1255. mutex_lock(&mq->lock);
  1256. __force_mapping(mq, current_oblock, new_oblock);
  1257. mutex_unlock(&mq->lock);
  1258. }
  1259. static dm_cblock_t smq_residency(struct dm_cache_policy *p)
  1260. {
  1261. dm_cblock_t r;
  1262. struct smq_policy *mq = to_smq_policy(p);
  1263. mutex_lock(&mq->lock);
  1264. r = to_cblock(mq->cache_alloc.nr_allocated);
  1265. mutex_unlock(&mq->lock);
  1266. return r;
  1267. }
  1268. static void smq_tick(struct dm_cache_policy *p, bool can_block)
  1269. {
  1270. struct smq_policy *mq = to_smq_policy(p);
  1271. unsigned long flags;
  1272. spin_lock_irqsave(&mq->tick_lock, flags);
  1273. mq->tick_protected++;
  1274. spin_unlock_irqrestore(&mq->tick_lock, flags);
  1275. if (can_block) {
  1276. mutex_lock(&mq->lock);
  1277. copy_tick(mq);
  1278. mutex_unlock(&mq->lock);
  1279. }
  1280. }
  1281. /* Init the policy plugin interface function pointers. */
  1282. static void init_policy_functions(struct smq_policy *mq)
  1283. {
  1284. mq->policy.destroy = smq_destroy;
  1285. mq->policy.map = smq_map;
  1286. mq->policy.lookup = smq_lookup;
  1287. mq->policy.set_dirty = smq_set_dirty;
  1288. mq->policy.clear_dirty = smq_clear_dirty;
  1289. mq->policy.load_mapping = smq_load_mapping;
  1290. mq->policy.walk_mappings = smq_walk_mappings;
  1291. mq->policy.remove_mapping = smq_remove_mapping;
  1292. mq->policy.remove_cblock = smq_remove_cblock;
  1293. mq->policy.writeback_work = smq_writeback_work;
  1294. mq->policy.force_mapping = smq_force_mapping;
  1295. mq->policy.residency = smq_residency;
  1296. mq->policy.tick = smq_tick;
  1297. }
  1298. static bool too_many_hotspot_blocks(sector_t origin_size,
  1299. sector_t hotspot_block_size,
  1300. unsigned nr_hotspot_blocks)
  1301. {
  1302. return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
  1303. }
  1304. static void calc_hotspot_params(sector_t origin_size,
  1305. sector_t cache_block_size,
  1306. unsigned nr_cache_blocks,
  1307. sector_t *hotspot_block_size,
  1308. unsigned *nr_hotspot_blocks)
  1309. {
  1310. *hotspot_block_size = cache_block_size * 16u;
  1311. *nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
  1312. while ((*hotspot_block_size > cache_block_size) &&
  1313. too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
  1314. *hotspot_block_size /= 2u;
  1315. }
  1316. static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
  1317. sector_t origin_size,
  1318. sector_t cache_block_size)
  1319. {
  1320. unsigned i;
  1321. unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
  1322. unsigned total_sentinels = 2u * nr_sentinels_per_queue;
  1323. struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
  1324. if (!mq)
  1325. return NULL;
  1326. init_policy_functions(mq);
  1327. mq->cache_size = cache_size;
  1328. mq->cache_block_size = cache_block_size;
  1329. calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
  1330. &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
  1331. mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
  1332. mq->hotspot_level_jump = 1u;
  1333. if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
  1334. DMERR("couldn't initialize entry space");
  1335. goto bad_pool_init;
  1336. }
  1337. init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
  1338. for (i = 0; i < nr_sentinels_per_queue; i++)
  1339. get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
  1340. init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
  1341. for (i = 0; i < nr_sentinels_per_queue; i++)
  1342. get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
  1343. init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
  1344. total_sentinels + mq->nr_hotspot_blocks);
  1345. init_allocator(&mq->cache_alloc, &mq->es,
  1346. total_sentinels + mq->nr_hotspot_blocks,
  1347. total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
  1348. mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
  1349. if (!mq->hotspot_hit_bits) {
  1350. DMERR("couldn't allocate hotspot hit bitset");
  1351. goto bad_hotspot_hit_bits;
  1352. }
  1353. clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
  1354. if (from_cblock(cache_size)) {
  1355. mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
  1356. if (!mq->cache_hit_bits && mq->cache_hit_bits) {
  1357. DMERR("couldn't allocate cache hit bitset");
  1358. goto bad_cache_hit_bits;
  1359. }
  1360. clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
  1361. } else
  1362. mq->cache_hit_bits = NULL;
  1363. mq->tick_protected = 0;
  1364. mq->tick = 0;
  1365. mutex_init(&mq->lock);
  1366. spin_lock_init(&mq->tick_lock);
  1367. q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
  1368. mq->hotspot.nr_top_levels = 8;
  1369. mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
  1370. from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
  1371. q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
  1372. q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
  1373. stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
  1374. stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
  1375. if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
  1376. goto bad_alloc_table;
  1377. if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
  1378. goto bad_alloc_hotspot_table;
  1379. sentinels_init(mq);
  1380. mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
  1381. mq->next_hotspot_period = jiffies;
  1382. mq->next_cache_period = jiffies;
  1383. return &mq->policy;
  1384. bad_alloc_hotspot_table:
  1385. h_exit(&mq->table);
  1386. bad_alloc_table:
  1387. free_bitset(mq->cache_hit_bits);
  1388. bad_cache_hit_bits:
  1389. free_bitset(mq->hotspot_hit_bits);
  1390. bad_hotspot_hit_bits:
  1391. space_exit(&mq->es);
  1392. bad_pool_init:
  1393. kfree(mq);
  1394. return NULL;
  1395. }
  1396. /*----------------------------------------------------------------*/
  1397. static struct dm_cache_policy_type smq_policy_type = {
  1398. .name = "smq",
  1399. .version = {1, 0, 0},
  1400. .hint_size = 4,
  1401. .owner = THIS_MODULE,
  1402. .create = smq_create
  1403. };
  1404. static struct dm_cache_policy_type default_policy_type = {
  1405. .name = "default",
  1406. .version = {1, 4, 0},
  1407. .hint_size = 4,
  1408. .owner = THIS_MODULE,
  1409. .create = smq_create,
  1410. .real = &smq_policy_type
  1411. };
  1412. static int __init smq_init(void)
  1413. {
  1414. int r;
  1415. r = dm_cache_policy_register(&smq_policy_type);
  1416. if (r) {
  1417. DMERR("register failed %d", r);
  1418. return -ENOMEM;
  1419. }
  1420. r = dm_cache_policy_register(&default_policy_type);
  1421. if (r) {
  1422. DMERR("register failed (as default) %d", r);
  1423. dm_cache_policy_unregister(&smq_policy_type);
  1424. return -ENOMEM;
  1425. }
  1426. return 0;
  1427. }
  1428. static void __exit smq_exit(void)
  1429. {
  1430. dm_cache_policy_unregister(&smq_policy_type);
  1431. dm_cache_policy_unregister(&default_policy_type);
  1432. }
  1433. module_init(smq_init);
  1434. module_exit(smq_exit);
  1435. MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
  1436. MODULE_LICENSE("GPL");
  1437. MODULE_DESCRIPTION("smq cache policy");