dm-cache-policy-smq.c 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919
  1. /*
  2. * Copyright (C) 2015 Red Hat. All rights reserved.
  3. *
  4. * This file is released under the GPL.
  5. */
  6. #include "dm-cache-background-tracker.h"
  7. #include "dm-cache-policy-internal.h"
  8. #include "dm-cache-policy.h"
  9. #include "dm.h"
  10. #include <linux/hash.h>
  11. #include <linux/jiffies.h>
  12. #include <linux/module.h>
  13. #include <linux/mutex.h>
  14. #include <linux/vmalloc.h>
  15. #include <linux/math64.h>
  16. #define DM_MSG_PREFIX "cache-policy-smq"
  17. /*----------------------------------------------------------------*/
  18. /*
  19. * Safe division functions that return zero on divide by zero.
  20. */
  21. static unsigned safe_div(unsigned n, unsigned d)
  22. {
  23. return d ? n / d : 0u;
  24. }
  25. static unsigned safe_mod(unsigned n, unsigned d)
  26. {
  27. return d ? n % d : 0u;
  28. }
  29. /*----------------------------------------------------------------*/
  30. struct entry {
  31. unsigned hash_next:28;
  32. unsigned prev:28;
  33. unsigned next:28;
  34. unsigned level:6;
  35. bool dirty:1;
  36. bool allocated:1;
  37. bool sentinel:1;
  38. bool pending_work:1;
  39. dm_oblock_t oblock;
  40. };
  41. /*----------------------------------------------------------------*/
  42. #define INDEXER_NULL ((1u << 28u) - 1u)
  43. /*
  44. * An entry_space manages a set of entries that we use for the queues.
  45. * The clean and dirty queues share entries, so this object is separate
  46. * from the queue itself.
  47. */
  48. struct entry_space {
  49. struct entry *begin;
  50. struct entry *end;
  51. };
  52. static int space_init(struct entry_space *es, unsigned nr_entries)
  53. {
  54. if (!nr_entries) {
  55. es->begin = es->end = NULL;
  56. return 0;
  57. }
  58. es->begin = vzalloc(sizeof(struct entry) * nr_entries);
  59. if (!es->begin)
  60. return -ENOMEM;
  61. es->end = es->begin + nr_entries;
  62. return 0;
  63. }
  64. static void space_exit(struct entry_space *es)
  65. {
  66. vfree(es->begin);
  67. }
  68. static struct entry *__get_entry(struct entry_space *es, unsigned block)
  69. {
  70. struct entry *e;
  71. e = es->begin + block;
  72. BUG_ON(e >= es->end);
  73. return e;
  74. }
  75. static unsigned to_index(struct entry_space *es, struct entry *e)
  76. {
  77. BUG_ON(e < es->begin || e >= es->end);
  78. return e - es->begin;
  79. }
  80. static struct entry *to_entry(struct entry_space *es, unsigned block)
  81. {
  82. if (block == INDEXER_NULL)
  83. return NULL;
  84. return __get_entry(es, block);
  85. }
  86. /*----------------------------------------------------------------*/
  87. struct ilist {
  88. unsigned nr_elts; /* excluding sentinel entries */
  89. unsigned head, tail;
  90. };
  91. static void l_init(struct ilist *l)
  92. {
  93. l->nr_elts = 0;
  94. l->head = l->tail = INDEXER_NULL;
  95. }
  96. static struct entry *l_head(struct entry_space *es, struct ilist *l)
  97. {
  98. return to_entry(es, l->head);
  99. }
  100. static struct entry *l_tail(struct entry_space *es, struct ilist *l)
  101. {
  102. return to_entry(es, l->tail);
  103. }
  104. static struct entry *l_next(struct entry_space *es, struct entry *e)
  105. {
  106. return to_entry(es, e->next);
  107. }
  108. static struct entry *l_prev(struct entry_space *es, struct entry *e)
  109. {
  110. return to_entry(es, e->prev);
  111. }
  112. static bool l_empty(struct ilist *l)
  113. {
  114. return l->head == INDEXER_NULL;
  115. }
  116. static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
  117. {
  118. struct entry *head = l_head(es, l);
  119. e->next = l->head;
  120. e->prev = INDEXER_NULL;
  121. if (head)
  122. head->prev = l->head = to_index(es, e);
  123. else
  124. l->head = l->tail = to_index(es, e);
  125. if (!e->sentinel)
  126. l->nr_elts++;
  127. }
  128. static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
  129. {
  130. struct entry *tail = l_tail(es, l);
  131. e->next = INDEXER_NULL;
  132. e->prev = l->tail;
  133. if (tail)
  134. tail->next = l->tail = to_index(es, e);
  135. else
  136. l->head = l->tail = to_index(es, e);
  137. if (!e->sentinel)
  138. l->nr_elts++;
  139. }
  140. static void l_add_before(struct entry_space *es, struct ilist *l,
  141. struct entry *old, struct entry *e)
  142. {
  143. struct entry *prev = l_prev(es, old);
  144. if (!prev)
  145. l_add_head(es, l, e);
  146. else {
  147. e->prev = old->prev;
  148. e->next = to_index(es, old);
  149. prev->next = old->prev = to_index(es, e);
  150. if (!e->sentinel)
  151. l->nr_elts++;
  152. }
  153. }
  154. static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
  155. {
  156. struct entry *prev = l_prev(es, e);
  157. struct entry *next = l_next(es, e);
  158. if (prev)
  159. prev->next = e->next;
  160. else
  161. l->head = e->next;
  162. if (next)
  163. next->prev = e->prev;
  164. else
  165. l->tail = e->prev;
  166. if (!e->sentinel)
  167. l->nr_elts--;
  168. }
  169. static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
  170. {
  171. struct entry *e;
  172. for (e = l_tail(es, l); e; e = l_prev(es, e))
  173. if (!e->sentinel) {
  174. l_del(es, l, e);
  175. return e;
  176. }
  177. return NULL;
  178. }
  179. /*----------------------------------------------------------------*/
  180. /*
  181. * The stochastic-multi-queue is a set of lru lists stacked into levels.
  182. * Entries are moved up levels when they are used, which loosely orders the
  183. * most accessed entries in the top levels and least in the bottom. This
  184. * structure is *much* better than a single lru list.
  185. */
  186. #define MAX_LEVELS 64u
  187. struct queue {
  188. struct entry_space *es;
  189. unsigned nr_elts;
  190. unsigned nr_levels;
  191. struct ilist qs[MAX_LEVELS];
  192. /*
  193. * We maintain a count of the number of entries we would like in each
  194. * level.
  195. */
  196. unsigned last_target_nr_elts;
  197. unsigned nr_top_levels;
  198. unsigned nr_in_top_levels;
  199. unsigned target_count[MAX_LEVELS];
  200. };
  201. static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
  202. {
  203. unsigned i;
  204. q->es = es;
  205. q->nr_elts = 0;
  206. q->nr_levels = nr_levels;
  207. for (i = 0; i < q->nr_levels; i++) {
  208. l_init(q->qs + i);
  209. q->target_count[i] = 0u;
  210. }
  211. q->last_target_nr_elts = 0u;
  212. q->nr_top_levels = 0u;
  213. q->nr_in_top_levels = 0u;
  214. }
  215. static unsigned q_size(struct queue *q)
  216. {
  217. return q->nr_elts;
  218. }
  219. /*
  220. * Insert an entry to the back of the given level.
  221. */
  222. static void q_push(struct queue *q, struct entry *e)
  223. {
  224. BUG_ON(e->pending_work);
  225. if (!e->sentinel)
  226. q->nr_elts++;
  227. l_add_tail(q->es, q->qs + e->level, e);
  228. }
  229. static void q_push_front(struct queue *q, struct entry *e)
  230. {
  231. BUG_ON(e->pending_work);
  232. if (!e->sentinel)
  233. q->nr_elts++;
  234. l_add_head(q->es, q->qs + e->level, e);
  235. }
  236. static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
  237. {
  238. BUG_ON(e->pending_work);
  239. if (!e->sentinel)
  240. q->nr_elts++;
  241. l_add_before(q->es, q->qs + e->level, old, e);
  242. }
  243. static void q_del(struct queue *q, struct entry *e)
  244. {
  245. l_del(q->es, q->qs + e->level, e);
  246. if (!e->sentinel)
  247. q->nr_elts--;
  248. }
  249. /*
  250. * Return the oldest entry of the lowest populated level.
  251. */
  252. static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
  253. {
  254. unsigned level;
  255. struct entry *e;
  256. max_level = min(max_level, q->nr_levels);
  257. for (level = 0; level < max_level; level++)
  258. for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
  259. if (e->sentinel) {
  260. if (can_cross_sentinel)
  261. continue;
  262. else
  263. break;
  264. }
  265. return e;
  266. }
  267. return NULL;
  268. }
  269. static struct entry *q_pop(struct queue *q)
  270. {
  271. struct entry *e = q_peek(q, q->nr_levels, true);
  272. if (e)
  273. q_del(q, e);
  274. return e;
  275. }
  276. /*
  277. * This function assumes there is a non-sentinel entry to pop. It's only
  278. * used by redistribute, so we know this is true. It also doesn't adjust
  279. * the q->nr_elts count.
  280. */
  281. static struct entry *__redist_pop_from(struct queue *q, unsigned level)
  282. {
  283. struct entry *e;
  284. for (; level < q->nr_levels; level++)
  285. for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
  286. if (!e->sentinel) {
  287. l_del(q->es, q->qs + e->level, e);
  288. return e;
  289. }
  290. return NULL;
  291. }
  292. static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
  293. {
  294. unsigned level, nr_levels, entries_per_level, remainder;
  295. BUG_ON(lbegin > lend);
  296. BUG_ON(lend > q->nr_levels);
  297. nr_levels = lend - lbegin;
  298. entries_per_level = safe_div(nr_elts, nr_levels);
  299. remainder = safe_mod(nr_elts, nr_levels);
  300. for (level = lbegin; level < lend; level++)
  301. q->target_count[level] =
  302. (level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
  303. }
  304. /*
  305. * Typically we have fewer elements in the top few levels which allows us
  306. * to adjust the promote threshold nicely.
  307. */
  308. static void q_set_targets(struct queue *q)
  309. {
  310. if (q->last_target_nr_elts == q->nr_elts)
  311. return;
  312. q->last_target_nr_elts = q->nr_elts;
  313. if (q->nr_top_levels > q->nr_levels)
  314. q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
  315. else {
  316. q_set_targets_subrange_(q, q->nr_in_top_levels,
  317. q->nr_levels - q->nr_top_levels, q->nr_levels);
  318. if (q->nr_in_top_levels < q->nr_elts)
  319. q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
  320. 0, q->nr_levels - q->nr_top_levels);
  321. else
  322. q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
  323. }
  324. }
  325. static void q_redistribute(struct queue *q)
  326. {
  327. unsigned target, level;
  328. struct ilist *l, *l_above;
  329. struct entry *e;
  330. q_set_targets(q);
  331. for (level = 0u; level < q->nr_levels - 1u; level++) {
  332. l = q->qs + level;
  333. target = q->target_count[level];
  334. /*
  335. * Pull down some entries from the level above.
  336. */
  337. while (l->nr_elts < target) {
  338. e = __redist_pop_from(q, level + 1u);
  339. if (!e) {
  340. /* bug in nr_elts */
  341. break;
  342. }
  343. e->level = level;
  344. l_add_tail(q->es, l, e);
  345. }
  346. /*
  347. * Push some entries up.
  348. */
  349. l_above = q->qs + level + 1u;
  350. while (l->nr_elts > target) {
  351. e = l_pop_tail(q->es, l);
  352. if (!e)
  353. /* bug in nr_elts */
  354. break;
  355. e->level = level + 1u;
  356. l_add_tail(q->es, l_above, e);
  357. }
  358. }
  359. }
  360. static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
  361. struct entry *s1, struct entry *s2)
  362. {
  363. struct entry *de;
  364. unsigned sentinels_passed = 0;
  365. unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
  366. /* try and find an entry to swap with */
  367. if (extra_levels && (e->level < q->nr_levels - 1u)) {
  368. for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
  369. sentinels_passed++;
  370. if (de) {
  371. q_del(q, de);
  372. de->level = e->level;
  373. if (s1) {
  374. switch (sentinels_passed) {
  375. case 0:
  376. q_push_before(q, s1, de);
  377. break;
  378. case 1:
  379. q_push_before(q, s2, de);
  380. break;
  381. default:
  382. q_push(q, de);
  383. }
  384. } else
  385. q_push(q, de);
  386. }
  387. }
  388. q_del(q, e);
  389. e->level = new_level;
  390. q_push(q, e);
  391. }
  392. /*----------------------------------------------------------------*/
  393. #define FP_SHIFT 8
  394. #define SIXTEENTH (1u << (FP_SHIFT - 4u))
  395. #define EIGHTH (1u << (FP_SHIFT - 3u))
  396. struct stats {
  397. unsigned hit_threshold;
  398. unsigned hits;
  399. unsigned misses;
  400. };
  401. enum performance {
  402. Q_POOR,
  403. Q_FAIR,
  404. Q_WELL
  405. };
  406. static void stats_init(struct stats *s, unsigned nr_levels)
  407. {
  408. s->hit_threshold = (nr_levels * 3u) / 4u;
  409. s->hits = 0u;
  410. s->misses = 0u;
  411. }
  412. static void stats_reset(struct stats *s)
  413. {
  414. s->hits = s->misses = 0u;
  415. }
  416. static void stats_level_accessed(struct stats *s, unsigned level)
  417. {
  418. if (level >= s->hit_threshold)
  419. s->hits++;
  420. else
  421. s->misses++;
  422. }
  423. static void stats_miss(struct stats *s)
  424. {
  425. s->misses++;
  426. }
  427. /*
  428. * There are times when we don't have any confidence in the hotspot queue.
  429. * Such as when a fresh cache is created and the blocks have been spread
  430. * out across the levels, or if an io load changes. We detect this by
  431. * seeing how often a lookup is in the top levels of the hotspot queue.
  432. */
  433. static enum performance stats_assess(struct stats *s)
  434. {
  435. unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
  436. if (confidence < SIXTEENTH)
  437. return Q_POOR;
  438. else if (confidence < EIGHTH)
  439. return Q_FAIR;
  440. else
  441. return Q_WELL;
  442. }
  443. /*----------------------------------------------------------------*/
  444. struct smq_hash_table {
  445. struct entry_space *es;
  446. unsigned long long hash_bits;
  447. unsigned *buckets;
  448. };
  449. /*
  450. * All cache entries are stored in a chained hash table. To save space we
  451. * use indexing again, and only store indexes to the next entry.
  452. */
  453. static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
  454. {
  455. unsigned i, nr_buckets;
  456. ht->es = es;
  457. nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
  458. ht->hash_bits = __ffs(nr_buckets);
  459. ht->buckets = vmalloc(sizeof(*ht->buckets) * nr_buckets);
  460. if (!ht->buckets)
  461. return -ENOMEM;
  462. for (i = 0; i < nr_buckets; i++)
  463. ht->buckets[i] = INDEXER_NULL;
  464. return 0;
  465. }
  466. static void h_exit(struct smq_hash_table *ht)
  467. {
  468. vfree(ht->buckets);
  469. }
  470. static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
  471. {
  472. return to_entry(ht->es, ht->buckets[bucket]);
  473. }
  474. static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
  475. {
  476. return to_entry(ht->es, e->hash_next);
  477. }
  478. static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
  479. {
  480. e->hash_next = ht->buckets[bucket];
  481. ht->buckets[bucket] = to_index(ht->es, e);
  482. }
  483. static void h_insert(struct smq_hash_table *ht, struct entry *e)
  484. {
  485. unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
  486. __h_insert(ht, h, e);
  487. }
  488. static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
  489. struct entry **prev)
  490. {
  491. struct entry *e;
  492. *prev = NULL;
  493. for (e = h_head(ht, h); e; e = h_next(ht, e)) {
  494. if (e->oblock == oblock)
  495. return e;
  496. *prev = e;
  497. }
  498. return NULL;
  499. }
  500. static void __h_unlink(struct smq_hash_table *ht, unsigned h,
  501. struct entry *e, struct entry *prev)
  502. {
  503. if (prev)
  504. prev->hash_next = e->hash_next;
  505. else
  506. ht->buckets[h] = e->hash_next;
  507. }
  508. /*
  509. * Also moves each entry to the front of the bucket.
  510. */
  511. static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
  512. {
  513. struct entry *e, *prev;
  514. unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
  515. e = __h_lookup(ht, h, oblock, &prev);
  516. if (e && prev) {
  517. /*
  518. * Move to the front because this entry is likely
  519. * to be hit again.
  520. */
  521. __h_unlink(ht, h, e, prev);
  522. __h_insert(ht, h, e);
  523. }
  524. return e;
  525. }
  526. static void h_remove(struct smq_hash_table *ht, struct entry *e)
  527. {
  528. unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
  529. struct entry *prev;
  530. /*
  531. * The down side of using a singly linked list is we have to
  532. * iterate the bucket to remove an item.
  533. */
  534. e = __h_lookup(ht, h, e->oblock, &prev);
  535. if (e)
  536. __h_unlink(ht, h, e, prev);
  537. }
  538. /*----------------------------------------------------------------*/
  539. struct entry_alloc {
  540. struct entry_space *es;
  541. unsigned begin;
  542. unsigned nr_allocated;
  543. struct ilist free;
  544. };
  545. static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
  546. unsigned begin, unsigned end)
  547. {
  548. unsigned i;
  549. ea->es = es;
  550. ea->nr_allocated = 0u;
  551. ea->begin = begin;
  552. l_init(&ea->free);
  553. for (i = begin; i != end; i++)
  554. l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
  555. }
  556. static void init_entry(struct entry *e)
  557. {
  558. /*
  559. * We can't memset because that would clear the hotspot and
  560. * sentinel bits which remain constant.
  561. */
  562. e->hash_next = INDEXER_NULL;
  563. e->next = INDEXER_NULL;
  564. e->prev = INDEXER_NULL;
  565. e->level = 0u;
  566. e->dirty = true; /* FIXME: audit */
  567. e->allocated = true;
  568. e->sentinel = false;
  569. e->pending_work = false;
  570. }
  571. static struct entry *alloc_entry(struct entry_alloc *ea)
  572. {
  573. struct entry *e;
  574. if (l_empty(&ea->free))
  575. return NULL;
  576. e = l_pop_tail(ea->es, &ea->free);
  577. init_entry(e);
  578. ea->nr_allocated++;
  579. return e;
  580. }
  581. /*
  582. * This assumes the cblock hasn't already been allocated.
  583. */
  584. static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
  585. {
  586. struct entry *e = __get_entry(ea->es, ea->begin + i);
  587. BUG_ON(e->allocated);
  588. l_del(ea->es, &ea->free, e);
  589. init_entry(e);
  590. ea->nr_allocated++;
  591. return e;
  592. }
  593. static void free_entry(struct entry_alloc *ea, struct entry *e)
  594. {
  595. BUG_ON(!ea->nr_allocated);
  596. BUG_ON(!e->allocated);
  597. ea->nr_allocated--;
  598. e->allocated = false;
  599. l_add_tail(ea->es, &ea->free, e);
  600. }
  601. static bool allocator_empty(struct entry_alloc *ea)
  602. {
  603. return l_empty(&ea->free);
  604. }
  605. static unsigned get_index(struct entry_alloc *ea, struct entry *e)
  606. {
  607. return to_index(ea->es, e) - ea->begin;
  608. }
  609. static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
  610. {
  611. return __get_entry(ea->es, ea->begin + index);
  612. }
  613. /*----------------------------------------------------------------*/
  614. #define NR_HOTSPOT_LEVELS 64u
  615. #define NR_CACHE_LEVELS 64u
  616. #define WRITEBACK_PERIOD (10ul * HZ)
  617. #define DEMOTE_PERIOD (60ul * HZ)
  618. #define HOTSPOT_UPDATE_PERIOD (HZ)
  619. #define CACHE_UPDATE_PERIOD (60ul * HZ)
  620. struct smq_policy {
  621. struct dm_cache_policy policy;
  622. /* protects everything */
  623. spinlock_t lock;
  624. dm_cblock_t cache_size;
  625. sector_t cache_block_size;
  626. sector_t hotspot_block_size;
  627. unsigned nr_hotspot_blocks;
  628. unsigned cache_blocks_per_hotspot_block;
  629. unsigned hotspot_level_jump;
  630. struct entry_space es;
  631. struct entry_alloc writeback_sentinel_alloc;
  632. struct entry_alloc demote_sentinel_alloc;
  633. struct entry_alloc hotspot_alloc;
  634. struct entry_alloc cache_alloc;
  635. unsigned long *hotspot_hit_bits;
  636. unsigned long *cache_hit_bits;
  637. /*
  638. * We maintain three queues of entries. The cache proper,
  639. * consisting of a clean and dirty queue, containing the currently
  640. * active mappings. The hotspot queue uses a larger block size to
  641. * track blocks that are being hit frequently and potential
  642. * candidates for promotion to the cache.
  643. */
  644. struct queue hotspot;
  645. struct queue clean;
  646. struct queue dirty;
  647. struct stats hotspot_stats;
  648. struct stats cache_stats;
  649. /*
  650. * Keeps track of time, incremented by the core. We use this to
  651. * avoid attributing multiple hits within the same tick.
  652. */
  653. unsigned tick;
  654. /*
  655. * The hash tables allows us to quickly find an entry by origin
  656. * block.
  657. */
  658. struct smq_hash_table table;
  659. struct smq_hash_table hotspot_table;
  660. bool current_writeback_sentinels;
  661. unsigned long next_writeback_period;
  662. bool current_demote_sentinels;
  663. unsigned long next_demote_period;
  664. unsigned write_promote_level;
  665. unsigned read_promote_level;
  666. unsigned long next_hotspot_period;
  667. unsigned long next_cache_period;
  668. struct background_tracker *bg_work;
  669. bool migrations_allowed;
  670. };
  671. /*----------------------------------------------------------------*/
  672. static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
  673. {
  674. return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
  675. }
  676. static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
  677. {
  678. return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
  679. }
  680. static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
  681. {
  682. return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
  683. }
  684. static void __update_writeback_sentinels(struct smq_policy *mq)
  685. {
  686. unsigned level;
  687. struct queue *q = &mq->dirty;
  688. struct entry *sentinel;
  689. for (level = 0; level < q->nr_levels; level++) {
  690. sentinel = writeback_sentinel(mq, level);
  691. q_del(q, sentinel);
  692. q_push(q, sentinel);
  693. }
  694. }
  695. static void __update_demote_sentinels(struct smq_policy *mq)
  696. {
  697. unsigned level;
  698. struct queue *q = &mq->clean;
  699. struct entry *sentinel;
  700. for (level = 0; level < q->nr_levels; level++) {
  701. sentinel = demote_sentinel(mq, level);
  702. q_del(q, sentinel);
  703. q_push(q, sentinel);
  704. }
  705. }
  706. static void update_sentinels(struct smq_policy *mq)
  707. {
  708. if (time_after(jiffies, mq->next_writeback_period)) {
  709. mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
  710. mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
  711. __update_writeback_sentinels(mq);
  712. }
  713. if (time_after(jiffies, mq->next_demote_period)) {
  714. mq->next_demote_period = jiffies + DEMOTE_PERIOD;
  715. mq->current_demote_sentinels = !mq->current_demote_sentinels;
  716. __update_demote_sentinels(mq);
  717. }
  718. }
  719. static void __sentinels_init(struct smq_policy *mq)
  720. {
  721. unsigned level;
  722. struct entry *sentinel;
  723. for (level = 0; level < NR_CACHE_LEVELS; level++) {
  724. sentinel = writeback_sentinel(mq, level);
  725. sentinel->level = level;
  726. q_push(&mq->dirty, sentinel);
  727. sentinel = demote_sentinel(mq, level);
  728. sentinel->level = level;
  729. q_push(&mq->clean, sentinel);
  730. }
  731. }
  732. static void sentinels_init(struct smq_policy *mq)
  733. {
  734. mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
  735. mq->next_demote_period = jiffies + DEMOTE_PERIOD;
  736. mq->current_writeback_sentinels = false;
  737. mq->current_demote_sentinels = false;
  738. __sentinels_init(mq);
  739. mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
  740. mq->current_demote_sentinels = !mq->current_demote_sentinels;
  741. __sentinels_init(mq);
  742. }
  743. /*----------------------------------------------------------------*/
  744. static void del_queue(struct smq_policy *mq, struct entry *e)
  745. {
  746. q_del(e->dirty ? &mq->dirty : &mq->clean, e);
  747. }
  748. static void push_queue(struct smq_policy *mq, struct entry *e)
  749. {
  750. if (e->dirty)
  751. q_push(&mq->dirty, e);
  752. else
  753. q_push(&mq->clean, e);
  754. }
  755. // !h, !q, a -> h, q, a
  756. static void push(struct smq_policy *mq, struct entry *e)
  757. {
  758. h_insert(&mq->table, e);
  759. if (!e->pending_work)
  760. push_queue(mq, e);
  761. }
  762. static void push_queue_front(struct smq_policy *mq, struct entry *e)
  763. {
  764. if (e->dirty)
  765. q_push_front(&mq->dirty, e);
  766. else
  767. q_push_front(&mq->clean, e);
  768. }
  769. static void push_front(struct smq_policy *mq, struct entry *e)
  770. {
  771. h_insert(&mq->table, e);
  772. if (!e->pending_work)
  773. push_queue_front(mq, e);
  774. }
  775. static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
  776. {
  777. return to_cblock(get_index(&mq->cache_alloc, e));
  778. }
  779. static void requeue(struct smq_policy *mq, struct entry *e)
  780. {
  781. /*
  782. * Pending work has temporarily been taken out of the queues.
  783. */
  784. if (e->pending_work)
  785. return;
  786. if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
  787. if (!e->dirty) {
  788. q_requeue(&mq->clean, e, 1u, NULL, NULL);
  789. return;
  790. }
  791. q_requeue(&mq->dirty, e, 1u,
  792. get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
  793. get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
  794. }
  795. }
  796. static unsigned default_promote_level(struct smq_policy *mq)
  797. {
  798. /*
  799. * The promote level depends on the current performance of the
  800. * cache.
  801. *
  802. * If the cache is performing badly, then we can't afford
  803. * to promote much without causing performance to drop below that
  804. * of the origin device.
  805. *
  806. * If the cache is performing well, then we don't need to promote
  807. * much. If it isn't broken, don't fix it.
  808. *
  809. * If the cache is middling then we promote more.
  810. *
  811. * This scheme reminds me of a graph of entropy vs probability of a
  812. * binary variable.
  813. */
  814. static unsigned table[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
  815. unsigned hits = mq->cache_stats.hits;
  816. unsigned misses = mq->cache_stats.misses;
  817. unsigned index = safe_div(hits << 4u, hits + misses);
  818. return table[index];
  819. }
  820. static void update_promote_levels(struct smq_policy *mq)
  821. {
  822. /*
  823. * If there are unused cache entries then we want to be really
  824. * eager to promote.
  825. */
  826. unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
  827. default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
  828. threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
  829. /*
  830. * If the hotspot queue is performing badly then we have little
  831. * confidence that we know which blocks to promote. So we cut down
  832. * the amount of promotions.
  833. */
  834. switch (stats_assess(&mq->hotspot_stats)) {
  835. case Q_POOR:
  836. threshold_level /= 4u;
  837. break;
  838. case Q_FAIR:
  839. threshold_level /= 2u;
  840. break;
  841. case Q_WELL:
  842. break;
  843. }
  844. mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
  845. mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
  846. }
  847. /*
  848. * If the hotspot queue is performing badly, then we try and move entries
  849. * around more quickly.
  850. */
  851. static void update_level_jump(struct smq_policy *mq)
  852. {
  853. switch (stats_assess(&mq->hotspot_stats)) {
  854. case Q_POOR:
  855. mq->hotspot_level_jump = 4u;
  856. break;
  857. case Q_FAIR:
  858. mq->hotspot_level_jump = 2u;
  859. break;
  860. case Q_WELL:
  861. mq->hotspot_level_jump = 1u;
  862. break;
  863. }
  864. }
  865. static void end_hotspot_period(struct smq_policy *mq)
  866. {
  867. clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
  868. update_promote_levels(mq);
  869. if (time_after(jiffies, mq->next_hotspot_period)) {
  870. update_level_jump(mq);
  871. q_redistribute(&mq->hotspot);
  872. stats_reset(&mq->hotspot_stats);
  873. mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
  874. }
  875. }
  876. static void end_cache_period(struct smq_policy *mq)
  877. {
  878. if (time_after(jiffies, mq->next_cache_period)) {
  879. clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
  880. q_redistribute(&mq->dirty);
  881. q_redistribute(&mq->clean);
  882. stats_reset(&mq->cache_stats);
  883. mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
  884. }
  885. }
  886. /*----------------------------------------------------------------*/
  887. /*
  888. * Targets are given as a percentage.
  889. */
  890. #define CLEAN_TARGET 25u
  891. #define FREE_TARGET 25u
  892. static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
  893. {
  894. return from_cblock(mq->cache_size) * p / 100u;
  895. }
  896. static bool clean_target_met(struct smq_policy *mq, bool idle)
  897. {
  898. /*
  899. * Cache entries may not be populated. So we cannot rely on the
  900. * size of the clean queue.
  901. */
  902. if (idle) {
  903. /*
  904. * We'd like to clean everything.
  905. */
  906. return q_size(&mq->dirty) == 0u;
  907. }
  908. /*
  909. * If we're busy we don't worry about cleaning at all.
  910. */
  911. return true;
  912. }
  913. static bool free_target_met(struct smq_policy *mq)
  914. {
  915. unsigned nr_free;
  916. nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated;
  917. return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
  918. percent_to_target(mq, FREE_TARGET);
  919. }
  920. /*----------------------------------------------------------------*/
  921. static void mark_pending(struct smq_policy *mq, struct entry *e)
  922. {
  923. BUG_ON(e->sentinel);
  924. BUG_ON(!e->allocated);
  925. BUG_ON(e->pending_work);
  926. e->pending_work = true;
  927. }
  928. static void clear_pending(struct smq_policy *mq, struct entry *e)
  929. {
  930. BUG_ON(!e->pending_work);
  931. e->pending_work = false;
  932. }
  933. static void queue_writeback(struct smq_policy *mq)
  934. {
  935. int r;
  936. struct policy_work work;
  937. struct entry *e;
  938. e = q_peek(&mq->dirty, mq->dirty.nr_levels, !mq->migrations_allowed);
  939. if (e) {
  940. mark_pending(mq, e);
  941. q_del(&mq->dirty, e);
  942. work.op = POLICY_WRITEBACK;
  943. work.oblock = e->oblock;
  944. work.cblock = infer_cblock(mq, e);
  945. r = btracker_queue(mq->bg_work, &work, NULL);
  946. WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race.
  947. }
  948. }
  949. static void queue_demotion(struct smq_policy *mq)
  950. {
  951. struct policy_work work;
  952. struct entry *e;
  953. if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed)))
  954. return;
  955. e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
  956. if (!e) {
  957. if (!clean_target_met(mq, true))
  958. queue_writeback(mq);
  959. return;
  960. }
  961. mark_pending(mq, e);
  962. q_del(&mq->clean, e);
  963. work.op = POLICY_DEMOTE;
  964. work.oblock = e->oblock;
  965. work.cblock = infer_cblock(mq, e);
  966. btracker_queue(mq->bg_work, &work, NULL);
  967. }
  968. static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
  969. struct policy_work **workp)
  970. {
  971. struct entry *e;
  972. struct policy_work work;
  973. if (!mq->migrations_allowed)
  974. return;
  975. if (allocator_empty(&mq->cache_alloc)) {
  976. /*
  977. * We always claim to be 'idle' to ensure some demotions happen
  978. * with continuous loads.
  979. */
  980. if (!free_target_met(mq))
  981. queue_demotion(mq);
  982. return;
  983. }
  984. if (btracker_promotion_already_present(mq->bg_work, oblock))
  985. return;
  986. /*
  987. * We allocate the entry now to reserve the cblock. If the
  988. * background work is aborted we must remember to free it.
  989. */
  990. e = alloc_entry(&mq->cache_alloc);
  991. BUG_ON(!e);
  992. e->pending_work = true;
  993. work.op = POLICY_PROMOTE;
  994. work.oblock = oblock;
  995. work.cblock = infer_cblock(mq, e);
  996. btracker_queue(mq->bg_work, &work, workp);
  997. }
  998. /*----------------------------------------------------------------*/
  999. enum promote_result {
  1000. PROMOTE_NOT,
  1001. PROMOTE_TEMPORARY,
  1002. PROMOTE_PERMANENT
  1003. };
  1004. /*
  1005. * Converts a boolean into a promote result.
  1006. */
  1007. static enum promote_result maybe_promote(bool promote)
  1008. {
  1009. return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
  1010. }
  1011. static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
  1012. int data_dir, bool fast_promote)
  1013. {
  1014. if (data_dir == WRITE) {
  1015. if (!allocator_empty(&mq->cache_alloc) && fast_promote)
  1016. return PROMOTE_TEMPORARY;
  1017. return maybe_promote(hs_e->level >= mq->write_promote_level);
  1018. } else
  1019. return maybe_promote(hs_e->level >= mq->read_promote_level);
  1020. }
  1021. static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
  1022. {
  1023. sector_t r = from_oblock(b);
  1024. (void) sector_div(r, mq->cache_blocks_per_hotspot_block);
  1025. return to_oblock(r);
  1026. }
  1027. static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
  1028. {
  1029. unsigned hi;
  1030. dm_oblock_t hb = to_hblock(mq, b);
  1031. struct entry *e = h_lookup(&mq->hotspot_table, hb);
  1032. if (e) {
  1033. stats_level_accessed(&mq->hotspot_stats, e->level);
  1034. hi = get_index(&mq->hotspot_alloc, e);
  1035. q_requeue(&mq->hotspot, e,
  1036. test_and_set_bit(hi, mq->hotspot_hit_bits) ?
  1037. 0u : mq->hotspot_level_jump,
  1038. NULL, NULL);
  1039. } else {
  1040. stats_miss(&mq->hotspot_stats);
  1041. e = alloc_entry(&mq->hotspot_alloc);
  1042. if (!e) {
  1043. e = q_pop(&mq->hotspot);
  1044. if (e) {
  1045. h_remove(&mq->hotspot_table, e);
  1046. hi = get_index(&mq->hotspot_alloc, e);
  1047. clear_bit(hi, mq->hotspot_hit_bits);
  1048. }
  1049. }
  1050. if (e) {
  1051. e->oblock = hb;
  1052. q_push(&mq->hotspot, e);
  1053. h_insert(&mq->hotspot_table, e);
  1054. }
  1055. }
  1056. return e;
  1057. }
  1058. /*----------------------------------------------------------------*/
  1059. /*
  1060. * Public interface, via the policy struct. See dm-cache-policy.h for a
  1061. * description of these.
  1062. */
  1063. static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
  1064. {
  1065. return container_of(p, struct smq_policy, policy);
  1066. }
  1067. static void smq_destroy(struct dm_cache_policy *p)
  1068. {
  1069. struct smq_policy *mq = to_smq_policy(p);
  1070. btracker_destroy(mq->bg_work);
  1071. h_exit(&mq->hotspot_table);
  1072. h_exit(&mq->table);
  1073. free_bitset(mq->hotspot_hit_bits);
  1074. free_bitset(mq->cache_hit_bits);
  1075. space_exit(&mq->es);
  1076. kfree(mq);
  1077. }
  1078. /*----------------------------------------------------------------*/
  1079. static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
  1080. int data_dir, bool fast_copy,
  1081. struct policy_work **work, bool *background_work)
  1082. {
  1083. struct entry *e, *hs_e;
  1084. enum promote_result pr;
  1085. *background_work = false;
  1086. e = h_lookup(&mq->table, oblock);
  1087. if (e) {
  1088. stats_level_accessed(&mq->cache_stats, e->level);
  1089. requeue(mq, e);
  1090. *cblock = infer_cblock(mq, e);
  1091. return 0;
  1092. } else {
  1093. stats_miss(&mq->cache_stats);
  1094. /*
  1095. * The hotspot queue only gets updated with misses.
  1096. */
  1097. hs_e = update_hotspot_queue(mq, oblock);
  1098. pr = should_promote(mq, hs_e, data_dir, fast_copy);
  1099. if (pr != PROMOTE_NOT) {
  1100. queue_promotion(mq, oblock, work);
  1101. *background_work = true;
  1102. }
  1103. return -ENOENT;
  1104. }
  1105. }
  1106. static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
  1107. int data_dir, bool fast_copy,
  1108. bool *background_work)
  1109. {
  1110. int r;
  1111. unsigned long flags;
  1112. struct smq_policy *mq = to_smq_policy(p);
  1113. spin_lock_irqsave(&mq->lock, flags);
  1114. r = __lookup(mq, oblock, cblock,
  1115. data_dir, fast_copy,
  1116. NULL, background_work);
  1117. spin_unlock_irqrestore(&mq->lock, flags);
  1118. return r;
  1119. }
  1120. static int smq_lookup_with_work(struct dm_cache_policy *p,
  1121. dm_oblock_t oblock, dm_cblock_t *cblock,
  1122. int data_dir, bool fast_copy,
  1123. struct policy_work **work)
  1124. {
  1125. int r;
  1126. bool background_queued;
  1127. unsigned long flags;
  1128. struct smq_policy *mq = to_smq_policy(p);
  1129. spin_lock_irqsave(&mq->lock, flags);
  1130. r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
  1131. spin_unlock_irqrestore(&mq->lock, flags);
  1132. return r;
  1133. }
  1134. static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
  1135. struct policy_work **result)
  1136. {
  1137. int r;
  1138. unsigned long flags;
  1139. struct smq_policy *mq = to_smq_policy(p);
  1140. spin_lock_irqsave(&mq->lock, flags);
  1141. r = btracker_issue(mq->bg_work, result);
  1142. if (r == -ENODATA) {
  1143. if (!clean_target_met(mq, idle)) {
  1144. queue_writeback(mq);
  1145. r = btracker_issue(mq->bg_work, result);
  1146. }
  1147. }
  1148. spin_unlock_irqrestore(&mq->lock, flags);
  1149. return r;
  1150. }
  1151. /*
  1152. * We need to clear any pending work flags that have been set, and in the
  1153. * case of promotion free the entry for the destination cblock.
  1154. */
  1155. static void __complete_background_work(struct smq_policy *mq,
  1156. struct policy_work *work,
  1157. bool success)
  1158. {
  1159. struct entry *e = get_entry(&mq->cache_alloc,
  1160. from_cblock(work->cblock));
  1161. switch (work->op) {
  1162. case POLICY_PROMOTE:
  1163. // !h, !q, a
  1164. clear_pending(mq, e);
  1165. if (success) {
  1166. e->oblock = work->oblock;
  1167. e->level = NR_CACHE_LEVELS - 1;
  1168. push(mq, e);
  1169. // h, q, a
  1170. } else {
  1171. free_entry(&mq->cache_alloc, e);
  1172. // !h, !q, !a
  1173. }
  1174. break;
  1175. case POLICY_DEMOTE:
  1176. // h, !q, a
  1177. if (success) {
  1178. h_remove(&mq->table, e);
  1179. free_entry(&mq->cache_alloc, e);
  1180. // !h, !q, !a
  1181. } else {
  1182. clear_pending(mq, e);
  1183. push_queue(mq, e);
  1184. // h, q, a
  1185. }
  1186. break;
  1187. case POLICY_WRITEBACK:
  1188. // h, !q, a
  1189. clear_pending(mq, e);
  1190. push_queue(mq, e);
  1191. // h, q, a
  1192. break;
  1193. }
  1194. btracker_complete(mq->bg_work, work);
  1195. }
  1196. static void smq_complete_background_work(struct dm_cache_policy *p,
  1197. struct policy_work *work,
  1198. bool success)
  1199. {
  1200. unsigned long flags;
  1201. struct smq_policy *mq = to_smq_policy(p);
  1202. spin_lock_irqsave(&mq->lock, flags);
  1203. __complete_background_work(mq, work, success);
  1204. spin_unlock_irqrestore(&mq->lock, flags);
  1205. }
  1206. // in_hash(oblock) -> in_hash(oblock)
  1207. static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
  1208. {
  1209. struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
  1210. if (e->pending_work)
  1211. e->dirty = set;
  1212. else {
  1213. del_queue(mq, e);
  1214. e->dirty = set;
  1215. push_queue(mq, e);
  1216. }
  1217. }
  1218. static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
  1219. {
  1220. unsigned long flags;
  1221. struct smq_policy *mq = to_smq_policy(p);
  1222. spin_lock_irqsave(&mq->lock, flags);
  1223. __smq_set_clear_dirty(mq, cblock, true);
  1224. spin_unlock_irqrestore(&mq->lock, flags);
  1225. }
  1226. static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
  1227. {
  1228. struct smq_policy *mq = to_smq_policy(p);
  1229. unsigned long flags;
  1230. spin_lock_irqsave(&mq->lock, flags);
  1231. __smq_set_clear_dirty(mq, cblock, false);
  1232. spin_unlock_irqrestore(&mq->lock, flags);
  1233. }
  1234. static unsigned random_level(dm_cblock_t cblock)
  1235. {
  1236. return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
  1237. }
  1238. static int smq_load_mapping(struct dm_cache_policy *p,
  1239. dm_oblock_t oblock, dm_cblock_t cblock,
  1240. bool dirty, uint32_t hint, bool hint_valid)
  1241. {
  1242. struct smq_policy *mq = to_smq_policy(p);
  1243. struct entry *e;
  1244. e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
  1245. e->oblock = oblock;
  1246. e->dirty = dirty;
  1247. e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
  1248. e->pending_work = false;
  1249. /*
  1250. * When we load mappings we push ahead of both sentinels in order to
  1251. * allow demotions and cleaning to occur immediately.
  1252. */
  1253. push_front(mq, e);
  1254. return 0;
  1255. }
  1256. static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
  1257. {
  1258. struct smq_policy *mq = to_smq_policy(p);
  1259. struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
  1260. if (!e->allocated)
  1261. return -ENODATA;
  1262. // FIXME: what if this block has pending background work?
  1263. del_queue(mq, e);
  1264. h_remove(&mq->table, e);
  1265. free_entry(&mq->cache_alloc, e);
  1266. return 0;
  1267. }
  1268. static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
  1269. {
  1270. struct smq_policy *mq = to_smq_policy(p);
  1271. struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
  1272. if (!e->allocated)
  1273. return 0;
  1274. return e->level;
  1275. }
  1276. static dm_cblock_t smq_residency(struct dm_cache_policy *p)
  1277. {
  1278. dm_cblock_t r;
  1279. unsigned long flags;
  1280. struct smq_policy *mq = to_smq_policy(p);
  1281. spin_lock_irqsave(&mq->lock, flags);
  1282. r = to_cblock(mq->cache_alloc.nr_allocated);
  1283. spin_unlock_irqrestore(&mq->lock, flags);
  1284. return r;
  1285. }
  1286. static void smq_tick(struct dm_cache_policy *p, bool can_block)
  1287. {
  1288. struct smq_policy *mq = to_smq_policy(p);
  1289. unsigned long flags;
  1290. spin_lock_irqsave(&mq->lock, flags);
  1291. mq->tick++;
  1292. update_sentinels(mq);
  1293. end_hotspot_period(mq);
  1294. end_cache_period(mq);
  1295. spin_unlock_irqrestore(&mq->lock, flags);
  1296. }
  1297. static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
  1298. {
  1299. struct smq_policy *mq = to_smq_policy(p);
  1300. mq->migrations_allowed = allow;
  1301. }
  1302. /*
  1303. * smq has no config values, but the old mq policy did. To avoid breaking
  1304. * software we continue to accept these configurables for the mq policy,
  1305. * but they have no effect.
  1306. */
  1307. static int mq_set_config_value(struct dm_cache_policy *p,
  1308. const char *key, const char *value)
  1309. {
  1310. unsigned long tmp;
  1311. if (kstrtoul(value, 10, &tmp))
  1312. return -EINVAL;
  1313. if (!strcasecmp(key, "random_threshold") ||
  1314. !strcasecmp(key, "sequential_threshold") ||
  1315. !strcasecmp(key, "discard_promote_adjustment") ||
  1316. !strcasecmp(key, "read_promote_adjustment") ||
  1317. !strcasecmp(key, "write_promote_adjustment")) {
  1318. DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
  1319. return 0;
  1320. }
  1321. return -EINVAL;
  1322. }
  1323. static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
  1324. unsigned maxlen, ssize_t *sz_ptr)
  1325. {
  1326. ssize_t sz = *sz_ptr;
  1327. DMEMIT("10 random_threshold 0 "
  1328. "sequential_threshold 0 "
  1329. "discard_promote_adjustment 0 "
  1330. "read_promote_adjustment 0 "
  1331. "write_promote_adjustment 0 ");
  1332. *sz_ptr = sz;
  1333. return 0;
  1334. }
  1335. /* Init the policy plugin interface function pointers. */
  1336. static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
  1337. {
  1338. mq->policy.destroy = smq_destroy;
  1339. mq->policy.lookup = smq_lookup;
  1340. mq->policy.lookup_with_work = smq_lookup_with_work;
  1341. mq->policy.get_background_work = smq_get_background_work;
  1342. mq->policy.complete_background_work = smq_complete_background_work;
  1343. mq->policy.set_dirty = smq_set_dirty;
  1344. mq->policy.clear_dirty = smq_clear_dirty;
  1345. mq->policy.load_mapping = smq_load_mapping;
  1346. mq->policy.invalidate_mapping = smq_invalidate_mapping;
  1347. mq->policy.get_hint = smq_get_hint;
  1348. mq->policy.residency = smq_residency;
  1349. mq->policy.tick = smq_tick;
  1350. mq->policy.allow_migrations = smq_allow_migrations;
  1351. if (mimic_mq) {
  1352. mq->policy.set_config_value = mq_set_config_value;
  1353. mq->policy.emit_config_values = mq_emit_config_values;
  1354. }
  1355. }
  1356. static bool too_many_hotspot_blocks(sector_t origin_size,
  1357. sector_t hotspot_block_size,
  1358. unsigned nr_hotspot_blocks)
  1359. {
  1360. return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
  1361. }
  1362. static void calc_hotspot_params(sector_t origin_size,
  1363. sector_t cache_block_size,
  1364. unsigned nr_cache_blocks,
  1365. sector_t *hotspot_block_size,
  1366. unsigned *nr_hotspot_blocks)
  1367. {
  1368. *hotspot_block_size = cache_block_size * 16u;
  1369. *nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
  1370. while ((*hotspot_block_size > cache_block_size) &&
  1371. too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
  1372. *hotspot_block_size /= 2u;
  1373. }
  1374. static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
  1375. sector_t origin_size,
  1376. sector_t cache_block_size,
  1377. bool mimic_mq,
  1378. bool migrations_allowed)
  1379. {
  1380. unsigned i;
  1381. unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
  1382. unsigned total_sentinels = 2u * nr_sentinels_per_queue;
  1383. struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
  1384. if (!mq)
  1385. return NULL;
  1386. init_policy_functions(mq, mimic_mq);
  1387. mq->cache_size = cache_size;
  1388. mq->cache_block_size = cache_block_size;
  1389. calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
  1390. &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
  1391. mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
  1392. mq->hotspot_level_jump = 1u;
  1393. if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
  1394. DMERR("couldn't initialize entry space");
  1395. goto bad_pool_init;
  1396. }
  1397. init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
  1398. for (i = 0; i < nr_sentinels_per_queue; i++)
  1399. get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
  1400. init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
  1401. for (i = 0; i < nr_sentinels_per_queue; i++)
  1402. get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
  1403. init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
  1404. total_sentinels + mq->nr_hotspot_blocks);
  1405. init_allocator(&mq->cache_alloc, &mq->es,
  1406. total_sentinels + mq->nr_hotspot_blocks,
  1407. total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
  1408. mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
  1409. if (!mq->hotspot_hit_bits) {
  1410. DMERR("couldn't allocate hotspot hit bitset");
  1411. goto bad_hotspot_hit_bits;
  1412. }
  1413. clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
  1414. if (from_cblock(cache_size)) {
  1415. mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
  1416. if (!mq->cache_hit_bits) {
  1417. DMERR("couldn't allocate cache hit bitset");
  1418. goto bad_cache_hit_bits;
  1419. }
  1420. clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
  1421. } else
  1422. mq->cache_hit_bits = NULL;
  1423. mq->tick = 0;
  1424. spin_lock_init(&mq->lock);
  1425. q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
  1426. mq->hotspot.nr_top_levels = 8;
  1427. mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
  1428. from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
  1429. q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
  1430. q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
  1431. stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
  1432. stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
  1433. if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
  1434. goto bad_alloc_table;
  1435. if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
  1436. goto bad_alloc_hotspot_table;
  1437. sentinels_init(mq);
  1438. mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
  1439. mq->next_hotspot_period = jiffies;
  1440. mq->next_cache_period = jiffies;
  1441. mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */
  1442. if (!mq->bg_work)
  1443. goto bad_btracker;
  1444. mq->migrations_allowed = migrations_allowed;
  1445. return &mq->policy;
  1446. bad_btracker:
  1447. h_exit(&mq->hotspot_table);
  1448. bad_alloc_hotspot_table:
  1449. h_exit(&mq->table);
  1450. bad_alloc_table:
  1451. free_bitset(mq->cache_hit_bits);
  1452. bad_cache_hit_bits:
  1453. free_bitset(mq->hotspot_hit_bits);
  1454. bad_hotspot_hit_bits:
  1455. space_exit(&mq->es);
  1456. bad_pool_init:
  1457. kfree(mq);
  1458. return NULL;
  1459. }
  1460. static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
  1461. sector_t origin_size,
  1462. sector_t cache_block_size)
  1463. {
  1464. return __smq_create(cache_size, origin_size, cache_block_size, false, true);
  1465. }
  1466. static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
  1467. sector_t origin_size,
  1468. sector_t cache_block_size)
  1469. {
  1470. return __smq_create(cache_size, origin_size, cache_block_size, true, true);
  1471. }
  1472. static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
  1473. sector_t origin_size,
  1474. sector_t cache_block_size)
  1475. {
  1476. return __smq_create(cache_size, origin_size, cache_block_size, false, false);
  1477. }
  1478. /*----------------------------------------------------------------*/
  1479. static struct dm_cache_policy_type smq_policy_type = {
  1480. .name = "smq",
  1481. .version = {2, 0, 0},
  1482. .hint_size = 4,
  1483. .owner = THIS_MODULE,
  1484. .create = smq_create
  1485. };
  1486. static struct dm_cache_policy_type mq_policy_type = {
  1487. .name = "mq",
  1488. .version = {2, 0, 0},
  1489. .hint_size = 4,
  1490. .owner = THIS_MODULE,
  1491. .create = mq_create,
  1492. };
  1493. static struct dm_cache_policy_type cleaner_policy_type = {
  1494. .name = "cleaner",
  1495. .version = {2, 0, 0},
  1496. .hint_size = 4,
  1497. .owner = THIS_MODULE,
  1498. .create = cleaner_create,
  1499. };
  1500. static struct dm_cache_policy_type default_policy_type = {
  1501. .name = "default",
  1502. .version = {2, 0, 0},
  1503. .hint_size = 4,
  1504. .owner = THIS_MODULE,
  1505. .create = smq_create,
  1506. .real = &smq_policy_type
  1507. };
  1508. static int __init smq_init(void)
  1509. {
  1510. int r;
  1511. r = dm_cache_policy_register(&smq_policy_type);
  1512. if (r) {
  1513. DMERR("register failed %d", r);
  1514. return -ENOMEM;
  1515. }
  1516. r = dm_cache_policy_register(&mq_policy_type);
  1517. if (r) {
  1518. DMERR("register failed (as mq) %d", r);
  1519. goto out_mq;
  1520. }
  1521. r = dm_cache_policy_register(&cleaner_policy_type);
  1522. if (r) {
  1523. DMERR("register failed (as cleaner) %d", r);
  1524. goto out_cleaner;
  1525. }
  1526. r = dm_cache_policy_register(&default_policy_type);
  1527. if (r) {
  1528. DMERR("register failed (as default) %d", r);
  1529. goto out_default;
  1530. }
  1531. return 0;
  1532. out_default:
  1533. dm_cache_policy_unregister(&cleaner_policy_type);
  1534. out_cleaner:
  1535. dm_cache_policy_unregister(&mq_policy_type);
  1536. out_mq:
  1537. dm_cache_policy_unregister(&smq_policy_type);
  1538. return -ENOMEM;
  1539. }
  1540. static void __exit smq_exit(void)
  1541. {
  1542. dm_cache_policy_unregister(&cleaner_policy_type);
  1543. dm_cache_policy_unregister(&smq_policy_type);
  1544. dm_cache_policy_unregister(&mq_policy_type);
  1545. dm_cache_policy_unregister(&default_policy_type);
  1546. }
  1547. module_init(smq_init);
  1548. module_exit(smq_exit);
  1549. MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
  1550. MODULE_LICENSE("GPL");
  1551. MODULE_DESCRIPTION("smq cache policy");
  1552. MODULE_ALIAS("dm-cache-default");
  1553. MODULE_ALIAS("dm-cache-mq");
  1554. MODULE_ALIAS("dm-cache-cleaner");