kyber-iosched.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996
  1. /*
  2. * The Kyber I/O scheduler. Controls latency by throttling queue depths using
  3. * scalable techniques.
  4. *
  5. * Copyright (C) 2017 Facebook
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU General Public
  9. * License v2 as published by the Free Software Foundation.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program. If not, see <https://www.gnu.org/licenses/>.
  18. */
  19. #include <linux/kernel.h>
  20. #include <linux/blkdev.h>
  21. #include <linux/blk-mq.h>
  22. #include <linux/elevator.h>
  23. #include <linux/module.h>
  24. #include <linux/sbitmap.h>
  25. #include "blk.h"
  26. #include "blk-mq.h"
  27. #include "blk-mq-debugfs.h"
  28. #include "blk-mq-sched.h"
  29. #include "blk-mq-tag.h"
  30. #include "blk-stat.h"
  31. /* Scheduling domains. */
  32. enum {
  33. KYBER_READ,
  34. KYBER_SYNC_WRITE,
  35. KYBER_OTHER, /* Async writes, discard, etc. */
  36. KYBER_NUM_DOMAINS,
  37. };
  38. enum {
  39. KYBER_MIN_DEPTH = 256,
  40. /*
  41. * In order to prevent starvation of synchronous requests by a flood of
  42. * asynchronous requests, we reserve 25% of requests for synchronous
  43. * operations.
  44. */
  45. KYBER_ASYNC_PERCENT = 75,
  46. };
  47. /*
  48. * Initial device-wide depths for each scheduling domain.
  49. *
  50. * Even for fast devices with lots of tags like NVMe, you can saturate
  51. * the device with only a fraction of the maximum possible queue depth.
  52. * So, we cap these to a reasonable value.
  53. */
  54. static const unsigned int kyber_depth[] = {
  55. [KYBER_READ] = 256,
  56. [KYBER_SYNC_WRITE] = 128,
  57. [KYBER_OTHER] = 64,
  58. };
  59. /*
  60. * Scheduling domain batch sizes. We favor reads.
  61. */
  62. static const unsigned int kyber_batch_size[] = {
  63. [KYBER_READ] = 16,
  64. [KYBER_SYNC_WRITE] = 8,
  65. [KYBER_OTHER] = 8,
  66. };
  67. /*
  68. * There is a same mapping between ctx & hctx and kcq & khd,
  69. * we use request->mq_ctx->index_hw to index the kcq in khd.
  70. */
  71. struct kyber_ctx_queue {
  72. /*
  73. * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
  74. * Also protect the rqs on rq_list when merge.
  75. */
  76. spinlock_t lock;
  77. struct list_head rq_list[KYBER_NUM_DOMAINS];
  78. } ____cacheline_aligned_in_smp;
  79. struct kyber_queue_data {
  80. struct request_queue *q;
  81. struct blk_stat_callback *cb;
  82. /*
  83. * The device is divided into multiple scheduling domains based on the
  84. * request type. Each domain has a fixed number of in-flight requests of
  85. * that type device-wide, limited by these tokens.
  86. */
  87. struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
  88. /*
  89. * Async request percentage, converted to per-word depth for
  90. * sbitmap_get_shallow().
  91. */
  92. unsigned int async_depth;
  93. /* Target latencies in nanoseconds. */
  94. u64 read_lat_nsec, write_lat_nsec;
  95. };
  96. struct kyber_hctx_data {
  97. spinlock_t lock;
  98. struct list_head rqs[KYBER_NUM_DOMAINS];
  99. unsigned int cur_domain;
  100. unsigned int batching;
  101. struct kyber_ctx_queue *kcqs;
  102. struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
  103. wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
  104. struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
  105. atomic_t wait_index[KYBER_NUM_DOMAINS];
  106. };
  107. static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
  108. void *key);
  109. static unsigned int kyber_sched_domain(unsigned int op)
  110. {
  111. if ((op & REQ_OP_MASK) == REQ_OP_READ)
  112. return KYBER_READ;
  113. else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
  114. return KYBER_SYNC_WRITE;
  115. else
  116. return KYBER_OTHER;
  117. }
  118. enum {
  119. NONE = 0,
  120. GOOD = 1,
  121. GREAT = 2,
  122. BAD = -1,
  123. AWFUL = -2,
  124. };
  125. #define IS_GOOD(status) ((status) > 0)
  126. #define IS_BAD(status) ((status) < 0)
  127. static int kyber_lat_status(struct blk_stat_callback *cb,
  128. unsigned int sched_domain, u64 target)
  129. {
  130. u64 latency;
  131. if (!cb->stat[sched_domain].nr_samples)
  132. return NONE;
  133. latency = cb->stat[sched_domain].mean;
  134. if (latency >= 2 * target)
  135. return AWFUL;
  136. else if (latency > target)
  137. return BAD;
  138. else if (latency <= target / 2)
  139. return GREAT;
  140. else /* (latency <= target) */
  141. return GOOD;
  142. }
  143. /*
  144. * Adjust the read or synchronous write depth given the status of reads and
  145. * writes. The goal is that the latencies of the two domains are fair (i.e., if
  146. * one is good, then the other is good).
  147. */
  148. static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
  149. unsigned int sched_domain, int this_status,
  150. int other_status)
  151. {
  152. unsigned int orig_depth, depth;
  153. /*
  154. * If this domain had no samples, or reads and writes are both good or
  155. * both bad, don't adjust the depth.
  156. */
  157. if (this_status == NONE ||
  158. (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
  159. (IS_BAD(this_status) && IS_BAD(other_status)))
  160. return;
  161. orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
  162. if (other_status == NONE) {
  163. depth++;
  164. } else {
  165. switch (this_status) {
  166. case GOOD:
  167. if (other_status == AWFUL)
  168. depth -= max(depth / 4, 1U);
  169. else
  170. depth -= max(depth / 8, 1U);
  171. break;
  172. case GREAT:
  173. if (other_status == AWFUL)
  174. depth /= 2;
  175. else
  176. depth -= max(depth / 4, 1U);
  177. break;
  178. case BAD:
  179. depth++;
  180. break;
  181. case AWFUL:
  182. if (other_status == GREAT)
  183. depth += 2;
  184. else
  185. depth++;
  186. break;
  187. }
  188. }
  189. depth = clamp(depth, 1U, kyber_depth[sched_domain]);
  190. if (depth != orig_depth)
  191. sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
  192. }
  193. /*
  194. * Adjust the depth of other requests given the status of reads and synchronous
  195. * writes. As long as either domain is doing fine, we don't throttle, but if
  196. * both domains are doing badly, we throttle heavily.
  197. */
  198. static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
  199. int read_status, int write_status,
  200. bool have_samples)
  201. {
  202. unsigned int orig_depth, depth;
  203. int status;
  204. orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
  205. if (read_status == NONE && write_status == NONE) {
  206. depth += 2;
  207. } else if (have_samples) {
  208. if (read_status == NONE)
  209. status = write_status;
  210. else if (write_status == NONE)
  211. status = read_status;
  212. else
  213. status = max(read_status, write_status);
  214. switch (status) {
  215. case GREAT:
  216. depth += 2;
  217. break;
  218. case GOOD:
  219. depth++;
  220. break;
  221. case BAD:
  222. depth -= max(depth / 4, 1U);
  223. break;
  224. case AWFUL:
  225. depth /= 2;
  226. break;
  227. }
  228. }
  229. depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
  230. if (depth != orig_depth)
  231. sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
  232. }
  233. /*
  234. * Apply heuristics for limiting queue depths based on gathered latency
  235. * statistics.
  236. */
  237. static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
  238. {
  239. struct kyber_queue_data *kqd = cb->data;
  240. int read_status, write_status;
  241. read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
  242. write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
  243. kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
  244. kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
  245. kyber_adjust_other_depth(kqd, read_status, write_status,
  246. cb->stat[KYBER_OTHER].nr_samples != 0);
  247. /*
  248. * Continue monitoring latencies if we aren't hitting the targets or
  249. * we're still throttling other requests.
  250. */
  251. if (!blk_stat_is_active(kqd->cb) &&
  252. ((IS_BAD(read_status) || IS_BAD(write_status) ||
  253. kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
  254. blk_stat_activate_msecs(kqd->cb, 100);
  255. }
  256. static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
  257. {
  258. /*
  259. * All of the hardware queues have the same depth, so we can just grab
  260. * the shift of the first one.
  261. */
  262. return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
  263. }
  264. static int kyber_bucket_fn(const struct request *rq)
  265. {
  266. return kyber_sched_domain(rq->cmd_flags);
  267. }
  268. static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
  269. {
  270. struct kyber_queue_data *kqd;
  271. unsigned int max_tokens;
  272. unsigned int shift;
  273. int ret = -ENOMEM;
  274. int i;
  275. kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
  276. if (!kqd)
  277. goto err;
  278. kqd->q = q;
  279. kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
  280. KYBER_NUM_DOMAINS, kqd);
  281. if (!kqd->cb)
  282. goto err_kqd;
  283. /*
  284. * The maximum number of tokens for any scheduling domain is at least
  285. * the queue depth of a single hardware queue. If the hardware doesn't
  286. * have many tags, still provide a reasonable number.
  287. */
  288. max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
  289. KYBER_MIN_DEPTH);
  290. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  291. WARN_ON(!kyber_depth[i]);
  292. WARN_ON(!kyber_batch_size[i]);
  293. ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
  294. max_tokens, -1, false, GFP_KERNEL,
  295. q->node);
  296. if (ret) {
  297. while (--i >= 0)
  298. sbitmap_queue_free(&kqd->domain_tokens[i]);
  299. goto err_cb;
  300. }
  301. sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
  302. }
  303. shift = kyber_sched_tags_shift(kqd);
  304. kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
  305. kqd->read_lat_nsec = 2000000ULL;
  306. kqd->write_lat_nsec = 10000000ULL;
  307. return kqd;
  308. err_cb:
  309. blk_stat_free_callback(kqd->cb);
  310. err_kqd:
  311. kfree(kqd);
  312. err:
  313. return ERR_PTR(ret);
  314. }
  315. static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
  316. {
  317. struct kyber_queue_data *kqd;
  318. struct elevator_queue *eq;
  319. eq = elevator_alloc(q, e);
  320. if (!eq)
  321. return -ENOMEM;
  322. kqd = kyber_queue_data_alloc(q);
  323. if (IS_ERR(kqd)) {
  324. kobject_put(&eq->kobj);
  325. return PTR_ERR(kqd);
  326. }
  327. eq->elevator_data = kqd;
  328. q->elevator = eq;
  329. blk_stat_add_callback(q, kqd->cb);
  330. return 0;
  331. }
  332. static void kyber_exit_sched(struct elevator_queue *e)
  333. {
  334. struct kyber_queue_data *kqd = e->elevator_data;
  335. struct request_queue *q = kqd->q;
  336. int i;
  337. blk_stat_remove_callback(q, kqd->cb);
  338. for (i = 0; i < KYBER_NUM_DOMAINS; i++)
  339. sbitmap_queue_free(&kqd->domain_tokens[i]);
  340. blk_stat_free_callback(kqd->cb);
  341. kfree(kqd);
  342. }
  343. static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
  344. {
  345. unsigned int i;
  346. spin_lock_init(&kcq->lock);
  347. for (i = 0; i < KYBER_NUM_DOMAINS; i++)
  348. INIT_LIST_HEAD(&kcq->rq_list[i]);
  349. }
  350. static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
  351. {
  352. struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
  353. struct kyber_hctx_data *khd;
  354. int i;
  355. khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
  356. if (!khd)
  357. return -ENOMEM;
  358. khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
  359. sizeof(struct kyber_ctx_queue),
  360. GFP_KERNEL, hctx->numa_node);
  361. if (!khd->kcqs)
  362. goto err_khd;
  363. for (i = 0; i < hctx->nr_ctx; i++)
  364. kyber_ctx_queue_init(&khd->kcqs[i]);
  365. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  366. if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
  367. ilog2(8), GFP_KERNEL, hctx->numa_node)) {
  368. while (--i >= 0)
  369. sbitmap_free(&khd->kcq_map[i]);
  370. goto err_kcqs;
  371. }
  372. }
  373. spin_lock_init(&khd->lock);
  374. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  375. INIT_LIST_HEAD(&khd->rqs[i]);
  376. init_waitqueue_func_entry(&khd->domain_wait[i],
  377. kyber_domain_wake);
  378. khd->domain_wait[i].private = hctx;
  379. INIT_LIST_HEAD(&khd->domain_wait[i].entry);
  380. atomic_set(&khd->wait_index[i], 0);
  381. }
  382. khd->cur_domain = 0;
  383. khd->batching = 0;
  384. hctx->sched_data = khd;
  385. sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
  386. kqd->async_depth);
  387. return 0;
  388. err_kcqs:
  389. kfree(khd->kcqs);
  390. err_khd:
  391. kfree(khd);
  392. return -ENOMEM;
  393. }
  394. static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
  395. {
  396. struct kyber_hctx_data *khd = hctx->sched_data;
  397. int i;
  398. for (i = 0; i < KYBER_NUM_DOMAINS; i++)
  399. sbitmap_free(&khd->kcq_map[i]);
  400. kfree(khd->kcqs);
  401. kfree(hctx->sched_data);
  402. }
  403. static int rq_get_domain_token(struct request *rq)
  404. {
  405. return (long)rq->elv.priv[0];
  406. }
  407. static void rq_set_domain_token(struct request *rq, int token)
  408. {
  409. rq->elv.priv[0] = (void *)(long)token;
  410. }
  411. static void rq_clear_domain_token(struct kyber_queue_data *kqd,
  412. struct request *rq)
  413. {
  414. unsigned int sched_domain;
  415. int nr;
  416. nr = rq_get_domain_token(rq);
  417. if (nr != -1) {
  418. sched_domain = kyber_sched_domain(rq->cmd_flags);
  419. sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
  420. rq->mq_ctx->cpu);
  421. }
  422. }
  423. static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
  424. {
  425. /*
  426. * We use the scheduler tags as per-hardware queue queueing tokens.
  427. * Async requests can be limited at this stage.
  428. */
  429. if (!op_is_sync(op)) {
  430. struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
  431. data->shallow_depth = kqd->async_depth;
  432. }
  433. }
  434. static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
  435. {
  436. struct kyber_hctx_data *khd = hctx->sched_data;
  437. struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
  438. struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
  439. unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
  440. struct list_head *rq_list = &kcq->rq_list[sched_domain];
  441. bool merged;
  442. spin_lock(&kcq->lock);
  443. merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
  444. spin_unlock(&kcq->lock);
  445. blk_mq_put_ctx(ctx);
  446. return merged;
  447. }
  448. static void kyber_prepare_request(struct request *rq, struct bio *bio)
  449. {
  450. rq_set_domain_token(rq, -1);
  451. }
  452. static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
  453. struct list_head *rq_list, bool at_head)
  454. {
  455. struct kyber_hctx_data *khd = hctx->sched_data;
  456. struct request *rq, *next;
  457. list_for_each_entry_safe(rq, next, rq_list, queuelist) {
  458. unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
  459. struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
  460. struct list_head *head = &kcq->rq_list[sched_domain];
  461. spin_lock(&kcq->lock);
  462. if (at_head)
  463. list_move(&rq->queuelist, head);
  464. else
  465. list_move_tail(&rq->queuelist, head);
  466. sbitmap_set_bit(&khd->kcq_map[sched_domain],
  467. rq->mq_ctx->index_hw);
  468. blk_mq_sched_request_inserted(rq);
  469. spin_unlock(&kcq->lock);
  470. }
  471. }
  472. static void kyber_finish_request(struct request *rq)
  473. {
  474. struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
  475. rq_clear_domain_token(kqd, rq);
  476. }
  477. static void kyber_completed_request(struct request *rq)
  478. {
  479. struct request_queue *q = rq->q;
  480. struct kyber_queue_data *kqd = q->elevator->elevator_data;
  481. unsigned int sched_domain;
  482. u64 now, latency, target;
  483. /*
  484. * Check if this request met our latency goal. If not, quickly gather
  485. * some statistics and start throttling.
  486. */
  487. sched_domain = kyber_sched_domain(rq->cmd_flags);
  488. switch (sched_domain) {
  489. case KYBER_READ:
  490. target = kqd->read_lat_nsec;
  491. break;
  492. case KYBER_SYNC_WRITE:
  493. target = kqd->write_lat_nsec;
  494. break;
  495. default:
  496. return;
  497. }
  498. /* If we are already monitoring latencies, don't check again. */
  499. if (blk_stat_is_active(kqd->cb))
  500. return;
  501. now = ktime_get_ns();
  502. if (now < rq->io_start_time_ns)
  503. return;
  504. latency = now - rq->io_start_time_ns;
  505. if (latency > target)
  506. blk_stat_activate_msecs(kqd->cb, 10);
  507. }
  508. struct flush_kcq_data {
  509. struct kyber_hctx_data *khd;
  510. unsigned int sched_domain;
  511. struct list_head *list;
  512. };
  513. static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
  514. {
  515. struct flush_kcq_data *flush_data = data;
  516. struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
  517. spin_lock(&kcq->lock);
  518. list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
  519. flush_data->list);
  520. sbitmap_clear_bit(sb, bitnr);
  521. spin_unlock(&kcq->lock);
  522. return true;
  523. }
  524. static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
  525. unsigned int sched_domain,
  526. struct list_head *list)
  527. {
  528. struct flush_kcq_data data = {
  529. .khd = khd,
  530. .sched_domain = sched_domain,
  531. .list = list,
  532. };
  533. sbitmap_for_each_set(&khd->kcq_map[sched_domain],
  534. flush_busy_kcq, &data);
  535. }
  536. static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
  537. void *key)
  538. {
  539. struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
  540. list_del_init(&wait->entry);
  541. blk_mq_run_hw_queue(hctx, true);
  542. return 1;
  543. }
  544. static int kyber_get_domain_token(struct kyber_queue_data *kqd,
  545. struct kyber_hctx_data *khd,
  546. struct blk_mq_hw_ctx *hctx)
  547. {
  548. unsigned int sched_domain = khd->cur_domain;
  549. struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
  550. wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
  551. struct sbq_wait_state *ws;
  552. int nr;
  553. nr = __sbitmap_queue_get(domain_tokens);
  554. /*
  555. * If we failed to get a domain token, make sure the hardware queue is
  556. * run when one becomes available. Note that this is serialized on
  557. * khd->lock, but we still need to be careful about the waker.
  558. */
  559. if (nr < 0 && list_empty_careful(&wait->entry)) {
  560. ws = sbq_wait_ptr(domain_tokens,
  561. &khd->wait_index[sched_domain]);
  562. khd->domain_ws[sched_domain] = ws;
  563. add_wait_queue(&ws->wait, wait);
  564. /*
  565. * Try again in case a token was freed before we got on the wait
  566. * queue.
  567. */
  568. nr = __sbitmap_queue_get(domain_tokens);
  569. }
  570. /*
  571. * If we got a token while we were on the wait queue, remove ourselves
  572. * from the wait queue to ensure that all wake ups make forward
  573. * progress. It's possible that the waker already deleted the entry
  574. * between the !list_empty_careful() check and us grabbing the lock, but
  575. * list_del_init() is okay with that.
  576. */
  577. if (nr >= 0 && !list_empty_careful(&wait->entry)) {
  578. ws = khd->domain_ws[sched_domain];
  579. spin_lock_irq(&ws->wait.lock);
  580. list_del_init(&wait->entry);
  581. spin_unlock_irq(&ws->wait.lock);
  582. }
  583. return nr;
  584. }
  585. static struct request *
  586. kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
  587. struct kyber_hctx_data *khd,
  588. struct blk_mq_hw_ctx *hctx)
  589. {
  590. struct list_head *rqs;
  591. struct request *rq;
  592. int nr;
  593. rqs = &khd->rqs[khd->cur_domain];
  594. /*
  595. * If we already have a flushed request, then we just need to get a
  596. * token for it. Otherwise, if there are pending requests in the kcqs,
  597. * flush the kcqs, but only if we can get a token. If not, we should
  598. * leave the requests in the kcqs so that they can be merged. Note that
  599. * khd->lock serializes the flushes, so if we observed any bit set in
  600. * the kcq_map, we will always get a request.
  601. */
  602. rq = list_first_entry_or_null(rqs, struct request, queuelist);
  603. if (rq) {
  604. nr = kyber_get_domain_token(kqd, khd, hctx);
  605. if (nr >= 0) {
  606. khd->batching++;
  607. rq_set_domain_token(rq, nr);
  608. list_del_init(&rq->queuelist);
  609. return rq;
  610. }
  611. } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
  612. nr = kyber_get_domain_token(kqd, khd, hctx);
  613. if (nr >= 0) {
  614. kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
  615. rq = list_first_entry(rqs, struct request, queuelist);
  616. khd->batching++;
  617. rq_set_domain_token(rq, nr);
  618. list_del_init(&rq->queuelist);
  619. return rq;
  620. }
  621. }
  622. /* There were either no pending requests or no tokens. */
  623. return NULL;
  624. }
  625. static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
  626. {
  627. struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
  628. struct kyber_hctx_data *khd = hctx->sched_data;
  629. struct request *rq;
  630. int i;
  631. spin_lock(&khd->lock);
  632. /*
  633. * First, if we are still entitled to batch, try to dispatch a request
  634. * from the batch.
  635. */
  636. if (khd->batching < kyber_batch_size[khd->cur_domain]) {
  637. rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
  638. if (rq)
  639. goto out;
  640. }
  641. /*
  642. * Either,
  643. * 1. We were no longer entitled to a batch.
  644. * 2. The domain we were batching didn't have any requests.
  645. * 3. The domain we were batching was out of tokens.
  646. *
  647. * Start another batch. Note that this wraps back around to the original
  648. * domain if no other domains have requests or tokens.
  649. */
  650. khd->batching = 0;
  651. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  652. if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
  653. khd->cur_domain = 0;
  654. else
  655. khd->cur_domain++;
  656. rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
  657. if (rq)
  658. goto out;
  659. }
  660. rq = NULL;
  661. out:
  662. spin_unlock(&khd->lock);
  663. return rq;
  664. }
  665. static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
  666. {
  667. struct kyber_hctx_data *khd = hctx->sched_data;
  668. int i;
  669. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  670. if (!list_empty_careful(&khd->rqs[i]) ||
  671. sbitmap_any_bit_set(&khd->kcq_map[i]))
  672. return true;
  673. }
  674. return false;
  675. }
  676. #define KYBER_LAT_SHOW_STORE(op) \
  677. static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \
  678. char *page) \
  679. { \
  680. struct kyber_queue_data *kqd = e->elevator_data; \
  681. \
  682. return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \
  683. } \
  684. \
  685. static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
  686. const char *page, size_t count) \
  687. { \
  688. struct kyber_queue_data *kqd = e->elevator_data; \
  689. unsigned long long nsec; \
  690. int ret; \
  691. \
  692. ret = kstrtoull(page, 10, &nsec); \
  693. if (ret) \
  694. return ret; \
  695. \
  696. kqd->op##_lat_nsec = nsec; \
  697. \
  698. return count; \
  699. }
  700. KYBER_LAT_SHOW_STORE(read);
  701. KYBER_LAT_SHOW_STORE(write);
  702. #undef KYBER_LAT_SHOW_STORE
  703. #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
  704. static struct elv_fs_entry kyber_sched_attrs[] = {
  705. KYBER_LAT_ATTR(read),
  706. KYBER_LAT_ATTR(write),
  707. __ATTR_NULL
  708. };
  709. #undef KYBER_LAT_ATTR
  710. #ifdef CONFIG_BLK_DEBUG_FS
  711. #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \
  712. static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \
  713. { \
  714. struct request_queue *q = data; \
  715. struct kyber_queue_data *kqd = q->elevator->elevator_data; \
  716. \
  717. sbitmap_queue_show(&kqd->domain_tokens[domain], m); \
  718. return 0; \
  719. } \
  720. \
  721. static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \
  722. __acquires(&khd->lock) \
  723. { \
  724. struct blk_mq_hw_ctx *hctx = m->private; \
  725. struct kyber_hctx_data *khd = hctx->sched_data; \
  726. \
  727. spin_lock(&khd->lock); \
  728. return seq_list_start(&khd->rqs[domain], *pos); \
  729. } \
  730. \
  731. static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \
  732. loff_t *pos) \
  733. { \
  734. struct blk_mq_hw_ctx *hctx = m->private; \
  735. struct kyber_hctx_data *khd = hctx->sched_data; \
  736. \
  737. return seq_list_next(v, &khd->rqs[domain], pos); \
  738. } \
  739. \
  740. static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \
  741. __releases(&khd->lock) \
  742. { \
  743. struct blk_mq_hw_ctx *hctx = m->private; \
  744. struct kyber_hctx_data *khd = hctx->sched_data; \
  745. \
  746. spin_unlock(&khd->lock); \
  747. } \
  748. \
  749. static const struct seq_operations kyber_##name##_rqs_seq_ops = { \
  750. .start = kyber_##name##_rqs_start, \
  751. .next = kyber_##name##_rqs_next, \
  752. .stop = kyber_##name##_rqs_stop, \
  753. .show = blk_mq_debugfs_rq_show, \
  754. }; \
  755. \
  756. static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
  757. { \
  758. struct blk_mq_hw_ctx *hctx = data; \
  759. struct kyber_hctx_data *khd = hctx->sched_data; \
  760. wait_queue_entry_t *wait = &khd->domain_wait[domain]; \
  761. \
  762. seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
  763. return 0; \
  764. }
  765. KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
  766. KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
  767. KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
  768. #undef KYBER_DEBUGFS_DOMAIN_ATTRS
  769. static int kyber_async_depth_show(void *data, struct seq_file *m)
  770. {
  771. struct request_queue *q = data;
  772. struct kyber_queue_data *kqd = q->elevator->elevator_data;
  773. seq_printf(m, "%u\n", kqd->async_depth);
  774. return 0;
  775. }
  776. static int kyber_cur_domain_show(void *data, struct seq_file *m)
  777. {
  778. struct blk_mq_hw_ctx *hctx = data;
  779. struct kyber_hctx_data *khd = hctx->sched_data;
  780. switch (khd->cur_domain) {
  781. case KYBER_READ:
  782. seq_puts(m, "READ\n");
  783. break;
  784. case KYBER_SYNC_WRITE:
  785. seq_puts(m, "SYNC_WRITE\n");
  786. break;
  787. case KYBER_OTHER:
  788. seq_puts(m, "OTHER\n");
  789. break;
  790. default:
  791. seq_printf(m, "%u\n", khd->cur_domain);
  792. break;
  793. }
  794. return 0;
  795. }
  796. static int kyber_batching_show(void *data, struct seq_file *m)
  797. {
  798. struct blk_mq_hw_ctx *hctx = data;
  799. struct kyber_hctx_data *khd = hctx->sched_data;
  800. seq_printf(m, "%u\n", khd->batching);
  801. return 0;
  802. }
  803. #define KYBER_QUEUE_DOMAIN_ATTRS(name) \
  804. {#name "_tokens", 0400, kyber_##name##_tokens_show}
  805. static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
  806. KYBER_QUEUE_DOMAIN_ATTRS(read),
  807. KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
  808. KYBER_QUEUE_DOMAIN_ATTRS(other),
  809. {"async_depth", 0400, kyber_async_depth_show},
  810. {},
  811. };
  812. #undef KYBER_QUEUE_DOMAIN_ATTRS
  813. #define KYBER_HCTX_DOMAIN_ATTRS(name) \
  814. {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops}, \
  815. {#name "_waiting", 0400, kyber_##name##_waiting_show}
  816. static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
  817. KYBER_HCTX_DOMAIN_ATTRS(read),
  818. KYBER_HCTX_DOMAIN_ATTRS(sync_write),
  819. KYBER_HCTX_DOMAIN_ATTRS(other),
  820. {"cur_domain", 0400, kyber_cur_domain_show},
  821. {"batching", 0400, kyber_batching_show},
  822. {},
  823. };
  824. #undef KYBER_HCTX_DOMAIN_ATTRS
  825. #endif
  826. static struct elevator_type kyber_sched = {
  827. .ops.mq = {
  828. .init_sched = kyber_init_sched,
  829. .exit_sched = kyber_exit_sched,
  830. .init_hctx = kyber_init_hctx,
  831. .exit_hctx = kyber_exit_hctx,
  832. .limit_depth = kyber_limit_depth,
  833. .bio_merge = kyber_bio_merge,
  834. .prepare_request = kyber_prepare_request,
  835. .insert_requests = kyber_insert_requests,
  836. .finish_request = kyber_finish_request,
  837. .requeue_request = kyber_finish_request,
  838. .completed_request = kyber_completed_request,
  839. .dispatch_request = kyber_dispatch_request,
  840. .has_work = kyber_has_work,
  841. },
  842. .uses_mq = true,
  843. #ifdef CONFIG_BLK_DEBUG_FS
  844. .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
  845. .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
  846. #endif
  847. .elevator_attrs = kyber_sched_attrs,
  848. .elevator_name = "kyber",
  849. .elevator_owner = THIS_MODULE,
  850. };
  851. static int __init kyber_init(void)
  852. {
  853. return elv_register(&kyber_sched);
  854. }
  855. static void __exit kyber_exit(void)
  856. {
  857. elv_unregister(&kyber_sched);
  858. }
  859. module_init(kyber_init);
  860. module_exit(kyber_exit);
  861. MODULE_AUTHOR("Omar Sandoval");
  862. MODULE_LICENSE("GPL");
  863. MODULE_DESCRIPTION("Kyber I/O scheduler");