blk-mq-sched.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538
  1. /*
  2. * blk-mq scheduling framework
  3. *
  4. * Copyright (C) 2016 Jens Axboe
  5. */
  6. #include <linux/kernel.h>
  7. #include <linux/module.h>
  8. #include <linux/blk-mq.h>
  9. #include <trace/events/block.h>
  10. #include "blk.h"
  11. #include "blk-mq.h"
  12. #include "blk-mq-debugfs.h"
  13. #include "blk-mq-sched.h"
  14. #include "blk-mq-tag.h"
  15. #include "blk-wbt.h"
  16. void blk_mq_sched_free_hctx_data(struct request_queue *q,
  17. void (*exit)(struct blk_mq_hw_ctx *))
  18. {
  19. struct blk_mq_hw_ctx *hctx;
  20. int i;
  21. queue_for_each_hw_ctx(q, hctx, i) {
  22. if (exit && hctx->sched_data)
  23. exit(hctx);
  24. kfree(hctx->sched_data);
  25. hctx->sched_data = NULL;
  26. }
  27. }
  28. EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
  29. void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio)
  30. {
  31. struct request_queue *q = rq->q;
  32. struct io_context *ioc = rq_ioc(bio);
  33. struct io_cq *icq;
  34. spin_lock_irq(q->queue_lock);
  35. icq = ioc_lookup_icq(ioc, q);
  36. spin_unlock_irq(q->queue_lock);
  37. if (!icq) {
  38. icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
  39. if (!icq)
  40. return;
  41. }
  42. get_io_context(icq->ioc);
  43. rq->elv.icq = icq;
  44. }
  45. /*
  46. * Mark a hardware queue as needing a restart. For shared queues, maintain
  47. * a count of how many hardware queues are marked for restart.
  48. */
  49. void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
  50. {
  51. if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
  52. return;
  53. set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
  54. }
  55. EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
  56. void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
  57. {
  58. if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
  59. return;
  60. clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
  61. blk_mq_run_hw_queue(hctx, true);
  62. }
  63. /*
  64. * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
  65. * its queue by itself in its completion handler, so we don't need to
  66. * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
  67. */
  68. static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
  69. {
  70. struct request_queue *q = hctx->queue;
  71. struct elevator_queue *e = q->elevator;
  72. LIST_HEAD(rq_list);
  73. do {
  74. struct request *rq;
  75. if (e->type->ops.mq.has_work &&
  76. !e->type->ops.mq.has_work(hctx))
  77. break;
  78. if (!blk_mq_get_dispatch_budget(hctx))
  79. break;
  80. rq = e->type->ops.mq.dispatch_request(hctx);
  81. if (!rq) {
  82. blk_mq_put_dispatch_budget(hctx);
  83. break;
  84. }
  85. /*
  86. * Now this rq owns the budget which has to be released
  87. * if this rq won't be queued to driver via .queue_rq()
  88. * in blk_mq_dispatch_rq_list().
  89. */
  90. list_add(&rq->queuelist, &rq_list);
  91. } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
  92. }
  93. static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
  94. struct blk_mq_ctx *ctx)
  95. {
  96. unsigned idx = ctx->index_hw;
  97. if (++idx == hctx->nr_ctx)
  98. idx = 0;
  99. return hctx->ctxs[idx];
  100. }
  101. /*
  102. * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
  103. * its queue by itself in its completion handler, so we don't need to
  104. * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
  105. */
  106. static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
  107. {
  108. struct request_queue *q = hctx->queue;
  109. LIST_HEAD(rq_list);
  110. struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
  111. do {
  112. struct request *rq;
  113. if (!sbitmap_any_bit_set(&hctx->ctx_map))
  114. break;
  115. if (!blk_mq_get_dispatch_budget(hctx))
  116. break;
  117. rq = blk_mq_dequeue_from_ctx(hctx, ctx);
  118. if (!rq) {
  119. blk_mq_put_dispatch_budget(hctx);
  120. break;
  121. }
  122. /*
  123. * Now this rq owns the budget which has to be released
  124. * if this rq won't be queued to driver via .queue_rq()
  125. * in blk_mq_dispatch_rq_list().
  126. */
  127. list_add(&rq->queuelist, &rq_list);
  128. /* round robin for fair dispatch */
  129. ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
  130. } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
  131. WRITE_ONCE(hctx->dispatch_from, ctx);
  132. }
  133. void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
  134. {
  135. struct request_queue *q = hctx->queue;
  136. struct elevator_queue *e = q->elevator;
  137. const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
  138. LIST_HEAD(rq_list);
  139. /* RCU or SRCU read lock is needed before checking quiesced flag */
  140. if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
  141. return;
  142. hctx->run++;
  143. /*
  144. * If we have previous entries on our dispatch list, grab them first for
  145. * more fair dispatch.
  146. */
  147. if (!list_empty_careful(&hctx->dispatch)) {
  148. spin_lock(&hctx->lock);
  149. if (!list_empty(&hctx->dispatch))
  150. list_splice_init(&hctx->dispatch, &rq_list);
  151. spin_unlock(&hctx->lock);
  152. }
  153. /*
  154. * Only ask the scheduler for requests, if we didn't have residual
  155. * requests from the dispatch list. This is to avoid the case where
  156. * we only ever dispatch a fraction of the requests available because
  157. * of low device queue depth. Once we pull requests out of the IO
  158. * scheduler, we can no longer merge or sort them. So it's best to
  159. * leave them there for as long as we can. Mark the hw queue as
  160. * needing a restart in that case.
  161. *
  162. * We want to dispatch from the scheduler if there was nothing
  163. * on the dispatch list or we were able to dispatch from the
  164. * dispatch list.
  165. */
  166. if (!list_empty(&rq_list)) {
  167. blk_mq_sched_mark_restart_hctx(hctx);
  168. if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
  169. if (has_sched_dispatch)
  170. blk_mq_do_dispatch_sched(hctx);
  171. else
  172. blk_mq_do_dispatch_ctx(hctx);
  173. }
  174. } else if (has_sched_dispatch) {
  175. blk_mq_do_dispatch_sched(hctx);
  176. } else if (hctx->dispatch_busy) {
  177. /* dequeue request one by one from sw queue if queue is busy */
  178. blk_mq_do_dispatch_ctx(hctx);
  179. } else {
  180. blk_mq_flush_busy_ctxs(hctx, &rq_list);
  181. blk_mq_dispatch_rq_list(q, &rq_list, false);
  182. }
  183. }
  184. bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
  185. struct request **merged_request)
  186. {
  187. struct request *rq;
  188. switch (elv_merge(q, &rq, bio)) {
  189. case ELEVATOR_BACK_MERGE:
  190. if (!blk_mq_sched_allow_merge(q, rq, bio))
  191. return false;
  192. if (!bio_attempt_back_merge(q, rq, bio))
  193. return false;
  194. *merged_request = attempt_back_merge(q, rq);
  195. if (!*merged_request)
  196. elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
  197. return true;
  198. case ELEVATOR_FRONT_MERGE:
  199. if (!blk_mq_sched_allow_merge(q, rq, bio))
  200. return false;
  201. if (!bio_attempt_front_merge(q, rq, bio))
  202. return false;
  203. *merged_request = attempt_front_merge(q, rq);
  204. if (!*merged_request)
  205. elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
  206. return true;
  207. case ELEVATOR_DISCARD_MERGE:
  208. return bio_attempt_discard_merge(q, rq, bio);
  209. default:
  210. return false;
  211. }
  212. }
  213. EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
  214. /*
  215. * Iterate list of requests and see if we can merge this bio with any
  216. * of them.
  217. */
  218. bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
  219. struct bio *bio)
  220. {
  221. struct request *rq;
  222. int checked = 8;
  223. list_for_each_entry_reverse(rq, list, queuelist) {
  224. bool merged = false;
  225. if (!checked--)
  226. break;
  227. if (!blk_rq_merge_ok(rq, bio))
  228. continue;
  229. switch (blk_try_merge(rq, bio)) {
  230. case ELEVATOR_BACK_MERGE:
  231. if (blk_mq_sched_allow_merge(q, rq, bio))
  232. merged = bio_attempt_back_merge(q, rq, bio);
  233. break;
  234. case ELEVATOR_FRONT_MERGE:
  235. if (blk_mq_sched_allow_merge(q, rq, bio))
  236. merged = bio_attempt_front_merge(q, rq, bio);
  237. break;
  238. case ELEVATOR_DISCARD_MERGE:
  239. merged = bio_attempt_discard_merge(q, rq, bio);
  240. break;
  241. default:
  242. continue;
  243. }
  244. return merged;
  245. }
  246. return false;
  247. }
  248. EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
  249. /*
  250. * Reverse check our software queue for entries that we could potentially
  251. * merge with. Currently includes a hand-wavy stop count of 8, to not spend
  252. * too much time checking for merges.
  253. */
  254. static bool blk_mq_attempt_merge(struct request_queue *q,
  255. struct blk_mq_ctx *ctx, struct bio *bio)
  256. {
  257. lockdep_assert_held(&ctx->lock);
  258. if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) {
  259. ctx->rq_merged++;
  260. return true;
  261. }
  262. return false;
  263. }
  264. bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
  265. {
  266. struct elevator_queue *e = q->elevator;
  267. struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
  268. struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
  269. bool ret = false;
  270. if (e && e->type->ops.mq.bio_merge) {
  271. blk_mq_put_ctx(ctx);
  272. return e->type->ops.mq.bio_merge(hctx, bio);
  273. }
  274. if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
  275. !list_empty_careful(&ctx->rq_list)) {
  276. /* default per sw-queue merge */
  277. spin_lock(&ctx->lock);
  278. ret = blk_mq_attempt_merge(q, ctx, bio);
  279. spin_unlock(&ctx->lock);
  280. }
  281. blk_mq_put_ctx(ctx);
  282. return ret;
  283. }
  284. bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
  285. {
  286. return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
  287. }
  288. EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
  289. void blk_mq_sched_request_inserted(struct request *rq)
  290. {
  291. trace_block_rq_insert(rq->q, rq);
  292. }
  293. EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
  294. static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
  295. bool has_sched,
  296. struct request *rq)
  297. {
  298. /* dispatch flush rq directly */
  299. if (rq->rq_flags & RQF_FLUSH_SEQ) {
  300. spin_lock(&hctx->lock);
  301. list_add(&rq->queuelist, &hctx->dispatch);
  302. spin_unlock(&hctx->lock);
  303. return true;
  304. }
  305. if (has_sched)
  306. rq->rq_flags |= RQF_SORTED;
  307. return false;
  308. }
  309. void blk_mq_sched_insert_request(struct request *rq, bool at_head,
  310. bool run_queue, bool async)
  311. {
  312. struct request_queue *q = rq->q;
  313. struct elevator_queue *e = q->elevator;
  314. struct blk_mq_ctx *ctx = rq->mq_ctx;
  315. struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
  316. /* flush rq in flush machinery need to be dispatched directly */
  317. if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
  318. blk_insert_flush(rq);
  319. goto run;
  320. }
  321. WARN_ON(e && (rq->tag != -1));
  322. if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
  323. goto run;
  324. if (e && e->type->ops.mq.insert_requests) {
  325. LIST_HEAD(list);
  326. list_add(&rq->queuelist, &list);
  327. e->type->ops.mq.insert_requests(hctx, &list, at_head);
  328. } else {
  329. spin_lock(&ctx->lock);
  330. __blk_mq_insert_request(hctx, rq, at_head);
  331. spin_unlock(&ctx->lock);
  332. }
  333. run:
  334. if (run_queue)
  335. blk_mq_run_hw_queue(hctx, async);
  336. }
  337. void blk_mq_sched_insert_requests(struct request_queue *q,
  338. struct blk_mq_ctx *ctx,
  339. struct list_head *list, bool run_queue_async)
  340. {
  341. struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
  342. struct elevator_queue *e = hctx->queue->elevator;
  343. if (e && e->type->ops.mq.insert_requests)
  344. e->type->ops.mq.insert_requests(hctx, list, false);
  345. else {
  346. /*
  347. * try to issue requests directly if the hw queue isn't
  348. * busy in case of 'none' scheduler, and this way may save
  349. * us one extra enqueue & dequeue to sw queue.
  350. */
  351. if (!hctx->dispatch_busy && !e && !run_queue_async) {
  352. blk_mq_try_issue_list_directly(hctx, list);
  353. if (list_empty(list))
  354. return;
  355. }
  356. blk_mq_insert_requests(hctx, ctx, list);
  357. }
  358. blk_mq_run_hw_queue(hctx, run_queue_async);
  359. }
  360. static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
  361. struct blk_mq_hw_ctx *hctx,
  362. unsigned int hctx_idx)
  363. {
  364. if (hctx->sched_tags) {
  365. blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
  366. blk_mq_free_rq_map(hctx->sched_tags);
  367. hctx->sched_tags = NULL;
  368. }
  369. }
  370. static int blk_mq_sched_alloc_tags(struct request_queue *q,
  371. struct blk_mq_hw_ctx *hctx,
  372. unsigned int hctx_idx)
  373. {
  374. struct blk_mq_tag_set *set = q->tag_set;
  375. int ret;
  376. hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
  377. set->reserved_tags);
  378. if (!hctx->sched_tags)
  379. return -ENOMEM;
  380. ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
  381. if (ret)
  382. blk_mq_sched_free_tags(set, hctx, hctx_idx);
  383. return ret;
  384. }
  385. static void blk_mq_sched_tags_teardown(struct request_queue *q)
  386. {
  387. struct blk_mq_tag_set *set = q->tag_set;
  388. struct blk_mq_hw_ctx *hctx;
  389. int i;
  390. queue_for_each_hw_ctx(q, hctx, i)
  391. blk_mq_sched_free_tags(set, hctx, i);
  392. }
  393. int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
  394. {
  395. struct blk_mq_hw_ctx *hctx;
  396. struct elevator_queue *eq;
  397. unsigned int i;
  398. int ret;
  399. if (!e) {
  400. q->elevator = NULL;
  401. q->nr_requests = q->tag_set->queue_depth;
  402. return 0;
  403. }
  404. /*
  405. * Default to double of smaller one between hw queue_depth and 128,
  406. * since we don't split into sync/async like the old code did.
  407. * Additionally, this is a per-hw queue depth.
  408. */
  409. q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
  410. BLKDEV_MAX_RQ);
  411. queue_for_each_hw_ctx(q, hctx, i) {
  412. ret = blk_mq_sched_alloc_tags(q, hctx, i);
  413. if (ret)
  414. goto err;
  415. }
  416. ret = e->ops.mq.init_sched(q, e);
  417. if (ret)
  418. goto err;
  419. blk_mq_debugfs_register_sched(q);
  420. queue_for_each_hw_ctx(q, hctx, i) {
  421. if (e->ops.mq.init_hctx) {
  422. ret = e->ops.mq.init_hctx(hctx, i);
  423. if (ret) {
  424. eq = q->elevator;
  425. blk_mq_exit_sched(q, eq);
  426. kobject_put(&eq->kobj);
  427. return ret;
  428. }
  429. }
  430. blk_mq_debugfs_register_sched_hctx(q, hctx);
  431. }
  432. return 0;
  433. err:
  434. blk_mq_sched_tags_teardown(q);
  435. q->elevator = NULL;
  436. return ret;
  437. }
  438. void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
  439. {
  440. struct blk_mq_hw_ctx *hctx;
  441. unsigned int i;
  442. queue_for_each_hw_ctx(q, hctx, i) {
  443. blk_mq_debugfs_unregister_sched_hctx(hctx);
  444. if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
  445. e->type->ops.mq.exit_hctx(hctx, i);
  446. hctx->sched_data = NULL;
  447. }
  448. }
  449. blk_mq_debugfs_unregister_sched(q);
  450. if (e->type->ops.mq.exit_sched)
  451. e->type->ops.mq.exit_sched(e);
  452. blk_mq_sched_tags_teardown(q);
  453. q->elevator = NULL;
  454. }