blk-mq-tag.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483
  1. /*
  2. * Tag allocation using scalable bitmaps. Uses active queue tracking to support
  3. * fairer distribution of tags between multiple submitters when a shared tag map
  4. * is used.
  5. *
  6. * Copyright (C) 2013-2014 Jens Axboe
  7. */
  8. #include <linux/kernel.h>
  9. #include <linux/module.h>
  10. #include <linux/blk-mq.h>
  11. #include "blk.h"
  12. #include "blk-mq.h"
  13. #include "blk-mq-tag.h"
  14. bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
  15. {
  16. if (!tags)
  17. return true;
  18. return sbitmap_any_bit_clear(&tags->bitmap_tags.sb);
  19. }
  20. /*
  21. * If a previously inactive queue goes active, bump the active user count.
  22. * We need to do this before try to allocate driver tag, then even if fail
  23. * to get tag when first time, the other shared-tag users could reserve
  24. * budget for it.
  25. */
  26. bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
  27. {
  28. if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
  29. !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
  30. atomic_inc(&hctx->tags->active_queues);
  31. return true;
  32. }
  33. /*
  34. * Wakeup all potentially sleeping on tags
  35. */
  36. void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
  37. {
  38. sbitmap_queue_wake_all(&tags->bitmap_tags);
  39. if (include_reserve)
  40. sbitmap_queue_wake_all(&tags->breserved_tags);
  41. }
  42. /*
  43. * If a previously busy queue goes inactive, potential waiters could now
  44. * be allowed to queue. Wake them up and check.
  45. */
  46. void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
  47. {
  48. struct blk_mq_tags *tags = hctx->tags;
  49. if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
  50. return;
  51. atomic_dec(&tags->active_queues);
  52. blk_mq_tag_wakeup_all(tags, false);
  53. }
  54. /*
  55. * For shared tag users, we track the number of currently active users
  56. * and attempt to provide a fair share of the tag depth for each of them.
  57. */
  58. static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
  59. struct sbitmap_queue *bt)
  60. {
  61. unsigned int depth, users;
  62. if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
  63. return true;
  64. if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
  65. return true;
  66. /*
  67. * Don't try dividing an ant
  68. */
  69. if (bt->sb.depth == 1)
  70. return true;
  71. users = atomic_read(&hctx->tags->active_queues);
  72. if (!users)
  73. return true;
  74. /*
  75. * Allow at least some tags
  76. */
  77. depth = max((bt->sb.depth + users - 1) / users, 4U);
  78. return atomic_read(&hctx->nr_active) < depth;
  79. }
  80. static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
  81. struct sbitmap_queue *bt)
  82. {
  83. if (!(data->flags & BLK_MQ_REQ_INTERNAL) &&
  84. !hctx_may_queue(data->hctx, bt))
  85. return -1;
  86. if (data->shallow_depth)
  87. return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
  88. else
  89. return __sbitmap_queue_get(bt);
  90. }
  91. unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
  92. {
  93. struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
  94. struct sbitmap_queue *bt;
  95. struct sbq_wait_state *ws;
  96. DEFINE_WAIT(wait);
  97. unsigned int tag_offset;
  98. bool drop_ctx;
  99. int tag;
  100. if (data->flags & BLK_MQ_REQ_RESERVED) {
  101. if (unlikely(!tags->nr_reserved_tags)) {
  102. WARN_ON_ONCE(1);
  103. return BLK_MQ_TAG_FAIL;
  104. }
  105. bt = &tags->breserved_tags;
  106. tag_offset = 0;
  107. } else {
  108. bt = &tags->bitmap_tags;
  109. tag_offset = tags->nr_reserved_tags;
  110. }
  111. tag = __blk_mq_get_tag(data, bt);
  112. if (tag != -1)
  113. goto found_tag;
  114. if (data->flags & BLK_MQ_REQ_NOWAIT)
  115. return BLK_MQ_TAG_FAIL;
  116. ws = bt_wait_ptr(bt, data->hctx);
  117. drop_ctx = data->ctx == NULL;
  118. do {
  119. struct sbitmap_queue *bt_prev;
  120. /*
  121. * We're out of tags on this hardware queue, kick any
  122. * pending IO submits before going to sleep waiting for
  123. * some to complete.
  124. */
  125. blk_mq_run_hw_queue(data->hctx, false);
  126. /*
  127. * Retry tag allocation after running the hardware queue,
  128. * as running the queue may also have found completions.
  129. */
  130. tag = __blk_mq_get_tag(data, bt);
  131. if (tag != -1)
  132. break;
  133. prepare_to_wait_exclusive(&ws->wait, &wait,
  134. TASK_UNINTERRUPTIBLE);
  135. tag = __blk_mq_get_tag(data, bt);
  136. if (tag != -1)
  137. break;
  138. if (data->ctx)
  139. blk_mq_put_ctx(data->ctx);
  140. bt_prev = bt;
  141. io_schedule();
  142. data->ctx = blk_mq_get_ctx(data->q);
  143. data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu);
  144. tags = blk_mq_tags_from_data(data);
  145. if (data->flags & BLK_MQ_REQ_RESERVED)
  146. bt = &tags->breserved_tags;
  147. else
  148. bt = &tags->bitmap_tags;
  149. finish_wait(&ws->wait, &wait);
  150. /*
  151. * If destination hw queue is changed, fake wake up on
  152. * previous queue for compensating the wake up miss, so
  153. * other allocations on previous queue won't be starved.
  154. */
  155. if (bt != bt_prev)
  156. sbitmap_queue_wake_up(bt_prev);
  157. ws = bt_wait_ptr(bt, data->hctx);
  158. } while (1);
  159. if (drop_ctx && data->ctx)
  160. blk_mq_put_ctx(data->ctx);
  161. finish_wait(&ws->wait, &wait);
  162. found_tag:
  163. return tag + tag_offset;
  164. }
  165. void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
  166. struct blk_mq_ctx *ctx, unsigned int tag)
  167. {
  168. if (!blk_mq_tag_is_reserved(tags, tag)) {
  169. const int real_tag = tag - tags->nr_reserved_tags;
  170. BUG_ON(real_tag >= tags->nr_tags);
  171. sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
  172. } else {
  173. BUG_ON(tag >= tags->nr_reserved_tags);
  174. sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
  175. }
  176. }
  177. struct bt_iter_data {
  178. struct blk_mq_hw_ctx *hctx;
  179. busy_iter_fn *fn;
  180. void *data;
  181. bool reserved;
  182. };
  183. static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
  184. {
  185. struct bt_iter_data *iter_data = data;
  186. struct blk_mq_hw_ctx *hctx = iter_data->hctx;
  187. struct blk_mq_tags *tags = hctx->tags;
  188. bool reserved = iter_data->reserved;
  189. struct request *rq;
  190. if (!reserved)
  191. bitnr += tags->nr_reserved_tags;
  192. rq = tags->rqs[bitnr];
  193. /*
  194. * We can hit rq == NULL here, because the tagging functions
  195. * test and set the bit before assining ->rqs[].
  196. */
  197. if (rq && rq->q == hctx->queue)
  198. iter_data->fn(hctx, rq, iter_data->data, reserved);
  199. return true;
  200. }
  201. static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
  202. busy_iter_fn *fn, void *data, bool reserved)
  203. {
  204. struct bt_iter_data iter_data = {
  205. .hctx = hctx,
  206. .fn = fn,
  207. .data = data,
  208. .reserved = reserved,
  209. };
  210. sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
  211. }
  212. struct bt_tags_iter_data {
  213. struct blk_mq_tags *tags;
  214. busy_tag_iter_fn *fn;
  215. void *data;
  216. bool reserved;
  217. };
  218. static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
  219. {
  220. struct bt_tags_iter_data *iter_data = data;
  221. struct blk_mq_tags *tags = iter_data->tags;
  222. bool reserved = iter_data->reserved;
  223. struct request *rq;
  224. if (!reserved)
  225. bitnr += tags->nr_reserved_tags;
  226. /*
  227. * We can hit rq == NULL here, because the tagging functions
  228. * test and set the bit before assining ->rqs[].
  229. */
  230. rq = tags->rqs[bitnr];
  231. if (rq && blk_mq_request_started(rq))
  232. iter_data->fn(rq, iter_data->data, reserved);
  233. return true;
  234. }
  235. static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
  236. busy_tag_iter_fn *fn, void *data, bool reserved)
  237. {
  238. struct bt_tags_iter_data iter_data = {
  239. .tags = tags,
  240. .fn = fn,
  241. .data = data,
  242. .reserved = reserved,
  243. };
  244. if (tags->rqs)
  245. sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
  246. }
  247. static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
  248. busy_tag_iter_fn *fn, void *priv)
  249. {
  250. if (tags->nr_reserved_tags)
  251. bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true);
  252. bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false);
  253. }
  254. void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
  255. busy_tag_iter_fn *fn, void *priv)
  256. {
  257. int i;
  258. for (i = 0; i < tagset->nr_hw_queues; i++) {
  259. if (tagset->tags && tagset->tags[i])
  260. blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
  261. }
  262. }
  263. EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
  264. void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
  265. void *priv)
  266. {
  267. struct blk_mq_hw_ctx *hctx;
  268. int i;
  269. /*
  270. * __blk_mq_update_nr_hw_queues will update the nr_hw_queues and
  271. * queue_hw_ctx after freeze the queue, so we use q_usage_counter
  272. * to avoid race with it.
  273. */
  274. if (!percpu_ref_tryget(&q->q_usage_counter))
  275. return;
  276. queue_for_each_hw_ctx(q, hctx, i) {
  277. struct blk_mq_tags *tags = hctx->tags;
  278. /*
  279. * If not software queues are currently mapped to this
  280. * hardware queue, there's nothing to check
  281. */
  282. if (!blk_mq_hw_queue_mapped(hctx))
  283. continue;
  284. if (tags->nr_reserved_tags)
  285. bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
  286. bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
  287. }
  288. blk_queue_exit(q);
  289. }
  290. static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
  291. bool round_robin, int node)
  292. {
  293. return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL,
  294. node);
  295. }
  296. static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
  297. int node, int alloc_policy)
  298. {
  299. unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
  300. bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
  301. if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
  302. goto free_tags;
  303. if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin,
  304. node))
  305. goto free_bitmap_tags;
  306. return tags;
  307. free_bitmap_tags:
  308. sbitmap_queue_free(&tags->bitmap_tags);
  309. free_tags:
  310. kfree(tags);
  311. return NULL;
  312. }
  313. struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
  314. unsigned int reserved_tags,
  315. int node, int alloc_policy)
  316. {
  317. struct blk_mq_tags *tags;
  318. if (total_tags > BLK_MQ_TAG_MAX) {
  319. pr_err("blk-mq: tag depth too large\n");
  320. return NULL;
  321. }
  322. tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
  323. if (!tags)
  324. return NULL;
  325. tags->nr_tags = total_tags;
  326. tags->nr_reserved_tags = reserved_tags;
  327. return blk_mq_init_bitmap_tags(tags, node, alloc_policy);
  328. }
  329. void blk_mq_free_tags(struct blk_mq_tags *tags)
  330. {
  331. sbitmap_queue_free(&tags->bitmap_tags);
  332. sbitmap_queue_free(&tags->breserved_tags);
  333. kfree(tags);
  334. }
  335. int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
  336. struct blk_mq_tags **tagsptr, unsigned int tdepth,
  337. bool can_grow)
  338. {
  339. struct blk_mq_tags *tags = *tagsptr;
  340. if (tdepth <= tags->nr_reserved_tags)
  341. return -EINVAL;
  342. /*
  343. * If we are allowed to grow beyond the original size, allocate
  344. * a new set of tags before freeing the old one.
  345. */
  346. if (tdepth > tags->nr_tags) {
  347. struct blk_mq_tag_set *set = hctx->queue->tag_set;
  348. struct blk_mq_tags *new;
  349. bool ret;
  350. if (!can_grow)
  351. return -EINVAL;
  352. /*
  353. * We need some sort of upper limit, set it high enough that
  354. * no valid use cases should require more.
  355. */
  356. if (tdepth > 16 * BLKDEV_MAX_RQ)
  357. return -EINVAL;
  358. new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
  359. tags->nr_reserved_tags);
  360. if (!new)
  361. return -ENOMEM;
  362. ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
  363. if (ret) {
  364. blk_mq_free_rq_map(new);
  365. return -ENOMEM;
  366. }
  367. blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
  368. blk_mq_free_rq_map(*tagsptr);
  369. *tagsptr = new;
  370. } else {
  371. /*
  372. * Don't need (or can't) update reserved tags here, they
  373. * remain static and should never need resizing.
  374. */
  375. sbitmap_queue_resize(&tags->bitmap_tags,
  376. tdepth - tags->nr_reserved_tags);
  377. }
  378. return 0;
  379. }
  380. /**
  381. * blk_mq_unique_tag() - return a tag that is unique queue-wide
  382. * @rq: request for which to compute a unique tag
  383. *
  384. * The tag field in struct request is unique per hardware queue but not over
  385. * all hardware queues. Hence this function that returns a tag with the
  386. * hardware context index in the upper bits and the per hardware queue tag in
  387. * the lower bits.
  388. *
  389. * Note: When called for a request that is queued on a non-multiqueue request
  390. * queue, the hardware context index is set to zero.
  391. */
  392. u32 blk_mq_unique_tag(struct request *rq)
  393. {
  394. struct request_queue *q = rq->q;
  395. struct blk_mq_hw_ctx *hctx;
  396. int hwq = 0;
  397. if (q->mq_ops) {
  398. hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
  399. hwq = hctx->queue_num;
  400. }
  401. return (hwq << BLK_MQ_UNIQUE_TAG_BITS) |
  402. (rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
  403. }
  404. EXPORT_SYMBOL(blk_mq_unique_tag);