null_blk.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687
  1. #include <linux/module.h>
  2. #include <linux/moduleparam.h>
  3. #include <linux/sched.h>
  4. #include <linux/fs.h>
  5. #include <linux/blkdev.h>
  6. #include <linux/init.h>
  7. #include <linux/slab.h>
  8. #include <linux/blk-mq.h>
  9. #include <linux/hrtimer.h>
  10. struct nullb_cmd {
  11. struct list_head list;
  12. struct llist_node ll_list;
  13. struct call_single_data csd;
  14. struct request *rq;
  15. struct bio *bio;
  16. unsigned int tag;
  17. struct nullb_queue *nq;
  18. };
  19. struct nullb_queue {
  20. unsigned long *tag_map;
  21. wait_queue_head_t wait;
  22. unsigned int queue_depth;
  23. struct nullb_cmd *cmds;
  24. };
  25. struct nullb {
  26. struct list_head list;
  27. unsigned int index;
  28. struct request_queue *q;
  29. struct gendisk *disk;
  30. struct blk_mq_tag_set tag_set;
  31. struct hrtimer timer;
  32. unsigned int queue_depth;
  33. spinlock_t lock;
  34. struct nullb_queue *queues;
  35. unsigned int nr_queues;
  36. };
  37. static LIST_HEAD(nullb_list);
  38. static struct mutex lock;
  39. static int null_major;
  40. static int nullb_indexes;
  41. struct completion_queue {
  42. struct llist_head list;
  43. struct hrtimer timer;
  44. };
  45. /*
  46. * These are per-cpu for now, they will need to be configured by the
  47. * complete_queues parameter and appropriately mapped.
  48. */
  49. static DEFINE_PER_CPU(struct completion_queue, completion_queues);
  50. enum {
  51. NULL_IRQ_NONE = 0,
  52. NULL_IRQ_SOFTIRQ = 1,
  53. NULL_IRQ_TIMER = 2,
  54. };
  55. enum {
  56. NULL_Q_BIO = 0,
  57. NULL_Q_RQ = 1,
  58. NULL_Q_MQ = 2,
  59. };
  60. static int submit_queues;
  61. module_param(submit_queues, int, S_IRUGO);
  62. MODULE_PARM_DESC(submit_queues, "Number of submission queues");
  63. static int home_node = NUMA_NO_NODE;
  64. module_param(home_node, int, S_IRUGO);
  65. MODULE_PARM_DESC(home_node, "Home node for the device");
  66. static int queue_mode = NULL_Q_MQ;
  67. static int null_param_store_val(const char *str, int *val, int min, int max)
  68. {
  69. int ret, new_val;
  70. ret = kstrtoint(str, 10, &new_val);
  71. if (ret)
  72. return -EINVAL;
  73. if (new_val < min || new_val > max)
  74. return -EINVAL;
  75. *val = new_val;
  76. return 0;
  77. }
  78. static int null_set_queue_mode(const char *str, const struct kernel_param *kp)
  79. {
  80. return null_param_store_val(str, &queue_mode, NULL_Q_BIO, NULL_Q_MQ);
  81. }
  82. static const struct kernel_param_ops null_queue_mode_param_ops = {
  83. .set = null_set_queue_mode,
  84. .get = param_get_int,
  85. };
  86. device_param_cb(queue_mode, &null_queue_mode_param_ops, &queue_mode, S_IRUGO);
  87. MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
  88. static int gb = 250;
  89. module_param(gb, int, S_IRUGO);
  90. MODULE_PARM_DESC(gb, "Size in GB");
  91. static int bs = 512;
  92. module_param(bs, int, S_IRUGO);
  93. MODULE_PARM_DESC(bs, "Block size (in bytes)");
  94. static int nr_devices = 2;
  95. module_param(nr_devices, int, S_IRUGO);
  96. MODULE_PARM_DESC(nr_devices, "Number of devices to register");
  97. static int irqmode = NULL_IRQ_SOFTIRQ;
  98. static int null_set_irqmode(const char *str, const struct kernel_param *kp)
  99. {
  100. return null_param_store_val(str, &irqmode, NULL_IRQ_NONE,
  101. NULL_IRQ_TIMER);
  102. }
  103. static const struct kernel_param_ops null_irqmode_param_ops = {
  104. .set = null_set_irqmode,
  105. .get = param_get_int,
  106. };
  107. device_param_cb(irqmode, &null_irqmode_param_ops, &irqmode, S_IRUGO);
  108. MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
  109. static int completion_nsec = 10000;
  110. module_param(completion_nsec, int, S_IRUGO);
  111. MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
  112. static int hw_queue_depth = 64;
  113. module_param(hw_queue_depth, int, S_IRUGO);
  114. MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
  115. static bool use_per_node_hctx = false;
  116. module_param(use_per_node_hctx, bool, S_IRUGO);
  117. MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
  118. static void put_tag(struct nullb_queue *nq, unsigned int tag)
  119. {
  120. clear_bit_unlock(tag, nq->tag_map);
  121. if (waitqueue_active(&nq->wait))
  122. wake_up(&nq->wait);
  123. }
  124. static unsigned int get_tag(struct nullb_queue *nq)
  125. {
  126. unsigned int tag;
  127. do {
  128. tag = find_first_zero_bit(nq->tag_map, nq->queue_depth);
  129. if (tag >= nq->queue_depth)
  130. return -1U;
  131. } while (test_and_set_bit_lock(tag, nq->tag_map));
  132. return tag;
  133. }
  134. static void free_cmd(struct nullb_cmd *cmd)
  135. {
  136. put_tag(cmd->nq, cmd->tag);
  137. }
  138. static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
  139. {
  140. struct nullb_cmd *cmd;
  141. unsigned int tag;
  142. tag = get_tag(nq);
  143. if (tag != -1U) {
  144. cmd = &nq->cmds[tag];
  145. cmd->tag = tag;
  146. cmd->nq = nq;
  147. return cmd;
  148. }
  149. return NULL;
  150. }
  151. static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait)
  152. {
  153. struct nullb_cmd *cmd;
  154. DEFINE_WAIT(wait);
  155. cmd = __alloc_cmd(nq);
  156. if (cmd || !can_wait)
  157. return cmd;
  158. do {
  159. prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE);
  160. cmd = __alloc_cmd(nq);
  161. if (cmd)
  162. break;
  163. io_schedule();
  164. } while (1);
  165. finish_wait(&nq->wait, &wait);
  166. return cmd;
  167. }
  168. static void end_cmd(struct nullb_cmd *cmd)
  169. {
  170. switch (queue_mode) {
  171. case NULL_Q_MQ:
  172. blk_mq_end_request(cmd->rq, 0);
  173. return;
  174. case NULL_Q_RQ:
  175. INIT_LIST_HEAD(&cmd->rq->queuelist);
  176. blk_end_request_all(cmd->rq, 0);
  177. break;
  178. case NULL_Q_BIO:
  179. bio_endio(cmd->bio, 0);
  180. break;
  181. }
  182. free_cmd(cmd);
  183. }
  184. static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
  185. {
  186. struct completion_queue *cq;
  187. struct llist_node *entry;
  188. struct nullb_cmd *cmd;
  189. cq = &per_cpu(completion_queues, smp_processor_id());
  190. while ((entry = llist_del_all(&cq->list)) != NULL) {
  191. entry = llist_reverse_order(entry);
  192. do {
  193. cmd = container_of(entry, struct nullb_cmd, ll_list);
  194. entry = entry->next;
  195. end_cmd(cmd);
  196. if (cmd->rq) {
  197. struct request_queue *q = cmd->rq->q;
  198. if (!q->mq_ops && blk_queue_stopped(q)) {
  199. spin_lock(q->queue_lock);
  200. if (blk_queue_stopped(q))
  201. blk_start_queue(q);
  202. spin_unlock(q->queue_lock);
  203. }
  204. }
  205. } while (entry);
  206. }
  207. return HRTIMER_NORESTART;
  208. }
  209. static void null_cmd_end_timer(struct nullb_cmd *cmd)
  210. {
  211. struct completion_queue *cq = &per_cpu(completion_queues, get_cpu());
  212. cmd->ll_list.next = NULL;
  213. if (llist_add(&cmd->ll_list, &cq->list)) {
  214. ktime_t kt = ktime_set(0, completion_nsec);
  215. hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL_PINNED);
  216. }
  217. put_cpu();
  218. }
  219. static void null_softirq_done_fn(struct request *rq)
  220. {
  221. if (queue_mode == NULL_Q_MQ)
  222. end_cmd(blk_mq_rq_to_pdu(rq));
  223. else
  224. end_cmd(rq->special);
  225. }
  226. static inline void null_handle_cmd(struct nullb_cmd *cmd)
  227. {
  228. /* Complete IO by inline, softirq or timer */
  229. switch (irqmode) {
  230. case NULL_IRQ_SOFTIRQ:
  231. switch (queue_mode) {
  232. case NULL_Q_MQ:
  233. blk_mq_complete_request(cmd->rq);
  234. break;
  235. case NULL_Q_RQ:
  236. blk_complete_request(cmd->rq);
  237. break;
  238. case NULL_Q_BIO:
  239. /*
  240. * XXX: no proper submitting cpu information available.
  241. */
  242. end_cmd(cmd);
  243. break;
  244. }
  245. break;
  246. case NULL_IRQ_NONE:
  247. end_cmd(cmd);
  248. break;
  249. case NULL_IRQ_TIMER:
  250. null_cmd_end_timer(cmd);
  251. break;
  252. }
  253. }
  254. static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
  255. {
  256. int index = 0;
  257. if (nullb->nr_queues != 1)
  258. index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues);
  259. return &nullb->queues[index];
  260. }
  261. static void null_queue_bio(struct request_queue *q, struct bio *bio)
  262. {
  263. struct nullb *nullb = q->queuedata;
  264. struct nullb_queue *nq = nullb_to_queue(nullb);
  265. struct nullb_cmd *cmd;
  266. cmd = alloc_cmd(nq, 1);
  267. cmd->bio = bio;
  268. null_handle_cmd(cmd);
  269. }
  270. static int null_rq_prep_fn(struct request_queue *q, struct request *req)
  271. {
  272. struct nullb *nullb = q->queuedata;
  273. struct nullb_queue *nq = nullb_to_queue(nullb);
  274. struct nullb_cmd *cmd;
  275. cmd = alloc_cmd(nq, 0);
  276. if (cmd) {
  277. cmd->rq = req;
  278. req->special = cmd;
  279. return BLKPREP_OK;
  280. }
  281. blk_stop_queue(q);
  282. return BLKPREP_DEFER;
  283. }
  284. static void null_request_fn(struct request_queue *q)
  285. {
  286. struct request *rq;
  287. while ((rq = blk_fetch_request(q)) != NULL) {
  288. struct nullb_cmd *cmd = rq->special;
  289. spin_unlock_irq(q->queue_lock);
  290. null_handle_cmd(cmd);
  291. spin_lock_irq(q->queue_lock);
  292. }
  293. }
  294. static int null_queue_rq(struct blk_mq_hw_ctx *hctx,
  295. const struct blk_mq_queue_data *bd)
  296. {
  297. struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
  298. cmd->rq = bd->rq;
  299. cmd->nq = hctx->driver_data;
  300. blk_mq_start_request(bd->rq);
  301. null_handle_cmd(cmd);
  302. return BLK_MQ_RQ_QUEUE_OK;
  303. }
  304. static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
  305. {
  306. BUG_ON(!nullb);
  307. BUG_ON(!nq);
  308. init_waitqueue_head(&nq->wait);
  309. nq->queue_depth = nullb->queue_depth;
  310. }
  311. static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
  312. unsigned int index)
  313. {
  314. struct nullb *nullb = data;
  315. struct nullb_queue *nq = &nullb->queues[index];
  316. hctx->driver_data = nq;
  317. null_init_queue(nullb, nq);
  318. nullb->nr_queues++;
  319. return 0;
  320. }
  321. static struct blk_mq_ops null_mq_ops = {
  322. .queue_rq = null_queue_rq,
  323. .map_queue = blk_mq_map_queue,
  324. .init_hctx = null_init_hctx,
  325. .complete = null_softirq_done_fn,
  326. };
  327. static void null_del_dev(struct nullb *nullb)
  328. {
  329. list_del_init(&nullb->list);
  330. del_gendisk(nullb->disk);
  331. blk_cleanup_queue(nullb->q);
  332. if (queue_mode == NULL_Q_MQ)
  333. blk_mq_free_tag_set(&nullb->tag_set);
  334. put_disk(nullb->disk);
  335. kfree(nullb);
  336. }
  337. static int null_open(struct block_device *bdev, fmode_t mode)
  338. {
  339. return 0;
  340. }
  341. static void null_release(struct gendisk *disk, fmode_t mode)
  342. {
  343. }
  344. static const struct block_device_operations null_fops = {
  345. .owner = THIS_MODULE,
  346. .open = null_open,
  347. .release = null_release,
  348. };
  349. static int setup_commands(struct nullb_queue *nq)
  350. {
  351. struct nullb_cmd *cmd;
  352. int i, tag_size;
  353. nq->cmds = kzalloc(nq->queue_depth * sizeof(*cmd), GFP_KERNEL);
  354. if (!nq->cmds)
  355. return -ENOMEM;
  356. tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG;
  357. nq->tag_map = kzalloc(tag_size * sizeof(unsigned long), GFP_KERNEL);
  358. if (!nq->tag_map) {
  359. kfree(nq->cmds);
  360. return -ENOMEM;
  361. }
  362. for (i = 0; i < nq->queue_depth; i++) {
  363. cmd = &nq->cmds[i];
  364. INIT_LIST_HEAD(&cmd->list);
  365. cmd->ll_list.next = NULL;
  366. cmd->tag = -1U;
  367. }
  368. return 0;
  369. }
  370. static void cleanup_queue(struct nullb_queue *nq)
  371. {
  372. kfree(nq->tag_map);
  373. kfree(nq->cmds);
  374. }
  375. static void cleanup_queues(struct nullb *nullb)
  376. {
  377. int i;
  378. for (i = 0; i < nullb->nr_queues; i++)
  379. cleanup_queue(&nullb->queues[i]);
  380. kfree(nullb->queues);
  381. }
  382. static int setup_queues(struct nullb *nullb)
  383. {
  384. nullb->queues = kzalloc(submit_queues * sizeof(struct nullb_queue),
  385. GFP_KERNEL);
  386. if (!nullb->queues)
  387. return -ENOMEM;
  388. nullb->nr_queues = 0;
  389. nullb->queue_depth = hw_queue_depth;
  390. return 0;
  391. }
  392. static int init_driver_queues(struct nullb *nullb)
  393. {
  394. struct nullb_queue *nq;
  395. int i, ret = 0;
  396. for (i = 0; i < submit_queues; i++) {
  397. nq = &nullb->queues[i];
  398. null_init_queue(nullb, nq);
  399. ret = setup_commands(nq);
  400. if (ret)
  401. return ret;
  402. nullb->nr_queues++;
  403. }
  404. return 0;
  405. }
  406. static int null_add_dev(void)
  407. {
  408. struct gendisk *disk;
  409. struct nullb *nullb;
  410. sector_t size;
  411. int rv;
  412. nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
  413. if (!nullb) {
  414. rv = -ENOMEM;
  415. goto out;
  416. }
  417. spin_lock_init(&nullb->lock);
  418. if (queue_mode == NULL_Q_MQ && use_per_node_hctx)
  419. submit_queues = nr_online_nodes;
  420. rv = setup_queues(nullb);
  421. if (rv)
  422. goto out_free_nullb;
  423. if (queue_mode == NULL_Q_MQ) {
  424. nullb->tag_set.ops = &null_mq_ops;
  425. nullb->tag_set.nr_hw_queues = submit_queues;
  426. nullb->tag_set.queue_depth = hw_queue_depth;
  427. nullb->tag_set.numa_node = home_node;
  428. nullb->tag_set.cmd_size = sizeof(struct nullb_cmd);
  429. nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
  430. nullb->tag_set.driver_data = nullb;
  431. rv = blk_mq_alloc_tag_set(&nullb->tag_set);
  432. if (rv)
  433. goto out_cleanup_queues;
  434. nullb->q = blk_mq_init_queue(&nullb->tag_set);
  435. if (IS_ERR(nullb->q)) {
  436. rv = -ENOMEM;
  437. goto out_cleanup_tags;
  438. }
  439. } else if (queue_mode == NULL_Q_BIO) {
  440. nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
  441. if (!nullb->q) {
  442. rv = -ENOMEM;
  443. goto out_cleanup_queues;
  444. }
  445. blk_queue_make_request(nullb->q, null_queue_bio);
  446. rv = init_driver_queues(nullb);
  447. if (rv)
  448. goto out_cleanup_blk_queue;
  449. } else {
  450. nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
  451. if (!nullb->q) {
  452. rv = -ENOMEM;
  453. goto out_cleanup_queues;
  454. }
  455. blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
  456. blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
  457. rv = init_driver_queues(nullb);
  458. if (rv)
  459. goto out_cleanup_blk_queue;
  460. }
  461. nullb->q->queuedata = nullb;
  462. queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
  463. queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q);
  464. disk = nullb->disk = alloc_disk_node(1, home_node);
  465. if (!disk) {
  466. rv = -ENOMEM;
  467. goto out_cleanup_blk_queue;
  468. }
  469. mutex_lock(&lock);
  470. list_add_tail(&nullb->list, &nullb_list);
  471. nullb->index = nullb_indexes++;
  472. mutex_unlock(&lock);
  473. blk_queue_logical_block_size(nullb->q, bs);
  474. blk_queue_physical_block_size(nullb->q, bs);
  475. size = gb * 1024 * 1024 * 1024ULL;
  476. sector_div(size, bs);
  477. set_capacity(disk, size);
  478. disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
  479. disk->major = null_major;
  480. disk->first_minor = nullb->index;
  481. disk->fops = &null_fops;
  482. disk->private_data = nullb;
  483. disk->queue = nullb->q;
  484. sprintf(disk->disk_name, "nullb%d", nullb->index);
  485. add_disk(disk);
  486. return 0;
  487. out_cleanup_blk_queue:
  488. blk_cleanup_queue(nullb->q);
  489. out_cleanup_tags:
  490. if (queue_mode == NULL_Q_MQ)
  491. blk_mq_free_tag_set(&nullb->tag_set);
  492. out_cleanup_queues:
  493. cleanup_queues(nullb);
  494. out_free_nullb:
  495. kfree(nullb);
  496. out:
  497. return rv;
  498. }
  499. static int __init null_init(void)
  500. {
  501. unsigned int i;
  502. if (bs > PAGE_SIZE) {
  503. pr_warn("null_blk: invalid block size\n");
  504. pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
  505. bs = PAGE_SIZE;
  506. }
  507. if (queue_mode == NULL_Q_MQ && use_per_node_hctx) {
  508. if (submit_queues < nr_online_nodes) {
  509. pr_warn("null_blk: submit_queues param is set to %u.",
  510. nr_online_nodes);
  511. submit_queues = nr_online_nodes;
  512. }
  513. } else if (submit_queues > nr_cpu_ids)
  514. submit_queues = nr_cpu_ids;
  515. else if (!submit_queues)
  516. submit_queues = 1;
  517. mutex_init(&lock);
  518. /* Initialize a separate list for each CPU for issuing softirqs */
  519. for_each_possible_cpu(i) {
  520. struct completion_queue *cq = &per_cpu(completion_queues, i);
  521. init_llist_head(&cq->list);
  522. if (irqmode != NULL_IRQ_TIMER)
  523. continue;
  524. hrtimer_init(&cq->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  525. cq->timer.function = null_cmd_timer_expired;
  526. }
  527. null_major = register_blkdev(0, "nullb");
  528. if (null_major < 0)
  529. return null_major;
  530. for (i = 0; i < nr_devices; i++) {
  531. if (null_add_dev()) {
  532. unregister_blkdev(null_major, "nullb");
  533. return -EINVAL;
  534. }
  535. }
  536. pr_info("null: module loaded\n");
  537. return 0;
  538. }
  539. static void __exit null_exit(void)
  540. {
  541. struct nullb *nullb;
  542. unregister_blkdev(null_major, "nullb");
  543. mutex_lock(&lock);
  544. while (!list_empty(&nullb_list)) {
  545. nullb = list_entry(nullb_list.next, struct nullb, list);
  546. null_del_dev(nullb);
  547. }
  548. mutex_unlock(&lock);
  549. }
  550. module_init(null_init);
  551. module_exit(null_exit);
  552. MODULE_AUTHOR("Jens Axboe <jaxboe@fusionio.com>");
  553. MODULE_LICENSE("GPL");