blk-zoned.c 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. /*
  2. * Zoned block device handling
  3. *
  4. * Copyright (c) 2015, Hannes Reinecke
  5. * Copyright (c) 2015, SUSE Linux GmbH
  6. *
  7. * Copyright (c) 2016, Damien Le Moal
  8. * Copyright (c) 2016, Western Digital
  9. */
  10. #include <linux/kernel.h>
  11. #include <linux/module.h>
  12. #include <linux/rbtree.h>
  13. #include <linux/blkdev.h>
  14. static inline sector_t blk_zone_start(struct request_queue *q,
  15. sector_t sector)
  16. {
  17. sector_t zone_mask = blk_queue_zone_sectors(q) - 1;
  18. return sector & ~zone_mask;
  19. }
  20. /*
  21. * Return true if a request is a write requests that needs zone write locking.
  22. */
  23. bool blk_req_needs_zone_write_lock(struct request *rq)
  24. {
  25. if (!rq->q->seq_zones_wlock)
  26. return false;
  27. if (blk_rq_is_passthrough(rq))
  28. return false;
  29. switch (req_op(rq)) {
  30. case REQ_OP_WRITE_ZEROES:
  31. case REQ_OP_WRITE_SAME:
  32. case REQ_OP_WRITE:
  33. return blk_rq_zone_is_seq(rq);
  34. default:
  35. return false;
  36. }
  37. }
  38. EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
  39. void __blk_req_zone_write_lock(struct request *rq)
  40. {
  41. if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
  42. rq->q->seq_zones_wlock)))
  43. return;
  44. WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
  45. rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
  46. }
  47. EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
  48. void __blk_req_zone_write_unlock(struct request *rq)
  49. {
  50. rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
  51. if (rq->q->seq_zones_wlock)
  52. WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
  53. rq->q->seq_zones_wlock));
  54. }
  55. EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
  56. /*
  57. * Check that a zone report belongs to the partition.
  58. * If yes, fix its start sector and write pointer, copy it in the
  59. * zone information array and return true. Return false otherwise.
  60. */
  61. static bool blkdev_report_zone(struct block_device *bdev,
  62. struct blk_zone *rep,
  63. struct blk_zone *zone)
  64. {
  65. sector_t offset = get_start_sect(bdev);
  66. if (rep->start < offset)
  67. return false;
  68. rep->start -= offset;
  69. if (rep->start + rep->len > bdev->bd_part->nr_sects)
  70. return false;
  71. if (rep->type == BLK_ZONE_TYPE_CONVENTIONAL)
  72. rep->wp = rep->start + rep->len;
  73. else
  74. rep->wp -= offset;
  75. memcpy(zone, rep, sizeof(struct blk_zone));
  76. return true;
  77. }
  78. /**
  79. * blkdev_report_zones - Get zones information
  80. * @bdev: Target block device
  81. * @sector: Sector from which to report zones
  82. * @zones: Array of zone structures where to return the zones information
  83. * @nr_zones: Number of zone structures in the zone array
  84. * @gfp_mask: Memory allocation flags (for bio_alloc)
  85. *
  86. * Description:
  87. * Get zone information starting from the zone containing @sector.
  88. * The number of zone information reported may be less than the number
  89. * requested by @nr_zones. The number of zones actually reported is
  90. * returned in @nr_zones.
  91. */
  92. int blkdev_report_zones(struct block_device *bdev,
  93. sector_t sector,
  94. struct blk_zone *zones,
  95. unsigned int *nr_zones,
  96. gfp_t gfp_mask)
  97. {
  98. struct request_queue *q = bdev_get_queue(bdev);
  99. struct blk_zone_report_hdr *hdr;
  100. unsigned int nrz = *nr_zones;
  101. struct page *page;
  102. unsigned int nr_rep;
  103. size_t rep_bytes;
  104. unsigned int nr_pages;
  105. struct bio *bio;
  106. struct bio_vec *bv;
  107. unsigned int i, n, nz;
  108. unsigned int ofst;
  109. void *addr;
  110. int ret;
  111. if (!q)
  112. return -ENXIO;
  113. if (!blk_queue_is_zoned(q))
  114. return -EOPNOTSUPP;
  115. if (!nrz)
  116. return 0;
  117. if (sector > bdev->bd_part->nr_sects) {
  118. *nr_zones = 0;
  119. return 0;
  120. }
  121. /*
  122. * The zone report has a header. So make room for it in the
  123. * payload. Also make sure that the report fits in a single BIO
  124. * that will not be split down the stack.
  125. */
  126. rep_bytes = sizeof(struct blk_zone_report_hdr) +
  127. sizeof(struct blk_zone) * nrz;
  128. rep_bytes = (rep_bytes + PAGE_SIZE - 1) & PAGE_MASK;
  129. if (rep_bytes > (queue_max_sectors(q) << 9))
  130. rep_bytes = queue_max_sectors(q) << 9;
  131. nr_pages = min_t(unsigned int, BIO_MAX_PAGES,
  132. rep_bytes >> PAGE_SHIFT);
  133. nr_pages = min_t(unsigned int, nr_pages,
  134. queue_max_segments(q));
  135. bio = bio_alloc(gfp_mask, nr_pages);
  136. if (!bio)
  137. return -ENOMEM;
  138. bio_set_dev(bio, bdev);
  139. bio->bi_iter.bi_sector = blk_zone_start(q, sector);
  140. bio_set_op_attrs(bio, REQ_OP_ZONE_REPORT, 0);
  141. for (i = 0; i < nr_pages; i++) {
  142. page = alloc_page(gfp_mask);
  143. if (!page) {
  144. ret = -ENOMEM;
  145. goto out;
  146. }
  147. if (!bio_add_page(bio, page, PAGE_SIZE, 0)) {
  148. __free_page(page);
  149. break;
  150. }
  151. }
  152. if (i == 0)
  153. ret = -ENOMEM;
  154. else
  155. ret = submit_bio_wait(bio);
  156. if (ret)
  157. goto out;
  158. /*
  159. * Process the report result: skip the header and go through the
  160. * reported zones to fixup and fixup the zone information for
  161. * partitions. At the same time, return the zone information into
  162. * the zone array.
  163. */
  164. n = 0;
  165. nz = 0;
  166. nr_rep = 0;
  167. bio_for_each_segment_all(bv, bio, i) {
  168. if (!bv->bv_page)
  169. break;
  170. addr = kmap_atomic(bv->bv_page);
  171. /* Get header in the first page */
  172. ofst = 0;
  173. if (!nr_rep) {
  174. hdr = addr;
  175. nr_rep = hdr->nr_zones;
  176. ofst = sizeof(struct blk_zone_report_hdr);
  177. }
  178. /* Fixup and report zones */
  179. while (ofst < bv->bv_len &&
  180. n < nr_rep && nz < nrz) {
  181. if (blkdev_report_zone(bdev, addr + ofst, &zones[nz]))
  182. nz++;
  183. ofst += sizeof(struct blk_zone);
  184. n++;
  185. }
  186. kunmap_atomic(addr);
  187. if (n >= nr_rep || nz >= nrz)
  188. break;
  189. }
  190. *nr_zones = nz;
  191. out:
  192. bio_for_each_segment_all(bv, bio, i)
  193. __free_page(bv->bv_page);
  194. bio_put(bio);
  195. return ret;
  196. }
  197. EXPORT_SYMBOL_GPL(blkdev_report_zones);
  198. /**
  199. * blkdev_reset_zones - Reset zones write pointer
  200. * @bdev: Target block device
  201. * @sector: Start sector of the first zone to reset
  202. * @nr_sectors: Number of sectors, at least the length of one zone
  203. * @gfp_mask: Memory allocation flags (for bio_alloc)
  204. *
  205. * Description:
  206. * Reset the write pointer of the zones contained in the range
  207. * @sector..@sector+@nr_sectors. Specifying the entire disk sector range
  208. * is valid, but the specified range should not contain conventional zones.
  209. */
  210. int blkdev_reset_zones(struct block_device *bdev,
  211. sector_t sector, sector_t nr_sectors,
  212. gfp_t gfp_mask)
  213. {
  214. struct request_queue *q = bdev_get_queue(bdev);
  215. sector_t zone_sectors;
  216. sector_t end_sector = sector + nr_sectors;
  217. struct bio *bio;
  218. int ret;
  219. if (!q)
  220. return -ENXIO;
  221. if (!blk_queue_is_zoned(q))
  222. return -EOPNOTSUPP;
  223. if (end_sector > bdev->bd_part->nr_sects)
  224. /* Out of range */
  225. return -EINVAL;
  226. /* Check alignment (handle eventual smaller last zone) */
  227. zone_sectors = blk_queue_zone_sectors(q);
  228. if (sector & (zone_sectors - 1))
  229. return -EINVAL;
  230. if ((nr_sectors & (zone_sectors - 1)) &&
  231. end_sector != bdev->bd_part->nr_sects)
  232. return -EINVAL;
  233. while (sector < end_sector) {
  234. bio = bio_alloc(gfp_mask, 0);
  235. bio->bi_iter.bi_sector = sector;
  236. bio_set_dev(bio, bdev);
  237. bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0);
  238. ret = submit_bio_wait(bio);
  239. bio_put(bio);
  240. if (ret)
  241. return ret;
  242. sector += zone_sectors;
  243. /* This may take a while, so be nice to others */
  244. cond_resched();
  245. }
  246. return 0;
  247. }
  248. EXPORT_SYMBOL_GPL(blkdev_reset_zones);
  249. /*
  250. * BLKREPORTZONE ioctl processing.
  251. * Called from blkdev_ioctl.
  252. */
  253. int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
  254. unsigned int cmd, unsigned long arg)
  255. {
  256. void __user *argp = (void __user *)arg;
  257. struct request_queue *q;
  258. struct blk_zone_report rep;
  259. struct blk_zone *zones;
  260. int ret;
  261. if (!argp)
  262. return -EINVAL;
  263. q = bdev_get_queue(bdev);
  264. if (!q)
  265. return -ENXIO;
  266. if (!blk_queue_is_zoned(q))
  267. return -ENOTTY;
  268. if (!capable(CAP_SYS_ADMIN))
  269. return -EACCES;
  270. if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
  271. return -EFAULT;
  272. if (!rep.nr_zones)
  273. return -EINVAL;
  274. if (rep.nr_zones > INT_MAX / sizeof(struct blk_zone))
  275. return -ERANGE;
  276. zones = kvmalloc_array(rep.nr_zones, sizeof(struct blk_zone),
  277. GFP_KERNEL | __GFP_ZERO);
  278. if (!zones)
  279. return -ENOMEM;
  280. ret = blkdev_report_zones(bdev, rep.sector,
  281. zones, &rep.nr_zones,
  282. GFP_KERNEL);
  283. if (ret)
  284. goto out;
  285. if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) {
  286. ret = -EFAULT;
  287. goto out;
  288. }
  289. if (rep.nr_zones) {
  290. if (copy_to_user(argp + sizeof(struct blk_zone_report), zones,
  291. sizeof(struct blk_zone) * rep.nr_zones))
  292. ret = -EFAULT;
  293. }
  294. out:
  295. kvfree(zones);
  296. return ret;
  297. }
  298. /*
  299. * BLKRESETZONE ioctl processing.
  300. * Called from blkdev_ioctl.
  301. */
  302. int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
  303. unsigned int cmd, unsigned long arg)
  304. {
  305. void __user *argp = (void __user *)arg;
  306. struct request_queue *q;
  307. struct blk_zone_range zrange;
  308. if (!argp)
  309. return -EINVAL;
  310. q = bdev_get_queue(bdev);
  311. if (!q)
  312. return -ENXIO;
  313. if (!blk_queue_is_zoned(q))
  314. return -ENOTTY;
  315. if (!capable(CAP_SYS_ADMIN))
  316. return -EACCES;
  317. if (!(mode & FMODE_WRITE))
  318. return -EBADF;
  319. if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
  320. return -EFAULT;
  321. return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors,
  322. GFP_KERNEL);
  323. }