nbd.c 23 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060
  1. /*
  2. * Network block device - make block devices work over TCP
  3. *
  4. * Note that you can not swap over this thing, yet. Seems to work but
  5. * deadlocks sometimes - you can not swap over TCP in general.
  6. *
  7. * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
  8. * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
  9. *
  10. * This file is released under GPLv2 or later.
  11. *
  12. * (part of code stolen from loop.c)
  13. */
  14. #include <linux/major.h>
  15. #include <linux/blkdev.h>
  16. #include <linux/module.h>
  17. #include <linux/init.h>
  18. #include <linux/sched.h>
  19. #include <linux/fs.h>
  20. #include <linux/bio.h>
  21. #include <linux/stat.h>
  22. #include <linux/errno.h>
  23. #include <linux/file.h>
  24. #include <linux/ioctl.h>
  25. #include <linux/mutex.h>
  26. #include <linux/compiler.h>
  27. #include <linux/err.h>
  28. #include <linux/kernel.h>
  29. #include <linux/slab.h>
  30. #include <net/sock.h>
  31. #include <linux/net.h>
  32. #include <linux/kthread.h>
  33. #include <linux/types.h>
  34. #include <linux/debugfs.h>
  35. #include <linux/blk-mq.h>
  36. #include <asm/uaccess.h>
  37. #include <asm/types.h>
  38. #include <linux/nbd.h>
  39. #define NBD_TIMEDOUT 0
  40. #define NBD_DISCONNECT_REQUESTED 1
  41. struct nbd_device {
  42. u32 flags;
  43. unsigned long runtime_flags;
  44. struct socket * sock; /* If == NULL, device is not ready, yet */
  45. int magic;
  46. struct blk_mq_tag_set tag_set;
  47. struct mutex tx_lock;
  48. struct gendisk *disk;
  49. loff_t blksize;
  50. loff_t bytesize;
  51. /* protects initialization and shutdown of the socket */
  52. spinlock_t sock_lock;
  53. struct task_struct *task_recv;
  54. struct task_struct *task_send;
  55. #if IS_ENABLED(CONFIG_DEBUG_FS)
  56. struct dentry *dbg_dir;
  57. #endif
  58. };
  59. struct nbd_cmd {
  60. struct nbd_device *nbd;
  61. struct list_head list;
  62. };
  63. #if IS_ENABLED(CONFIG_DEBUG_FS)
  64. static struct dentry *nbd_dbg_dir;
  65. #endif
  66. #define nbd_name(nbd) ((nbd)->disk->disk_name)
  67. #define NBD_MAGIC 0x68797548
  68. static unsigned int nbds_max = 16;
  69. static struct nbd_device *nbd_dev;
  70. static int max_part;
  71. static inline struct device *nbd_to_dev(struct nbd_device *nbd)
  72. {
  73. return disk_to_dev(nbd->disk);
  74. }
  75. static bool nbd_is_connected(struct nbd_device *nbd)
  76. {
  77. return !!nbd->task_recv;
  78. }
  79. static const char *nbdcmd_to_ascii(int cmd)
  80. {
  81. switch (cmd) {
  82. case NBD_CMD_READ: return "read";
  83. case NBD_CMD_WRITE: return "write";
  84. case NBD_CMD_DISC: return "disconnect";
  85. case NBD_CMD_FLUSH: return "flush";
  86. case NBD_CMD_TRIM: return "trim/discard";
  87. }
  88. return "invalid";
  89. }
  90. static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
  91. {
  92. bdev->bd_inode->i_size = 0;
  93. set_capacity(nbd->disk, 0);
  94. kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
  95. return 0;
  96. }
  97. static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev)
  98. {
  99. if (!nbd_is_connected(nbd))
  100. return;
  101. bdev->bd_inode->i_size = nbd->bytesize;
  102. set_capacity(nbd->disk, nbd->bytesize >> 9);
  103. kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
  104. }
  105. static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
  106. loff_t blocksize, loff_t nr_blocks)
  107. {
  108. int ret;
  109. ret = set_blocksize(bdev, blocksize);
  110. if (ret)
  111. return ret;
  112. nbd->blksize = blocksize;
  113. nbd->bytesize = blocksize * nr_blocks;
  114. nbd_size_update(nbd, bdev);
  115. return 0;
  116. }
  117. static void nbd_end_request(struct nbd_cmd *cmd)
  118. {
  119. struct nbd_device *nbd = cmd->nbd;
  120. struct request *req = blk_mq_rq_from_pdu(cmd);
  121. int error = req->errors ? -EIO : 0;
  122. dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd,
  123. error ? "failed" : "done");
  124. blk_mq_complete_request(req, error);
  125. }
  126. /*
  127. * Forcibly shutdown the socket causing all listeners to error
  128. */
  129. static void sock_shutdown(struct nbd_device *nbd)
  130. {
  131. struct socket *sock;
  132. spin_lock(&nbd->sock_lock);
  133. if (!nbd->sock) {
  134. spin_unlock(&nbd->sock_lock);
  135. return;
  136. }
  137. sock = nbd->sock;
  138. dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
  139. nbd->sock = NULL;
  140. spin_unlock(&nbd->sock_lock);
  141. kernel_sock_shutdown(sock, SHUT_RDWR);
  142. sockfd_put(sock);
  143. }
  144. static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
  145. bool reserved)
  146. {
  147. struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
  148. struct nbd_device *nbd = cmd->nbd;
  149. struct socket *sock = NULL;
  150. spin_lock(&nbd->sock_lock);
  151. set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
  152. if (nbd->sock) {
  153. sock = nbd->sock;
  154. get_file(sock->file);
  155. }
  156. spin_unlock(&nbd->sock_lock);
  157. if (sock) {
  158. kernel_sock_shutdown(sock, SHUT_RDWR);
  159. sockfd_put(sock);
  160. }
  161. req->errors++;
  162. dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
  163. return BLK_EH_HANDLED;
  164. }
  165. /*
  166. * Send or receive packet.
  167. */
  168. static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
  169. int msg_flags)
  170. {
  171. struct socket *sock = nbd->sock;
  172. int result;
  173. struct msghdr msg;
  174. struct kvec iov;
  175. unsigned long pflags = current->flags;
  176. if (unlikely(!sock)) {
  177. dev_err(disk_to_dev(nbd->disk),
  178. "Attempted %s on closed socket in sock_xmit\n",
  179. (send ? "send" : "recv"));
  180. return -EINVAL;
  181. }
  182. current->flags |= PF_MEMALLOC;
  183. do {
  184. sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
  185. iov.iov_base = buf;
  186. iov.iov_len = size;
  187. msg.msg_name = NULL;
  188. msg.msg_namelen = 0;
  189. msg.msg_control = NULL;
  190. msg.msg_controllen = 0;
  191. msg.msg_flags = msg_flags | MSG_NOSIGNAL;
  192. if (send)
  193. result = kernel_sendmsg(sock, &msg, &iov, 1, size);
  194. else
  195. result = kernel_recvmsg(sock, &msg, &iov, 1, size,
  196. msg.msg_flags);
  197. if (result <= 0) {
  198. if (result == 0)
  199. result = -EPIPE; /* short read */
  200. break;
  201. }
  202. size -= result;
  203. buf += result;
  204. } while (size > 0);
  205. tsk_restore_flags(current, pflags, PF_MEMALLOC);
  206. return result;
  207. }
  208. static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
  209. int flags)
  210. {
  211. int result;
  212. void *kaddr = kmap(bvec->bv_page);
  213. result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
  214. bvec->bv_len, flags);
  215. kunmap(bvec->bv_page);
  216. return result;
  217. }
  218. /* always call with the tx_lock held */
  219. static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
  220. {
  221. struct request *req = blk_mq_rq_from_pdu(cmd);
  222. int result, flags;
  223. struct nbd_request request;
  224. unsigned long size = blk_rq_bytes(req);
  225. struct bio *bio;
  226. u32 type;
  227. if (req->cmd_type == REQ_TYPE_DRV_PRIV)
  228. type = NBD_CMD_DISC;
  229. else if (req_op(req) == REQ_OP_DISCARD)
  230. type = NBD_CMD_TRIM;
  231. else if (req_op(req) == REQ_OP_FLUSH)
  232. type = NBD_CMD_FLUSH;
  233. else if (rq_data_dir(req) == WRITE)
  234. type = NBD_CMD_WRITE;
  235. else
  236. type = NBD_CMD_READ;
  237. memset(&request, 0, sizeof(request));
  238. request.magic = htonl(NBD_REQUEST_MAGIC);
  239. request.type = htonl(type);
  240. if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) {
  241. request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
  242. request.len = htonl(size);
  243. }
  244. memcpy(request.handle, &req->tag, sizeof(req->tag));
  245. dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
  246. cmd, nbdcmd_to_ascii(type),
  247. (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
  248. result = sock_xmit(nbd, 1, &request, sizeof(request),
  249. (type == NBD_CMD_WRITE) ? MSG_MORE : 0);
  250. if (result <= 0) {
  251. dev_err(disk_to_dev(nbd->disk),
  252. "Send control failed (result %d)\n", result);
  253. return -EIO;
  254. }
  255. if (type != NBD_CMD_WRITE)
  256. return 0;
  257. flags = 0;
  258. bio = req->bio;
  259. while (bio) {
  260. struct bio *next = bio->bi_next;
  261. struct bvec_iter iter;
  262. struct bio_vec bvec;
  263. bio_for_each_segment(bvec, bio, iter) {
  264. bool is_last = !next && bio_iter_last(bvec, iter);
  265. if (is_last)
  266. flags = MSG_MORE;
  267. dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
  268. cmd, bvec.bv_len);
  269. result = sock_send_bvec(nbd, &bvec, flags);
  270. if (result <= 0) {
  271. dev_err(disk_to_dev(nbd->disk),
  272. "Send data failed (result %d)\n",
  273. result);
  274. return -EIO;
  275. }
  276. /*
  277. * The completion might already have come in,
  278. * so break for the last one instead of letting
  279. * the iterator do it. This prevents use-after-free
  280. * of the bio.
  281. */
  282. if (is_last)
  283. break;
  284. }
  285. bio = next;
  286. }
  287. return 0;
  288. }
  289. static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
  290. {
  291. int result;
  292. void *kaddr = kmap(bvec->bv_page);
  293. result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
  294. MSG_WAITALL);
  295. kunmap(bvec->bv_page);
  296. return result;
  297. }
  298. /* NULL returned = something went wrong, inform userspace */
  299. static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
  300. {
  301. int result;
  302. struct nbd_reply reply;
  303. struct nbd_cmd *cmd;
  304. struct request *req = NULL;
  305. u16 hwq;
  306. int tag;
  307. reply.magic = 0;
  308. result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
  309. if (result <= 0) {
  310. dev_err(disk_to_dev(nbd->disk),
  311. "Receive control failed (result %d)\n", result);
  312. return ERR_PTR(result);
  313. }
  314. if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
  315. dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
  316. (unsigned long)ntohl(reply.magic));
  317. return ERR_PTR(-EPROTO);
  318. }
  319. memcpy(&tag, reply.handle, sizeof(int));
  320. hwq = blk_mq_unique_tag_to_hwq(tag);
  321. if (hwq < nbd->tag_set.nr_hw_queues)
  322. req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
  323. blk_mq_unique_tag_to_tag(tag));
  324. if (!req || !blk_mq_request_started(req)) {
  325. dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
  326. tag, req);
  327. return ERR_PTR(-ENOENT);
  328. }
  329. cmd = blk_mq_rq_to_pdu(req);
  330. if (ntohl(reply.error)) {
  331. dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
  332. ntohl(reply.error));
  333. req->errors++;
  334. return cmd;
  335. }
  336. dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", cmd);
  337. if (rq_data_dir(req) != WRITE) {
  338. struct req_iterator iter;
  339. struct bio_vec bvec;
  340. rq_for_each_segment(bvec, req, iter) {
  341. result = sock_recv_bvec(nbd, &bvec);
  342. if (result <= 0) {
  343. dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
  344. result);
  345. req->errors++;
  346. return cmd;
  347. }
  348. dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
  349. cmd, bvec.bv_len);
  350. }
  351. }
  352. return cmd;
  353. }
  354. static ssize_t pid_show(struct device *dev,
  355. struct device_attribute *attr, char *buf)
  356. {
  357. struct gendisk *disk = dev_to_disk(dev);
  358. struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
  359. return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
  360. }
  361. static struct device_attribute pid_attr = {
  362. .attr = { .name = "pid", .mode = S_IRUGO},
  363. .show = pid_show,
  364. };
  365. static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
  366. {
  367. struct nbd_cmd *cmd;
  368. int ret;
  369. BUG_ON(nbd->magic != NBD_MAGIC);
  370. sk_set_memalloc(nbd->sock->sk);
  371. ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
  372. if (ret) {
  373. dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
  374. return ret;
  375. }
  376. nbd_size_update(nbd, bdev);
  377. while (1) {
  378. cmd = nbd_read_stat(nbd);
  379. if (IS_ERR(cmd)) {
  380. ret = PTR_ERR(cmd);
  381. break;
  382. }
  383. nbd_end_request(cmd);
  384. }
  385. nbd_size_clear(nbd, bdev);
  386. device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
  387. return ret;
  388. }
  389. static void nbd_clear_req(struct request *req, void *data, bool reserved)
  390. {
  391. struct nbd_cmd *cmd;
  392. if (!blk_mq_request_started(req))
  393. return;
  394. cmd = blk_mq_rq_to_pdu(req);
  395. req->errors++;
  396. nbd_end_request(cmd);
  397. }
  398. static void nbd_clear_que(struct nbd_device *nbd)
  399. {
  400. BUG_ON(nbd->magic != NBD_MAGIC);
  401. /*
  402. * Because we have set nbd->sock to NULL under the tx_lock, all
  403. * modifications to the list must have completed by now.
  404. */
  405. BUG_ON(nbd->sock);
  406. blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
  407. dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
  408. }
  409. static void nbd_handle_cmd(struct nbd_cmd *cmd)
  410. {
  411. struct request *req = blk_mq_rq_from_pdu(cmd);
  412. struct nbd_device *nbd = cmd->nbd;
  413. if (req->cmd_type != REQ_TYPE_FS)
  414. goto error_out;
  415. if (rq_data_dir(req) == WRITE &&
  416. (nbd->flags & NBD_FLAG_READ_ONLY)) {
  417. dev_err(disk_to_dev(nbd->disk),
  418. "Write on read-only\n");
  419. goto error_out;
  420. }
  421. req->errors = 0;
  422. mutex_lock(&nbd->tx_lock);
  423. nbd->task_send = current;
  424. if (unlikely(!nbd->sock)) {
  425. mutex_unlock(&nbd->tx_lock);
  426. dev_err(disk_to_dev(nbd->disk),
  427. "Attempted send on closed socket\n");
  428. goto error_out;
  429. }
  430. if (nbd_send_cmd(nbd, cmd) != 0) {
  431. dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
  432. req->errors++;
  433. nbd_end_request(cmd);
  434. }
  435. nbd->task_send = NULL;
  436. mutex_unlock(&nbd->tx_lock);
  437. return;
  438. error_out:
  439. req->errors++;
  440. nbd_end_request(cmd);
  441. }
  442. static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
  443. const struct blk_mq_queue_data *bd)
  444. {
  445. struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
  446. blk_mq_start_request(bd->rq);
  447. nbd_handle_cmd(cmd);
  448. return BLK_MQ_RQ_QUEUE_OK;
  449. }
  450. static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock)
  451. {
  452. int ret = 0;
  453. spin_lock_irq(&nbd->sock_lock);
  454. if (nbd->sock) {
  455. ret = -EBUSY;
  456. goto out;
  457. }
  458. nbd->sock = sock;
  459. out:
  460. spin_unlock_irq(&nbd->sock_lock);
  461. return ret;
  462. }
  463. /* Reset all properties of an NBD device */
  464. static void nbd_reset(struct nbd_device *nbd)
  465. {
  466. nbd->runtime_flags = 0;
  467. nbd->blksize = 1024;
  468. nbd->bytesize = 0;
  469. set_capacity(nbd->disk, 0);
  470. nbd->flags = 0;
  471. nbd->tag_set.timeout = 0;
  472. queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
  473. }
  474. static void nbd_bdev_reset(struct block_device *bdev)
  475. {
  476. set_device_ro(bdev, false);
  477. bdev->bd_inode->i_size = 0;
  478. if (max_part > 0) {
  479. blkdev_reread_part(bdev);
  480. bdev->bd_invalidated = 1;
  481. }
  482. }
  483. static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
  484. {
  485. if (nbd->flags & NBD_FLAG_READ_ONLY)
  486. set_device_ro(bdev, true);
  487. if (nbd->flags & NBD_FLAG_SEND_TRIM)
  488. queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
  489. if (nbd->flags & NBD_FLAG_SEND_FLUSH)
  490. blk_queue_write_cache(nbd->disk->queue, true, false);
  491. else
  492. blk_queue_write_cache(nbd->disk->queue, false, false);
  493. }
  494. static int nbd_dev_dbg_init(struct nbd_device *nbd);
  495. static void nbd_dev_dbg_close(struct nbd_device *nbd);
  496. /* Must be called with tx_lock held */
  497. static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
  498. unsigned int cmd, unsigned long arg)
  499. {
  500. switch (cmd) {
  501. case NBD_DISCONNECT: {
  502. struct request *sreq;
  503. dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
  504. if (!nbd->sock)
  505. return -EINVAL;
  506. sreq = blk_mq_alloc_request(bdev_get_queue(bdev), WRITE, 0);
  507. if (IS_ERR(sreq))
  508. return -ENOMEM;
  509. mutex_unlock(&nbd->tx_lock);
  510. fsync_bdev(bdev);
  511. mutex_lock(&nbd->tx_lock);
  512. sreq->cmd_type = REQ_TYPE_DRV_PRIV;
  513. /* Check again after getting mutex back. */
  514. if (!nbd->sock) {
  515. blk_mq_free_request(sreq);
  516. return -EINVAL;
  517. }
  518. set_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags);
  519. nbd_send_cmd(nbd, blk_mq_rq_to_pdu(sreq));
  520. blk_mq_free_request(sreq);
  521. return 0;
  522. }
  523. case NBD_CLEAR_SOCK:
  524. sock_shutdown(nbd);
  525. nbd_clear_que(nbd);
  526. kill_bdev(bdev);
  527. return 0;
  528. case NBD_SET_SOCK: {
  529. int err;
  530. struct socket *sock = sockfd_lookup(arg, &err);
  531. if (!sock)
  532. return err;
  533. err = nbd_set_socket(nbd, sock);
  534. if (!err && max_part)
  535. bdev->bd_invalidated = 1;
  536. return err;
  537. }
  538. case NBD_SET_BLKSIZE: {
  539. loff_t bsize = div_s64(nbd->bytesize, arg);
  540. return nbd_size_set(nbd, bdev, arg, bsize);
  541. }
  542. case NBD_SET_SIZE:
  543. return nbd_size_set(nbd, bdev, nbd->blksize,
  544. div_s64(arg, nbd->blksize));
  545. case NBD_SET_SIZE_BLOCKS:
  546. return nbd_size_set(nbd, bdev, nbd->blksize, arg);
  547. case NBD_SET_TIMEOUT:
  548. if (arg) {
  549. nbd->tag_set.timeout = arg * HZ;
  550. blk_queue_rq_timeout(nbd->disk->queue, arg * HZ);
  551. }
  552. return 0;
  553. case NBD_SET_FLAGS:
  554. nbd->flags = arg;
  555. return 0;
  556. case NBD_DO_IT: {
  557. int error;
  558. if (nbd->task_recv)
  559. return -EBUSY;
  560. if (!nbd->sock)
  561. return -EINVAL;
  562. /* We have to claim the device under the lock */
  563. nbd->task_recv = current;
  564. mutex_unlock(&nbd->tx_lock);
  565. nbd_parse_flags(nbd, bdev);
  566. nbd_dev_dbg_init(nbd);
  567. error = nbd_thread_recv(nbd, bdev);
  568. nbd_dev_dbg_close(nbd);
  569. mutex_lock(&nbd->tx_lock);
  570. nbd->task_recv = NULL;
  571. sock_shutdown(nbd);
  572. nbd_clear_que(nbd);
  573. kill_bdev(bdev);
  574. nbd_bdev_reset(bdev);
  575. /* user requested, ignore socket errors */
  576. if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
  577. error = 0;
  578. if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags))
  579. error = -ETIMEDOUT;
  580. nbd_reset(nbd);
  581. return error;
  582. }
  583. case NBD_CLEAR_QUE:
  584. /*
  585. * This is for compatibility only. The queue is always cleared
  586. * by NBD_DO_IT or NBD_CLEAR_SOCK.
  587. */
  588. return 0;
  589. case NBD_PRINT_DEBUG:
  590. /*
  591. * For compatibility only, we no longer keep a list of
  592. * outstanding requests.
  593. */
  594. return 0;
  595. }
  596. return -ENOTTY;
  597. }
  598. static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
  599. unsigned int cmd, unsigned long arg)
  600. {
  601. struct nbd_device *nbd = bdev->bd_disk->private_data;
  602. int error;
  603. if (!capable(CAP_SYS_ADMIN))
  604. return -EPERM;
  605. BUG_ON(nbd->magic != NBD_MAGIC);
  606. mutex_lock(&nbd->tx_lock);
  607. error = __nbd_ioctl(bdev, nbd, cmd, arg);
  608. mutex_unlock(&nbd->tx_lock);
  609. return error;
  610. }
  611. static const struct block_device_operations nbd_fops =
  612. {
  613. .owner = THIS_MODULE,
  614. .ioctl = nbd_ioctl,
  615. .compat_ioctl = nbd_ioctl,
  616. };
  617. #if IS_ENABLED(CONFIG_DEBUG_FS)
  618. static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
  619. {
  620. struct nbd_device *nbd = s->private;
  621. if (nbd->task_recv)
  622. seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
  623. if (nbd->task_send)
  624. seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send));
  625. return 0;
  626. }
  627. static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
  628. {
  629. return single_open(file, nbd_dbg_tasks_show, inode->i_private);
  630. }
  631. static const struct file_operations nbd_dbg_tasks_ops = {
  632. .open = nbd_dbg_tasks_open,
  633. .read = seq_read,
  634. .llseek = seq_lseek,
  635. .release = single_release,
  636. };
  637. static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
  638. {
  639. struct nbd_device *nbd = s->private;
  640. u32 flags = nbd->flags;
  641. seq_printf(s, "Hex: 0x%08x\n\n", flags);
  642. seq_puts(s, "Known flags:\n");
  643. if (flags & NBD_FLAG_HAS_FLAGS)
  644. seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
  645. if (flags & NBD_FLAG_READ_ONLY)
  646. seq_puts(s, "NBD_FLAG_READ_ONLY\n");
  647. if (flags & NBD_FLAG_SEND_FLUSH)
  648. seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
  649. if (flags & NBD_FLAG_SEND_TRIM)
  650. seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
  651. return 0;
  652. }
  653. static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
  654. {
  655. return single_open(file, nbd_dbg_flags_show, inode->i_private);
  656. }
  657. static const struct file_operations nbd_dbg_flags_ops = {
  658. .open = nbd_dbg_flags_open,
  659. .read = seq_read,
  660. .llseek = seq_lseek,
  661. .release = single_release,
  662. };
  663. static int nbd_dev_dbg_init(struct nbd_device *nbd)
  664. {
  665. struct dentry *dir;
  666. if (!nbd_dbg_dir)
  667. return -EIO;
  668. dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
  669. if (!dir) {
  670. dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
  671. nbd_name(nbd));
  672. return -EIO;
  673. }
  674. nbd->dbg_dir = dir;
  675. debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
  676. debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
  677. debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
  678. debugfs_create_u64("blocksize", 0444, dir, &nbd->blksize);
  679. debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
  680. return 0;
  681. }
  682. static void nbd_dev_dbg_close(struct nbd_device *nbd)
  683. {
  684. debugfs_remove_recursive(nbd->dbg_dir);
  685. }
  686. static int nbd_dbg_init(void)
  687. {
  688. struct dentry *dbg_dir;
  689. dbg_dir = debugfs_create_dir("nbd", NULL);
  690. if (!dbg_dir)
  691. return -EIO;
  692. nbd_dbg_dir = dbg_dir;
  693. return 0;
  694. }
  695. static void nbd_dbg_close(void)
  696. {
  697. debugfs_remove_recursive(nbd_dbg_dir);
  698. }
  699. #else /* IS_ENABLED(CONFIG_DEBUG_FS) */
  700. static int nbd_dev_dbg_init(struct nbd_device *nbd)
  701. {
  702. return 0;
  703. }
  704. static void nbd_dev_dbg_close(struct nbd_device *nbd)
  705. {
  706. }
  707. static int nbd_dbg_init(void)
  708. {
  709. return 0;
  710. }
  711. static void nbd_dbg_close(void)
  712. {
  713. }
  714. #endif
  715. static int nbd_init_request(void *data, struct request *rq,
  716. unsigned int hctx_idx, unsigned int request_idx,
  717. unsigned int numa_node)
  718. {
  719. struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
  720. cmd->nbd = data;
  721. INIT_LIST_HEAD(&cmd->list);
  722. return 0;
  723. }
  724. static struct blk_mq_ops nbd_mq_ops = {
  725. .queue_rq = nbd_queue_rq,
  726. .init_request = nbd_init_request,
  727. .timeout = nbd_xmit_timeout,
  728. };
  729. /*
  730. * And here should be modules and kernel interface
  731. * (Just smiley confuses emacs :-)
  732. */
  733. static int __init nbd_init(void)
  734. {
  735. int err = -ENOMEM;
  736. int i;
  737. int part_shift;
  738. BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
  739. if (max_part < 0) {
  740. printk(KERN_ERR "nbd: max_part must be >= 0\n");
  741. return -EINVAL;
  742. }
  743. part_shift = 0;
  744. if (max_part > 0) {
  745. part_shift = fls(max_part);
  746. /*
  747. * Adjust max_part according to part_shift as it is exported
  748. * to user space so that user can know the max number of
  749. * partition kernel should be able to manage.
  750. *
  751. * Note that -1 is required because partition 0 is reserved
  752. * for the whole disk.
  753. */
  754. max_part = (1UL << part_shift) - 1;
  755. }
  756. if ((1UL << part_shift) > DISK_MAX_PARTS)
  757. return -EINVAL;
  758. if (nbds_max > 1UL << (MINORBITS - part_shift))
  759. return -EINVAL;
  760. nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
  761. if (!nbd_dev)
  762. return -ENOMEM;
  763. for (i = 0; i < nbds_max; i++) {
  764. struct request_queue *q;
  765. struct gendisk *disk = alloc_disk(1 << part_shift);
  766. if (!disk)
  767. goto out;
  768. nbd_dev[i].disk = disk;
  769. nbd_dev[i].tag_set.ops = &nbd_mq_ops;
  770. nbd_dev[i].tag_set.nr_hw_queues = 1;
  771. nbd_dev[i].tag_set.queue_depth = 128;
  772. nbd_dev[i].tag_set.numa_node = NUMA_NO_NODE;
  773. nbd_dev[i].tag_set.cmd_size = sizeof(struct nbd_cmd);
  774. nbd_dev[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
  775. BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
  776. nbd_dev[i].tag_set.driver_data = &nbd_dev[i];
  777. err = blk_mq_alloc_tag_set(&nbd_dev[i].tag_set);
  778. if (err) {
  779. put_disk(disk);
  780. goto out;
  781. }
  782. /*
  783. * The new linux 2.5 block layer implementation requires
  784. * every gendisk to have its very own request_queue struct.
  785. * These structs are big so we dynamically allocate them.
  786. */
  787. q = blk_mq_init_queue(&nbd_dev[i].tag_set);
  788. if (IS_ERR(q)) {
  789. blk_mq_free_tag_set(&nbd_dev[i].tag_set);
  790. put_disk(disk);
  791. goto out;
  792. }
  793. disk->queue = q;
  794. /*
  795. * Tell the block layer that we are not a rotational device
  796. */
  797. queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
  798. queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
  799. disk->queue->limits.discard_granularity = 512;
  800. blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
  801. disk->queue->limits.discard_zeroes_data = 0;
  802. blk_queue_max_hw_sectors(disk->queue, 65536);
  803. disk->queue->limits.max_sectors = 256;
  804. }
  805. if (register_blkdev(NBD_MAJOR, "nbd")) {
  806. err = -EIO;
  807. goto out;
  808. }
  809. printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR);
  810. nbd_dbg_init();
  811. for (i = 0; i < nbds_max; i++) {
  812. struct gendisk *disk = nbd_dev[i].disk;
  813. nbd_dev[i].magic = NBD_MAGIC;
  814. spin_lock_init(&nbd_dev[i].sock_lock);
  815. mutex_init(&nbd_dev[i].tx_lock);
  816. disk->major = NBD_MAJOR;
  817. disk->first_minor = i << part_shift;
  818. disk->fops = &nbd_fops;
  819. disk->private_data = &nbd_dev[i];
  820. sprintf(disk->disk_name, "nbd%d", i);
  821. nbd_reset(&nbd_dev[i]);
  822. add_disk(disk);
  823. }
  824. return 0;
  825. out:
  826. while (i--) {
  827. blk_mq_free_tag_set(&nbd_dev[i].tag_set);
  828. blk_cleanup_queue(nbd_dev[i].disk->queue);
  829. put_disk(nbd_dev[i].disk);
  830. }
  831. kfree(nbd_dev);
  832. return err;
  833. }
  834. static void __exit nbd_cleanup(void)
  835. {
  836. int i;
  837. nbd_dbg_close();
  838. for (i = 0; i < nbds_max; i++) {
  839. struct gendisk *disk = nbd_dev[i].disk;
  840. nbd_dev[i].magic = 0;
  841. if (disk) {
  842. del_gendisk(disk);
  843. blk_cleanup_queue(disk->queue);
  844. blk_mq_free_tag_set(&nbd_dev[i].tag_set);
  845. put_disk(disk);
  846. }
  847. }
  848. unregister_blkdev(NBD_MAJOR, "nbd");
  849. kfree(nbd_dev);
  850. printk(KERN_INFO "nbd: unregistered device at major %d\n", NBD_MAJOR);
  851. }
  852. module_init(nbd_init);
  853. module_exit(nbd_cleanup);
  854. MODULE_DESCRIPTION("Network Block Device");
  855. MODULE_LICENSE("GPL");
  856. module_param(nbds_max, int, 0444);
  857. MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
  858. module_param(max_part, int, 0444);
  859. MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)");