extent_tree.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647
  1. /*
  2. * Copyright (c) 2014-2016 Christoph Hellwig.
  3. */
  4. #include <linux/vmalloc.h>
  5. #include "blocklayout.h"
  6. #define NFSDBG_FACILITY NFSDBG_PNFS_LD
  7. static inline struct pnfs_block_extent *
  8. ext_node(struct rb_node *node)
  9. {
  10. return rb_entry(node, struct pnfs_block_extent, be_node);
  11. }
  12. static struct pnfs_block_extent *
  13. ext_tree_first(struct rb_root *root)
  14. {
  15. struct rb_node *node = rb_first(root);
  16. return node ? ext_node(node) : NULL;
  17. }
  18. static struct pnfs_block_extent *
  19. ext_tree_prev(struct pnfs_block_extent *be)
  20. {
  21. struct rb_node *node = rb_prev(&be->be_node);
  22. return node ? ext_node(node) : NULL;
  23. }
  24. static struct pnfs_block_extent *
  25. ext_tree_next(struct pnfs_block_extent *be)
  26. {
  27. struct rb_node *node = rb_next(&be->be_node);
  28. return node ? ext_node(node) : NULL;
  29. }
  30. static inline sector_t
  31. ext_f_end(struct pnfs_block_extent *be)
  32. {
  33. return be->be_f_offset + be->be_length;
  34. }
  35. static struct pnfs_block_extent *
  36. __ext_tree_search(struct rb_root *root, sector_t start)
  37. {
  38. struct rb_node *node = root->rb_node;
  39. struct pnfs_block_extent *be = NULL;
  40. while (node) {
  41. be = ext_node(node);
  42. if (start < be->be_f_offset)
  43. node = node->rb_left;
  44. else if (start >= ext_f_end(be))
  45. node = node->rb_right;
  46. else
  47. return be;
  48. }
  49. if (be) {
  50. if (start < be->be_f_offset)
  51. return be;
  52. if (start >= ext_f_end(be))
  53. return ext_tree_next(be);
  54. }
  55. return NULL;
  56. }
  57. static bool
  58. ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
  59. {
  60. if (be1->be_state != be2->be_state)
  61. return false;
  62. if (be1->be_device != be2->be_device)
  63. return false;
  64. if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
  65. return false;
  66. if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
  67. (be1->be_v_offset + be1->be_length != be2->be_v_offset))
  68. return false;
  69. if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
  70. be1->be_tag != be2->be_tag)
  71. return false;
  72. return true;
  73. }
  74. static struct pnfs_block_extent *
  75. ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
  76. {
  77. struct pnfs_block_extent *left = ext_tree_prev(be);
  78. if (left && ext_can_merge(left, be)) {
  79. left->be_length += be->be_length;
  80. rb_erase(&be->be_node, root);
  81. nfs4_put_deviceid_node(be->be_device);
  82. kfree(be);
  83. return left;
  84. }
  85. return be;
  86. }
  87. static struct pnfs_block_extent *
  88. ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
  89. {
  90. struct pnfs_block_extent *right = ext_tree_next(be);
  91. if (right && ext_can_merge(be, right)) {
  92. be->be_length += right->be_length;
  93. rb_erase(&right->be_node, root);
  94. nfs4_put_deviceid_node(right->be_device);
  95. kfree(right);
  96. }
  97. return be;
  98. }
  99. static void __ext_put_deviceids(struct list_head *head)
  100. {
  101. struct pnfs_block_extent *be, *tmp;
  102. list_for_each_entry_safe(be, tmp, head, be_list) {
  103. nfs4_put_deviceid_node(be->be_device);
  104. kfree(be);
  105. }
  106. }
  107. static void
  108. __ext_tree_insert(struct rb_root *root,
  109. struct pnfs_block_extent *new, bool merge_ok)
  110. {
  111. struct rb_node **p = &root->rb_node, *parent = NULL;
  112. struct pnfs_block_extent *be;
  113. while (*p) {
  114. parent = *p;
  115. be = ext_node(parent);
  116. if (new->be_f_offset < be->be_f_offset) {
  117. if (merge_ok && ext_can_merge(new, be)) {
  118. be->be_f_offset = new->be_f_offset;
  119. if (be->be_state != PNFS_BLOCK_NONE_DATA)
  120. be->be_v_offset = new->be_v_offset;
  121. be->be_length += new->be_length;
  122. be = ext_try_to_merge_left(root, be);
  123. goto free_new;
  124. }
  125. p = &(*p)->rb_left;
  126. } else if (new->be_f_offset >= ext_f_end(be)) {
  127. if (merge_ok && ext_can_merge(be, new)) {
  128. be->be_length += new->be_length;
  129. be = ext_try_to_merge_right(root, be);
  130. goto free_new;
  131. }
  132. p = &(*p)->rb_right;
  133. } else {
  134. BUG();
  135. }
  136. }
  137. rb_link_node(&new->be_node, parent, p);
  138. rb_insert_color(&new->be_node, root);
  139. return;
  140. free_new:
  141. nfs4_put_deviceid_node(new->be_device);
  142. kfree(new);
  143. }
  144. static int
  145. __ext_tree_remove(struct rb_root *root,
  146. sector_t start, sector_t end, struct list_head *tmp)
  147. {
  148. struct pnfs_block_extent *be;
  149. sector_t len1 = 0, len2 = 0;
  150. sector_t orig_v_offset;
  151. sector_t orig_len;
  152. be = __ext_tree_search(root, start);
  153. if (!be)
  154. return 0;
  155. if (be->be_f_offset >= end)
  156. return 0;
  157. orig_v_offset = be->be_v_offset;
  158. orig_len = be->be_length;
  159. if (start > be->be_f_offset)
  160. len1 = start - be->be_f_offset;
  161. if (ext_f_end(be) > end)
  162. len2 = ext_f_end(be) - end;
  163. if (len2 > 0) {
  164. if (len1 > 0) {
  165. struct pnfs_block_extent *new;
  166. new = kzalloc(sizeof(*new), GFP_ATOMIC);
  167. if (!new)
  168. return -ENOMEM;
  169. be->be_length = len1;
  170. new->be_f_offset = end;
  171. if (be->be_state != PNFS_BLOCK_NONE_DATA) {
  172. new->be_v_offset =
  173. orig_v_offset + orig_len - len2;
  174. }
  175. new->be_length = len2;
  176. new->be_state = be->be_state;
  177. new->be_tag = be->be_tag;
  178. new->be_device = nfs4_get_deviceid(be->be_device);
  179. __ext_tree_insert(root, new, true);
  180. } else {
  181. be->be_f_offset = end;
  182. if (be->be_state != PNFS_BLOCK_NONE_DATA) {
  183. be->be_v_offset =
  184. orig_v_offset + orig_len - len2;
  185. }
  186. be->be_length = len2;
  187. }
  188. } else {
  189. if (len1 > 0) {
  190. be->be_length = len1;
  191. be = ext_tree_next(be);
  192. }
  193. while (be && ext_f_end(be) <= end) {
  194. struct pnfs_block_extent *next = ext_tree_next(be);
  195. rb_erase(&be->be_node, root);
  196. list_add_tail(&be->be_list, tmp);
  197. be = next;
  198. }
  199. if (be && be->be_f_offset < end) {
  200. len1 = ext_f_end(be) - end;
  201. be->be_f_offset = end;
  202. if (be->be_state != PNFS_BLOCK_NONE_DATA)
  203. be->be_v_offset += be->be_length - len1;
  204. be->be_length = len1;
  205. }
  206. }
  207. return 0;
  208. }
  209. int
  210. ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
  211. {
  212. struct pnfs_block_extent *be;
  213. struct rb_root *root;
  214. int err = 0;
  215. switch (new->be_state) {
  216. case PNFS_BLOCK_READWRITE_DATA:
  217. case PNFS_BLOCK_INVALID_DATA:
  218. root = &bl->bl_ext_rw;
  219. break;
  220. case PNFS_BLOCK_READ_DATA:
  221. case PNFS_BLOCK_NONE_DATA:
  222. root = &bl->bl_ext_ro;
  223. break;
  224. default:
  225. dprintk("invalid extent type\n");
  226. return -EINVAL;
  227. }
  228. spin_lock(&bl->bl_ext_lock);
  229. retry:
  230. be = __ext_tree_search(root, new->be_f_offset);
  231. if (!be || be->be_f_offset >= ext_f_end(new)) {
  232. __ext_tree_insert(root, new, true);
  233. } else if (new->be_f_offset >= be->be_f_offset) {
  234. if (ext_f_end(new) <= ext_f_end(be)) {
  235. nfs4_put_deviceid_node(new->be_device);
  236. kfree(new);
  237. } else {
  238. sector_t new_len = ext_f_end(new) - ext_f_end(be);
  239. sector_t diff = new->be_length - new_len;
  240. new->be_f_offset += diff;
  241. new->be_v_offset += diff;
  242. new->be_length = new_len;
  243. goto retry;
  244. }
  245. } else if (ext_f_end(new) <= ext_f_end(be)) {
  246. new->be_length = be->be_f_offset - new->be_f_offset;
  247. __ext_tree_insert(root, new, true);
  248. } else {
  249. struct pnfs_block_extent *split;
  250. sector_t new_len = ext_f_end(new) - ext_f_end(be);
  251. sector_t diff = new->be_length - new_len;
  252. split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
  253. if (!split) {
  254. err = -EINVAL;
  255. goto out;
  256. }
  257. split->be_length = be->be_f_offset - split->be_f_offset;
  258. split->be_device = nfs4_get_deviceid(new->be_device);
  259. __ext_tree_insert(root, split, true);
  260. new->be_f_offset += diff;
  261. new->be_v_offset += diff;
  262. new->be_length = new_len;
  263. goto retry;
  264. }
  265. out:
  266. spin_unlock(&bl->bl_ext_lock);
  267. return err;
  268. }
  269. static bool
  270. __ext_tree_lookup(struct rb_root *root, sector_t isect,
  271. struct pnfs_block_extent *ret)
  272. {
  273. struct rb_node *node;
  274. struct pnfs_block_extent *be;
  275. node = root->rb_node;
  276. while (node) {
  277. be = ext_node(node);
  278. if (isect < be->be_f_offset)
  279. node = node->rb_left;
  280. else if (isect >= ext_f_end(be))
  281. node = node->rb_right;
  282. else {
  283. *ret = *be;
  284. return true;
  285. }
  286. }
  287. return false;
  288. }
  289. bool
  290. ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
  291. struct pnfs_block_extent *ret, bool rw)
  292. {
  293. bool found = false;
  294. spin_lock(&bl->bl_ext_lock);
  295. if (!rw)
  296. found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
  297. if (!found)
  298. found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
  299. spin_unlock(&bl->bl_ext_lock);
  300. return found;
  301. }
  302. int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
  303. sector_t start, sector_t end)
  304. {
  305. int err, err2;
  306. LIST_HEAD(tmp);
  307. spin_lock(&bl->bl_ext_lock);
  308. err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
  309. if (rw) {
  310. err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
  311. if (!err)
  312. err = err2;
  313. }
  314. spin_unlock(&bl->bl_ext_lock);
  315. __ext_put_deviceids(&tmp);
  316. return err;
  317. }
  318. static int
  319. ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
  320. sector_t split)
  321. {
  322. struct pnfs_block_extent *new;
  323. sector_t orig_len = be->be_length;
  324. new = kzalloc(sizeof(*new), GFP_ATOMIC);
  325. if (!new)
  326. return -ENOMEM;
  327. be->be_length = split - be->be_f_offset;
  328. new->be_f_offset = split;
  329. if (be->be_state != PNFS_BLOCK_NONE_DATA)
  330. new->be_v_offset = be->be_v_offset + be->be_length;
  331. new->be_length = orig_len - be->be_length;
  332. new->be_state = be->be_state;
  333. new->be_tag = be->be_tag;
  334. new->be_device = nfs4_get_deviceid(be->be_device);
  335. __ext_tree_insert(root, new, false);
  336. return 0;
  337. }
  338. int
  339. ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
  340. sector_t len, u64 lwb)
  341. {
  342. struct rb_root *root = &bl->bl_ext_rw;
  343. sector_t end = start + len;
  344. struct pnfs_block_extent *be;
  345. int err = 0;
  346. LIST_HEAD(tmp);
  347. spin_lock(&bl->bl_ext_lock);
  348. /*
  349. * First remove all COW extents or holes from written to range.
  350. */
  351. err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
  352. if (err)
  353. goto out;
  354. /*
  355. * Then mark all invalid extents in the range as written to.
  356. */
  357. for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
  358. if (be->be_f_offset >= end)
  359. break;
  360. if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
  361. continue;
  362. if (be->be_f_offset < start) {
  363. struct pnfs_block_extent *left = ext_tree_prev(be);
  364. if (left && ext_can_merge(left, be)) {
  365. sector_t diff = start - be->be_f_offset;
  366. left->be_length += diff;
  367. be->be_f_offset += diff;
  368. be->be_v_offset += diff;
  369. be->be_length -= diff;
  370. } else {
  371. err = ext_tree_split(root, be, start);
  372. if (err)
  373. goto out;
  374. }
  375. }
  376. if (ext_f_end(be) > end) {
  377. struct pnfs_block_extent *right = ext_tree_next(be);
  378. if (right && ext_can_merge(be, right)) {
  379. sector_t diff = end - be->be_f_offset;
  380. be->be_length -= diff;
  381. right->be_f_offset -= diff;
  382. right->be_v_offset -= diff;
  383. right->be_length += diff;
  384. } else {
  385. err = ext_tree_split(root, be, end);
  386. if (err)
  387. goto out;
  388. }
  389. }
  390. if (be->be_f_offset >= start && ext_f_end(be) <= end) {
  391. be->be_tag = EXTENT_WRITTEN;
  392. be = ext_try_to_merge_left(root, be);
  393. be = ext_try_to_merge_right(root, be);
  394. }
  395. }
  396. out:
  397. if (bl->bl_lwb < lwb)
  398. bl->bl_lwb = lwb;
  399. spin_unlock(&bl->bl_ext_lock);
  400. __ext_put_deviceids(&tmp);
  401. return err;
  402. }
  403. static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
  404. {
  405. if (bl->bl_scsi_layout)
  406. return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
  407. else
  408. return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
  409. }
  410. static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
  411. size_t buffer_size)
  412. {
  413. if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
  414. int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
  415. for (i = 0; i < nr_pages; i++)
  416. put_page(arg->layoutupdate_pages[i]);
  417. vfree(arg->start_p);
  418. kfree(arg->layoutupdate_pages);
  419. } else {
  420. put_page(arg->layoutupdate_page);
  421. }
  422. }
  423. static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
  424. {
  425. p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
  426. NFS4_DEVICEID4_SIZE);
  427. p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
  428. p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
  429. p = xdr_encode_hyper(p, 0LL);
  430. *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
  431. return p;
  432. }
  433. static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
  434. {
  435. p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
  436. return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
  437. }
  438. static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
  439. size_t buffer_size, size_t *count, __u64 *lastbyte)
  440. {
  441. struct pnfs_block_extent *be;
  442. int ret = 0;
  443. spin_lock(&bl->bl_ext_lock);
  444. for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
  445. if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
  446. be->be_tag != EXTENT_WRITTEN)
  447. continue;
  448. (*count)++;
  449. if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
  450. /* keep counting.. */
  451. ret = -ENOSPC;
  452. continue;
  453. }
  454. if (bl->bl_scsi_layout)
  455. p = encode_scsi_range(be, p);
  456. else
  457. p = encode_block_extent(be, p);
  458. be->be_tag = EXTENT_COMMITTING;
  459. }
  460. *lastbyte = bl->bl_lwb - 1;
  461. bl->bl_lwb = 0;
  462. spin_unlock(&bl->bl_ext_lock);
  463. return ret;
  464. }
  465. int
  466. ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
  467. {
  468. struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
  469. size_t count = 0, buffer_size = PAGE_SIZE;
  470. __be32 *start_p;
  471. int ret;
  472. dprintk("%s enter\n", __func__);
  473. arg->layoutupdate_page = alloc_page(GFP_NOFS);
  474. if (!arg->layoutupdate_page)
  475. return -ENOMEM;
  476. start_p = page_address(arg->layoutupdate_page);
  477. arg->layoutupdate_pages = &arg->layoutupdate_page;
  478. retry:
  479. ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
  480. if (unlikely(ret)) {
  481. ext_tree_free_commitdata(arg, buffer_size);
  482. buffer_size = ext_tree_layoutupdate_size(bl, count);
  483. count = 0;
  484. arg->layoutupdate_pages =
  485. kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
  486. sizeof(struct page *), GFP_NOFS);
  487. if (!arg->layoutupdate_pages)
  488. return -ENOMEM;
  489. start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
  490. if (!start_p) {
  491. kfree(arg->layoutupdate_pages);
  492. return -ENOMEM;
  493. }
  494. goto retry;
  495. }
  496. *start_p = cpu_to_be32(count);
  497. arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
  498. if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
  499. void *p = start_p, *end = p + arg->layoutupdate_len;
  500. struct page *page = NULL;
  501. int i = 0;
  502. arg->start_p = start_p;
  503. for ( ; p < end; p += PAGE_SIZE) {
  504. page = vmalloc_to_page(p);
  505. arg->layoutupdate_pages[i++] = page;
  506. get_page(page);
  507. }
  508. }
  509. dprintk("%s found %zu ranges\n", __func__, count);
  510. return 0;
  511. }
  512. void
  513. ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
  514. {
  515. struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
  516. struct rb_root *root = &bl->bl_ext_rw;
  517. struct pnfs_block_extent *be;
  518. dprintk("%s status %d\n", __func__, status);
  519. ext_tree_free_commitdata(arg, arg->layoutupdate_len);
  520. spin_lock(&bl->bl_ext_lock);
  521. for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
  522. if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
  523. be->be_tag != EXTENT_COMMITTING)
  524. continue;
  525. if (status) {
  526. /*
  527. * Mark as written and try again.
  528. *
  529. * XXX: some real error handling here wouldn't hurt..
  530. */
  531. be->be_tag = EXTENT_WRITTEN;
  532. } else {
  533. be->be_state = PNFS_BLOCK_READWRITE_DATA;
  534. be->be_tag = 0;
  535. }
  536. be = ext_try_to_merge_left(root, be);
  537. be = ext_try_to_merge_right(root, be);
  538. }
  539. spin_unlock(&bl->bl_ext_lock);
  540. }