flexfilelayoutdev.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576
  1. /*
  2. * Device operations for the pnfs nfs4 file layout driver.
  3. *
  4. * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
  5. *
  6. * Tao Peng <bergwolf@primarydata.com>
  7. */
  8. #include <linux/nfs_fs.h>
  9. #include <linux/vmalloc.h>
  10. #include <linux/module.h>
  11. #include <linux/sunrpc/addr.h>
  12. #include "../internal.h"
  13. #include "../nfs4session.h"
  14. #include "flexfilelayout.h"
  15. #define NFSDBG_FACILITY NFSDBG_PNFS_LD
  16. static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS;
  17. static unsigned int dataserver_retrans;
  18. void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
  19. {
  20. if (mirror_ds)
  21. nfs4_put_deviceid_node(&mirror_ds->id_node);
  22. }
  23. void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
  24. {
  25. nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
  26. nfs4_pnfs_ds_put(mirror_ds->ds);
  27. kfree(mirror_ds->ds_versions);
  28. kfree_rcu(mirror_ds, id_node.rcu);
  29. }
  30. /* Decode opaque device data and construct new_ds using it */
  31. struct nfs4_ff_layout_ds *
  32. nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
  33. gfp_t gfp_flags)
  34. {
  35. struct xdr_stream stream;
  36. struct xdr_buf buf;
  37. struct page *scratch;
  38. struct list_head dsaddrs;
  39. struct nfs4_pnfs_ds_addr *da;
  40. struct nfs4_ff_layout_ds *new_ds = NULL;
  41. struct nfs4_ff_ds_version *ds_versions = NULL;
  42. u32 mp_count;
  43. u32 version_count;
  44. __be32 *p;
  45. int i, ret = -ENOMEM;
  46. /* set up xdr stream */
  47. scratch = alloc_page(gfp_flags);
  48. if (!scratch)
  49. goto out_err;
  50. new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags);
  51. if (!new_ds)
  52. goto out_scratch;
  53. nfs4_init_deviceid_node(&new_ds->id_node,
  54. server,
  55. &pdev->dev_id);
  56. INIT_LIST_HEAD(&dsaddrs);
  57. xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
  58. xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
  59. /* multipath count */
  60. p = xdr_inline_decode(&stream, 4);
  61. if (unlikely(!p))
  62. goto out_err_drain_dsaddrs;
  63. mp_count = be32_to_cpup(p);
  64. dprintk("%s: multipath ds count %d\n", __func__, mp_count);
  65. for (i = 0; i < mp_count; i++) {
  66. /* multipath ds */
  67. da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
  68. &stream, gfp_flags);
  69. if (da)
  70. list_add_tail(&da->da_node, &dsaddrs);
  71. }
  72. if (list_empty(&dsaddrs)) {
  73. dprintk("%s: no suitable DS addresses found\n",
  74. __func__);
  75. ret = -ENOMEDIUM;
  76. goto out_err_drain_dsaddrs;
  77. }
  78. /* version count */
  79. p = xdr_inline_decode(&stream, 4);
  80. if (unlikely(!p))
  81. goto out_err_drain_dsaddrs;
  82. version_count = be32_to_cpup(p);
  83. dprintk("%s: version count %d\n", __func__, version_count);
  84. ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version),
  85. gfp_flags);
  86. if (!ds_versions)
  87. goto out_scratch;
  88. for (i = 0; i < version_count; i++) {
  89. /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
  90. * tightly_coupled(4) */
  91. p = xdr_inline_decode(&stream, 20);
  92. if (unlikely(!p))
  93. goto out_err_drain_dsaddrs;
  94. ds_versions[i].version = be32_to_cpup(p++);
  95. ds_versions[i].minor_version = be32_to_cpup(p++);
  96. ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL);
  97. ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL);
  98. ds_versions[i].tightly_coupled = be32_to_cpup(p);
  99. if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
  100. ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE;
  101. if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
  102. ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
  103. if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) {
  104. dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
  105. i, ds_versions[i].version,
  106. ds_versions[i].minor_version);
  107. ret = -EPROTONOSUPPORT;
  108. goto out_err_drain_dsaddrs;
  109. }
  110. dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
  111. __func__, i, ds_versions[i].version,
  112. ds_versions[i].minor_version,
  113. ds_versions[i].rsize,
  114. ds_versions[i].wsize,
  115. ds_versions[i].tightly_coupled);
  116. }
  117. new_ds->ds_versions = ds_versions;
  118. new_ds->ds_versions_cnt = version_count;
  119. new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
  120. if (!new_ds->ds)
  121. goto out_err_drain_dsaddrs;
  122. /* If DS was already in cache, free ds addrs */
  123. while (!list_empty(&dsaddrs)) {
  124. da = list_first_entry(&dsaddrs,
  125. struct nfs4_pnfs_ds_addr,
  126. da_node);
  127. list_del_init(&da->da_node);
  128. kfree(da->da_remotestr);
  129. kfree(da);
  130. }
  131. __free_page(scratch);
  132. return new_ds;
  133. out_err_drain_dsaddrs:
  134. while (!list_empty(&dsaddrs)) {
  135. da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
  136. da_node);
  137. list_del_init(&da->da_node);
  138. kfree(da->da_remotestr);
  139. kfree(da);
  140. }
  141. kfree(ds_versions);
  142. out_scratch:
  143. __free_page(scratch);
  144. out_err:
  145. kfree(new_ds);
  146. dprintk("%s ERROR: returning %d\n", __func__, ret);
  147. return NULL;
  148. }
  149. static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg,
  150. struct nfs4_deviceid_node *devid)
  151. {
  152. nfs4_mark_deviceid_unavailable(devid);
  153. if (!ff_layout_has_available_ds(lseg))
  154. pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
  155. lseg);
  156. }
  157. static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
  158. struct nfs4_ff_layout_mirror *mirror)
  159. {
  160. if (mirror == NULL || mirror->mirror_ds == NULL) {
  161. pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
  162. lseg);
  163. return false;
  164. }
  165. if (mirror->mirror_ds->ds == NULL) {
  166. struct nfs4_deviceid_node *devid;
  167. devid = &mirror->mirror_ds->id_node;
  168. ff_layout_mark_devid_invalid(lseg, devid);
  169. return false;
  170. }
  171. return true;
  172. }
  173. static u64
  174. end_offset(u64 start, u64 len)
  175. {
  176. u64 end;
  177. end = start + len;
  178. return end >= start ? end : NFS4_MAX_UINT64;
  179. }
  180. static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
  181. u64 offset, u64 length)
  182. {
  183. u64 end;
  184. end = max_t(u64, end_offset(err->offset, err->length),
  185. end_offset(offset, length));
  186. err->offset = min_t(u64, err->offset, offset);
  187. err->length = end - err->offset;
  188. }
  189. static int
  190. ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
  191. const struct nfs4_ff_layout_ds_err *e2)
  192. {
  193. int ret;
  194. if (e1->opnum != e2->opnum)
  195. return e1->opnum < e2->opnum ? -1 : 1;
  196. if (e1->status != e2->status)
  197. return e1->status < e2->status ? -1 : 1;
  198. ret = memcmp(e1->stateid.data, e2->stateid.data,
  199. sizeof(e1->stateid.data));
  200. if (ret != 0)
  201. return ret;
  202. ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
  203. if (ret != 0)
  204. return ret;
  205. if (end_offset(e1->offset, e1->length) < e2->offset)
  206. return -1;
  207. if (e1->offset > end_offset(e2->offset, e2->length))
  208. return 1;
  209. /* If ranges overlap or are contiguous, they are the same */
  210. return 0;
  211. }
  212. static void
  213. ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
  214. struct nfs4_ff_layout_ds_err *dserr)
  215. {
  216. struct nfs4_ff_layout_ds_err *err, *tmp;
  217. struct list_head *head = &flo->error_list;
  218. int match;
  219. /* Do insertion sort w/ merges */
  220. list_for_each_entry_safe(err, tmp, &flo->error_list, list) {
  221. match = ff_ds_error_match(err, dserr);
  222. if (match < 0)
  223. continue;
  224. if (match > 0) {
  225. /* Add entry "dserr" _before_ entry "err" */
  226. head = &err->list;
  227. break;
  228. }
  229. /* Entries match, so merge "err" into "dserr" */
  230. extend_ds_error(dserr, err->offset, err->length);
  231. list_del(&err->list);
  232. kfree(err);
  233. }
  234. list_add_tail(&dserr->list, head);
  235. }
  236. int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
  237. struct nfs4_ff_layout_mirror *mirror, u64 offset,
  238. u64 length, int status, enum nfs_opnum4 opnum,
  239. gfp_t gfp_flags)
  240. {
  241. struct nfs4_ff_layout_ds_err *dserr;
  242. if (status == 0)
  243. return 0;
  244. if (mirror->mirror_ds == NULL)
  245. return -EINVAL;
  246. dserr = kmalloc(sizeof(*dserr), gfp_flags);
  247. if (!dserr)
  248. return -ENOMEM;
  249. INIT_LIST_HEAD(&dserr->list);
  250. dserr->offset = offset;
  251. dserr->length = length;
  252. dserr->status = status;
  253. dserr->opnum = opnum;
  254. nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
  255. memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
  256. NFS4_DEVICEID4_SIZE);
  257. spin_lock(&flo->generic_hdr.plh_inode->i_lock);
  258. ff_layout_add_ds_error_locked(flo, dserr);
  259. spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
  260. return 0;
  261. }
  262. static struct rpc_cred *
  263. ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
  264. {
  265. struct rpc_cred *cred, __rcu **pcred;
  266. if (iomode == IOMODE_READ)
  267. pcred = &mirror->ro_cred;
  268. else
  269. pcred = &mirror->rw_cred;
  270. rcu_read_lock();
  271. do {
  272. cred = rcu_dereference(*pcred);
  273. if (!cred)
  274. break;
  275. cred = get_rpccred_rcu(cred);
  276. } while(!cred);
  277. rcu_read_unlock();
  278. return cred;
  279. }
  280. struct nfs_fh *
  281. nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
  282. {
  283. struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
  284. struct nfs_fh *fh = NULL;
  285. if (!ff_layout_mirror_valid(lseg, mirror)) {
  286. pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
  287. __func__, mirror_idx);
  288. goto out;
  289. }
  290. /* FIXME: For now assume there is only 1 version available for the DS */
  291. fh = &mirror->fh_versions[0];
  292. out:
  293. return fh;
  294. }
  295. /**
  296. * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
  297. * @lseg: the layout segment we're operating on
  298. * @ds_idx: index of the DS to use
  299. * @fail_return: return layout on connect failure?
  300. *
  301. * Try to prepare a DS connection to accept an RPC call. This involves
  302. * selecting a mirror to use and connecting the client to it if it's not
  303. * already connected.
  304. *
  305. * Since we only need a single functioning mirror to satisfy a read, we don't
  306. * want to return the layout if there is one. For writes though, any down
  307. * mirror should result in a LAYOUTRETURN. @fail_return is how we distinguish
  308. * between the two cases.
  309. *
  310. * Returns a pointer to a connected DS object on success or NULL on failure.
  311. */
  312. struct nfs4_pnfs_ds *
  313. nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
  314. bool fail_return)
  315. {
  316. struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
  317. struct nfs4_pnfs_ds *ds = NULL;
  318. struct nfs4_deviceid_node *devid;
  319. struct inode *ino = lseg->pls_layout->plh_inode;
  320. struct nfs_server *s = NFS_SERVER(ino);
  321. unsigned int max_payload;
  322. if (!ff_layout_mirror_valid(lseg, mirror)) {
  323. pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
  324. __func__, ds_idx);
  325. goto out;
  326. }
  327. devid = &mirror->mirror_ds->id_node;
  328. if (ff_layout_test_devid_unavailable(devid))
  329. goto out_fail;
  330. ds = mirror->mirror_ds->ds;
  331. /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
  332. smp_rmb();
  333. if (ds->ds_clp)
  334. goto out;
  335. /* FIXME: For now we assume the server sent only one version of NFS
  336. * to use for the DS.
  337. */
  338. nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
  339. dataserver_retrans,
  340. mirror->mirror_ds->ds_versions[0].version,
  341. mirror->mirror_ds->ds_versions[0].minor_version,
  342. RPC_AUTH_UNIX);
  343. /* connect success, check rsize/wsize limit */
  344. if (ds->ds_clp) {
  345. max_payload =
  346. nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
  347. NULL);
  348. if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
  349. mirror->mirror_ds->ds_versions[0].rsize = max_payload;
  350. if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
  351. mirror->mirror_ds->ds_versions[0].wsize = max_payload;
  352. goto out;
  353. }
  354. ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
  355. mirror, lseg->pls_range.offset,
  356. lseg->pls_range.length, NFS4ERR_NXIO,
  357. OP_ILLEGAL, GFP_NOIO);
  358. out_fail:
  359. if (fail_return || !ff_layout_has_available_ds(lseg))
  360. pnfs_error_mark_layout_for_return(ino, lseg);
  361. ds = NULL;
  362. out:
  363. return ds;
  364. }
  365. struct rpc_cred *
  366. ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
  367. struct rpc_cred *mdscred)
  368. {
  369. struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
  370. struct rpc_cred *cred;
  371. if (mirror) {
  372. cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode);
  373. if (!cred)
  374. cred = get_rpccred(mdscred);
  375. } else {
  376. cred = get_rpccred(mdscred);
  377. }
  378. return cred;
  379. }
  380. /**
  381. * Find or create a DS rpc client with th MDS server rpc client auth flavor
  382. * in the nfs_client cl_ds_clients list.
  383. */
  384. struct rpc_clnt *
  385. nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
  386. struct nfs_client *ds_clp, struct inode *inode)
  387. {
  388. struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
  389. switch (mirror->mirror_ds->ds_versions[0].version) {
  390. case 3:
  391. /* For NFSv3 DS, flavor is set when creating DS connections */
  392. return ds_clp->cl_rpcclient;
  393. case 4:
  394. return nfs4_find_or_create_ds_client(ds_clp, inode);
  395. default:
  396. BUG();
  397. }
  398. }
  399. static bool is_range_intersecting(u64 offset1, u64 length1,
  400. u64 offset2, u64 length2)
  401. {
  402. u64 end1 = end_offset(offset1, length1);
  403. u64 end2 = end_offset(offset2, length2);
  404. return (end1 == NFS4_MAX_UINT64 || end1 > offset2) &&
  405. (end2 == NFS4_MAX_UINT64 || end2 > offset1);
  406. }
  407. /* called with inode i_lock held */
  408. int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
  409. struct xdr_stream *xdr, int *count,
  410. const struct pnfs_layout_range *range)
  411. {
  412. struct nfs4_ff_layout_ds_err *err, *n;
  413. __be32 *p;
  414. list_for_each_entry_safe(err, n, &flo->error_list, list) {
  415. if (!is_range_intersecting(err->offset, err->length,
  416. range->offset, range->length))
  417. continue;
  418. /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
  419. * + array length + deviceid(NFS4_DEVICEID4_SIZE)
  420. * + status(4) + opnum(4)
  421. */
  422. p = xdr_reserve_space(xdr,
  423. 28 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
  424. if (unlikely(!p))
  425. return -ENOBUFS;
  426. p = xdr_encode_hyper(p, err->offset);
  427. p = xdr_encode_hyper(p, err->length);
  428. p = xdr_encode_opaque_fixed(p, &err->stateid,
  429. NFS4_STATEID_SIZE);
  430. /* Encode 1 error */
  431. *p++ = cpu_to_be32(1);
  432. p = xdr_encode_opaque_fixed(p, &err->deviceid,
  433. NFS4_DEVICEID4_SIZE);
  434. *p++ = cpu_to_be32(err->status);
  435. *p++ = cpu_to_be32(err->opnum);
  436. *count += 1;
  437. list_del(&err->list);
  438. dprintk("%s: offset %llu length %llu status %d op %d count %d\n",
  439. __func__, err->offset, err->length, err->status,
  440. err->opnum, *count);
  441. kfree(err);
  442. }
  443. return 0;
  444. }
  445. static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
  446. {
  447. struct nfs4_ff_layout_mirror *mirror;
  448. struct nfs4_deviceid_node *devid;
  449. u32 idx;
  450. for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
  451. mirror = FF_LAYOUT_COMP(lseg, idx);
  452. if (mirror && mirror->mirror_ds) {
  453. devid = &mirror->mirror_ds->id_node;
  454. if (!ff_layout_test_devid_unavailable(devid))
  455. return true;
  456. }
  457. }
  458. return false;
  459. }
  460. static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
  461. {
  462. struct nfs4_ff_layout_mirror *mirror;
  463. struct nfs4_deviceid_node *devid;
  464. u32 idx;
  465. for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
  466. mirror = FF_LAYOUT_COMP(lseg, idx);
  467. if (!mirror || !mirror->mirror_ds)
  468. return false;
  469. devid = &mirror->mirror_ds->id_node;
  470. if (ff_layout_test_devid_unavailable(devid))
  471. return false;
  472. }
  473. return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
  474. }
  475. bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
  476. {
  477. if (lseg->pls_range.iomode == IOMODE_READ)
  478. return ff_read_layout_has_available_ds(lseg);
  479. /* Note: RW layout needs all mirrors available */
  480. return ff_rw_layout_has_available_ds(lseg);
  481. }
  482. bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg)
  483. {
  484. return ff_layout_no_fallback_to_mds(lseg) ||
  485. ff_layout_has_available_ds(lseg);
  486. }
  487. bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg)
  488. {
  489. return lseg->pls_range.iomode == IOMODE_RW &&
  490. ff_layout_no_read_on_rw(lseg);
  491. }
  492. module_param(dataserver_retrans, uint, 0644);
  493. MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
  494. "retries a request before it attempts further "
  495. " recovery action.");
  496. module_param(dataserver_timeo, uint, 0644);
  497. MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
  498. "NFSv4.1 client waits for a response from a "
  499. " data server before it retries an NFS request.");