flexfilelayoutdev.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554
  1. /*
  2. * Device operations for the pnfs nfs4 file layout driver.
  3. *
  4. * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
  5. *
  6. * Tao Peng <bergwolf@primarydata.com>
  7. */
  8. #include <linux/nfs_fs.h>
  9. #include <linux/vmalloc.h>
  10. #include <linux/module.h>
  11. #include <linux/sunrpc/addr.h>
  12. #include "../internal.h"
  13. #include "../nfs4session.h"
  14. #include "flexfilelayout.h"
  15. #define NFSDBG_FACILITY NFSDBG_PNFS_LD
  16. static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
  17. static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
  18. void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
  19. {
  20. if (mirror_ds)
  21. nfs4_put_deviceid_node(&mirror_ds->id_node);
  22. }
  23. void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
  24. {
  25. nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
  26. nfs4_pnfs_ds_put(mirror_ds->ds);
  27. kfree_rcu(mirror_ds, id_node.rcu);
  28. }
  29. /* Decode opaque device data and construct new_ds using it */
  30. struct nfs4_ff_layout_ds *
  31. nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
  32. gfp_t gfp_flags)
  33. {
  34. struct xdr_stream stream;
  35. struct xdr_buf buf;
  36. struct page *scratch;
  37. struct list_head dsaddrs;
  38. struct nfs4_pnfs_ds_addr *da;
  39. struct nfs4_ff_layout_ds *new_ds = NULL;
  40. struct nfs4_ff_ds_version *ds_versions = NULL;
  41. u32 mp_count;
  42. u32 version_count;
  43. __be32 *p;
  44. int i, ret = -ENOMEM;
  45. /* set up xdr stream */
  46. scratch = alloc_page(gfp_flags);
  47. if (!scratch)
  48. goto out_err;
  49. new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags);
  50. if (!new_ds)
  51. goto out_scratch;
  52. nfs4_init_deviceid_node(&new_ds->id_node,
  53. server,
  54. &pdev->dev_id);
  55. INIT_LIST_HEAD(&dsaddrs);
  56. xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
  57. xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
  58. /* multipath count */
  59. p = xdr_inline_decode(&stream, 4);
  60. if (unlikely(!p))
  61. goto out_err_drain_dsaddrs;
  62. mp_count = be32_to_cpup(p);
  63. dprintk("%s: multipath ds count %d\n", __func__, mp_count);
  64. for (i = 0; i < mp_count; i++) {
  65. /* multipath ds */
  66. da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
  67. &stream, gfp_flags);
  68. if (da)
  69. list_add_tail(&da->da_node, &dsaddrs);
  70. }
  71. if (list_empty(&dsaddrs)) {
  72. dprintk("%s: no suitable DS addresses found\n",
  73. __func__);
  74. ret = -ENOMEDIUM;
  75. goto out_err_drain_dsaddrs;
  76. }
  77. /* version count */
  78. p = xdr_inline_decode(&stream, 4);
  79. if (unlikely(!p))
  80. goto out_err_drain_dsaddrs;
  81. version_count = be32_to_cpup(p);
  82. dprintk("%s: version count %d\n", __func__, version_count);
  83. ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version),
  84. gfp_flags);
  85. if (!ds_versions)
  86. goto out_scratch;
  87. for (i = 0; i < version_count; i++) {
  88. /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
  89. * tightly_coupled(4) */
  90. p = xdr_inline_decode(&stream, 20);
  91. if (unlikely(!p))
  92. goto out_err_drain_dsaddrs;
  93. ds_versions[i].version = be32_to_cpup(p++);
  94. ds_versions[i].minor_version = be32_to_cpup(p++);
  95. ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL);
  96. ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL);
  97. ds_versions[i].tightly_coupled = be32_to_cpup(p);
  98. if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
  99. ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE;
  100. if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
  101. ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
  102. if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) {
  103. dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
  104. i, ds_versions[i].version,
  105. ds_versions[i].minor_version);
  106. ret = -EPROTONOSUPPORT;
  107. goto out_err_drain_dsaddrs;
  108. }
  109. dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
  110. __func__, i, ds_versions[i].version,
  111. ds_versions[i].minor_version,
  112. ds_versions[i].rsize,
  113. ds_versions[i].wsize,
  114. ds_versions[i].tightly_coupled);
  115. }
  116. new_ds->ds_versions = ds_versions;
  117. new_ds->ds_versions_cnt = version_count;
  118. new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
  119. if (!new_ds->ds)
  120. goto out_err_drain_dsaddrs;
  121. /* If DS was already in cache, free ds addrs */
  122. while (!list_empty(&dsaddrs)) {
  123. da = list_first_entry(&dsaddrs,
  124. struct nfs4_pnfs_ds_addr,
  125. da_node);
  126. list_del_init(&da->da_node);
  127. kfree(da->da_remotestr);
  128. kfree(da);
  129. }
  130. __free_page(scratch);
  131. return new_ds;
  132. out_err_drain_dsaddrs:
  133. while (!list_empty(&dsaddrs)) {
  134. da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
  135. da_node);
  136. list_del_init(&da->da_node);
  137. kfree(da->da_remotestr);
  138. kfree(da);
  139. }
  140. kfree(ds_versions);
  141. out_scratch:
  142. __free_page(scratch);
  143. out_err:
  144. kfree(new_ds);
  145. dprintk("%s ERROR: returning %d\n", __func__, ret);
  146. return NULL;
  147. }
  148. static u64
  149. end_offset(u64 start, u64 len)
  150. {
  151. u64 end;
  152. end = start + len;
  153. return end >= start ? end : NFS4_MAX_UINT64;
  154. }
  155. static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
  156. u64 offset, u64 length)
  157. {
  158. u64 end;
  159. end = max_t(u64, end_offset(err->offset, err->length),
  160. end_offset(offset, length));
  161. err->offset = min_t(u64, err->offset, offset);
  162. err->length = end - err->offset;
  163. }
  164. static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err, u64 offset,
  165. u64 length, int status, enum nfs_opnum4 opnum,
  166. nfs4_stateid *stateid,
  167. struct nfs4_deviceid *deviceid)
  168. {
  169. return err->status == status && err->opnum == opnum &&
  170. nfs4_stateid_match(&err->stateid, stateid) &&
  171. !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) &&
  172. end_offset(err->offset, err->length) >= offset &&
  173. err->offset <= end_offset(offset, length);
  174. }
  175. static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old,
  176. struct nfs4_ff_layout_ds_err *new)
  177. {
  178. if (!ds_error_can_merge(old, new->offset, new->length, new->status,
  179. new->opnum, &new->stateid, &new->deviceid))
  180. return false;
  181. extend_ds_error(old, new->offset, new->length);
  182. return true;
  183. }
  184. static bool
  185. ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
  186. struct nfs4_ff_layout_ds_err *dserr)
  187. {
  188. struct nfs4_ff_layout_ds_err *err;
  189. list_for_each_entry(err, &flo->error_list, list) {
  190. if (merge_ds_error(err, dserr)) {
  191. return true;
  192. }
  193. }
  194. list_add(&dserr->list, &flo->error_list);
  195. return false;
  196. }
  197. static bool
  198. ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset,
  199. u64 length, int status, enum nfs_opnum4 opnum,
  200. nfs4_stateid *stateid, struct nfs4_deviceid *deviceid)
  201. {
  202. bool found = false;
  203. struct nfs4_ff_layout_ds_err *err;
  204. list_for_each_entry(err, &flo->error_list, list) {
  205. if (ds_error_can_merge(err, offset, length, status, opnum,
  206. stateid, deviceid)) {
  207. found = true;
  208. extend_ds_error(err, offset, length);
  209. break;
  210. }
  211. }
  212. return found;
  213. }
  214. int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
  215. struct nfs4_ff_layout_mirror *mirror, u64 offset,
  216. u64 length, int status, enum nfs_opnum4 opnum,
  217. gfp_t gfp_flags)
  218. {
  219. struct nfs4_ff_layout_ds_err *dserr;
  220. bool needfree;
  221. if (status == 0)
  222. return 0;
  223. if (mirror->mirror_ds == NULL)
  224. return -EINVAL;
  225. spin_lock(&flo->generic_hdr.plh_inode->i_lock);
  226. if (ff_layout_update_ds_error(flo, offset, length, status, opnum,
  227. &mirror->stateid,
  228. &mirror->mirror_ds->id_node.deviceid)) {
  229. spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
  230. return 0;
  231. }
  232. spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
  233. dserr = kmalloc(sizeof(*dserr), gfp_flags);
  234. if (!dserr)
  235. return -ENOMEM;
  236. INIT_LIST_HEAD(&dserr->list);
  237. dserr->offset = offset;
  238. dserr->length = length;
  239. dserr->status = status;
  240. dserr->opnum = opnum;
  241. nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
  242. memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
  243. NFS4_DEVICEID4_SIZE);
  244. spin_lock(&flo->generic_hdr.plh_inode->i_lock);
  245. needfree = ff_layout_add_ds_error_locked(flo, dserr);
  246. spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
  247. if (needfree)
  248. kfree(dserr);
  249. return 0;
  250. }
  251. /* currently we only support AUTH_NONE and AUTH_SYS */
  252. static rpc_authflavor_t
  253. nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror)
  254. {
  255. if (mirror->uid == (u32)-1)
  256. return RPC_AUTH_NULL;
  257. return RPC_AUTH_UNIX;
  258. }
  259. /* fetch cred for NFSv3 DS */
  260. static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
  261. struct nfs4_pnfs_ds *ds)
  262. {
  263. if (ds->ds_clp && !mirror->cred &&
  264. mirror->mirror_ds->ds_versions[0].version == 3) {
  265. struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth;
  266. struct rpc_cred *cred;
  267. struct auth_cred acred = {
  268. .uid = make_kuid(&init_user_ns, mirror->uid),
  269. .gid = make_kgid(&init_user_ns, mirror->gid),
  270. };
  271. /* AUTH_NULL ignores acred */
  272. cred = auth->au_ops->lookup_cred(auth, &acred, 0);
  273. if (IS_ERR(cred)) {
  274. dprintk("%s: lookup_cred failed with %ld\n",
  275. __func__, PTR_ERR(cred));
  276. return PTR_ERR(cred);
  277. } else {
  278. if (cmpxchg(&mirror->cred, NULL, cred))
  279. put_rpccred(cred);
  280. }
  281. }
  282. return 0;
  283. }
  284. struct nfs_fh *
  285. nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
  286. {
  287. struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
  288. struct nfs_fh *fh = NULL;
  289. struct nfs4_deviceid_node *devid;
  290. if (mirror == NULL || mirror->mirror_ds == NULL ||
  291. mirror->mirror_ds->ds == NULL) {
  292. printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
  293. __func__, mirror_idx);
  294. if (mirror && mirror->mirror_ds) {
  295. devid = &mirror->mirror_ds->id_node;
  296. pnfs_generic_mark_devid_invalid(devid);
  297. }
  298. goto out;
  299. }
  300. /* FIXME: For now assume there is only 1 version available for the DS */
  301. fh = &mirror->fh_versions[0];
  302. out:
  303. return fh;
  304. }
  305. /* Upon return, either ds is connected, or ds is NULL */
  306. struct nfs4_pnfs_ds *
  307. nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
  308. bool fail_return)
  309. {
  310. struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
  311. struct nfs4_pnfs_ds *ds = NULL;
  312. struct nfs4_deviceid_node *devid;
  313. struct inode *ino = lseg->pls_layout->plh_inode;
  314. struct nfs_server *s = NFS_SERVER(ino);
  315. unsigned int max_payload;
  316. rpc_authflavor_t flavor;
  317. if (mirror == NULL || mirror->mirror_ds == NULL ||
  318. mirror->mirror_ds->ds == NULL) {
  319. printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
  320. __func__, ds_idx);
  321. if (mirror && mirror->mirror_ds) {
  322. devid = &mirror->mirror_ds->id_node;
  323. pnfs_generic_mark_devid_invalid(devid);
  324. }
  325. goto out;
  326. }
  327. devid = &mirror->mirror_ds->id_node;
  328. if (ff_layout_test_devid_unavailable(devid))
  329. goto out;
  330. ds = mirror->mirror_ds->ds;
  331. /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
  332. smp_rmb();
  333. if (ds->ds_clp)
  334. goto out_update_creds;
  335. flavor = nfs4_ff_layout_choose_authflavor(mirror);
  336. /* FIXME: For now we assume the server sent only one version of NFS
  337. * to use for the DS.
  338. */
  339. nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
  340. dataserver_retrans,
  341. mirror->mirror_ds->ds_versions[0].version,
  342. mirror->mirror_ds->ds_versions[0].minor_version,
  343. flavor);
  344. /* connect success, check rsize/wsize limit */
  345. if (ds->ds_clp) {
  346. max_payload =
  347. nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
  348. NULL);
  349. if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
  350. mirror->mirror_ds->ds_versions[0].rsize = max_payload;
  351. if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
  352. mirror->mirror_ds->ds_versions[0].wsize = max_payload;
  353. } else {
  354. ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
  355. mirror, lseg->pls_range.offset,
  356. lseg->pls_range.length, NFS4ERR_NXIO,
  357. OP_ILLEGAL, GFP_NOIO);
  358. if (fail_return) {
  359. pnfs_error_mark_layout_for_return(ino, lseg);
  360. if (ff_layout_has_available_ds(lseg))
  361. pnfs_set_retry_layoutget(lseg->pls_layout);
  362. else
  363. pnfs_clear_retry_layoutget(lseg->pls_layout);
  364. } else {
  365. if (ff_layout_has_available_ds(lseg))
  366. set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
  367. &lseg->pls_layout->plh_flags);
  368. else {
  369. pnfs_error_mark_layout_for_return(ino, lseg);
  370. pnfs_clear_retry_layoutget(lseg->pls_layout);
  371. }
  372. }
  373. }
  374. out_update_creds:
  375. if (ff_layout_update_mirror_cred(mirror, ds))
  376. ds = NULL;
  377. out:
  378. return ds;
  379. }
  380. struct rpc_cred *
  381. ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
  382. struct rpc_cred *mdscred)
  383. {
  384. struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
  385. struct rpc_cred *cred = ERR_PTR(-EINVAL);
  386. if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true))
  387. goto out;
  388. if (mirror && mirror->cred)
  389. cred = mirror->cred;
  390. else
  391. cred = mdscred;
  392. out:
  393. return cred;
  394. }
  395. /**
  396. * Find or create a DS rpc client with th MDS server rpc client auth flavor
  397. * in the nfs_client cl_ds_clients list.
  398. */
  399. struct rpc_clnt *
  400. nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
  401. struct nfs_client *ds_clp, struct inode *inode)
  402. {
  403. struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
  404. switch (mirror->mirror_ds->ds_versions[0].version) {
  405. case 3:
  406. /* For NFSv3 DS, flavor is set when creating DS connections */
  407. return ds_clp->cl_rpcclient;
  408. case 4:
  409. return nfs4_find_or_create_ds_client(ds_clp, inode);
  410. default:
  411. BUG();
  412. }
  413. }
  414. static bool is_range_intersecting(u64 offset1, u64 length1,
  415. u64 offset2, u64 length2)
  416. {
  417. u64 end1 = end_offset(offset1, length1);
  418. u64 end2 = end_offset(offset2, length2);
  419. return (end1 == NFS4_MAX_UINT64 || end1 > offset2) &&
  420. (end2 == NFS4_MAX_UINT64 || end2 > offset1);
  421. }
  422. /* called with inode i_lock held */
  423. int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
  424. struct xdr_stream *xdr, int *count,
  425. const struct pnfs_layout_range *range)
  426. {
  427. struct nfs4_ff_layout_ds_err *err, *n;
  428. __be32 *p;
  429. list_for_each_entry_safe(err, n, &flo->error_list, list) {
  430. if (!is_range_intersecting(err->offset, err->length,
  431. range->offset, range->length))
  432. continue;
  433. /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
  434. * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
  435. */
  436. p = xdr_reserve_space(xdr,
  437. 24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
  438. if (unlikely(!p))
  439. return -ENOBUFS;
  440. p = xdr_encode_hyper(p, err->offset);
  441. p = xdr_encode_hyper(p, err->length);
  442. p = xdr_encode_opaque_fixed(p, &err->stateid,
  443. NFS4_STATEID_SIZE);
  444. p = xdr_encode_opaque_fixed(p, &err->deviceid,
  445. NFS4_DEVICEID4_SIZE);
  446. *p++ = cpu_to_be32(err->status);
  447. *p++ = cpu_to_be32(err->opnum);
  448. *count += 1;
  449. list_del(&err->list);
  450. dprintk("%s: offset %llu length %llu status %d op %d count %d\n",
  451. __func__, err->offset, err->length, err->status,
  452. err->opnum, *count);
  453. kfree(err);
  454. }
  455. return 0;
  456. }
  457. bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
  458. {
  459. struct nfs4_ff_layout_mirror *mirror;
  460. struct nfs4_deviceid_node *devid;
  461. int idx;
  462. for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
  463. mirror = FF_LAYOUT_COMP(lseg, idx);
  464. if (mirror && mirror->mirror_ds) {
  465. devid = &mirror->mirror_ds->id_node;
  466. if (!ff_layout_test_devid_unavailable(devid))
  467. return true;
  468. }
  469. }
  470. return false;
  471. }
  472. module_param(dataserver_retrans, uint, 0644);
  473. MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
  474. "retries a request before it attempts further "
  475. " recovery action.");
  476. module_param(dataserver_timeo, uint, 0644);
  477. MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
  478. "NFSv4.1 client waits for a response from a "
  479. " data server before it retries an NFS request.");