objio_osd.c 26 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057
  1. /*
  2. * pNFS Objects layout implementation over open-osd initiator library
  3. *
  4. * Copyright (C) 2009 Panasas Inc. [year of first publication]
  5. * All rights reserved.
  6. *
  7. * Benny Halevy <bhalevy@panasas.com>
  8. * Boaz Harrosh <bharrosh@panasas.com>
  9. *
  10. * This program is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License version 2
  12. * See the file COPYING included with this distribution for more details.
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. *
  18. * 1. Redistributions of source code must retain the above copyright
  19. * notice, this list of conditions and the following disclaimer.
  20. * 2. Redistributions in binary form must reproduce the above copyright
  21. * notice, this list of conditions and the following disclaimer in the
  22. * documentation and/or other materials provided with the distribution.
  23. * 3. Neither the name of the Panasas company nor the names of its
  24. * contributors may be used to endorse or promote products derived
  25. * from this software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
  28. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  29. * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  30. * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  31. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
  34. * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  35. * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  36. * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  37. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. */
  39. #include <linux/module.h>
  40. #include <scsi/osd_initiator.h>
  41. #include "objlayout.h"
  42. #define NFSDBG_FACILITY NFSDBG_PNFS_LD
  43. #define _LLU(x) ((unsigned long long)x)
  44. enum { BIO_MAX_PAGES_KMALLOC =
  45. (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
  46. };
  47. struct objio_dev_ent {
  48. struct nfs4_deviceid_node id_node;
  49. struct osd_dev *od;
  50. };
  51. static void
  52. objio_free_deviceid_node(struct nfs4_deviceid_node *d)
  53. {
  54. struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
  55. dprintk("%s: free od=%p\n", __func__, de->od);
  56. osduld_put_device(de->od);
  57. kfree(de);
  58. }
  59. static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
  60. const struct nfs4_deviceid *d_id)
  61. {
  62. struct nfs4_deviceid_node *d;
  63. struct objio_dev_ent *de;
  64. d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
  65. if (!d)
  66. return NULL;
  67. de = container_of(d, struct objio_dev_ent, id_node);
  68. return de;
  69. }
  70. static struct objio_dev_ent *
  71. _dev_list_add(const struct nfs_server *nfss,
  72. const struct nfs4_deviceid *d_id, struct osd_dev *od,
  73. gfp_t gfp_flags)
  74. {
  75. struct nfs4_deviceid_node *d;
  76. struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
  77. struct objio_dev_ent *n;
  78. if (!de) {
  79. dprintk("%s: -ENOMEM od=%p\n", __func__, od);
  80. return NULL;
  81. }
  82. dprintk("%s: Adding od=%p\n", __func__, od);
  83. nfs4_init_deviceid_node(&de->id_node,
  84. nfss->pnfs_curr_ld,
  85. nfss->nfs_client,
  86. d_id);
  87. de->od = od;
  88. d = nfs4_insert_deviceid_node(&de->id_node);
  89. n = container_of(d, struct objio_dev_ent, id_node);
  90. if (n != de) {
  91. dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
  92. objio_free_deviceid_node(&de->id_node);
  93. de = n;
  94. }
  95. return de;
  96. }
  97. struct caps_buffers {
  98. u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
  99. u8 creds[OSD_CAP_LEN];
  100. };
  101. struct objio_segment {
  102. struct pnfs_layout_segment lseg;
  103. struct pnfs_osd_object_cred *comps;
  104. unsigned mirrors_p1;
  105. unsigned stripe_unit;
  106. unsigned group_width; /* Data stripe_units without integrity comps */
  107. u64 group_depth;
  108. unsigned group_count;
  109. unsigned max_io_size;
  110. unsigned comps_index;
  111. unsigned num_comps;
  112. /* variable length */
  113. struct objio_dev_ent *ods[];
  114. };
  115. static inline struct objio_segment *
  116. OBJIO_LSEG(struct pnfs_layout_segment *lseg)
  117. {
  118. return container_of(lseg, struct objio_segment, lseg);
  119. }
  120. struct objio_state;
  121. typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
  122. struct objio_state {
  123. /* Generic layer */
  124. struct objlayout_io_state ol_state;
  125. struct objio_segment *layout;
  126. struct kref kref;
  127. objio_done_fn done;
  128. void *private;
  129. unsigned long length;
  130. unsigned numdevs; /* Actually used devs in this IO */
  131. /* A per-device variable array of size numdevs */
  132. struct _objio_per_comp {
  133. struct bio *bio;
  134. struct osd_request *or;
  135. unsigned long length;
  136. u64 offset;
  137. unsigned dev;
  138. } per_dev[];
  139. };
  140. /* Send and wait for a get_device_info of devices in the layout,
  141. then look them up with the osd_initiator library */
  142. static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
  143. struct objio_segment *objio_seg, unsigned comp,
  144. gfp_t gfp_flags)
  145. {
  146. struct pnfs_osd_deviceaddr *deviceaddr;
  147. struct nfs4_deviceid *d_id;
  148. struct objio_dev_ent *ode;
  149. struct osd_dev *od;
  150. struct osd_dev_info odi;
  151. int err;
  152. d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
  153. ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
  154. if (ode)
  155. return ode;
  156. err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
  157. if (unlikely(err)) {
  158. dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
  159. __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
  160. return ERR_PTR(err);
  161. }
  162. odi.systemid_len = deviceaddr->oda_systemid.len;
  163. if (odi.systemid_len > sizeof(odi.systemid)) {
  164. err = -EINVAL;
  165. goto out;
  166. } else if (odi.systemid_len)
  167. memcpy(odi.systemid, deviceaddr->oda_systemid.data,
  168. odi.systemid_len);
  169. odi.osdname_len = deviceaddr->oda_osdname.len;
  170. odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
  171. if (!odi.osdname_len && !odi.systemid_len) {
  172. dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
  173. __func__);
  174. err = -ENODEV;
  175. goto out;
  176. }
  177. od = osduld_info_lookup(&odi);
  178. if (unlikely(IS_ERR(od))) {
  179. err = PTR_ERR(od);
  180. dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
  181. goto out;
  182. }
  183. ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
  184. gfp_flags);
  185. out:
  186. dprintk("%s: return=%d\n", __func__, err);
  187. objlayout_put_deviceinfo(deviceaddr);
  188. return err ? ERR_PTR(err) : ode;
  189. }
  190. static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
  191. struct objio_segment *objio_seg,
  192. gfp_t gfp_flags)
  193. {
  194. unsigned i;
  195. int err;
  196. /* lookup all devices */
  197. for (i = 0; i < objio_seg->num_comps; i++) {
  198. struct objio_dev_ent *ode;
  199. ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
  200. if (unlikely(IS_ERR(ode))) {
  201. err = PTR_ERR(ode);
  202. goto out;
  203. }
  204. objio_seg->ods[i] = ode;
  205. }
  206. err = 0;
  207. out:
  208. dprintk("%s: return=%d\n", __func__, err);
  209. return err;
  210. }
  211. static int _verify_data_map(struct pnfs_osd_layout *layout)
  212. {
  213. struct pnfs_osd_data_map *data_map = &layout->olo_map;
  214. u64 stripe_length;
  215. u32 group_width;
  216. /* FIXME: Only raid0 for now. if not go through MDS */
  217. if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
  218. printk(KERN_ERR "Only RAID_0 for now\n");
  219. return -ENOTSUPP;
  220. }
  221. if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
  222. printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
  223. data_map->odm_num_comps, data_map->odm_mirror_cnt);
  224. return -EINVAL;
  225. }
  226. if (data_map->odm_group_width)
  227. group_width = data_map->odm_group_width;
  228. else
  229. group_width = data_map->odm_num_comps /
  230. (data_map->odm_mirror_cnt + 1);
  231. stripe_length = (u64)data_map->odm_stripe_unit * group_width;
  232. if (stripe_length >= (1ULL << 32)) {
  233. printk(KERN_ERR "Total Stripe length(0x%llx)"
  234. " >= 32bit is not supported\n", _LLU(stripe_length));
  235. return -ENOTSUPP;
  236. }
  237. if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
  238. printk(KERN_ERR "Stripe Unit(0x%llx)"
  239. " must be Multples of PAGE_SIZE(0x%lx)\n",
  240. _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
  241. return -ENOTSUPP;
  242. }
  243. return 0;
  244. }
  245. static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
  246. struct pnfs_osd_object_cred *src_comp,
  247. struct caps_buffers *caps_p)
  248. {
  249. WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
  250. WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
  251. *cur_comp = *src_comp;
  252. memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
  253. sizeof(caps_p->caps_key));
  254. cur_comp->oc_cap_key.cred = caps_p->caps_key;
  255. memcpy(caps_p->creds, src_comp->oc_cap.cred,
  256. sizeof(caps_p->creds));
  257. cur_comp->oc_cap.cred = caps_p->creds;
  258. }
  259. int objio_alloc_lseg(struct pnfs_layout_segment **outp,
  260. struct pnfs_layout_hdr *pnfslay,
  261. struct pnfs_layout_range *range,
  262. struct xdr_stream *xdr,
  263. gfp_t gfp_flags)
  264. {
  265. struct objio_segment *objio_seg;
  266. struct pnfs_osd_xdr_decode_layout_iter iter;
  267. struct pnfs_osd_layout layout;
  268. struct pnfs_osd_object_cred *cur_comp, src_comp;
  269. struct caps_buffers *caps_p;
  270. int err;
  271. err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
  272. if (unlikely(err))
  273. return err;
  274. err = _verify_data_map(&layout);
  275. if (unlikely(err))
  276. return err;
  277. objio_seg = kzalloc(sizeof(*objio_seg) +
  278. sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
  279. sizeof(*objio_seg->comps) * layout.olo_num_comps +
  280. sizeof(struct caps_buffers) * layout.olo_num_comps,
  281. gfp_flags);
  282. if (!objio_seg)
  283. return -ENOMEM;
  284. objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
  285. cur_comp = objio_seg->comps;
  286. caps_p = (void *)(cur_comp + layout.olo_num_comps);
  287. while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
  288. copy_single_comp(cur_comp++, &src_comp, caps_p++);
  289. if (unlikely(err))
  290. goto err;
  291. objio_seg->num_comps = layout.olo_num_comps;
  292. objio_seg->comps_index = layout.olo_comps_index;
  293. err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
  294. if (err)
  295. goto err;
  296. objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
  297. objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
  298. if (layout.olo_map.odm_group_width) {
  299. objio_seg->group_width = layout.olo_map.odm_group_width;
  300. objio_seg->group_depth = layout.olo_map.odm_group_depth;
  301. objio_seg->group_count = layout.olo_map.odm_num_comps /
  302. objio_seg->mirrors_p1 /
  303. objio_seg->group_width;
  304. } else {
  305. objio_seg->group_width = layout.olo_map.odm_num_comps /
  306. objio_seg->mirrors_p1;
  307. objio_seg->group_depth = -1;
  308. objio_seg->group_count = 1;
  309. }
  310. /* Cache this calculation it will hit for every page */
  311. objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
  312. objio_seg->stripe_unit) *
  313. objio_seg->group_width;
  314. *outp = &objio_seg->lseg;
  315. return 0;
  316. err:
  317. kfree(objio_seg);
  318. dprintk("%s: Error: return %d\n", __func__, err);
  319. *outp = NULL;
  320. return err;
  321. }
  322. void objio_free_lseg(struct pnfs_layout_segment *lseg)
  323. {
  324. int i;
  325. struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
  326. for (i = 0; i < objio_seg->num_comps; i++) {
  327. if (!objio_seg->ods[i])
  328. break;
  329. nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
  330. }
  331. kfree(objio_seg);
  332. }
  333. int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
  334. struct objlayout_io_state **outp,
  335. gfp_t gfp_flags)
  336. {
  337. struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
  338. struct objio_state *ios;
  339. const unsigned first_size = sizeof(*ios) +
  340. objio_seg->num_comps * sizeof(ios->per_dev[0]);
  341. const unsigned sec_size = objio_seg->num_comps *
  342. sizeof(ios->ol_state.ioerrs[0]);
  343. ios = kzalloc(first_size + sec_size, gfp_flags);
  344. if (unlikely(!ios))
  345. return -ENOMEM;
  346. ios->layout = objio_seg;
  347. ios->ol_state.ioerrs = ((void *)ios) + first_size;
  348. ios->ol_state.num_comps = objio_seg->num_comps;
  349. *outp = &ios->ol_state;
  350. return 0;
  351. }
  352. void objio_free_io_state(struct objlayout_io_state *ol_state)
  353. {
  354. struct objio_state *ios = container_of(ol_state, struct objio_state,
  355. ol_state);
  356. kfree(ios);
  357. }
  358. enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
  359. {
  360. switch (oep) {
  361. case OSD_ERR_PRI_NO_ERROR:
  362. return (enum pnfs_osd_errno)0;
  363. case OSD_ERR_PRI_CLEAR_PAGES:
  364. BUG_ON(1);
  365. return 0;
  366. case OSD_ERR_PRI_RESOURCE:
  367. return PNFS_OSD_ERR_RESOURCE;
  368. case OSD_ERR_PRI_BAD_CRED:
  369. return PNFS_OSD_ERR_BAD_CRED;
  370. case OSD_ERR_PRI_NO_ACCESS:
  371. return PNFS_OSD_ERR_NO_ACCESS;
  372. case OSD_ERR_PRI_UNREACHABLE:
  373. return PNFS_OSD_ERR_UNREACHABLE;
  374. case OSD_ERR_PRI_NOT_FOUND:
  375. return PNFS_OSD_ERR_NOT_FOUND;
  376. case OSD_ERR_PRI_NO_SPACE:
  377. return PNFS_OSD_ERR_NO_SPACE;
  378. default:
  379. WARN_ON(1);
  380. /* fallthrough */
  381. case OSD_ERR_PRI_EIO:
  382. return PNFS_OSD_ERR_EIO;
  383. }
  384. }
  385. static void _clear_bio(struct bio *bio)
  386. {
  387. struct bio_vec *bv;
  388. unsigned i;
  389. __bio_for_each_segment(bv, bio, i, 0) {
  390. unsigned this_count = bv->bv_len;
  391. if (likely(PAGE_SIZE == this_count))
  392. clear_highpage(bv->bv_page);
  393. else
  394. zero_user(bv->bv_page, bv->bv_offset, this_count);
  395. }
  396. }
  397. static int _io_check(struct objio_state *ios, bool is_write)
  398. {
  399. enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
  400. int lin_ret = 0;
  401. int i;
  402. for (i = 0; i < ios->numdevs; i++) {
  403. struct osd_sense_info osi;
  404. struct osd_request *or = ios->per_dev[i].or;
  405. int ret;
  406. if (!or)
  407. continue;
  408. ret = osd_req_decode_sense(or, &osi);
  409. if (likely(!ret))
  410. continue;
  411. if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
  412. /* start read offset passed endof file */
  413. BUG_ON(is_write);
  414. _clear_bio(ios->per_dev[i].bio);
  415. dprintk("%s: start read offset passed end of file "
  416. "offset=0x%llx, length=0x%lx\n", __func__,
  417. _LLU(ios->per_dev[i].offset),
  418. ios->per_dev[i].length);
  419. continue; /* we recovered */
  420. }
  421. objlayout_io_set_result(&ios->ol_state, i,
  422. &ios->layout->comps[i].oc_object_id,
  423. osd_pri_2_pnfs_err(osi.osd_err_pri),
  424. ios->per_dev[i].offset,
  425. ios->per_dev[i].length,
  426. is_write);
  427. if (osi.osd_err_pri >= oep) {
  428. oep = osi.osd_err_pri;
  429. lin_ret = ret;
  430. }
  431. }
  432. return lin_ret;
  433. }
  434. /*
  435. * Common IO state helpers.
  436. */
  437. static void _io_free(struct objio_state *ios)
  438. {
  439. unsigned i;
  440. for (i = 0; i < ios->numdevs; i++) {
  441. struct _objio_per_comp *per_dev = &ios->per_dev[i];
  442. if (per_dev->or) {
  443. osd_end_request(per_dev->or);
  444. per_dev->or = NULL;
  445. }
  446. if (per_dev->bio) {
  447. bio_put(per_dev->bio);
  448. per_dev->bio = NULL;
  449. }
  450. }
  451. }
  452. struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
  453. {
  454. unsigned min_dev = ios->layout->comps_index;
  455. unsigned max_dev = min_dev + ios->layout->num_comps;
  456. BUG_ON(dev < min_dev || max_dev <= dev);
  457. return ios->layout->ods[dev - min_dev]->od;
  458. }
  459. struct _striping_info {
  460. u64 obj_offset;
  461. u64 group_length;
  462. unsigned dev;
  463. unsigned unit_off;
  464. };
  465. static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
  466. struct _striping_info *si)
  467. {
  468. u32 stripe_unit = ios->layout->stripe_unit;
  469. u32 group_width = ios->layout->group_width;
  470. u64 group_depth = ios->layout->group_depth;
  471. u32 U = stripe_unit * group_width;
  472. u64 T = U * group_depth;
  473. u64 S = T * ios->layout->group_count;
  474. u64 M = div64_u64(file_offset, S);
  475. /*
  476. G = (L - (M * S)) / T
  477. H = (L - (M * S)) % T
  478. */
  479. u64 LmodU = file_offset - M * S;
  480. u32 G = div64_u64(LmodU, T);
  481. u64 H = LmodU - G * T;
  482. u32 N = div_u64(H, U);
  483. div_u64_rem(file_offset, stripe_unit, &si->unit_off);
  484. si->obj_offset = si->unit_off + (N * stripe_unit) +
  485. (M * group_depth * stripe_unit);
  486. /* "H - (N * U)" is just "H % U" so it's bound to u32 */
  487. si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
  488. si->dev *= ios->layout->mirrors_p1;
  489. si->group_length = T - H;
  490. }
  491. static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
  492. unsigned pgbase, struct _objio_per_comp *per_dev, int len,
  493. gfp_t gfp_flags)
  494. {
  495. unsigned pg = *cur_pg;
  496. int cur_len = len;
  497. struct request_queue *q =
  498. osd_request_queue(_io_od(ios, per_dev->dev));
  499. if (per_dev->bio == NULL) {
  500. unsigned pages_in_stripe = ios->layout->group_width *
  501. (ios->layout->stripe_unit / PAGE_SIZE);
  502. unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
  503. ios->layout->group_width;
  504. if (BIO_MAX_PAGES_KMALLOC < bio_size)
  505. bio_size = BIO_MAX_PAGES_KMALLOC;
  506. per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
  507. if (unlikely(!per_dev->bio)) {
  508. dprintk("Faild to allocate BIO size=%u\n", bio_size);
  509. return -ENOMEM;
  510. }
  511. }
  512. while (cur_len > 0) {
  513. unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
  514. unsigned added_len;
  515. BUG_ON(ios->ol_state.nr_pages <= pg);
  516. cur_len -= pglen;
  517. added_len = bio_add_pc_page(q, per_dev->bio,
  518. ios->ol_state.pages[pg], pglen, pgbase);
  519. if (unlikely(pglen != added_len))
  520. return -ENOMEM;
  521. pgbase = 0;
  522. ++pg;
  523. }
  524. BUG_ON(cur_len);
  525. per_dev->length += len;
  526. *cur_pg = pg;
  527. return 0;
  528. }
  529. static int _prepare_one_group(struct objio_state *ios, u64 length,
  530. struct _striping_info *si, unsigned *last_pg,
  531. gfp_t gfp_flags)
  532. {
  533. unsigned stripe_unit = ios->layout->stripe_unit;
  534. unsigned mirrors_p1 = ios->layout->mirrors_p1;
  535. unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
  536. unsigned dev = si->dev;
  537. unsigned first_dev = dev - (dev % devs_in_group);
  538. unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
  539. unsigned cur_pg = *last_pg;
  540. int ret = 0;
  541. while (length) {
  542. struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
  543. unsigned cur_len, page_off = 0;
  544. if (!per_dev->length) {
  545. per_dev->dev = dev;
  546. if (dev < si->dev) {
  547. per_dev->offset = si->obj_offset + stripe_unit -
  548. si->unit_off;
  549. cur_len = stripe_unit;
  550. } else if (dev == si->dev) {
  551. per_dev->offset = si->obj_offset;
  552. cur_len = stripe_unit - si->unit_off;
  553. page_off = si->unit_off & ~PAGE_MASK;
  554. BUG_ON(page_off &&
  555. (page_off != ios->ol_state.pgbase));
  556. } else { /* dev > si->dev */
  557. per_dev->offset = si->obj_offset - si->unit_off;
  558. cur_len = stripe_unit;
  559. }
  560. if (max_comp < dev - first_dev)
  561. max_comp = dev - first_dev;
  562. } else {
  563. cur_len = stripe_unit;
  564. }
  565. if (cur_len >= length)
  566. cur_len = length;
  567. ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
  568. cur_len, gfp_flags);
  569. if (unlikely(ret))
  570. goto out;
  571. dev += mirrors_p1;
  572. dev = (dev % devs_in_group) + first_dev;
  573. length -= cur_len;
  574. ios->length += cur_len;
  575. }
  576. out:
  577. ios->numdevs = max_comp + mirrors_p1;
  578. *last_pg = cur_pg;
  579. return ret;
  580. }
  581. static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
  582. {
  583. u64 length = ios->ol_state.count;
  584. u64 offset = ios->ol_state.offset;
  585. struct _striping_info si;
  586. unsigned last_pg = 0;
  587. int ret = 0;
  588. while (length) {
  589. _calc_stripe_info(ios, offset, &si);
  590. if (length < si.group_length)
  591. si.group_length = length;
  592. ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
  593. if (unlikely(ret))
  594. goto out;
  595. offset += si.group_length;
  596. length -= si.group_length;
  597. }
  598. out:
  599. if (!ios->length)
  600. return ret;
  601. return 0;
  602. }
  603. static ssize_t _sync_done(struct objio_state *ios)
  604. {
  605. struct completion *waiting = ios->private;
  606. complete(waiting);
  607. return 0;
  608. }
  609. static void _last_io(struct kref *kref)
  610. {
  611. struct objio_state *ios = container_of(kref, struct objio_state, kref);
  612. ios->done(ios);
  613. }
  614. static void _done_io(struct osd_request *or, void *p)
  615. {
  616. struct objio_state *ios = p;
  617. kref_put(&ios->kref, _last_io);
  618. }
  619. static ssize_t _io_exec(struct objio_state *ios)
  620. {
  621. DECLARE_COMPLETION_ONSTACK(wait);
  622. ssize_t status = 0; /* sync status */
  623. unsigned i;
  624. objio_done_fn saved_done_fn = ios->done;
  625. bool sync = ios->ol_state.sync;
  626. if (sync) {
  627. ios->done = _sync_done;
  628. ios->private = &wait;
  629. }
  630. kref_init(&ios->kref);
  631. for (i = 0; i < ios->numdevs; i++) {
  632. struct osd_request *or = ios->per_dev[i].or;
  633. if (!or)
  634. continue;
  635. kref_get(&ios->kref);
  636. osd_execute_request_async(or, _done_io, ios);
  637. }
  638. kref_put(&ios->kref, _last_io);
  639. if (sync) {
  640. wait_for_completion(&wait);
  641. status = saved_done_fn(ios);
  642. }
  643. return status;
  644. }
  645. /*
  646. * read
  647. */
  648. static ssize_t _read_done(struct objio_state *ios)
  649. {
  650. ssize_t status;
  651. int ret = _io_check(ios, false);
  652. _io_free(ios);
  653. if (likely(!ret))
  654. status = ios->length;
  655. else
  656. status = ret;
  657. objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
  658. return status;
  659. }
  660. static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
  661. {
  662. struct osd_request *or = NULL;
  663. struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
  664. unsigned dev = per_dev->dev;
  665. struct pnfs_osd_object_cred *cred =
  666. &ios->layout->comps[cur_comp];
  667. struct osd_obj_id obj = {
  668. .partition = cred->oc_object_id.oid_partition_id,
  669. .id = cred->oc_object_id.oid_object_id,
  670. };
  671. int ret;
  672. or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
  673. if (unlikely(!or)) {
  674. ret = -ENOMEM;
  675. goto err;
  676. }
  677. per_dev->or = or;
  678. osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
  679. ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
  680. if (ret) {
  681. dprintk("%s: Faild to osd_finalize_request() => %d\n",
  682. __func__, ret);
  683. goto err;
  684. }
  685. dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
  686. __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
  687. per_dev->length);
  688. err:
  689. return ret;
  690. }
  691. static ssize_t _read_exec(struct objio_state *ios)
  692. {
  693. unsigned i;
  694. int ret;
  695. for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
  696. if (!ios->per_dev[i].length)
  697. continue;
  698. ret = _read_mirrors(ios, i);
  699. if (unlikely(ret))
  700. goto err;
  701. }
  702. ios->done = _read_done;
  703. return _io_exec(ios); /* In sync mode exec returns the io status */
  704. err:
  705. _io_free(ios);
  706. return ret;
  707. }
  708. ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
  709. {
  710. struct objio_state *ios = container_of(ol_state, struct objio_state,
  711. ol_state);
  712. int ret;
  713. ret = _io_rw_pagelist(ios, GFP_KERNEL);
  714. if (unlikely(ret))
  715. return ret;
  716. return _read_exec(ios);
  717. }
  718. /*
  719. * write
  720. */
  721. static ssize_t _write_done(struct objio_state *ios)
  722. {
  723. ssize_t status;
  724. int ret = _io_check(ios, true);
  725. _io_free(ios);
  726. if (likely(!ret)) {
  727. /* FIXME: should be based on the OSD's persistence model
  728. * See OSD2r05 Section 4.13 Data persistence model */
  729. ios->ol_state.committed = NFS_FILE_SYNC;
  730. status = ios->length;
  731. } else {
  732. status = ret;
  733. }
  734. objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
  735. return status;
  736. }
  737. static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
  738. {
  739. struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
  740. unsigned dev = ios->per_dev[cur_comp].dev;
  741. unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
  742. int ret;
  743. for (; cur_comp < last_comp; ++cur_comp, ++dev) {
  744. struct osd_request *or = NULL;
  745. struct pnfs_osd_object_cred *cred =
  746. &ios->layout->comps[cur_comp];
  747. struct osd_obj_id obj = {
  748. .partition = cred->oc_object_id.oid_partition_id,
  749. .id = cred->oc_object_id.oid_object_id,
  750. };
  751. struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
  752. struct bio *bio;
  753. or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
  754. if (unlikely(!or)) {
  755. ret = -ENOMEM;
  756. goto err;
  757. }
  758. per_dev->or = or;
  759. if (per_dev != master_dev) {
  760. bio = bio_kmalloc(GFP_NOFS,
  761. master_dev->bio->bi_max_vecs);
  762. if (unlikely(!bio)) {
  763. dprintk("Faild to allocate BIO size=%u\n",
  764. master_dev->bio->bi_max_vecs);
  765. ret = -ENOMEM;
  766. goto err;
  767. }
  768. __bio_clone(bio, master_dev->bio);
  769. bio->bi_bdev = NULL;
  770. bio->bi_next = NULL;
  771. per_dev->bio = bio;
  772. per_dev->dev = dev;
  773. per_dev->length = master_dev->length;
  774. per_dev->offset = master_dev->offset;
  775. } else {
  776. bio = master_dev->bio;
  777. bio->bi_rw |= REQ_WRITE;
  778. }
  779. osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
  780. ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
  781. if (ret) {
  782. dprintk("%s: Faild to osd_finalize_request() => %d\n",
  783. __func__, ret);
  784. goto err;
  785. }
  786. dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
  787. __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
  788. per_dev->length);
  789. }
  790. err:
  791. return ret;
  792. }
  793. static ssize_t _write_exec(struct objio_state *ios)
  794. {
  795. unsigned i;
  796. int ret;
  797. for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
  798. if (!ios->per_dev[i].length)
  799. continue;
  800. ret = _write_mirrors(ios, i);
  801. if (unlikely(ret))
  802. goto err;
  803. }
  804. ios->done = _write_done;
  805. return _io_exec(ios); /* In sync mode exec returns the io->status */
  806. err:
  807. _io_free(ios);
  808. return ret;
  809. }
  810. ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
  811. {
  812. struct objio_state *ios = container_of(ol_state, struct objio_state,
  813. ol_state);
  814. int ret;
  815. /* TODO: ios->stable = stable; */
  816. ret = _io_rw_pagelist(ios, GFP_NOFS);
  817. if (unlikely(ret))
  818. return ret;
  819. return _write_exec(ios);
  820. }
  821. static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
  822. struct nfs_page *prev, struct nfs_page *req)
  823. {
  824. if (!pnfs_generic_pg_test(pgio, prev, req))
  825. return false;
  826. if (pgio->pg_lseg == NULL)
  827. return true;
  828. return pgio->pg_count + req->wb_bytes <=
  829. OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
  830. }
  831. static struct pnfs_layoutdriver_type objlayout_type = {
  832. .id = LAYOUT_OSD2_OBJECTS,
  833. .name = "LAYOUT_OSD2_OBJECTS",
  834. .flags = PNFS_LAYOUTRET_ON_SETATTR |
  835. PNFS_LAYOUTRET_ON_ERROR,
  836. .alloc_layout_hdr = objlayout_alloc_layout_hdr,
  837. .free_layout_hdr = objlayout_free_layout_hdr,
  838. .alloc_lseg = objlayout_alloc_lseg,
  839. .free_lseg = objlayout_free_lseg,
  840. .read_pagelist = objlayout_read_pagelist,
  841. .write_pagelist = objlayout_write_pagelist,
  842. .pg_test = objio_pg_test,
  843. .free_deviceid_node = objio_free_deviceid_node,
  844. .encode_layoutcommit = objlayout_encode_layoutcommit,
  845. .encode_layoutreturn = objlayout_encode_layoutreturn,
  846. };
  847. MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
  848. MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
  849. MODULE_LICENSE("GPL");
  850. static int __init
  851. objlayout_init(void)
  852. {
  853. int ret = pnfs_register_layoutdriver(&objlayout_type);
  854. if (ret)
  855. printk(KERN_INFO
  856. "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
  857. __func__, ret);
  858. else
  859. printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
  860. __func__);
  861. return ret;
  862. }
  863. static void __exit
  864. objlayout_exit(void)
  865. {
  866. pnfs_unregister_layoutdriver(&objlayout_type);
  867. printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
  868. __func__);
  869. }
  870. module_init(objlayout_init);
  871. module_exit(objlayout_exit);