softraid_raid5.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967
  1. /* $OpenBSD: softraid_raid5.c,v 1.23 2015/07/21 03:30:51 krw Exp $ */
  2. /*
  3. * Copyright (c) 2014 Joel Sing <jsing@openbsd.org>
  4. * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
  5. * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
  6. *
  7. * Permission to use, copy, modify, and distribute this software for any
  8. * purpose with or without fee is hereby granted, provided that the above
  9. * copyright notice and this permission notice appear in all copies.
  10. *
  11. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  12. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  13. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  14. * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  15. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  16. * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  17. * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  18. */
  19. #include "bio.h"
  20. #include <sys/param.h>
  21. #include <sys/systm.h>
  22. #include <sys/buf.h>
  23. #include <sys/device.h>
  24. #include <sys/ioctl.h>
  25. #include <sys/malloc.h>
  26. #include <sys/kernel.h>
  27. #include <sys/disk.h>
  28. #include <sys/rwlock.h>
  29. #include <sys/queue.h>
  30. #include <sys/fcntl.h>
  31. #include <sys/mount.h>
  32. #include <sys/sensors.h>
  33. #include <sys/stat.h>
  34. #include <sys/task.h>
  35. #include <sys/pool.h>
  36. #include <sys/conf.h>
  37. #include <sys/uio.h>
  38. #include <scsi/scsi_all.h>
  39. #include <scsi/scsiconf.h>
  40. #include <scsi/scsi_disk.h>
  41. #include <dev/softraidvar.h>
  42. /* RAID 5 functions. */
  43. int sr_raid5_create(struct sr_discipline *, struct bioc_createraid *,
  44. int, int64_t);
  45. int sr_raid5_assemble(struct sr_discipline *, struct bioc_createraid *,
  46. int, void *);
  47. int sr_raid5_init(struct sr_discipline *);
  48. int sr_raid5_rw(struct sr_workunit *);
  49. int sr_raid5_openings(struct sr_discipline *);
  50. void sr_raid5_intr(struct buf *);
  51. int sr_raid5_wu_done(struct sr_workunit *);
  52. void sr_raid5_set_chunk_state(struct sr_discipline *, int, int);
  53. void sr_raid5_set_vol_state(struct sr_discipline *);
  54. int sr_raid5_addio(struct sr_workunit *wu, int, daddr_t, long,
  55. void *, int, int, void *);
  56. int sr_raid5_regenerate(struct sr_workunit *, int, daddr_t, long,
  57. void *);
  58. int sr_raid5_write(struct sr_workunit *, struct sr_workunit *, int, int,
  59. daddr_t, long, void *, int, int);
  60. void sr_raid5_xor(void *, void *, int);
  61. void sr_raid5_rebuild(struct sr_discipline *);
  62. void sr_raid5_scrub(struct sr_discipline *);
  63. /* discipline initialisation. */
  64. void
  65. sr_raid5_discipline_init(struct sr_discipline *sd)
  66. {
  67. /* Fill out discipline members. */
  68. sd->sd_type = SR_MD_RAID5;
  69. strlcpy(sd->sd_name, "RAID 5", sizeof(sd->sd_name));
  70. sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
  71. SR_CAP_REBUILD | SR_CAP_REDUNDANT;
  72. sd->sd_max_ccb_per_wu = 4; /* only if stripsize <= MAXPHYS */
  73. sd->sd_max_wu = SR_RAID5_NOWU + 2; /* Two for scrub/rebuild. */
  74. /* Setup discipline specific function pointers. */
  75. sd->sd_assemble = sr_raid5_assemble;
  76. sd->sd_create = sr_raid5_create;
  77. sd->sd_openings = sr_raid5_openings;
  78. sd->sd_rebuild = sr_raid5_rebuild;
  79. sd->sd_scsi_rw = sr_raid5_rw;
  80. sd->sd_scsi_intr = sr_raid5_intr;
  81. sd->sd_scsi_wu_done = sr_raid5_wu_done;
  82. sd->sd_set_chunk_state = sr_raid5_set_chunk_state;
  83. sd->sd_set_vol_state = sr_raid5_set_vol_state;
  84. }
  85. int
  86. sr_raid5_create(struct sr_discipline *sd, struct bioc_createraid *bc,
  87. int no_chunk, int64_t coerced_size)
  88. {
  89. if (no_chunk < 3) {
  90. sr_error(sd->sd_sc, "%s requires three or more chunks",
  91. sd->sd_name);
  92. return EINVAL;
  93. }
  94. /*
  95. * XXX add variable strip size later even though MAXPHYS is really
  96. * the clever value, users like to tinker with that type of stuff.
  97. */
  98. sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
  99. sd->sd_meta->ssdi.ssd_size = (coerced_size &
  100. ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >>
  101. DEV_BSHIFT) - 1)) * (no_chunk - 1);
  102. return sr_raid5_init(sd);
  103. }
  104. int
  105. sr_raid5_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
  106. int no_chunk, void *data)
  107. {
  108. return sr_raid5_init(sd);
  109. }
  110. int
  111. sr_raid5_init(struct sr_discipline *sd)
  112. {
  113. /* Initialise runtime values. */
  114. sd->mds.mdd_raid5.sr5_strip_bits =
  115. sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
  116. if (sd->mds.mdd_raid5.sr5_strip_bits == -1) {
  117. sr_error(sd->sd_sc, "invalid strip size");
  118. return EINVAL;
  119. }
  120. return 0;
  121. }
  122. int
  123. sr_raid5_openings(struct sr_discipline *sd)
  124. {
  125. /* Two work units per I/O, two for rebuild/scrub. */
  126. return ((sd->sd_max_wu - 2) >> 1);
  127. }
  128. void
  129. sr_raid5_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
  130. {
  131. int old_state, s;
  132. DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
  133. DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
  134. sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
  135. /* ok to go to splbio since this only happens in error path */
  136. s = splbio();
  137. old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
  138. /* multiple IOs to the same chunk that fail will come through here */
  139. if (old_state == new_state)
  140. goto done;
  141. switch (old_state) {
  142. case BIOC_SDONLINE:
  143. switch (new_state) {
  144. case BIOC_SDOFFLINE:
  145. case BIOC_SDSCRUB:
  146. break;
  147. default:
  148. goto die;
  149. }
  150. break;
  151. case BIOC_SDOFFLINE:
  152. if (new_state == BIOC_SDREBUILD) {
  153. ;
  154. } else
  155. goto die;
  156. break;
  157. case BIOC_SDSCRUB:
  158. switch (new_state) {
  159. case BIOC_SDONLINE:
  160. case BIOC_SDOFFLINE:
  161. break;
  162. default:
  163. goto die;
  164. }
  165. break;
  166. case BIOC_SDREBUILD:
  167. switch (new_state) {
  168. case BIOC_SDONLINE:
  169. case BIOC_SDOFFLINE:
  170. break;
  171. default:
  172. goto die;
  173. }
  174. break;
  175. default:
  176. die:
  177. splx(s); /* XXX */
  178. panic("%s: %s: %s: invalid chunk state transition "
  179. "%d -> %d", DEVNAME(sd->sd_sc),
  180. sd->sd_meta->ssd_devname,
  181. sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
  182. old_state, new_state);
  183. /* NOTREACHED */
  184. }
  185. sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
  186. sd->sd_set_vol_state(sd);
  187. sd->sd_must_flush = 1;
  188. task_add(systq, &sd->sd_meta_save_task);
  189. done:
  190. splx(s);
  191. }
  192. void
  193. sr_raid5_set_vol_state(struct sr_discipline *sd)
  194. {
  195. int states[SR_MAX_STATES];
  196. int new_state, i, s, nd;
  197. int old_state = sd->sd_vol_status;
  198. DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
  199. DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
  200. nd = sd->sd_meta->ssdi.ssd_chunk_no;
  201. for (i = 0; i < SR_MAX_STATES; i++)
  202. states[i] = 0;
  203. for (i = 0; i < nd; i++) {
  204. s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
  205. if (s >= SR_MAX_STATES)
  206. panic("%s: %s: %s: invalid chunk state",
  207. DEVNAME(sd->sd_sc),
  208. sd->sd_meta->ssd_devname,
  209. sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
  210. states[s]++;
  211. }
  212. if (states[BIOC_SDONLINE] == nd)
  213. new_state = BIOC_SVONLINE;
  214. else if (states[BIOC_SDONLINE] < nd - 1)
  215. new_state = BIOC_SVOFFLINE;
  216. else if (states[BIOC_SDSCRUB] != 0)
  217. new_state = BIOC_SVSCRUB;
  218. else if (states[BIOC_SDREBUILD] != 0)
  219. new_state = BIOC_SVREBUILD;
  220. else if (states[BIOC_SDONLINE] == nd - 1)
  221. new_state = BIOC_SVDEGRADED;
  222. else {
  223. #ifdef SR_DEBUG
  224. DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state "
  225. "was %d\n", DEVNAME(sd->sd_sc), old_state);
  226. for (i = 0; i < nd; i++)
  227. DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n",
  228. DEVNAME(sd->sd_sc), i,
  229. sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
  230. #endif
  231. panic("invalid volume state");
  232. }
  233. DNPRINTF(SR_D_STATE, "%s: %s: sr_raid5_set_vol_state %d -> %d\n",
  234. DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
  235. old_state, new_state);
  236. switch (old_state) {
  237. case BIOC_SVONLINE:
  238. switch (new_state) {
  239. case BIOC_SVONLINE: /* can go to same state */
  240. case BIOC_SVOFFLINE:
  241. case BIOC_SVDEGRADED:
  242. case BIOC_SVREBUILD: /* happens on boot */
  243. break;
  244. default:
  245. goto die;
  246. }
  247. break;
  248. case BIOC_SVOFFLINE:
  249. /* XXX this might be a little too much */
  250. goto die;
  251. case BIOC_SVDEGRADED:
  252. switch (new_state) {
  253. case BIOC_SVOFFLINE:
  254. case BIOC_SVREBUILD:
  255. case BIOC_SVDEGRADED: /* can go to the same state */
  256. break;
  257. default:
  258. goto die;
  259. }
  260. break;
  261. case BIOC_SVBUILDING:
  262. switch (new_state) {
  263. case BIOC_SVONLINE:
  264. case BIOC_SVOFFLINE:
  265. case BIOC_SVBUILDING: /* can go to the same state */
  266. break;
  267. default:
  268. goto die;
  269. }
  270. break;
  271. case BIOC_SVSCRUB:
  272. switch (new_state) {
  273. case BIOC_SVONLINE:
  274. case BIOC_SVOFFLINE:
  275. case BIOC_SVDEGRADED:
  276. case BIOC_SVSCRUB: /* can go to same state */
  277. break;
  278. default:
  279. goto die;
  280. }
  281. break;
  282. case BIOC_SVREBUILD:
  283. switch (new_state) {
  284. case BIOC_SVONLINE:
  285. case BIOC_SVOFFLINE:
  286. case BIOC_SVDEGRADED:
  287. case BIOC_SVREBUILD: /* can go to the same state */
  288. break;
  289. default:
  290. goto die;
  291. }
  292. break;
  293. default:
  294. die:
  295. panic("%s: %s: invalid volume state transition %d -> %d",
  296. DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
  297. old_state, new_state);
  298. /* NOTREACHED */
  299. }
  300. sd->sd_vol_status = new_state;
  301. }
  302. static inline int
  303. sr_raid5_chunk_online(struct sr_discipline *sd, int chunk)
  304. {
  305. switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
  306. case BIOC_SDONLINE:
  307. case BIOC_SDSCRUB:
  308. return 1;
  309. default:
  310. return 0;
  311. }
  312. }
  313. static inline int
  314. sr_raid5_chunk_rebuild(struct sr_discipline *sd, int chunk)
  315. {
  316. switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
  317. case BIOC_SDREBUILD:
  318. return 1;
  319. default:
  320. return 0;
  321. }
  322. }
  323. int
  324. sr_raid5_rw(struct sr_workunit *wu)
  325. {
  326. struct sr_workunit *wu_r = NULL;
  327. struct sr_discipline *sd = wu->swu_dis;
  328. struct scsi_xfer *xs = wu->swu_xs;
  329. struct sr_chunk *scp;
  330. daddr_t blkno, lba;
  331. int64_t chunk_offs, lbaoffs, offset, strip_offs;
  332. int64_t strip_bits, strip_no, strip_size;
  333. int64_t chunk, no_chunk;
  334. int64_t parity, row_size;
  335. long length, datalen;
  336. void *data;
  337. int s;
  338. /* blkno and scsi error will be handled by sr_validate_io */
  339. if (sr_validate_io(wu, &blkno, "sr_raid5_rw"))
  340. goto bad;
  341. DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_rw %s: blkno %lld size %d\n",
  342. DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
  343. (xs->flags & SCSI_DATA_IN) ? "read" : "write",
  344. (long long)blkno, xs->datalen);
  345. strip_size = sd->sd_meta->ssdi.ssd_strip_size;
  346. strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
  347. no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
  348. row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
  349. data = xs->data;
  350. datalen = xs->datalen;
  351. lbaoffs = blkno << DEV_BSHIFT;
  352. if (xs->flags & SCSI_DATA_OUT) {
  353. if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){
  354. printf("%s: %s failed to get read work unit",
  355. DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
  356. goto bad;
  357. }
  358. wu_r->swu_state = SR_WU_INPROGRESS;
  359. wu_r->swu_flags |= SR_WUF_DISCIPLINE;
  360. }
  361. wu->swu_blk_start = 0;
  362. while (datalen != 0) {
  363. strip_no = lbaoffs >> strip_bits;
  364. strip_offs = lbaoffs & (strip_size - 1);
  365. chunk_offs = (strip_no / no_chunk) << strip_bits;
  366. offset = chunk_offs + strip_offs;
  367. /* get size remaining in this stripe */
  368. length = MIN(strip_size - strip_offs, datalen);
  369. /*
  370. * Map disk offset to data and parity chunks, using a left
  371. * asymmetric algorithm for the parity assignment.
  372. */
  373. chunk = strip_no % no_chunk;
  374. parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
  375. if (chunk >= parity)
  376. chunk++;
  377. lba = offset >> DEV_BSHIFT;
  378. /* XXX big hammer.. exclude I/O from entire stripe */
  379. if (wu->swu_blk_start == 0)
  380. wu->swu_blk_start = (strip_no / no_chunk) * row_size;
  381. wu->swu_blk_end = (strip_no / no_chunk) * row_size +
  382. (row_size - 1);
  383. scp = sd->sd_vol.sv_chunks[chunk];
  384. if (xs->flags & SCSI_DATA_IN) {
  385. switch (scp->src_meta.scm_status) {
  386. case BIOC_SDONLINE:
  387. case BIOC_SDSCRUB:
  388. /*
  389. * Chunk is online, issue a single read
  390. * request.
  391. */
  392. if (sr_raid5_addio(wu, chunk, lba, length,
  393. data, xs->flags, 0, NULL))
  394. goto bad;
  395. break;
  396. case BIOC_SDOFFLINE:
  397. case BIOC_SDREBUILD:
  398. case BIOC_SDHOTSPARE:
  399. if (sr_raid5_regenerate(wu, chunk, lba,
  400. length, data))
  401. goto bad;
  402. break;
  403. default:
  404. printf("%s: is offline, can't read\n",
  405. DEVNAME(sd->sd_sc));
  406. goto bad;
  407. }
  408. } else {
  409. if (sr_raid5_write(wu, wu_r, chunk, parity, lba,
  410. length, data, xs->flags, 0))
  411. goto bad;
  412. }
  413. /* advance to next block */
  414. lbaoffs += length;
  415. datalen -= length;
  416. data += length;
  417. }
  418. s = splbio();
  419. if (wu_r) {
  420. if (wu_r->swu_io_count > 0) {
  421. /* collide write request with reads */
  422. wu_r->swu_blk_start = wu->swu_blk_start;
  423. wu_r->swu_blk_end = wu->swu_blk_end;
  424. wu->swu_state = SR_WU_DEFERRED;
  425. wu_r->swu_collider = wu;
  426. TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
  427. wu = wu_r;
  428. } else {
  429. sr_scsi_wu_put(sd, wu_r);
  430. }
  431. }
  432. splx(s);
  433. sr_schedule_wu(wu);
  434. return (0);
  435. bad:
  436. /* wu is unwound by sr_wu_put */
  437. if (wu_r)
  438. sr_scsi_wu_put(sd, wu_r);
  439. return (1);
  440. }
  441. int
  442. sr_raid5_regenerate(struct sr_workunit *wu, int chunk, daddr_t blkno,
  443. long len, void *data)
  444. {
  445. struct sr_discipline *sd = wu->swu_dis;
  446. int i;
  447. /*
  448. * Regenerate a block on a RAID 5 volume by xoring the data and parity
  449. * from all of the remaining online chunks. This requires the parity
  450. * to already be correct.
  451. */
  452. DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_regenerate chunk %d offline, "
  453. "regenerating block %llu\n",
  454. DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, chunk, blkno);
  455. memset(data, 0, len);
  456. for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
  457. if (i == chunk)
  458. continue;
  459. if (!sr_raid5_chunk_online(sd, i))
  460. goto bad;
  461. if (sr_raid5_addio(wu, i, blkno, len, NULL, SCSI_DATA_IN,
  462. 0, data))
  463. goto bad;
  464. }
  465. return (0);
  466. bad:
  467. return (1);
  468. }
  469. int
  470. sr_raid5_write(struct sr_workunit *wu, struct sr_workunit *wu_r, int chunk,
  471. int parity, daddr_t blkno, long len, void *data, int xsflags,
  472. int ccbflags)
  473. {
  474. struct sr_discipline *sd = wu->swu_dis;
  475. struct scsi_xfer *xs = wu->swu_xs;
  476. void *xorbuf;
  477. int chunk_online, chunk_rebuild;
  478. int parity_online, parity_rebuild;
  479. int other_offline = 0, other_rebuild = 0;
  480. int i;
  481. /*
  482. * Perform a write to a RAID 5 volume. This write routine does not
  483. * require the parity to already be correct and will operate on a
  484. * uninitialised volume.
  485. *
  486. * There are four possible cases:
  487. *
  488. * 1) All data chunks and parity are online. In this case we read the
  489. * data from all data chunks, except the one we are writing to, in
  490. * order to calculate and write the new parity.
  491. *
  492. * 2) The parity chunk is offline. In this case we only need to write
  493. * to the data chunk. No parity calculation is required.
  494. *
  495. * 3) The data chunk is offline. In this case we read the data from all
  496. * online chunks in order to calculate and write the new parity.
  497. * This is the same as (1) except we do not write the data chunk.
  498. *
  499. * 4) A different data chunk is offline. The new parity is calculated
  500. * by taking the existing parity, xoring the original data and
  501. * xoring in the new data. This requires that the parity already be
  502. * correct, which it will be if any of the data chunks has
  503. * previously been written.
  504. *
  505. * There is an additional complication introduced by a chunk that is
  506. * being rebuilt. If this is the data or parity chunk, then we want
  507. * to write to it as per normal. If it is another data chunk then we
  508. * need to presume that it has not yet been regenerated and use the
  509. * same method as detailed in (4) above.
  510. */
  511. DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_write chunk %i parity %i "
  512. "blkno %llu\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
  513. chunk, parity, (unsigned long long)blkno);
  514. chunk_online = sr_raid5_chunk_online(sd, chunk);
  515. chunk_rebuild = sr_raid5_chunk_rebuild(sd, chunk);
  516. parity_online = sr_raid5_chunk_online(sd, parity);
  517. parity_rebuild = sr_raid5_chunk_rebuild(sd, parity);
  518. for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
  519. if (i == chunk || i == parity)
  520. continue;
  521. if (sr_raid5_chunk_rebuild(sd, i))
  522. other_rebuild = 1;
  523. else if (!sr_raid5_chunk_online(sd, i))
  524. other_offline = 1;
  525. }
  526. DNPRINTF(SR_D_DIS, "%s: %s chunk online %d, parity online %d, "
  527. "other offline %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
  528. chunk_online, parity_online, other_offline);
  529. if (!parity_online && !parity_rebuild)
  530. goto data_write;
  531. xorbuf = sr_block_get(sd, len);
  532. if (xorbuf == NULL)
  533. goto bad;
  534. memcpy(xorbuf, data, len);
  535. if (other_offline || other_rebuild) {
  536. /*
  537. * XXX - If we can guarantee that this LBA has been scrubbed
  538. * then we can also take this faster path.
  539. */
  540. /* Read in existing data and existing parity. */
  541. if (sr_raid5_addio(wu_r, chunk, blkno, len, NULL,
  542. SCSI_DATA_IN, 0, xorbuf))
  543. goto bad;
  544. if (sr_raid5_addio(wu_r, parity, blkno, len, NULL,
  545. SCSI_DATA_IN, 0, xorbuf))
  546. goto bad;
  547. } else {
  548. /* Read in existing data from all other chunks. */
  549. for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
  550. if (i == chunk || i == parity)
  551. continue;
  552. if (sr_raid5_addio(wu_r, i, blkno, len, NULL,
  553. SCSI_DATA_IN, 0, xorbuf))
  554. goto bad;
  555. }
  556. }
  557. /* Write new parity. */
  558. if (sr_raid5_addio(wu, parity, blkno, len, xorbuf, xs->flags,
  559. SR_CCBF_FREEBUF, NULL))
  560. goto bad;
  561. data_write:
  562. /* Write new data. */
  563. if (chunk_online || chunk_rebuild)
  564. if (sr_raid5_addio(wu, chunk, blkno, len, data, xs->flags,
  565. 0, NULL))
  566. goto bad;
  567. return (0);
  568. bad:
  569. return (1);
  570. }
  571. void
  572. sr_raid5_intr(struct buf *bp)
  573. {
  574. struct sr_ccb *ccb = (struct sr_ccb *)bp;
  575. struct sr_workunit *wu = ccb->ccb_wu;
  576. struct sr_discipline *sd = wu->swu_dis;
  577. int s;
  578. DNPRINTF(SR_D_INTR, "%s: sr_raid5_intr bp %p xs %p\n",
  579. DEVNAME(sd->sd_sc), bp, wu->swu_xs);
  580. s = splbio();
  581. sr_ccb_done(ccb);
  582. /* XXX - Should this be done via the taskq? */
  583. /* XOR data to result. */
  584. if (ccb->ccb_state == SR_CCB_OK && ccb->ccb_opaque)
  585. sr_raid5_xor(ccb->ccb_opaque, ccb->ccb_buf.b_data,
  586. ccb->ccb_buf.b_bcount);
  587. /* Free allocated data buffer. */
  588. if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
  589. sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
  590. ccb->ccb_buf.b_data = NULL;
  591. }
  592. sr_wu_done(wu);
  593. splx(s);
  594. }
  595. int
  596. sr_raid5_wu_done(struct sr_workunit *wu)
  597. {
  598. struct sr_discipline *sd = wu->swu_dis;
  599. struct scsi_xfer *xs = wu->swu_xs;
  600. /* XXX - we have no way of propagating errors... */
  601. if (wu->swu_flags & (SR_WUF_DISCIPLINE | SR_WUF_REBUILD))
  602. return SR_WU_OK;
  603. /* XXX - This is insufficient for RAID 5. */
  604. if (wu->swu_ios_succeeded > 0) {
  605. xs->error = XS_NOERROR;
  606. return SR_WU_OK;
  607. }
  608. if (xs->flags & SCSI_DATA_IN) {
  609. printf("%s: retrying read on block %lld\n",
  610. sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
  611. sr_wu_release_ccbs(wu);
  612. wu->swu_state = SR_WU_RESTART;
  613. if (sd->sd_scsi_rw(wu) == 0)
  614. return SR_WU_RESTART;
  615. } else {
  616. /* XXX - retry write if we just went from online to degraded. */
  617. printf("%s: permanently fail write on block %lld\n",
  618. sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
  619. }
  620. wu->swu_state = SR_WU_FAILED;
  621. xs->error = XS_DRIVER_STUFFUP;
  622. return SR_WU_FAILED;
  623. }
  624. int
  625. sr_raid5_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
  626. long len, void *data, int xsflags, int ccbflags, void *xorbuf)
  627. {
  628. struct sr_discipline *sd = wu->swu_dis;
  629. struct sr_ccb *ccb;
  630. DNPRINTF(SR_D_DIS, "sr_raid5_addio: %s chunk %d block %lld "
  631. "length %ld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write",
  632. chunk, (long long)blkno, len, xorbuf ? "X0R" : "-");
  633. /* Allocate temporary buffer. */
  634. if (data == NULL) {
  635. data = sr_block_get(sd, len);
  636. if (data == NULL)
  637. return (-1);
  638. ccbflags |= SR_CCBF_FREEBUF;
  639. }
  640. ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
  641. if (ccb == NULL) {
  642. if (ccbflags & SR_CCBF_FREEBUF)
  643. sr_block_put(sd, data, len);
  644. return (-1);
  645. }
  646. ccb->ccb_opaque = xorbuf;
  647. sr_wu_enqueue_ccb(wu, ccb);
  648. return (0);
  649. }
  650. void
  651. sr_raid5_xor(void *a, void *b, int len)
  652. {
  653. uint32_t *xa = a, *xb = b;
  654. len >>= 2;
  655. while (len--)
  656. *xa++ ^= *xb++;
  657. }
  658. void
  659. sr_raid5_rebuild(struct sr_discipline *sd)
  660. {
  661. int64_t strip_no, strip_size, strip_bits, i, psz, rb, restart;
  662. int64_t chunk_count, chunk_strips, chunk_lba, chunk_size, row_size;
  663. struct sr_workunit *wu_r, *wu_w;
  664. int s, slept, percent = 0, old_percent = -1;
  665. int rebuild_chunk = -1;
  666. void *xorbuf;
  667. /* Find the rebuild chunk. */
  668. for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
  669. if (sr_raid5_chunk_rebuild(sd, i)) {
  670. rebuild_chunk = i;
  671. break;
  672. }
  673. }
  674. if (rebuild_chunk == -1)
  675. goto bad;
  676. strip_size = sd->sd_meta->ssdi.ssd_strip_size;
  677. strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
  678. chunk_count = sd->sd_meta->ssdi.ssd_chunk_no - 1;
  679. chunk_size = sd->sd_meta->ssdi.ssd_size / chunk_count;
  680. chunk_strips = (chunk_size << DEV_BSHIFT) >> strip_bits;
  681. row_size = (chunk_count << strip_bits) >> DEV_BSHIFT;
  682. DNPRINTF(SR_D_REBUILD, "%s: %s sr_raid5_rebuild volume size = %lld, "
  683. "chunk count = %lld, chunk size = %lld, chunk strips = %lld, "
  684. "row size = %lld\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
  685. sd->sd_meta->ssdi.ssd_size, chunk_count, chunk_size, chunk_strips,
  686. row_size);
  687. restart = sd->sd_meta->ssd_rebuild / row_size;
  688. if (restart > chunk_strips) {
  689. printf("%s: bogus rebuild restart offset, starting from 0\n",
  690. DEVNAME(sd->sd_sc));
  691. restart = 0;
  692. }
  693. if (restart != 0) {
  694. psz = sd->sd_meta->ssdi.ssd_size;
  695. rb = sd->sd_meta->ssd_rebuild;
  696. if (rb > 0)
  697. percent = 100 - ((psz * 100 - rb * 100) / psz) - 1;
  698. else
  699. percent = 0;
  700. printf("%s: resuming rebuild on %s at %d%%\n",
  701. DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, percent);
  702. }
  703. for (strip_no = restart; strip_no < chunk_strips; strip_no++) {
  704. chunk_lba = (strip_size >> DEV_BSHIFT) * strip_no;
  705. DNPRINTF(SR_D_REBUILD, "%s: %s rebuild strip %lld, "
  706. "chunk lba = %lld\n", DEVNAME(sd->sd_sc),
  707. sd->sd_meta->ssd_devname, strip_no, chunk_lba);
  708. wu_w = sr_scsi_wu_get(sd, 0);
  709. wu_r = sr_scsi_wu_get(sd, 0);
  710. xorbuf = sr_block_get(sd, strip_size);
  711. if (sr_raid5_regenerate(wu_r, rebuild_chunk, chunk_lba,
  712. strip_size, xorbuf))
  713. goto bad;
  714. if (sr_raid5_addio(wu_w, rebuild_chunk, chunk_lba, strip_size,
  715. xorbuf, SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL))
  716. goto bad;
  717. /* Collide write work unit with read work unit. */
  718. wu_r->swu_state = SR_WU_INPROGRESS;
  719. wu_r->swu_flags |= SR_WUF_REBUILD;
  720. wu_w->swu_state = SR_WU_DEFERRED;
  721. wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
  722. wu_r->swu_collider = wu_w;
  723. /* Block I/O to this strip while we rebuild it. */
  724. wu_r->swu_blk_start = (strip_no / chunk_count) * row_size;
  725. wu_r->swu_blk_end = wu_r->swu_blk_start + row_size - 1;
  726. wu_w->swu_blk_start = wu_r->swu_blk_start;
  727. wu_w->swu_blk_end = wu_r->swu_blk_end;
  728. DNPRINTF(SR_D_REBUILD, "%s: %s rebuild swu_blk_start = %lld, "
  729. "swu_blk_end = %lld\n", DEVNAME(sd->sd_sc),
  730. sd->sd_meta->ssd_devname,
  731. wu_r->swu_blk_start, wu_r->swu_blk_end);
  732. s = splbio();
  733. TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
  734. splx(s);
  735. sr_schedule_wu(wu_r);
  736. slept = 0;
  737. while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
  738. tsleep(wu_w, PRIBIO, "sr_rebuild", 0);
  739. slept = 1;
  740. }
  741. if (!slept)
  742. tsleep(sd->sd_sc, PWAIT, "sr_yield", 1);
  743. sr_scsi_wu_put(sd, wu_r);
  744. sr_scsi_wu_put(sd, wu_w);
  745. sd->sd_meta->ssd_rebuild = chunk_lba * chunk_count;
  746. psz = sd->sd_meta->ssdi.ssd_size;
  747. rb = sd->sd_meta->ssd_rebuild;
  748. if (rb > 0)
  749. percent = 100 - ((psz * 100 - rb * 100) / psz) - 1;
  750. else
  751. percent = 0;
  752. if (percent != old_percent && strip_no != chunk_strips - 1) {
  753. if (sr_meta_save(sd, SR_META_DIRTY))
  754. printf("%s: could not save metadata to %s\n",
  755. DEVNAME(sd->sd_sc),
  756. sd->sd_meta->ssd_devname);
  757. old_percent = percent;
  758. }
  759. if (sd->sd_reb_abort)
  760. goto abort;
  761. }
  762. DNPRINTF(SR_D_REBUILD, "%s: %s rebuild complete\n", DEVNAME(sd->sd_sc),
  763. sd->sd_meta->ssd_devname);
  764. /* all done */
  765. sd->sd_meta->ssd_rebuild = 0;
  766. for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
  767. if (sd->sd_vol.sv_chunks[i]->src_meta.scm_status ==
  768. BIOC_SDREBUILD) {
  769. sd->sd_set_chunk_state(sd, i, BIOC_SDONLINE);
  770. break;
  771. }
  772. }
  773. return;
  774. abort:
  775. if (sr_meta_save(sd, SR_META_DIRTY))
  776. printf("%s: could not save metadata to %s\n",
  777. DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
  778. bad:
  779. return;
  780. }
  781. #if 0
  782. void
  783. sr_raid5_scrub(struct sr_discipline *sd)
  784. {
  785. int64_t strip_no, strip_size, no_chunk, parity, max_strip, strip_bits;
  786. int64_t i;
  787. struct sr_workunit *wu_r, *wu_w;
  788. int s, slept;
  789. void *xorbuf;
  790. wu_w = sr_scsi_wu_get(sd, 0);
  791. wu_r = sr_scsi_wu_get(sd, 0);
  792. no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
  793. strip_size = sd->sd_meta->ssdi.ssd_strip_size;
  794. strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
  795. max_strip = sd->sd_meta->ssdi.ssd_size >> strip_bits;
  796. for (strip_no = 0; strip_no < max_strip; strip_no++) {
  797. parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
  798. xorbuf = sr_block_get(sd, strip_size);
  799. for (i = 0; i <= no_chunk; i++) {
  800. if (i != parity)
  801. sr_raid5_addio(wu_r, i, 0xBADCAFE, strip_size,
  802. NULL, SCSI_DATA_IN, 0, xorbuf);
  803. }
  804. sr_raid5_addio(wu_w, parity, 0xBADCAFE, strip_size, xorbuf,
  805. SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL);
  806. wu_r->swu_flags |= SR_WUF_REBUILD;
  807. /* Collide wu_w with wu_r */
  808. wu_w->swu_state = SR_WU_DEFERRED;
  809. wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
  810. wu_r->swu_collider = wu_w;
  811. s = splbio();
  812. TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
  813. splx(s);
  814. wu_r->swu_state = SR_WU_INPROGRESS;
  815. sr_schedule_wu(wu_r);
  816. slept = 0;
  817. while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
  818. tsleep(wu_w, PRIBIO, "sr_scrub", 0);
  819. slept = 1;
  820. }
  821. if (!slept)
  822. tsleep(sd->sd_sc, PWAIT, "sr_yield", 1);
  823. }
  824. done:
  825. return;
  826. }
  827. #endif