md-cluster.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966
  1. /*
  2. * Copyright (C) 2015, SUSE
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2, or (at your option)
  7. * any later version.
  8. *
  9. */
  10. #include <linux/module.h>
  11. #include <linux/dlm.h>
  12. #include <linux/sched.h>
  13. #include <linux/raid/md_p.h>
  14. #include "md.h"
  15. #include "bitmap.h"
  16. #include "md-cluster.h"
  17. #define LVB_SIZE 64
  18. #define NEW_DEV_TIMEOUT 5000
  19. struct dlm_lock_resource {
  20. dlm_lockspace_t *ls;
  21. struct dlm_lksb lksb;
  22. char *name; /* lock name. */
  23. uint32_t flags; /* flags to pass to dlm_lock() */
  24. struct completion completion; /* completion for synchronized locking */
  25. void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
  26. struct mddev *mddev; /* pointing back to mddev. */
  27. };
  28. struct suspend_info {
  29. int slot;
  30. sector_t lo;
  31. sector_t hi;
  32. struct list_head list;
  33. };
  34. struct resync_info {
  35. __le64 lo;
  36. __le64 hi;
  37. };
  38. /* md_cluster_info flags */
  39. #define MD_CLUSTER_WAITING_FOR_NEWDISK 1
  40. struct md_cluster_info {
  41. /* dlm lock space and resources for clustered raid. */
  42. dlm_lockspace_t *lockspace;
  43. int slot_number;
  44. struct completion completion;
  45. struct dlm_lock_resource *sb_lock;
  46. struct mutex sb_mutex;
  47. struct dlm_lock_resource *bitmap_lockres;
  48. struct list_head suspend_list;
  49. spinlock_t suspend_lock;
  50. struct md_thread *recovery_thread;
  51. unsigned long recovery_map;
  52. /* communication loc resources */
  53. struct dlm_lock_resource *ack_lockres;
  54. struct dlm_lock_resource *message_lockres;
  55. struct dlm_lock_resource *token_lockres;
  56. struct dlm_lock_resource *no_new_dev_lockres;
  57. struct md_thread *recv_thread;
  58. struct completion newdisk_completion;
  59. unsigned long state;
  60. };
  61. enum msg_type {
  62. METADATA_UPDATED = 0,
  63. RESYNCING,
  64. NEWDISK,
  65. REMOVE,
  66. RE_ADD,
  67. };
  68. struct cluster_msg {
  69. int type;
  70. int slot;
  71. /* TODO: Unionize this for smaller footprint */
  72. sector_t low;
  73. sector_t high;
  74. char uuid[16];
  75. int raid_slot;
  76. };
  77. static void sync_ast(void *arg)
  78. {
  79. struct dlm_lock_resource *res;
  80. res = (struct dlm_lock_resource *) arg;
  81. complete(&res->completion);
  82. }
  83. static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
  84. {
  85. int ret = 0;
  86. init_completion(&res->completion);
  87. ret = dlm_lock(res->ls, mode, &res->lksb,
  88. res->flags, res->name, strlen(res->name),
  89. 0, sync_ast, res, res->bast);
  90. if (ret)
  91. return ret;
  92. wait_for_completion(&res->completion);
  93. return res->lksb.sb_status;
  94. }
  95. static int dlm_unlock_sync(struct dlm_lock_resource *res)
  96. {
  97. return dlm_lock_sync(res, DLM_LOCK_NL);
  98. }
  99. static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
  100. char *name, void (*bastfn)(void *arg, int mode), int with_lvb)
  101. {
  102. struct dlm_lock_resource *res = NULL;
  103. int ret, namelen;
  104. struct md_cluster_info *cinfo = mddev->cluster_info;
  105. res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
  106. if (!res)
  107. return NULL;
  108. res->ls = cinfo->lockspace;
  109. res->mddev = mddev;
  110. namelen = strlen(name);
  111. res->name = kzalloc(namelen + 1, GFP_KERNEL);
  112. if (!res->name) {
  113. pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name);
  114. goto out_err;
  115. }
  116. strlcpy(res->name, name, namelen + 1);
  117. if (with_lvb) {
  118. res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL);
  119. if (!res->lksb.sb_lvbptr) {
  120. pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name);
  121. goto out_err;
  122. }
  123. res->flags = DLM_LKF_VALBLK;
  124. }
  125. if (bastfn)
  126. res->bast = bastfn;
  127. res->flags |= DLM_LKF_EXPEDITE;
  128. ret = dlm_lock_sync(res, DLM_LOCK_NL);
  129. if (ret) {
  130. pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name);
  131. goto out_err;
  132. }
  133. res->flags &= ~DLM_LKF_EXPEDITE;
  134. res->flags |= DLM_LKF_CONVERT;
  135. return res;
  136. out_err:
  137. kfree(res->lksb.sb_lvbptr);
  138. kfree(res->name);
  139. kfree(res);
  140. return NULL;
  141. }
  142. static void lockres_free(struct dlm_lock_resource *res)
  143. {
  144. if (!res)
  145. return;
  146. init_completion(&res->completion);
  147. dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
  148. wait_for_completion(&res->completion);
  149. kfree(res->name);
  150. kfree(res->lksb.sb_lvbptr);
  151. kfree(res);
  152. }
  153. static char *pretty_uuid(char *dest, char *src)
  154. {
  155. int i, len = 0;
  156. for (i = 0; i < 16; i++) {
  157. if (i == 4 || i == 6 || i == 8 || i == 10)
  158. len += sprintf(dest + len, "-");
  159. len += sprintf(dest + len, "%02x", (__u8)src[i]);
  160. }
  161. return dest;
  162. }
  163. static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
  164. sector_t lo, sector_t hi)
  165. {
  166. struct resync_info *ri;
  167. ri = (struct resync_info *)lockres->lksb.sb_lvbptr;
  168. ri->lo = cpu_to_le64(lo);
  169. ri->hi = cpu_to_le64(hi);
  170. }
  171. static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres)
  172. {
  173. struct resync_info ri;
  174. struct suspend_info *s = NULL;
  175. sector_t hi = 0;
  176. dlm_lock_sync(lockres, DLM_LOCK_CR);
  177. memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
  178. hi = le64_to_cpu(ri.hi);
  179. if (ri.hi > 0) {
  180. s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
  181. if (!s)
  182. goto out;
  183. s->hi = hi;
  184. s->lo = le64_to_cpu(ri.lo);
  185. }
  186. dlm_unlock_sync(lockres);
  187. out:
  188. return s;
  189. }
  190. static void recover_bitmaps(struct md_thread *thread)
  191. {
  192. struct mddev *mddev = thread->mddev;
  193. struct md_cluster_info *cinfo = mddev->cluster_info;
  194. struct dlm_lock_resource *bm_lockres;
  195. char str[64];
  196. int slot, ret;
  197. struct suspend_info *s, *tmp;
  198. sector_t lo, hi;
  199. while (cinfo->recovery_map) {
  200. slot = fls64((u64)cinfo->recovery_map) - 1;
  201. /* Clear suspend_area associated with the bitmap */
  202. spin_lock_irq(&cinfo->suspend_lock);
  203. list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
  204. if (slot == s->slot) {
  205. list_del(&s->list);
  206. kfree(s);
  207. }
  208. spin_unlock_irq(&cinfo->suspend_lock);
  209. snprintf(str, 64, "bitmap%04d", slot);
  210. bm_lockres = lockres_init(mddev, str, NULL, 1);
  211. if (!bm_lockres) {
  212. pr_err("md-cluster: Cannot initialize bitmaps\n");
  213. goto clear_bit;
  214. }
  215. ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
  216. if (ret) {
  217. pr_err("md-cluster: Could not DLM lock %s: %d\n",
  218. str, ret);
  219. goto clear_bit;
  220. }
  221. ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
  222. if (ret) {
  223. pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
  224. goto dlm_unlock;
  225. }
  226. if (hi > 0) {
  227. /* TODO:Wait for current resync to get over */
  228. set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
  229. if (lo < mddev->recovery_cp)
  230. mddev->recovery_cp = lo;
  231. md_check_recovery(mddev);
  232. }
  233. dlm_unlock:
  234. dlm_unlock_sync(bm_lockres);
  235. clear_bit:
  236. clear_bit(slot, &cinfo->recovery_map);
  237. }
  238. }
  239. static void recover_prep(void *arg)
  240. {
  241. }
  242. static void recover_slot(void *arg, struct dlm_slot *slot)
  243. {
  244. struct mddev *mddev = arg;
  245. struct md_cluster_info *cinfo = mddev->cluster_info;
  246. pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
  247. mddev->bitmap_info.cluster_name,
  248. slot->nodeid, slot->slot,
  249. cinfo->slot_number);
  250. set_bit(slot->slot - 1, &cinfo->recovery_map);
  251. if (!cinfo->recovery_thread) {
  252. cinfo->recovery_thread = md_register_thread(recover_bitmaps,
  253. mddev, "recover");
  254. if (!cinfo->recovery_thread) {
  255. pr_warn("md-cluster: Could not create recovery thread\n");
  256. return;
  257. }
  258. }
  259. md_wakeup_thread(cinfo->recovery_thread);
  260. }
  261. static void recover_done(void *arg, struct dlm_slot *slots,
  262. int num_slots, int our_slot,
  263. uint32_t generation)
  264. {
  265. struct mddev *mddev = arg;
  266. struct md_cluster_info *cinfo = mddev->cluster_info;
  267. cinfo->slot_number = our_slot;
  268. complete(&cinfo->completion);
  269. }
  270. static const struct dlm_lockspace_ops md_ls_ops = {
  271. .recover_prep = recover_prep,
  272. .recover_slot = recover_slot,
  273. .recover_done = recover_done,
  274. };
  275. /*
  276. * The BAST function for the ack lock resource
  277. * This function wakes up the receive thread in
  278. * order to receive and process the message.
  279. */
  280. static void ack_bast(void *arg, int mode)
  281. {
  282. struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg;
  283. struct md_cluster_info *cinfo = res->mddev->cluster_info;
  284. if (mode == DLM_LOCK_EX)
  285. md_wakeup_thread(cinfo->recv_thread);
  286. }
  287. static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
  288. {
  289. struct suspend_info *s, *tmp;
  290. list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
  291. if (slot == s->slot) {
  292. pr_info("%s:%d Deleting suspend_info: %d\n",
  293. __func__, __LINE__, slot);
  294. list_del(&s->list);
  295. kfree(s);
  296. break;
  297. }
  298. }
  299. static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
  300. {
  301. spin_lock_irq(&cinfo->suspend_lock);
  302. __remove_suspend_info(cinfo, slot);
  303. spin_unlock_irq(&cinfo->suspend_lock);
  304. }
  305. static void process_suspend_info(struct md_cluster_info *cinfo,
  306. int slot, sector_t lo, sector_t hi)
  307. {
  308. struct suspend_info *s;
  309. if (!hi) {
  310. remove_suspend_info(cinfo, slot);
  311. return;
  312. }
  313. s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
  314. if (!s)
  315. return;
  316. s->slot = slot;
  317. s->lo = lo;
  318. s->hi = hi;
  319. spin_lock_irq(&cinfo->suspend_lock);
  320. /* Remove existing entry (if exists) before adding */
  321. __remove_suspend_info(cinfo, slot);
  322. list_add(&s->list, &cinfo->suspend_list);
  323. spin_unlock_irq(&cinfo->suspend_lock);
  324. }
  325. static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
  326. {
  327. char disk_uuid[64];
  328. struct md_cluster_info *cinfo = mddev->cluster_info;
  329. char event_name[] = "EVENT=ADD_DEVICE";
  330. char raid_slot[16];
  331. char *envp[] = {event_name, disk_uuid, raid_slot, NULL};
  332. int len;
  333. len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
  334. pretty_uuid(disk_uuid + len, cmsg->uuid);
  335. snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
  336. pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
  337. init_completion(&cinfo->newdisk_completion);
  338. set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
  339. kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp);
  340. wait_for_completion_timeout(&cinfo->newdisk_completion,
  341. NEW_DEV_TIMEOUT);
  342. clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
  343. }
  344. static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
  345. {
  346. struct md_cluster_info *cinfo = mddev->cluster_info;
  347. md_reload_sb(mddev);
  348. dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
  349. }
  350. static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
  351. {
  352. struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
  353. if (rdev)
  354. md_kick_rdev_from_array(rdev);
  355. else
  356. pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
  357. }
  358. static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
  359. {
  360. struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
  361. if (rdev && test_bit(Faulty, &rdev->flags))
  362. clear_bit(Faulty, &rdev->flags);
  363. else
  364. pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot);
  365. }
  366. static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
  367. {
  368. switch (msg->type) {
  369. case METADATA_UPDATED:
  370. pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
  371. __func__, __LINE__, msg->slot);
  372. process_metadata_update(mddev, msg);
  373. break;
  374. case RESYNCING:
  375. pr_info("%s: %d Received message: RESYNCING from %d\n",
  376. __func__, __LINE__, msg->slot);
  377. process_suspend_info(mddev->cluster_info, msg->slot,
  378. msg->low, msg->high);
  379. break;
  380. case NEWDISK:
  381. pr_info("%s: %d Received message: NEWDISK from %d\n",
  382. __func__, __LINE__, msg->slot);
  383. process_add_new_disk(mddev, msg);
  384. break;
  385. case REMOVE:
  386. pr_info("%s: %d Received REMOVE from %d\n",
  387. __func__, __LINE__, msg->slot);
  388. process_remove_disk(mddev, msg);
  389. break;
  390. case RE_ADD:
  391. pr_info("%s: %d Received RE_ADD from %d\n",
  392. __func__, __LINE__, msg->slot);
  393. process_readd_disk(mddev, msg);
  394. break;
  395. default:
  396. pr_warn("%s:%d Received unknown message from %d\n",
  397. __func__, __LINE__, msg->slot);
  398. }
  399. }
  400. /*
  401. * thread for receiving message
  402. */
  403. static void recv_daemon(struct md_thread *thread)
  404. {
  405. struct md_cluster_info *cinfo = thread->mddev->cluster_info;
  406. struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
  407. struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
  408. struct cluster_msg msg;
  409. /*get CR on Message*/
  410. if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
  411. pr_err("md/raid1:failed to get CR on MESSAGE\n");
  412. return;
  413. }
  414. /* read lvb and wake up thread to process this message_lockres */
  415. memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg));
  416. process_recvd_msg(thread->mddev, &msg);
  417. /*release CR on ack_lockres*/
  418. dlm_unlock_sync(ack_lockres);
  419. /*up-convert to EX on message_lockres*/
  420. dlm_lock_sync(message_lockres, DLM_LOCK_EX);
  421. /*get CR on ack_lockres again*/
  422. dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
  423. /*release CR on message_lockres*/
  424. dlm_unlock_sync(message_lockres);
  425. }
  426. /* lock_comm()
  427. * Takes the lock on the TOKEN lock resource so no other
  428. * node can communicate while the operation is underway.
  429. */
  430. static int lock_comm(struct md_cluster_info *cinfo)
  431. {
  432. int error;
  433. error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
  434. if (error)
  435. pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
  436. __func__, __LINE__, error);
  437. return error;
  438. }
  439. static void unlock_comm(struct md_cluster_info *cinfo)
  440. {
  441. dlm_unlock_sync(cinfo->token_lockres);
  442. }
  443. /* __sendmsg()
  444. * This function performs the actual sending of the message. This function is
  445. * usually called after performing the encompassing operation
  446. * The function:
  447. * 1. Grabs the message lockresource in EX mode
  448. * 2. Copies the message to the message LVB
  449. * 3. Downconverts message lockresource to CR
  450. * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
  451. * and the other nodes read the message. The thread will wait here until all other
  452. * nodes have released ack lock resource.
  453. * 5. Downconvert ack lockresource to CR
  454. */
  455. static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
  456. {
  457. int error;
  458. int slot = cinfo->slot_number - 1;
  459. cmsg->slot = cpu_to_le32(slot);
  460. /*get EX on Message*/
  461. error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX);
  462. if (error) {
  463. pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error);
  464. goto failed_message;
  465. }
  466. memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
  467. sizeof(struct cluster_msg));
  468. /*down-convert EX to CR on Message*/
  469. error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR);
  470. if (error) {
  471. pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n",
  472. error);
  473. goto failed_message;
  474. }
  475. /*up-convert CR to EX on Ack*/
  476. error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX);
  477. if (error) {
  478. pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n",
  479. error);
  480. goto failed_ack;
  481. }
  482. /*down-convert EX to CR on Ack*/
  483. error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR);
  484. if (error) {
  485. pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n",
  486. error);
  487. goto failed_ack;
  488. }
  489. failed_ack:
  490. dlm_unlock_sync(cinfo->message_lockres);
  491. failed_message:
  492. return error;
  493. }
  494. static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
  495. {
  496. int ret;
  497. lock_comm(cinfo);
  498. ret = __sendmsg(cinfo, cmsg);
  499. unlock_comm(cinfo);
  500. return ret;
  501. }
  502. static int gather_all_resync_info(struct mddev *mddev, int total_slots)
  503. {
  504. struct md_cluster_info *cinfo = mddev->cluster_info;
  505. int i, ret = 0;
  506. struct dlm_lock_resource *bm_lockres;
  507. struct suspend_info *s;
  508. char str[64];
  509. for (i = 0; i < total_slots; i++) {
  510. memset(str, '\0', 64);
  511. snprintf(str, 64, "bitmap%04d", i);
  512. bm_lockres = lockres_init(mddev, str, NULL, 1);
  513. if (!bm_lockres)
  514. return -ENOMEM;
  515. if (i == (cinfo->slot_number - 1))
  516. continue;
  517. bm_lockres->flags |= DLM_LKF_NOQUEUE;
  518. ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
  519. if (ret == -EAGAIN) {
  520. memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE);
  521. s = read_resync_info(mddev, bm_lockres);
  522. if (s) {
  523. pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
  524. __func__, __LINE__,
  525. (unsigned long long) s->lo,
  526. (unsigned long long) s->hi, i);
  527. spin_lock_irq(&cinfo->suspend_lock);
  528. s->slot = i;
  529. list_add(&s->list, &cinfo->suspend_list);
  530. spin_unlock_irq(&cinfo->suspend_lock);
  531. }
  532. ret = 0;
  533. lockres_free(bm_lockres);
  534. continue;
  535. }
  536. if (ret)
  537. goto out;
  538. /* TODO: Read the disk bitmap sb and check if it needs recovery */
  539. dlm_unlock_sync(bm_lockres);
  540. lockres_free(bm_lockres);
  541. }
  542. out:
  543. return ret;
  544. }
  545. static int join(struct mddev *mddev, int nodes)
  546. {
  547. struct md_cluster_info *cinfo;
  548. int ret, ops_rv;
  549. char str[64];
  550. if (!try_module_get(THIS_MODULE))
  551. return -ENOENT;
  552. cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
  553. if (!cinfo)
  554. return -ENOMEM;
  555. init_completion(&cinfo->completion);
  556. mutex_init(&cinfo->sb_mutex);
  557. mddev->cluster_info = cinfo;
  558. memset(str, 0, 64);
  559. pretty_uuid(str, mddev->uuid);
  560. ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
  561. DLM_LSFL_FS, LVB_SIZE,
  562. &md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
  563. if (ret)
  564. goto err;
  565. wait_for_completion(&cinfo->completion);
  566. if (nodes < cinfo->slot_number) {
  567. pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).",
  568. cinfo->slot_number, nodes);
  569. ret = -ERANGE;
  570. goto err;
  571. }
  572. cinfo->sb_lock = lockres_init(mddev, "cmd-super",
  573. NULL, 0);
  574. if (!cinfo->sb_lock) {
  575. ret = -ENOMEM;
  576. goto err;
  577. }
  578. /* Initiate the communication resources */
  579. ret = -ENOMEM;
  580. cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
  581. if (!cinfo->recv_thread) {
  582. pr_err("md-cluster: cannot allocate memory for recv_thread!\n");
  583. goto err;
  584. }
  585. cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1);
  586. if (!cinfo->message_lockres)
  587. goto err;
  588. cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0);
  589. if (!cinfo->token_lockres)
  590. goto err;
  591. cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
  592. if (!cinfo->ack_lockres)
  593. goto err;
  594. cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0);
  595. if (!cinfo->no_new_dev_lockres)
  596. goto err;
  597. /* get sync CR lock on ACK. */
  598. if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR))
  599. pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n",
  600. ret);
  601. /* get sync CR lock on no-new-dev. */
  602. if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR))
  603. pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret);
  604. pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number);
  605. snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1);
  606. cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1);
  607. if (!cinfo->bitmap_lockres)
  608. goto err;
  609. if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) {
  610. pr_err("Failed to get bitmap lock\n");
  611. ret = -EINVAL;
  612. goto err;
  613. }
  614. INIT_LIST_HEAD(&cinfo->suspend_list);
  615. spin_lock_init(&cinfo->suspend_lock);
  616. ret = gather_all_resync_info(mddev, nodes);
  617. if (ret)
  618. goto err;
  619. return 0;
  620. err:
  621. lockres_free(cinfo->message_lockres);
  622. lockres_free(cinfo->token_lockres);
  623. lockres_free(cinfo->ack_lockres);
  624. lockres_free(cinfo->no_new_dev_lockres);
  625. lockres_free(cinfo->bitmap_lockres);
  626. lockres_free(cinfo->sb_lock);
  627. if (cinfo->lockspace)
  628. dlm_release_lockspace(cinfo->lockspace, 2);
  629. mddev->cluster_info = NULL;
  630. kfree(cinfo);
  631. module_put(THIS_MODULE);
  632. return ret;
  633. }
  634. static int leave(struct mddev *mddev)
  635. {
  636. struct md_cluster_info *cinfo = mddev->cluster_info;
  637. if (!cinfo)
  638. return 0;
  639. md_unregister_thread(&cinfo->recovery_thread);
  640. md_unregister_thread(&cinfo->recv_thread);
  641. lockres_free(cinfo->message_lockres);
  642. lockres_free(cinfo->token_lockres);
  643. lockres_free(cinfo->ack_lockres);
  644. lockres_free(cinfo->no_new_dev_lockres);
  645. lockres_free(cinfo->sb_lock);
  646. lockres_free(cinfo->bitmap_lockres);
  647. dlm_release_lockspace(cinfo->lockspace, 2);
  648. return 0;
  649. }
  650. /* slot_number(): Returns the MD slot number to use
  651. * DLM starts the slot numbers from 1, wheras cluster-md
  652. * wants the number to be from zero, so we deduct one
  653. */
  654. static int slot_number(struct mddev *mddev)
  655. {
  656. struct md_cluster_info *cinfo = mddev->cluster_info;
  657. return cinfo->slot_number - 1;
  658. }
  659. static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
  660. {
  661. struct md_cluster_info *cinfo = mddev->cluster_info;
  662. add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
  663. /* Re-acquire the lock to refresh LVB */
  664. dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
  665. }
  666. static int metadata_update_start(struct mddev *mddev)
  667. {
  668. return lock_comm(mddev->cluster_info);
  669. }
  670. static int metadata_update_finish(struct mddev *mddev)
  671. {
  672. struct md_cluster_info *cinfo = mddev->cluster_info;
  673. struct cluster_msg cmsg;
  674. int ret;
  675. memset(&cmsg, 0, sizeof(cmsg));
  676. cmsg.type = cpu_to_le32(METADATA_UPDATED);
  677. ret = __sendmsg(cinfo, &cmsg);
  678. unlock_comm(cinfo);
  679. return ret;
  680. }
  681. static int metadata_update_cancel(struct mddev *mddev)
  682. {
  683. struct md_cluster_info *cinfo = mddev->cluster_info;
  684. return dlm_unlock_sync(cinfo->token_lockres);
  685. }
  686. static int resync_send(struct mddev *mddev, enum msg_type type,
  687. sector_t lo, sector_t hi)
  688. {
  689. struct md_cluster_info *cinfo = mddev->cluster_info;
  690. struct cluster_msg cmsg;
  691. int slot = cinfo->slot_number - 1;
  692. pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__,
  693. (unsigned long long)lo,
  694. (unsigned long long)hi);
  695. resync_info_update(mddev, lo, hi);
  696. cmsg.type = cpu_to_le32(type);
  697. cmsg.slot = cpu_to_le32(slot);
  698. cmsg.low = cpu_to_le64(lo);
  699. cmsg.high = cpu_to_le64(hi);
  700. return sendmsg(cinfo, &cmsg);
  701. }
  702. static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
  703. {
  704. pr_info("%s:%d\n", __func__, __LINE__);
  705. return resync_send(mddev, RESYNCING, lo, hi);
  706. }
  707. static void resync_finish(struct mddev *mddev)
  708. {
  709. pr_info("%s:%d\n", __func__, __LINE__);
  710. resync_send(mddev, RESYNCING, 0, 0);
  711. }
  712. static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi)
  713. {
  714. struct md_cluster_info *cinfo = mddev->cluster_info;
  715. int ret = 0;
  716. struct suspend_info *s;
  717. spin_lock_irq(&cinfo->suspend_lock);
  718. if (list_empty(&cinfo->suspend_list))
  719. goto out;
  720. list_for_each_entry(s, &cinfo->suspend_list, list)
  721. if (hi > s->lo && lo < s->hi) {
  722. ret = 1;
  723. break;
  724. }
  725. out:
  726. spin_unlock_irq(&cinfo->suspend_lock);
  727. return ret;
  728. }
  729. static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
  730. {
  731. struct md_cluster_info *cinfo = mddev->cluster_info;
  732. struct cluster_msg cmsg;
  733. int ret = 0;
  734. struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
  735. char *uuid = sb->device_uuid;
  736. memset(&cmsg, 0, sizeof(cmsg));
  737. cmsg.type = cpu_to_le32(NEWDISK);
  738. memcpy(cmsg.uuid, uuid, 16);
  739. cmsg.raid_slot = rdev->desc_nr;
  740. lock_comm(cinfo);
  741. ret = __sendmsg(cinfo, &cmsg);
  742. if (ret)
  743. return ret;
  744. cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE;
  745. ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX);
  746. cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
  747. /* Some node does not "see" the device */
  748. if (ret == -EAGAIN)
  749. ret = -ENOENT;
  750. else
  751. dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
  752. return ret;
  753. }
  754. static int add_new_disk_finish(struct mddev *mddev)
  755. {
  756. struct cluster_msg cmsg;
  757. struct md_cluster_info *cinfo = mddev->cluster_info;
  758. int ret;
  759. /* Write sb and inform others */
  760. md_update_sb(mddev, 1);
  761. cmsg.type = METADATA_UPDATED;
  762. ret = __sendmsg(cinfo, &cmsg);
  763. unlock_comm(cinfo);
  764. return ret;
  765. }
  766. static int new_disk_ack(struct mddev *mddev, bool ack)
  767. {
  768. struct md_cluster_info *cinfo = mddev->cluster_info;
  769. if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) {
  770. pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev));
  771. return -EINVAL;
  772. }
  773. if (ack)
  774. dlm_unlock_sync(cinfo->no_new_dev_lockres);
  775. complete(&cinfo->newdisk_completion);
  776. return 0;
  777. }
  778. static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
  779. {
  780. struct cluster_msg cmsg;
  781. struct md_cluster_info *cinfo = mddev->cluster_info;
  782. cmsg.type = REMOVE;
  783. cmsg.raid_slot = rdev->desc_nr;
  784. return __sendmsg(cinfo, &cmsg);
  785. }
  786. static int gather_bitmaps(struct md_rdev *rdev)
  787. {
  788. int sn, err;
  789. sector_t lo, hi;
  790. struct cluster_msg cmsg;
  791. struct mddev *mddev = rdev->mddev;
  792. struct md_cluster_info *cinfo = mddev->cluster_info;
  793. cmsg.type = RE_ADD;
  794. cmsg.raid_slot = rdev->desc_nr;
  795. err = sendmsg(cinfo, &cmsg);
  796. if (err)
  797. goto out;
  798. for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) {
  799. if (sn == (cinfo->slot_number - 1))
  800. continue;
  801. err = bitmap_copy_from_slot(mddev, sn, &lo, &hi, false);
  802. if (err) {
  803. pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
  804. goto out;
  805. }
  806. if ((hi > 0) && (lo < mddev->recovery_cp))
  807. mddev->recovery_cp = lo;
  808. }
  809. out:
  810. return err;
  811. }
  812. static struct md_cluster_operations cluster_ops = {
  813. .join = join,
  814. .leave = leave,
  815. .slot_number = slot_number,
  816. .resync_info_update = resync_info_update,
  817. .resync_start = resync_start,
  818. .resync_finish = resync_finish,
  819. .metadata_update_start = metadata_update_start,
  820. .metadata_update_finish = metadata_update_finish,
  821. .metadata_update_cancel = metadata_update_cancel,
  822. .area_resyncing = area_resyncing,
  823. .add_new_disk_start = add_new_disk_start,
  824. .add_new_disk_finish = add_new_disk_finish,
  825. .new_disk_ack = new_disk_ack,
  826. .remove_disk = remove_disk,
  827. .gather_bitmaps = gather_bitmaps,
  828. };
  829. static int __init cluster_init(void)
  830. {
  831. pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n");
  832. pr_info("Registering Cluster MD functions\n");
  833. register_md_cluster_operations(&cluster_ops, THIS_MODULE);
  834. return 0;
  835. }
  836. static void cluster_exit(void)
  837. {
  838. unregister_md_cluster_operations();
  839. }
  840. module_init(cluster_init);
  841. module_exit(cluster_exit);
  842. MODULE_LICENSE("GPL");
  843. MODULE_DESCRIPTION("Clustering support for MD");