drbd_bitmap.c 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674
  1. /*
  2. drbd_bitmap.c
  3. This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
  4. Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
  5. Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
  6. Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
  7. drbd is free software; you can redistribute it and/or modify
  8. it under the terms of the GNU General Public License as published by
  9. the Free Software Foundation; either version 2, or (at your option)
  10. any later version.
  11. drbd is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. GNU General Public License for more details.
  15. You should have received a copy of the GNU General Public License
  16. along with drbd; see the file COPYING. If not, write to
  17. the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  18. */
  19. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  20. #include <linux/bitmap.h>
  21. #include <linux/vmalloc.h>
  22. #include <linux/string.h>
  23. #include <linux/drbd.h>
  24. #include <linux/slab.h>
  25. #include <linux/highmem.h>
  26. #include "drbd_int.h"
  27. /* OPAQUE outside this file!
  28. * interface defined in drbd_int.h
  29. * convention:
  30. * function name drbd_bm_... => used elsewhere, "public".
  31. * function name bm_... => internal to implementation, "private".
  32. */
  33. /*
  34. * LIMITATIONS:
  35. * We want to support >= peta byte of backend storage, while for now still using
  36. * a granularity of one bit per 4KiB of storage.
  37. * 1 << 50 bytes backend storage (1 PiB)
  38. * 1 << (50 - 12) bits needed
  39. * 38 --> we need u64 to index and count bits
  40. * 1 << (38 - 3) bitmap bytes needed
  41. * 35 --> we still need u64 to index and count bytes
  42. * (that's 32 GiB of bitmap for 1 PiB storage)
  43. * 1 << (35 - 2) 32bit longs needed
  44. * 33 --> we'd even need u64 to index and count 32bit long words.
  45. * 1 << (35 - 3) 64bit longs needed
  46. * 32 --> we could get away with a 32bit unsigned int to index and count
  47. * 64bit long words, but I rather stay with unsigned long for now.
  48. * We probably should neither count nor point to bytes or long words
  49. * directly, but either by bitnumber, or by page index and offset.
  50. * 1 << (35 - 12)
  51. * 22 --> we need that much 4KiB pages of bitmap.
  52. * 1 << (22 + 3) --> on a 64bit arch,
  53. * we need 32 MiB to store the array of page pointers.
  54. *
  55. * Because I'm lazy, and because the resulting patch was too large, too ugly
  56. * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
  57. * (1 << 32) bits * 4k storage.
  58. *
  59. * bitmap storage and IO:
  60. * Bitmap is stored little endian on disk, and is kept little endian in
  61. * core memory. Currently we still hold the full bitmap in core as long
  62. * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
  63. * seems excessive.
  64. *
  65. * We plan to reduce the amount of in-core bitmap pages by paging them in
  66. * and out against their on-disk location as necessary, but need to make
  67. * sure we don't cause too much meta data IO, and must not deadlock in
  68. * tight memory situations. This needs some more work.
  69. */
  70. /*
  71. * NOTE
  72. * Access to the *bm_pages is protected by bm_lock.
  73. * It is safe to read the other members within the lock.
  74. *
  75. * drbd_bm_set_bits is called from bio_endio callbacks,
  76. * We may be called with irq already disabled,
  77. * so we need spin_lock_irqsave().
  78. * And we need the kmap_atomic.
  79. */
  80. struct drbd_bitmap {
  81. struct page **bm_pages;
  82. spinlock_t bm_lock;
  83. /* exclusively to be used by __al_write_transaction(),
  84. * drbd_bm_mark_for_writeout() and
  85. * and drbd_bm_write_hinted() -> bm_rw() called from there.
  86. */
  87. unsigned int n_bitmap_hints;
  88. unsigned int al_bitmap_hints[AL_UPDATES_PER_TRANSACTION];
  89. /* see LIMITATIONS: above */
  90. unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
  91. unsigned long bm_bits;
  92. size_t bm_words;
  93. size_t bm_number_of_pages;
  94. sector_t bm_dev_capacity;
  95. struct mutex bm_change; /* serializes resize operations */
  96. wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
  97. enum bm_flag bm_flags;
  98. /* debugging aid, in case we are still racy somewhere */
  99. char *bm_why;
  100. struct task_struct *bm_task;
  101. };
  102. #define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
  103. static void __bm_print_lock_info(struct drbd_device *device, const char *func)
  104. {
  105. struct drbd_bitmap *b = device->bitmap;
  106. if (!__ratelimit(&drbd_ratelimit_state))
  107. return;
  108. drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n",
  109. current->comm, task_pid_nr(current),
  110. func, b->bm_why ?: "?",
  111. b->bm_task->comm, task_pid_nr(b->bm_task));
  112. }
  113. void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags)
  114. {
  115. struct drbd_bitmap *b = device->bitmap;
  116. int trylock_failed;
  117. if (!b) {
  118. drbd_err(device, "FIXME no bitmap in drbd_bm_lock!?\n");
  119. return;
  120. }
  121. trylock_failed = !mutex_trylock(&b->bm_change);
  122. if (trylock_failed) {
  123. drbd_warn(device, "%s[%d] going to '%s' but bitmap already locked for '%s' by %s[%d]\n",
  124. current->comm, task_pid_nr(current),
  125. why, b->bm_why ?: "?",
  126. b->bm_task->comm, task_pid_nr(b->bm_task));
  127. mutex_lock(&b->bm_change);
  128. }
  129. if (BM_LOCKED_MASK & b->bm_flags)
  130. drbd_err(device, "FIXME bitmap already locked in bm_lock\n");
  131. b->bm_flags |= flags & BM_LOCKED_MASK;
  132. b->bm_why = why;
  133. b->bm_task = current;
  134. }
  135. void drbd_bm_unlock(struct drbd_device *device)
  136. {
  137. struct drbd_bitmap *b = device->bitmap;
  138. if (!b) {
  139. drbd_err(device, "FIXME no bitmap in drbd_bm_unlock!?\n");
  140. return;
  141. }
  142. if (!(BM_LOCKED_MASK & device->bitmap->bm_flags))
  143. drbd_err(device, "FIXME bitmap not locked in bm_unlock\n");
  144. b->bm_flags &= ~BM_LOCKED_MASK;
  145. b->bm_why = NULL;
  146. b->bm_task = NULL;
  147. mutex_unlock(&b->bm_change);
  148. }
  149. /* we store some "meta" info about our pages in page->private */
  150. /* at a granularity of 4k storage per bitmap bit:
  151. * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
  152. * 1<<38 bits,
  153. * 1<<23 4k bitmap pages.
  154. * Use 24 bits as page index, covers 2 peta byte storage
  155. * at a granularity of 4k per bit.
  156. * Used to report the failed page idx on io error from the endio handlers.
  157. */
  158. #define BM_PAGE_IDX_MASK ((1UL<<24)-1)
  159. /* this page is currently read in, or written back */
  160. #define BM_PAGE_IO_LOCK 31
  161. /* if there has been an IO error for this page */
  162. #define BM_PAGE_IO_ERROR 30
  163. /* this is to be able to intelligently skip disk IO,
  164. * set if bits have been set since last IO. */
  165. #define BM_PAGE_NEED_WRITEOUT 29
  166. /* to mark for lazy writeout once syncer cleared all clearable bits,
  167. * we if bits have been cleared since last IO. */
  168. #define BM_PAGE_LAZY_WRITEOUT 28
  169. /* pages marked with this "HINT" will be considered for writeout
  170. * on activity log transactions */
  171. #define BM_PAGE_HINT_WRITEOUT 27
  172. /* store_page_idx uses non-atomic assignment. It is only used directly after
  173. * allocating the page. All other bm_set_page_* and bm_clear_page_* need to
  174. * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
  175. * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
  176. * requires it all to be atomic as well. */
  177. static void bm_store_page_idx(struct page *page, unsigned long idx)
  178. {
  179. BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
  180. set_page_private(page, idx);
  181. }
  182. static unsigned long bm_page_to_idx(struct page *page)
  183. {
  184. return page_private(page) & BM_PAGE_IDX_MASK;
  185. }
  186. /* As is very unlikely that the same page is under IO from more than one
  187. * context, we can get away with a bit per page and one wait queue per bitmap.
  188. */
  189. static void bm_page_lock_io(struct drbd_device *device, int page_nr)
  190. {
  191. struct drbd_bitmap *b = device->bitmap;
  192. void *addr = &page_private(b->bm_pages[page_nr]);
  193. wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
  194. }
  195. static void bm_page_unlock_io(struct drbd_device *device, int page_nr)
  196. {
  197. struct drbd_bitmap *b = device->bitmap;
  198. void *addr = &page_private(b->bm_pages[page_nr]);
  199. clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
  200. wake_up(&device->bitmap->bm_io_wait);
  201. }
  202. /* set _before_ submit_io, so it may be reset due to being changed
  203. * while this page is in flight... will get submitted later again */
  204. static void bm_set_page_unchanged(struct page *page)
  205. {
  206. /* use cmpxchg? */
  207. clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
  208. clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
  209. }
  210. static void bm_set_page_need_writeout(struct page *page)
  211. {
  212. set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
  213. }
  214. void drbd_bm_reset_al_hints(struct drbd_device *device)
  215. {
  216. device->bitmap->n_bitmap_hints = 0;
  217. }
  218. /**
  219. * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
  220. * @device: DRBD device.
  221. * @page_nr: the bitmap page to mark with the "hint" flag
  222. *
  223. * From within an activity log transaction, we mark a few pages with these
  224. * hints, then call drbd_bm_write_hinted(), which will only write out changed
  225. * pages which are flagged with this mark.
  226. */
  227. void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
  228. {
  229. struct drbd_bitmap *b = device->bitmap;
  230. struct page *page;
  231. if (page_nr >= device->bitmap->bm_number_of_pages) {
  232. drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n",
  233. page_nr, (int)device->bitmap->bm_number_of_pages);
  234. return;
  235. }
  236. page = device->bitmap->bm_pages[page_nr];
  237. BUG_ON(b->n_bitmap_hints >= ARRAY_SIZE(b->al_bitmap_hints));
  238. if (!test_and_set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)))
  239. b->al_bitmap_hints[b->n_bitmap_hints++] = page_nr;
  240. }
  241. static int bm_test_page_unchanged(struct page *page)
  242. {
  243. volatile const unsigned long *addr = &page_private(page);
  244. return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
  245. }
  246. static void bm_set_page_io_err(struct page *page)
  247. {
  248. set_bit(BM_PAGE_IO_ERROR, &page_private(page));
  249. }
  250. static void bm_clear_page_io_err(struct page *page)
  251. {
  252. clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
  253. }
  254. static void bm_set_page_lazy_writeout(struct page *page)
  255. {
  256. set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
  257. }
  258. static int bm_test_page_lazy_writeout(struct page *page)
  259. {
  260. return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
  261. }
  262. /* on a 32bit box, this would allow for exactly (2<<38) bits. */
  263. static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
  264. {
  265. /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
  266. unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
  267. BUG_ON(page_nr >= b->bm_number_of_pages);
  268. return page_nr;
  269. }
  270. static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
  271. {
  272. /* page_nr = (bitnr/8) >> PAGE_SHIFT; */
  273. unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
  274. BUG_ON(page_nr >= b->bm_number_of_pages);
  275. return page_nr;
  276. }
  277. static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
  278. {
  279. struct page *page = b->bm_pages[idx];
  280. return (unsigned long *) kmap_atomic(page);
  281. }
  282. static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
  283. {
  284. return __bm_map_pidx(b, idx);
  285. }
  286. static void __bm_unmap(unsigned long *p_addr)
  287. {
  288. kunmap_atomic(p_addr);
  289. };
  290. static void bm_unmap(unsigned long *p_addr)
  291. {
  292. return __bm_unmap(p_addr);
  293. }
  294. /* long word offset of _bitmap_ sector */
  295. #define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
  296. /* word offset from start of bitmap to word number _in_page_
  297. * modulo longs per page
  298. #define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
  299. hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
  300. so do it explicitly:
  301. */
  302. #define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
  303. /* Long words per page */
  304. #define LWPP (PAGE_SIZE/sizeof(long))
  305. /*
  306. * actually most functions herein should take a struct drbd_bitmap*, not a
  307. * struct drbd_device*, but for the debug macros I like to have the device around
  308. * to be able to report device specific.
  309. */
  310. static void bm_free_pages(struct page **pages, unsigned long number)
  311. {
  312. unsigned long i;
  313. if (!pages)
  314. return;
  315. for (i = 0; i < number; i++) {
  316. if (!pages[i]) {
  317. pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n",
  318. i, number);
  319. continue;
  320. }
  321. __free_page(pages[i]);
  322. pages[i] = NULL;
  323. }
  324. }
  325. static inline void bm_vk_free(void *ptr)
  326. {
  327. kvfree(ptr);
  328. }
  329. /*
  330. * "have" and "want" are NUMBER OF PAGES.
  331. */
  332. static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
  333. {
  334. struct page **old_pages = b->bm_pages;
  335. struct page **new_pages, *page;
  336. unsigned int i, bytes;
  337. unsigned long have = b->bm_number_of_pages;
  338. BUG_ON(have == 0 && old_pages != NULL);
  339. BUG_ON(have != 0 && old_pages == NULL);
  340. if (have == want)
  341. return old_pages;
  342. /* Trying kmalloc first, falling back to vmalloc.
  343. * GFP_NOIO, as this is called while drbd IO is "suspended",
  344. * and during resize or attach on diskless Primary,
  345. * we must not block on IO to ourselves.
  346. * Context is receiver thread or dmsetup. */
  347. bytes = sizeof(struct page *)*want;
  348. new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN);
  349. if (!new_pages) {
  350. new_pages = __vmalloc(bytes,
  351. GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO,
  352. PAGE_KERNEL);
  353. if (!new_pages)
  354. return NULL;
  355. }
  356. if (want >= have) {
  357. for (i = 0; i < have; i++)
  358. new_pages[i] = old_pages[i];
  359. for (; i < want; i++) {
  360. page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
  361. if (!page) {
  362. bm_free_pages(new_pages + have, i - have);
  363. bm_vk_free(new_pages);
  364. return NULL;
  365. }
  366. /* we want to know which page it is
  367. * from the endio handlers */
  368. bm_store_page_idx(page, i);
  369. new_pages[i] = page;
  370. }
  371. } else {
  372. for (i = 0; i < want; i++)
  373. new_pages[i] = old_pages[i];
  374. /* NOT HERE, we are outside the spinlock!
  375. bm_free_pages(old_pages + want, have - want);
  376. */
  377. }
  378. return new_pages;
  379. }
  380. /*
  381. * allocates the drbd_bitmap and stores it in device->bitmap.
  382. */
  383. int drbd_bm_init(struct drbd_device *device)
  384. {
  385. struct drbd_bitmap *b = device->bitmap;
  386. WARN_ON(b != NULL);
  387. b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
  388. if (!b)
  389. return -ENOMEM;
  390. spin_lock_init(&b->bm_lock);
  391. mutex_init(&b->bm_change);
  392. init_waitqueue_head(&b->bm_io_wait);
  393. device->bitmap = b;
  394. return 0;
  395. }
  396. sector_t drbd_bm_capacity(struct drbd_device *device)
  397. {
  398. if (!expect(device->bitmap))
  399. return 0;
  400. return device->bitmap->bm_dev_capacity;
  401. }
  402. /* called on driver unload. TODO: call when a device is destroyed.
  403. */
  404. void drbd_bm_cleanup(struct drbd_device *device)
  405. {
  406. if (!expect(device->bitmap))
  407. return;
  408. bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
  409. bm_vk_free(device->bitmap->bm_pages);
  410. kfree(device->bitmap);
  411. device->bitmap = NULL;
  412. }
  413. /*
  414. * since (b->bm_bits % BITS_PER_LONG) != 0,
  415. * this masks out the remaining bits.
  416. * Returns the number of bits cleared.
  417. */
  418. #ifndef BITS_PER_PAGE
  419. #define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3))
  420. #define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1)
  421. #else
  422. # if BITS_PER_PAGE != (1UL << (PAGE_SHIFT + 3))
  423. # error "ambiguous BITS_PER_PAGE"
  424. # endif
  425. #endif
  426. #define BITS_PER_LONG_MASK (BITS_PER_LONG - 1)
  427. static int bm_clear_surplus(struct drbd_bitmap *b)
  428. {
  429. unsigned long mask;
  430. unsigned long *p_addr, *bm;
  431. int tmp;
  432. int cleared = 0;
  433. /* number of bits modulo bits per page */
  434. tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
  435. /* mask the used bits of the word containing the last bit */
  436. mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
  437. /* bitmap is always stored little endian,
  438. * on disk and in core memory alike */
  439. mask = cpu_to_lel(mask);
  440. p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
  441. bm = p_addr + (tmp/BITS_PER_LONG);
  442. if (mask) {
  443. /* If mask != 0, we are not exactly aligned, so bm now points
  444. * to the long containing the last bit.
  445. * If mask == 0, bm already points to the word immediately
  446. * after the last (long word aligned) bit. */
  447. cleared = hweight_long(*bm & ~mask);
  448. *bm &= mask;
  449. bm++;
  450. }
  451. if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
  452. /* on a 32bit arch, we may need to zero out
  453. * a padding long to align with a 64bit remote */
  454. cleared += hweight_long(*bm);
  455. *bm = 0;
  456. }
  457. bm_unmap(p_addr);
  458. return cleared;
  459. }
  460. static void bm_set_surplus(struct drbd_bitmap *b)
  461. {
  462. unsigned long mask;
  463. unsigned long *p_addr, *bm;
  464. int tmp;
  465. /* number of bits modulo bits per page */
  466. tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
  467. /* mask the used bits of the word containing the last bit */
  468. mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
  469. /* bitmap is always stored little endian,
  470. * on disk and in core memory alike */
  471. mask = cpu_to_lel(mask);
  472. p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
  473. bm = p_addr + (tmp/BITS_PER_LONG);
  474. if (mask) {
  475. /* If mask != 0, we are not exactly aligned, so bm now points
  476. * to the long containing the last bit.
  477. * If mask == 0, bm already points to the word immediately
  478. * after the last (long word aligned) bit. */
  479. *bm |= ~mask;
  480. bm++;
  481. }
  482. if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
  483. /* on a 32bit arch, we may need to zero out
  484. * a padding long to align with a 64bit remote */
  485. *bm = ~0UL;
  486. }
  487. bm_unmap(p_addr);
  488. }
  489. /* you better not modify the bitmap while this is running,
  490. * or its results will be stale */
  491. static unsigned long bm_count_bits(struct drbd_bitmap *b)
  492. {
  493. unsigned long *p_addr;
  494. unsigned long bits = 0;
  495. unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
  496. int idx, last_word;
  497. /* all but last page */
  498. for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
  499. p_addr = __bm_map_pidx(b, idx);
  500. bits += bitmap_weight(p_addr, BITS_PER_PAGE);
  501. __bm_unmap(p_addr);
  502. cond_resched();
  503. }
  504. /* last (or only) page */
  505. last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
  506. p_addr = __bm_map_pidx(b, idx);
  507. bits += bitmap_weight(p_addr, last_word * BITS_PER_LONG);
  508. p_addr[last_word] &= cpu_to_lel(mask);
  509. bits += hweight_long(p_addr[last_word]);
  510. /* 32bit arch, may have an unused padding long */
  511. if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
  512. p_addr[last_word+1] = 0;
  513. __bm_unmap(p_addr);
  514. return bits;
  515. }
  516. /* offset and len in long words.*/
  517. static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
  518. {
  519. unsigned long *p_addr, *bm;
  520. unsigned int idx;
  521. size_t do_now, end;
  522. end = offset + len;
  523. if (end > b->bm_words) {
  524. pr_alert("bm_memset end > bm_words\n");
  525. return;
  526. }
  527. while (offset < end) {
  528. do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
  529. idx = bm_word_to_page_idx(b, offset);
  530. p_addr = bm_map_pidx(b, idx);
  531. bm = p_addr + MLPP(offset);
  532. if (bm+do_now > p_addr + LWPP) {
  533. pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
  534. p_addr, bm, (int)do_now);
  535. } else
  536. memset(bm, c, do_now * sizeof(long));
  537. bm_unmap(p_addr);
  538. bm_set_page_need_writeout(b->bm_pages[idx]);
  539. offset += do_now;
  540. }
  541. }
  542. /* For the layout, see comment above drbd_md_set_sector_offsets(). */
  543. static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
  544. {
  545. u64 bitmap_sectors;
  546. if (ldev->md.al_offset == 8)
  547. bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
  548. else
  549. bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
  550. return bitmap_sectors << (9 + 3);
  551. }
  552. /*
  553. * make sure the bitmap has enough room for the attached storage,
  554. * if necessary, resize.
  555. * called whenever we may have changed the device size.
  556. * returns -ENOMEM if we could not allocate enough memory, 0 on success.
  557. * In case this is actually a resize, we copy the old bitmap into the new one.
  558. * Otherwise, the bitmap is initialized to all bits set.
  559. */
  560. int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bits)
  561. {
  562. struct drbd_bitmap *b = device->bitmap;
  563. unsigned long bits, words, owords, obits;
  564. unsigned long want, have, onpages; /* number of pages */
  565. struct page **npages, **opages = NULL;
  566. int err = 0;
  567. bool growing;
  568. if (!expect(b))
  569. return -ENOMEM;
  570. drbd_bm_lock(device, "resize", BM_LOCKED_MASK);
  571. drbd_info(device, "drbd_bm_resize called with capacity == %llu\n",
  572. (unsigned long long)capacity);
  573. if (capacity == b->bm_dev_capacity)
  574. goto out;
  575. if (capacity == 0) {
  576. spin_lock_irq(&b->bm_lock);
  577. opages = b->bm_pages;
  578. onpages = b->bm_number_of_pages;
  579. owords = b->bm_words;
  580. b->bm_pages = NULL;
  581. b->bm_number_of_pages =
  582. b->bm_set =
  583. b->bm_bits =
  584. b->bm_words =
  585. b->bm_dev_capacity = 0;
  586. spin_unlock_irq(&b->bm_lock);
  587. bm_free_pages(opages, onpages);
  588. bm_vk_free(opages);
  589. goto out;
  590. }
  591. bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
  592. /* if we would use
  593. words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
  594. a 32bit host could present the wrong number of words
  595. to a 64bit host.
  596. */
  597. words = ALIGN(bits, 64) >> LN2_BPL;
  598. if (get_ldev(device)) {
  599. u64 bits_on_disk = drbd_md_on_disk_bits(device->ldev);
  600. put_ldev(device);
  601. if (bits > bits_on_disk) {
  602. drbd_info(device, "bits = %lu\n", bits);
  603. drbd_info(device, "bits_on_disk = %llu\n", bits_on_disk);
  604. err = -ENOSPC;
  605. goto out;
  606. }
  607. }
  608. want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
  609. have = b->bm_number_of_pages;
  610. if (want == have) {
  611. D_ASSERT(device, b->bm_pages != NULL);
  612. npages = b->bm_pages;
  613. } else {
  614. if (drbd_insert_fault(device, DRBD_FAULT_BM_ALLOC))
  615. npages = NULL;
  616. else
  617. npages = bm_realloc_pages(b, want);
  618. }
  619. if (!npages) {
  620. err = -ENOMEM;
  621. goto out;
  622. }
  623. spin_lock_irq(&b->bm_lock);
  624. opages = b->bm_pages;
  625. owords = b->bm_words;
  626. obits = b->bm_bits;
  627. growing = bits > obits;
  628. if (opages && growing && set_new_bits)
  629. bm_set_surplus(b);
  630. b->bm_pages = npages;
  631. b->bm_number_of_pages = want;
  632. b->bm_bits = bits;
  633. b->bm_words = words;
  634. b->bm_dev_capacity = capacity;
  635. if (growing) {
  636. if (set_new_bits) {
  637. bm_memset(b, owords, 0xff, words-owords);
  638. b->bm_set += bits - obits;
  639. } else
  640. bm_memset(b, owords, 0x00, words-owords);
  641. }
  642. if (want < have) {
  643. /* implicit: (opages != NULL) && (opages != npages) */
  644. bm_free_pages(opages + want, have - want);
  645. }
  646. (void)bm_clear_surplus(b);
  647. spin_unlock_irq(&b->bm_lock);
  648. if (opages != npages)
  649. bm_vk_free(opages);
  650. if (!growing)
  651. b->bm_set = bm_count_bits(b);
  652. drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
  653. out:
  654. drbd_bm_unlock(device);
  655. return err;
  656. }
  657. /* inherently racy:
  658. * if not protected by other means, return value may be out of date when
  659. * leaving this function...
  660. * we still need to lock it, since it is important that this returns
  661. * bm_set == 0 precisely.
  662. *
  663. * maybe bm_set should be atomic_t ?
  664. */
  665. unsigned long _drbd_bm_total_weight(struct drbd_device *device)
  666. {
  667. struct drbd_bitmap *b = device->bitmap;
  668. unsigned long s;
  669. unsigned long flags;
  670. if (!expect(b))
  671. return 0;
  672. if (!expect(b->bm_pages))
  673. return 0;
  674. spin_lock_irqsave(&b->bm_lock, flags);
  675. s = b->bm_set;
  676. spin_unlock_irqrestore(&b->bm_lock, flags);
  677. return s;
  678. }
  679. unsigned long drbd_bm_total_weight(struct drbd_device *device)
  680. {
  681. unsigned long s;
  682. /* if I don't have a disk, I don't know about out-of-sync status */
  683. if (!get_ldev_if_state(device, D_NEGOTIATING))
  684. return 0;
  685. s = _drbd_bm_total_weight(device);
  686. put_ldev(device);
  687. return s;
  688. }
  689. size_t drbd_bm_words(struct drbd_device *device)
  690. {
  691. struct drbd_bitmap *b = device->bitmap;
  692. if (!expect(b))
  693. return 0;
  694. if (!expect(b->bm_pages))
  695. return 0;
  696. return b->bm_words;
  697. }
  698. unsigned long drbd_bm_bits(struct drbd_device *device)
  699. {
  700. struct drbd_bitmap *b = device->bitmap;
  701. if (!expect(b))
  702. return 0;
  703. return b->bm_bits;
  704. }
  705. /* merge number words from buffer into the bitmap starting at offset.
  706. * buffer[i] is expected to be little endian unsigned long.
  707. * bitmap must be locked by drbd_bm_lock.
  708. * currently only used from receive_bitmap.
  709. */
  710. void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number,
  711. unsigned long *buffer)
  712. {
  713. struct drbd_bitmap *b = device->bitmap;
  714. unsigned long *p_addr, *bm;
  715. unsigned long word, bits;
  716. unsigned int idx;
  717. size_t end, do_now;
  718. end = offset + number;
  719. if (!expect(b))
  720. return;
  721. if (!expect(b->bm_pages))
  722. return;
  723. if (number == 0)
  724. return;
  725. WARN_ON(offset >= b->bm_words);
  726. WARN_ON(end > b->bm_words);
  727. spin_lock_irq(&b->bm_lock);
  728. while (offset < end) {
  729. do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
  730. idx = bm_word_to_page_idx(b, offset);
  731. p_addr = bm_map_pidx(b, idx);
  732. bm = p_addr + MLPP(offset);
  733. offset += do_now;
  734. while (do_now--) {
  735. bits = hweight_long(*bm);
  736. word = *bm | *buffer++;
  737. *bm++ = word;
  738. b->bm_set += hweight_long(word) - bits;
  739. }
  740. bm_unmap(p_addr);
  741. bm_set_page_need_writeout(b->bm_pages[idx]);
  742. }
  743. /* with 32bit <-> 64bit cross-platform connect
  744. * this is only correct for current usage,
  745. * where we _know_ that we are 64 bit aligned,
  746. * and know that this function is used in this way, too...
  747. */
  748. if (end == b->bm_words)
  749. b->bm_set -= bm_clear_surplus(b);
  750. spin_unlock_irq(&b->bm_lock);
  751. }
  752. /* copy number words from the bitmap starting at offset into the buffer.
  753. * buffer[i] will be little endian unsigned long.
  754. */
  755. void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
  756. unsigned long *buffer)
  757. {
  758. struct drbd_bitmap *b = device->bitmap;
  759. unsigned long *p_addr, *bm;
  760. size_t end, do_now;
  761. end = offset + number;
  762. if (!expect(b))
  763. return;
  764. if (!expect(b->bm_pages))
  765. return;
  766. spin_lock_irq(&b->bm_lock);
  767. if ((offset >= b->bm_words) ||
  768. (end > b->bm_words) ||
  769. (number <= 0))
  770. drbd_err(device, "offset=%lu number=%lu bm_words=%lu\n",
  771. (unsigned long) offset,
  772. (unsigned long) number,
  773. (unsigned long) b->bm_words);
  774. else {
  775. while (offset < end) {
  776. do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
  777. p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
  778. bm = p_addr + MLPP(offset);
  779. offset += do_now;
  780. while (do_now--)
  781. *buffer++ = *bm++;
  782. bm_unmap(p_addr);
  783. }
  784. }
  785. spin_unlock_irq(&b->bm_lock);
  786. }
  787. /* set all bits in the bitmap */
  788. void drbd_bm_set_all(struct drbd_device *device)
  789. {
  790. struct drbd_bitmap *b = device->bitmap;
  791. if (!expect(b))
  792. return;
  793. if (!expect(b->bm_pages))
  794. return;
  795. spin_lock_irq(&b->bm_lock);
  796. bm_memset(b, 0, 0xff, b->bm_words);
  797. (void)bm_clear_surplus(b);
  798. b->bm_set = b->bm_bits;
  799. spin_unlock_irq(&b->bm_lock);
  800. }
  801. /* clear all bits in the bitmap */
  802. void drbd_bm_clear_all(struct drbd_device *device)
  803. {
  804. struct drbd_bitmap *b = device->bitmap;
  805. if (!expect(b))
  806. return;
  807. if (!expect(b->bm_pages))
  808. return;
  809. spin_lock_irq(&b->bm_lock);
  810. bm_memset(b, 0, 0, b->bm_words);
  811. b->bm_set = 0;
  812. spin_unlock_irq(&b->bm_lock);
  813. }
  814. static void drbd_bm_aio_ctx_destroy(struct kref *kref)
  815. {
  816. struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref);
  817. unsigned long flags;
  818. spin_lock_irqsave(&ctx->device->resource->req_lock, flags);
  819. list_del(&ctx->list);
  820. spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags);
  821. put_ldev(ctx->device);
  822. kfree(ctx);
  823. }
  824. /* bv_page may be a copy, or may be the original */
  825. static void drbd_bm_endio(struct bio *bio)
  826. {
  827. struct drbd_bm_aio_ctx *ctx = bio->bi_private;
  828. struct drbd_device *device = ctx->device;
  829. struct drbd_bitmap *b = device->bitmap;
  830. unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
  831. if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
  832. !bm_test_page_unchanged(b->bm_pages[idx]))
  833. drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx);
  834. if (bio->bi_error) {
  835. /* ctx error will hold the completed-last non-zero error code,
  836. * in case error codes differ. */
  837. ctx->error = bio->bi_error;
  838. bm_set_page_io_err(b->bm_pages[idx]);
  839. /* Not identical to on disk version of it.
  840. * Is BM_PAGE_IO_ERROR enough? */
  841. if (__ratelimit(&drbd_ratelimit_state))
  842. drbd_err(device, "IO ERROR %d on bitmap page idx %u\n",
  843. bio->bi_error, idx);
  844. } else {
  845. bm_clear_page_io_err(b->bm_pages[idx]);
  846. dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx);
  847. }
  848. bm_page_unlock_io(device, idx);
  849. if (ctx->flags & BM_AIO_COPY_PAGES)
  850. mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
  851. bio_put(bio);
  852. if (atomic_dec_and_test(&ctx->in_flight)) {
  853. ctx->done = 1;
  854. wake_up(&device->misc_wait);
  855. kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
  856. }
  857. }
  858. static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
  859. {
  860. struct bio *bio = bio_alloc_drbd(GFP_NOIO);
  861. struct drbd_device *device = ctx->device;
  862. struct drbd_bitmap *b = device->bitmap;
  863. struct page *page;
  864. unsigned int len;
  865. unsigned int op = (ctx->flags & BM_AIO_READ) ? REQ_OP_READ : REQ_OP_WRITE;
  866. sector_t on_disk_sector =
  867. device->ldev->md.md_offset + device->ldev->md.bm_offset;
  868. on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
  869. /* this might happen with very small
  870. * flexible external meta data device,
  871. * or with PAGE_SIZE > 4k */
  872. len = min_t(unsigned int, PAGE_SIZE,
  873. (drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9);
  874. /* serialize IO on this page */
  875. bm_page_lock_io(device, page_nr);
  876. /* before memcpy and submit,
  877. * so it can be redirtied any time */
  878. bm_set_page_unchanged(b->bm_pages[page_nr]);
  879. if (ctx->flags & BM_AIO_COPY_PAGES) {
  880. page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_RECLAIM);
  881. copy_highpage(page, b->bm_pages[page_nr]);
  882. bm_store_page_idx(page, page_nr);
  883. } else
  884. page = b->bm_pages[page_nr];
  885. bio->bi_bdev = device->ldev->md_bdev;
  886. bio->bi_iter.bi_sector = on_disk_sector;
  887. /* bio_add_page of a single page to an empty bio will always succeed,
  888. * according to api. Do we want to assert that? */
  889. bio_add_page(bio, page, len, 0);
  890. bio->bi_private = ctx;
  891. bio->bi_end_io = drbd_bm_endio;
  892. bio_set_op_attrs(bio, op, 0);
  893. if (drbd_insert_fault(device, (op == REQ_OP_WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
  894. bio_io_error(bio);
  895. } else {
  896. submit_bio(bio);
  897. /* this should not count as user activity and cause the
  898. * resync to throttle -- see drbd_rs_should_slow_down(). */
  899. atomic_add(len >> 9, &device->rs_sect_ev);
  900. }
  901. }
  902. /*
  903. * bm_rw: read/write the whole bitmap from/to its on disk location.
  904. */
  905. static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
  906. {
  907. struct drbd_bm_aio_ctx *ctx;
  908. struct drbd_bitmap *b = device->bitmap;
  909. unsigned int num_pages, i, count = 0;
  910. unsigned long now;
  911. char ppb[10];
  912. int err = 0;
  913. /*
  914. * We are protected against bitmap disappearing/resizing by holding an
  915. * ldev reference (caller must have called get_ldev()).
  916. * For read/write, we are protected against changes to the bitmap by
  917. * the bitmap lock (see drbd_bitmap_io).
  918. * For lazy writeout, we don't care for ongoing changes to the bitmap,
  919. * as we submit copies of pages anyways.
  920. */
  921. ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO);
  922. if (!ctx)
  923. return -ENOMEM;
  924. *ctx = (struct drbd_bm_aio_ctx) {
  925. .device = device,
  926. .start_jif = jiffies,
  927. .in_flight = ATOMIC_INIT(1),
  928. .done = 0,
  929. .flags = flags,
  930. .error = 0,
  931. .kref = { ATOMIC_INIT(2) },
  932. };
  933. if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in drbd_bm_aio_ctx_destroy() */
  934. drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
  935. kfree(ctx);
  936. return -ENODEV;
  937. }
  938. /* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from
  939. drbd_adm_attach(), after device->ldev was assigned. */
  940. if (0 == (ctx->flags & ~BM_AIO_READ))
  941. WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
  942. spin_lock_irq(&device->resource->req_lock);
  943. list_add_tail(&ctx->list, &device->pending_bitmap_io);
  944. spin_unlock_irq(&device->resource->req_lock);
  945. num_pages = b->bm_number_of_pages;
  946. now = jiffies;
  947. /* let the layers below us try to merge these bios... */
  948. if (flags & BM_AIO_READ) {
  949. for (i = 0; i < num_pages; i++) {
  950. atomic_inc(&ctx->in_flight);
  951. bm_page_io_async(ctx, i);
  952. ++count;
  953. cond_resched();
  954. }
  955. } else if (flags & BM_AIO_WRITE_HINTED) {
  956. /* ASSERT: BM_AIO_WRITE_ALL_PAGES is not set. */
  957. unsigned int hint;
  958. for (hint = 0; hint < b->n_bitmap_hints; hint++) {
  959. i = b->al_bitmap_hints[hint];
  960. if (i >= num_pages) /* == -1U: no hint here. */
  961. continue;
  962. /* Several AL-extents may point to the same page. */
  963. if (!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
  964. &page_private(b->bm_pages[i])))
  965. continue;
  966. /* Has it even changed? */
  967. if (bm_test_page_unchanged(b->bm_pages[i]))
  968. continue;
  969. atomic_inc(&ctx->in_flight);
  970. bm_page_io_async(ctx, i);
  971. ++count;
  972. }
  973. } else {
  974. for (i = 0; i < num_pages; i++) {
  975. /* ignore completely unchanged pages */
  976. if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
  977. break;
  978. if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
  979. bm_test_page_unchanged(b->bm_pages[i])) {
  980. dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
  981. continue;
  982. }
  983. /* during lazy writeout,
  984. * ignore those pages not marked for lazy writeout. */
  985. if (lazy_writeout_upper_idx &&
  986. !bm_test_page_lazy_writeout(b->bm_pages[i])) {
  987. dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i);
  988. continue;
  989. }
  990. atomic_inc(&ctx->in_flight);
  991. bm_page_io_async(ctx, i);
  992. ++count;
  993. cond_resched();
  994. }
  995. }
  996. /*
  997. * We initialize ctx->in_flight to one to make sure drbd_bm_endio
  998. * will not set ctx->done early, and decrement / test it here. If there
  999. * are still some bios in flight, we need to wait for them here.
  1000. * If all IO is done already (or nothing had been submitted), there is
  1001. * no need to wait. Still, we need to put the kref associated with the
  1002. * "in_flight reached zero, all done" event.
  1003. */
  1004. if (!atomic_dec_and_test(&ctx->in_flight))
  1005. wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
  1006. else
  1007. kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
  1008. /* summary for global bitmap IO */
  1009. if (flags == 0) {
  1010. unsigned int ms = jiffies_to_msecs(jiffies - now);
  1011. if (ms > 5) {
  1012. drbd_info(device, "bitmap %s of %u pages took %u ms\n",
  1013. (flags & BM_AIO_READ) ? "READ" : "WRITE",
  1014. count, ms);
  1015. }
  1016. }
  1017. if (ctx->error) {
  1018. drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n");
  1019. drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
  1020. err = -EIO; /* ctx->error ? */
  1021. }
  1022. if (atomic_read(&ctx->in_flight))
  1023. err = -EIO; /* Disk timeout/force-detach during IO... */
  1024. now = jiffies;
  1025. if (flags & BM_AIO_READ) {
  1026. b->bm_set = bm_count_bits(b);
  1027. drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
  1028. jiffies - now);
  1029. }
  1030. now = b->bm_set;
  1031. if ((flags & ~BM_AIO_READ) == 0)
  1032. drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
  1033. ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
  1034. kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
  1035. return err;
  1036. }
  1037. /**
  1038. * drbd_bm_read() - Read the whole bitmap from its on disk location.
  1039. * @device: DRBD device.
  1040. */
  1041. int drbd_bm_read(struct drbd_device *device) __must_hold(local)
  1042. {
  1043. return bm_rw(device, BM_AIO_READ, 0);
  1044. }
  1045. /**
  1046. * drbd_bm_write() - Write the whole bitmap to its on disk location.
  1047. * @device: DRBD device.
  1048. *
  1049. * Will only write pages that have changed since last IO.
  1050. */
  1051. int drbd_bm_write(struct drbd_device *device) __must_hold(local)
  1052. {
  1053. return bm_rw(device, 0, 0);
  1054. }
  1055. /**
  1056. * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
  1057. * @device: DRBD device.
  1058. *
  1059. * Will write all pages.
  1060. */
  1061. int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
  1062. {
  1063. return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0);
  1064. }
  1065. /**
  1066. * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
  1067. * @device: DRBD device.
  1068. * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages
  1069. */
  1070. int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local)
  1071. {
  1072. return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx);
  1073. }
  1074. /**
  1075. * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
  1076. * @device: DRBD device.
  1077. *
  1078. * Will only write pages that have changed since last IO.
  1079. * In contrast to drbd_bm_write(), this will copy the bitmap pages
  1080. * to temporary writeout pages. It is intended to trigger a full write-out
  1081. * while still allowing the bitmap to change, for example if a resync or online
  1082. * verify is aborted due to a failed peer disk, while local IO continues, or
  1083. * pending resync acks are still being processed.
  1084. */
  1085. int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
  1086. {
  1087. return bm_rw(device, BM_AIO_COPY_PAGES, 0);
  1088. }
  1089. /**
  1090. * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
  1091. * @device: DRBD device.
  1092. */
  1093. int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
  1094. {
  1095. return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
  1096. }
  1097. /* NOTE
  1098. * find_first_bit returns int, we return unsigned long.
  1099. * For this to work on 32bit arch with bitnumbers > (1<<32),
  1100. * we'd need to return u64, and get a whole lot of other places
  1101. * fixed where we still use unsigned long.
  1102. *
  1103. * this returns a bit number, NOT a sector!
  1104. */
  1105. static unsigned long __bm_find_next(struct drbd_device *device, unsigned long bm_fo,
  1106. const int find_zero_bit)
  1107. {
  1108. struct drbd_bitmap *b = device->bitmap;
  1109. unsigned long *p_addr;
  1110. unsigned long bit_offset;
  1111. unsigned i;
  1112. if (bm_fo > b->bm_bits) {
  1113. drbd_err(device, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
  1114. bm_fo = DRBD_END_OF_BITMAP;
  1115. } else {
  1116. while (bm_fo < b->bm_bits) {
  1117. /* bit offset of the first bit in the page */
  1118. bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
  1119. p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
  1120. if (find_zero_bit)
  1121. i = find_next_zero_bit_le(p_addr,
  1122. PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
  1123. else
  1124. i = find_next_bit_le(p_addr,
  1125. PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
  1126. __bm_unmap(p_addr);
  1127. if (i < PAGE_SIZE*8) {
  1128. bm_fo = bit_offset + i;
  1129. if (bm_fo >= b->bm_bits)
  1130. break;
  1131. goto found;
  1132. }
  1133. bm_fo = bit_offset + PAGE_SIZE*8;
  1134. }
  1135. bm_fo = DRBD_END_OF_BITMAP;
  1136. }
  1137. found:
  1138. return bm_fo;
  1139. }
  1140. static unsigned long bm_find_next(struct drbd_device *device,
  1141. unsigned long bm_fo, const int find_zero_bit)
  1142. {
  1143. struct drbd_bitmap *b = device->bitmap;
  1144. unsigned long i = DRBD_END_OF_BITMAP;
  1145. if (!expect(b))
  1146. return i;
  1147. if (!expect(b->bm_pages))
  1148. return i;
  1149. spin_lock_irq(&b->bm_lock);
  1150. if (BM_DONT_TEST & b->bm_flags)
  1151. bm_print_lock_info(device);
  1152. i = __bm_find_next(device, bm_fo, find_zero_bit);
  1153. spin_unlock_irq(&b->bm_lock);
  1154. return i;
  1155. }
  1156. unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
  1157. {
  1158. return bm_find_next(device, bm_fo, 0);
  1159. }
  1160. #if 0
  1161. /* not yet needed for anything. */
  1162. unsigned long drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
  1163. {
  1164. return bm_find_next(device, bm_fo, 1);
  1165. }
  1166. #endif
  1167. /* does not spin_lock_irqsave.
  1168. * you must take drbd_bm_lock() first */
  1169. unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
  1170. {
  1171. /* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
  1172. return __bm_find_next(device, bm_fo, 0);
  1173. }
  1174. unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
  1175. {
  1176. /* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
  1177. return __bm_find_next(device, bm_fo, 1);
  1178. }
  1179. /* returns number of bits actually changed.
  1180. * for val != 0, we change 0 -> 1, return code positive
  1181. * for val == 0, we change 1 -> 0, return code negative
  1182. * wants bitnr, not sector.
  1183. * expected to be called for only a few bits (e - s about BITS_PER_LONG).
  1184. * Must hold bitmap lock already. */
  1185. static int __bm_change_bits_to(struct drbd_device *device, const unsigned long s,
  1186. unsigned long e, int val)
  1187. {
  1188. struct drbd_bitmap *b = device->bitmap;
  1189. unsigned long *p_addr = NULL;
  1190. unsigned long bitnr;
  1191. unsigned int last_page_nr = -1U;
  1192. int c = 0;
  1193. int changed_total = 0;
  1194. if (e >= b->bm_bits) {
  1195. drbd_err(device, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
  1196. s, e, b->bm_bits);
  1197. e = b->bm_bits ? b->bm_bits -1 : 0;
  1198. }
  1199. for (bitnr = s; bitnr <= e; bitnr++) {
  1200. unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
  1201. if (page_nr != last_page_nr) {
  1202. if (p_addr)
  1203. __bm_unmap(p_addr);
  1204. if (c < 0)
  1205. bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
  1206. else if (c > 0)
  1207. bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
  1208. changed_total += c;
  1209. c = 0;
  1210. p_addr = __bm_map_pidx(b, page_nr);
  1211. last_page_nr = page_nr;
  1212. }
  1213. if (val)
  1214. c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
  1215. else
  1216. c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
  1217. }
  1218. if (p_addr)
  1219. __bm_unmap(p_addr);
  1220. if (c < 0)
  1221. bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
  1222. else if (c > 0)
  1223. bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
  1224. changed_total += c;
  1225. b->bm_set += changed_total;
  1226. return changed_total;
  1227. }
  1228. /* returns number of bits actually changed.
  1229. * for val != 0, we change 0 -> 1, return code positive
  1230. * for val == 0, we change 1 -> 0, return code negative
  1231. * wants bitnr, not sector */
  1232. static int bm_change_bits_to(struct drbd_device *device, const unsigned long s,
  1233. const unsigned long e, int val)
  1234. {
  1235. unsigned long flags;
  1236. struct drbd_bitmap *b = device->bitmap;
  1237. int c = 0;
  1238. if (!expect(b))
  1239. return 1;
  1240. if (!expect(b->bm_pages))
  1241. return 0;
  1242. spin_lock_irqsave(&b->bm_lock, flags);
  1243. if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
  1244. bm_print_lock_info(device);
  1245. c = __bm_change_bits_to(device, s, e, val);
  1246. spin_unlock_irqrestore(&b->bm_lock, flags);
  1247. return c;
  1248. }
  1249. /* returns number of bits changed 0 -> 1 */
  1250. int drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
  1251. {
  1252. return bm_change_bits_to(device, s, e, 1);
  1253. }
  1254. /* returns number of bits changed 1 -> 0 */
  1255. int drbd_bm_clear_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
  1256. {
  1257. return -bm_change_bits_to(device, s, e, 0);
  1258. }
  1259. /* sets all bits in full words,
  1260. * from first_word up to, but not including, last_word */
  1261. static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
  1262. int page_nr, int first_word, int last_word)
  1263. {
  1264. int i;
  1265. int bits;
  1266. int changed = 0;
  1267. unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
  1268. /* I think it is more cache line friendly to hweight_long then set to ~0UL,
  1269. * than to first bitmap_weight() all words, then bitmap_fill() all words */
  1270. for (i = first_word; i < last_word; i++) {
  1271. bits = hweight_long(paddr[i]);
  1272. paddr[i] = ~0UL;
  1273. changed += BITS_PER_LONG - bits;
  1274. }
  1275. kunmap_atomic(paddr);
  1276. if (changed) {
  1277. /* We only need lazy writeout, the information is still in the
  1278. * remote bitmap as well, and is reconstructed during the next
  1279. * bitmap exchange, if lost locally due to a crash. */
  1280. bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
  1281. b->bm_set += changed;
  1282. }
  1283. }
  1284. /* Same thing as drbd_bm_set_bits,
  1285. * but more efficient for a large bit range.
  1286. * You must first drbd_bm_lock().
  1287. * Can be called to set the whole bitmap in one go.
  1288. * Sets bits from s to e _inclusive_. */
  1289. void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
  1290. {
  1291. /* First set_bit from the first bit (s)
  1292. * up to the next long boundary (sl),
  1293. * then assign full words up to the last long boundary (el),
  1294. * then set_bit up to and including the last bit (e).
  1295. *
  1296. * Do not use memset, because we must account for changes,
  1297. * so we need to loop over the words with hweight() anyways.
  1298. */
  1299. struct drbd_bitmap *b = device->bitmap;
  1300. unsigned long sl = ALIGN(s,BITS_PER_LONG);
  1301. unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
  1302. int first_page;
  1303. int last_page;
  1304. int page_nr;
  1305. int first_word;
  1306. int last_word;
  1307. if (e - s <= 3*BITS_PER_LONG) {
  1308. /* don't bother; el and sl may even be wrong. */
  1309. spin_lock_irq(&b->bm_lock);
  1310. __bm_change_bits_to(device, s, e, 1);
  1311. spin_unlock_irq(&b->bm_lock);
  1312. return;
  1313. }
  1314. /* difference is large enough that we can trust sl and el */
  1315. spin_lock_irq(&b->bm_lock);
  1316. /* bits filling the current long */
  1317. if (sl)
  1318. __bm_change_bits_to(device, s, sl-1, 1);
  1319. first_page = sl >> (3 + PAGE_SHIFT);
  1320. last_page = el >> (3 + PAGE_SHIFT);
  1321. /* MLPP: modulo longs per page */
  1322. /* LWPP: long words per page */
  1323. first_word = MLPP(sl >> LN2_BPL);
  1324. last_word = LWPP;
  1325. /* first and full pages, unless first page == last page */
  1326. for (page_nr = first_page; page_nr < last_page; page_nr++) {
  1327. bm_set_full_words_within_one_page(device->bitmap, page_nr, first_word, last_word);
  1328. spin_unlock_irq(&b->bm_lock);
  1329. cond_resched();
  1330. first_word = 0;
  1331. spin_lock_irq(&b->bm_lock);
  1332. }
  1333. /* last page (respectively only page, for first page == last page) */
  1334. last_word = MLPP(el >> LN2_BPL);
  1335. /* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples).
  1336. * ==> e = 32767, el = 32768, last_page = 2,
  1337. * and now last_word = 0.
  1338. * We do not want to touch last_page in this case,
  1339. * as we did not allocate it, it is not present in bitmap->bm_pages.
  1340. */
  1341. if (last_word)
  1342. bm_set_full_words_within_one_page(device->bitmap, last_page, first_word, last_word);
  1343. /* possibly trailing bits.
  1344. * example: (e & 63) == 63, el will be e+1.
  1345. * if that even was the very last bit,
  1346. * it would trigger an assert in __bm_change_bits_to()
  1347. */
  1348. if (el <= e)
  1349. __bm_change_bits_to(device, el, e, 1);
  1350. spin_unlock_irq(&b->bm_lock);
  1351. }
  1352. /* returns bit state
  1353. * wants bitnr, NOT sector.
  1354. * inherently racy... area needs to be locked by means of {al,rs}_lru
  1355. * 1 ... bit set
  1356. * 0 ... bit not set
  1357. * -1 ... first out of bounds access, stop testing for bits!
  1358. */
  1359. int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr)
  1360. {
  1361. unsigned long flags;
  1362. struct drbd_bitmap *b = device->bitmap;
  1363. unsigned long *p_addr;
  1364. int i;
  1365. if (!expect(b))
  1366. return 0;
  1367. if (!expect(b->bm_pages))
  1368. return 0;
  1369. spin_lock_irqsave(&b->bm_lock, flags);
  1370. if (BM_DONT_TEST & b->bm_flags)
  1371. bm_print_lock_info(device);
  1372. if (bitnr < b->bm_bits) {
  1373. p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
  1374. i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
  1375. bm_unmap(p_addr);
  1376. } else if (bitnr == b->bm_bits) {
  1377. i = -1;
  1378. } else { /* (bitnr > b->bm_bits) */
  1379. drbd_err(device, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
  1380. i = 0;
  1381. }
  1382. spin_unlock_irqrestore(&b->bm_lock, flags);
  1383. return i;
  1384. }
  1385. /* returns number of bits set in the range [s, e] */
  1386. int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
  1387. {
  1388. unsigned long flags;
  1389. struct drbd_bitmap *b = device->bitmap;
  1390. unsigned long *p_addr = NULL;
  1391. unsigned long bitnr;
  1392. unsigned int page_nr = -1U;
  1393. int c = 0;
  1394. /* If this is called without a bitmap, that is a bug. But just to be
  1395. * robust in case we screwed up elsewhere, in that case pretend there
  1396. * was one dirty bit in the requested area, so we won't try to do a
  1397. * local read there (no bitmap probably implies no disk) */
  1398. if (!expect(b))
  1399. return 1;
  1400. if (!expect(b->bm_pages))
  1401. return 1;
  1402. spin_lock_irqsave(&b->bm_lock, flags);
  1403. if (BM_DONT_TEST & b->bm_flags)
  1404. bm_print_lock_info(device);
  1405. for (bitnr = s; bitnr <= e; bitnr++) {
  1406. unsigned int idx = bm_bit_to_page_idx(b, bitnr);
  1407. if (page_nr != idx) {
  1408. page_nr = idx;
  1409. if (p_addr)
  1410. bm_unmap(p_addr);
  1411. p_addr = bm_map_pidx(b, idx);
  1412. }
  1413. if (expect(bitnr < b->bm_bits))
  1414. c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
  1415. else
  1416. drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
  1417. }
  1418. if (p_addr)
  1419. bm_unmap(p_addr);
  1420. spin_unlock_irqrestore(&b->bm_lock, flags);
  1421. return c;
  1422. }
  1423. /* inherently racy...
  1424. * return value may be already out-of-date when this function returns.
  1425. * but the general usage is that this is only use during a cstate when bits are
  1426. * only cleared, not set, and typically only care for the case when the return
  1427. * value is zero, or we already "locked" this "bitmap extent" by other means.
  1428. *
  1429. * enr is bm-extent number, since we chose to name one sector (512 bytes)
  1430. * worth of the bitmap a "bitmap extent".
  1431. *
  1432. * TODO
  1433. * I think since we use it like a reference count, we should use the real
  1434. * reference count of some bitmap extent element from some lru instead...
  1435. *
  1436. */
  1437. int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
  1438. {
  1439. struct drbd_bitmap *b = device->bitmap;
  1440. int count, s, e;
  1441. unsigned long flags;
  1442. unsigned long *p_addr, *bm;
  1443. if (!expect(b))
  1444. return 0;
  1445. if (!expect(b->bm_pages))
  1446. return 0;
  1447. spin_lock_irqsave(&b->bm_lock, flags);
  1448. if (BM_DONT_TEST & b->bm_flags)
  1449. bm_print_lock_info(device);
  1450. s = S2W(enr);
  1451. e = min((size_t)S2W(enr+1), b->bm_words);
  1452. count = 0;
  1453. if (s < b->bm_words) {
  1454. int n = e-s;
  1455. p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
  1456. bm = p_addr + MLPP(s);
  1457. count += bitmap_weight(bm, n * BITS_PER_LONG);
  1458. bm_unmap(p_addr);
  1459. } else {
  1460. drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
  1461. }
  1462. spin_unlock_irqrestore(&b->bm_lock, flags);
  1463. return count;
  1464. }