dm-writecache.c 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2018 Red Hat. All rights reserved.
  4. *
  5. * This file is released under the GPL.
  6. */
  7. #include <linux/device-mapper.h>
  8. #include <linux/module.h>
  9. #include <linux/init.h>
  10. #include <linux/vmalloc.h>
  11. #include <linux/kthread.h>
  12. #include <linux/dm-io.h>
  13. #include <linux/dm-kcopyd.h>
  14. #include <linux/dax.h>
  15. #include <linux/pfn_t.h>
  16. #include <linux/libnvdimm.h>
  17. #define DM_MSG_PREFIX "writecache"
  18. #define HIGH_WATERMARK 50
  19. #define LOW_WATERMARK 45
  20. #define MAX_WRITEBACK_JOBS 0
  21. #define ENDIO_LATENCY 16
  22. #define WRITEBACK_LATENCY 64
  23. #define AUTOCOMMIT_BLOCKS_SSD 65536
  24. #define AUTOCOMMIT_BLOCKS_PMEM 64
  25. #define AUTOCOMMIT_MSEC 1000
  26. #define BITMAP_GRANULARITY 65536
  27. #if BITMAP_GRANULARITY < PAGE_SIZE
  28. #undef BITMAP_GRANULARITY
  29. #define BITMAP_GRANULARITY PAGE_SIZE
  30. #endif
  31. #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
  32. #define DM_WRITECACHE_HAS_PMEM
  33. #endif
  34. #ifdef DM_WRITECACHE_HAS_PMEM
  35. #define pmem_assign(dest, src) \
  36. do { \
  37. typeof(dest) uniq = (src); \
  38. memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \
  39. } while (0)
  40. #else
  41. #define pmem_assign(dest, src) ((dest) = (src))
  42. #endif
  43. #if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM)
  44. #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  45. #endif
  46. #define MEMORY_SUPERBLOCK_MAGIC 0x23489321
  47. #define MEMORY_SUPERBLOCK_VERSION 1
  48. struct wc_memory_entry {
  49. __le64 original_sector;
  50. __le64 seq_count;
  51. };
  52. struct wc_memory_superblock {
  53. union {
  54. struct {
  55. __le32 magic;
  56. __le32 version;
  57. __le32 block_size;
  58. __le32 pad;
  59. __le64 n_blocks;
  60. __le64 seq_count;
  61. };
  62. __le64 padding[8];
  63. };
  64. struct wc_memory_entry entries[0];
  65. };
  66. struct wc_entry {
  67. struct rb_node rb_node;
  68. struct list_head lru;
  69. unsigned short wc_list_contiguous;
  70. bool write_in_progress
  71. #if BITS_PER_LONG == 64
  72. :1
  73. #endif
  74. ;
  75. unsigned long index
  76. #if BITS_PER_LONG == 64
  77. :47
  78. #endif
  79. ;
  80. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  81. uint64_t original_sector;
  82. uint64_t seq_count;
  83. #endif
  84. };
  85. #ifdef DM_WRITECACHE_HAS_PMEM
  86. #define WC_MODE_PMEM(wc) ((wc)->pmem_mode)
  87. #define WC_MODE_FUA(wc) ((wc)->writeback_fua)
  88. #else
  89. #define WC_MODE_PMEM(wc) false
  90. #define WC_MODE_FUA(wc) false
  91. #endif
  92. #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc))
  93. struct dm_writecache {
  94. struct mutex lock;
  95. struct list_head lru;
  96. union {
  97. struct list_head freelist;
  98. struct {
  99. struct rb_root freetree;
  100. struct wc_entry *current_free;
  101. };
  102. };
  103. struct rb_root tree;
  104. size_t freelist_size;
  105. size_t writeback_size;
  106. size_t freelist_high_watermark;
  107. size_t freelist_low_watermark;
  108. unsigned uncommitted_blocks;
  109. unsigned autocommit_blocks;
  110. unsigned max_writeback_jobs;
  111. int error;
  112. unsigned long autocommit_jiffies;
  113. struct timer_list autocommit_timer;
  114. struct wait_queue_head freelist_wait;
  115. atomic_t bio_in_progress[2];
  116. struct wait_queue_head bio_in_progress_wait[2];
  117. struct dm_target *ti;
  118. struct dm_dev *dev;
  119. struct dm_dev *ssd_dev;
  120. sector_t start_sector;
  121. void *memory_map;
  122. uint64_t memory_map_size;
  123. size_t metadata_sectors;
  124. size_t n_blocks;
  125. uint64_t seq_count;
  126. void *block_start;
  127. struct wc_entry *entries;
  128. unsigned block_size;
  129. unsigned char block_size_bits;
  130. bool pmem_mode:1;
  131. bool writeback_fua:1;
  132. bool overwrote_committed:1;
  133. bool memory_vmapped:1;
  134. bool high_wm_percent_set:1;
  135. bool low_wm_percent_set:1;
  136. bool max_writeback_jobs_set:1;
  137. bool autocommit_blocks_set:1;
  138. bool autocommit_time_set:1;
  139. bool writeback_fua_set:1;
  140. bool flush_on_suspend:1;
  141. unsigned writeback_all;
  142. struct workqueue_struct *writeback_wq;
  143. struct work_struct writeback_work;
  144. struct work_struct flush_work;
  145. struct dm_io_client *dm_io;
  146. raw_spinlock_t endio_list_lock;
  147. struct list_head endio_list;
  148. struct task_struct *endio_thread;
  149. struct task_struct *flush_thread;
  150. struct bio_list flush_list;
  151. struct dm_kcopyd_client *dm_kcopyd;
  152. unsigned long *dirty_bitmap;
  153. unsigned dirty_bitmap_size;
  154. struct bio_set bio_set;
  155. mempool_t copy_pool;
  156. };
  157. #define WB_LIST_INLINE 16
  158. struct writeback_struct {
  159. struct list_head endio_entry;
  160. struct dm_writecache *wc;
  161. struct wc_entry **wc_list;
  162. unsigned wc_list_n;
  163. unsigned page_offset;
  164. struct page *page;
  165. struct wc_entry *wc_list_inline[WB_LIST_INLINE];
  166. struct bio bio;
  167. };
  168. struct copy_struct {
  169. struct list_head endio_entry;
  170. struct dm_writecache *wc;
  171. struct wc_entry *e;
  172. unsigned n_entries;
  173. int error;
  174. };
  175. DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
  176. "A percentage of time allocated for data copying");
  177. static void wc_lock(struct dm_writecache *wc)
  178. {
  179. mutex_lock(&wc->lock);
  180. }
  181. static void wc_unlock(struct dm_writecache *wc)
  182. {
  183. mutex_unlock(&wc->lock);
  184. }
  185. #ifdef DM_WRITECACHE_HAS_PMEM
  186. static int persistent_memory_claim(struct dm_writecache *wc)
  187. {
  188. int r;
  189. loff_t s;
  190. long p, da;
  191. pfn_t pfn;
  192. int id;
  193. struct page **pages;
  194. wc->memory_vmapped = false;
  195. if (!wc->ssd_dev->dax_dev) {
  196. r = -EOPNOTSUPP;
  197. goto err1;
  198. }
  199. s = wc->memory_map_size;
  200. p = s >> PAGE_SHIFT;
  201. if (!p) {
  202. r = -EINVAL;
  203. goto err1;
  204. }
  205. if (p != s >> PAGE_SHIFT) {
  206. r = -EOVERFLOW;
  207. goto err1;
  208. }
  209. id = dax_read_lock();
  210. da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn);
  211. if (da < 0) {
  212. wc->memory_map = NULL;
  213. r = da;
  214. goto err2;
  215. }
  216. if (!pfn_t_has_page(pfn)) {
  217. wc->memory_map = NULL;
  218. r = -EOPNOTSUPP;
  219. goto err2;
  220. }
  221. if (da != p) {
  222. long i;
  223. wc->memory_map = NULL;
  224. pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
  225. if (!pages) {
  226. r = -ENOMEM;
  227. goto err2;
  228. }
  229. i = 0;
  230. do {
  231. long daa;
  232. daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i,
  233. NULL, &pfn);
  234. if (daa <= 0) {
  235. r = daa ? daa : -EINVAL;
  236. goto err3;
  237. }
  238. if (!pfn_t_has_page(pfn)) {
  239. r = -EOPNOTSUPP;
  240. goto err3;
  241. }
  242. while (daa-- && i < p) {
  243. pages[i++] = pfn_t_to_page(pfn);
  244. pfn.val++;
  245. }
  246. } while (i < p);
  247. wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
  248. if (!wc->memory_map) {
  249. r = -ENOMEM;
  250. goto err3;
  251. }
  252. kvfree(pages);
  253. wc->memory_vmapped = true;
  254. }
  255. dax_read_unlock(id);
  256. wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
  257. wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
  258. return 0;
  259. err3:
  260. kvfree(pages);
  261. err2:
  262. dax_read_unlock(id);
  263. err1:
  264. return r;
  265. }
  266. #else
  267. static int persistent_memory_claim(struct dm_writecache *wc)
  268. {
  269. BUG();
  270. }
  271. #endif
  272. static void persistent_memory_release(struct dm_writecache *wc)
  273. {
  274. if (wc->memory_vmapped)
  275. vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
  276. }
  277. static struct page *persistent_memory_page(void *addr)
  278. {
  279. if (is_vmalloc_addr(addr))
  280. return vmalloc_to_page(addr);
  281. else
  282. return virt_to_page(addr);
  283. }
  284. static unsigned persistent_memory_page_offset(void *addr)
  285. {
  286. return (unsigned long)addr & (PAGE_SIZE - 1);
  287. }
  288. static void persistent_memory_flush_cache(void *ptr, size_t size)
  289. {
  290. if (is_vmalloc_addr(ptr))
  291. flush_kernel_vmap_range(ptr, size);
  292. }
  293. static void persistent_memory_invalidate_cache(void *ptr, size_t size)
  294. {
  295. if (is_vmalloc_addr(ptr))
  296. invalidate_kernel_vmap_range(ptr, size);
  297. }
  298. static struct wc_memory_superblock *sb(struct dm_writecache *wc)
  299. {
  300. return wc->memory_map;
  301. }
  302. static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
  303. {
  304. if (is_power_of_2(sizeof(struct wc_entry)) && 0)
  305. return &sb(wc)->entries[e - wc->entries];
  306. else
  307. return &sb(wc)->entries[e->index];
  308. }
  309. static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
  310. {
  311. return (char *)wc->block_start + (e->index << wc->block_size_bits);
  312. }
  313. static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
  314. {
  315. return wc->start_sector + wc->metadata_sectors +
  316. ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
  317. }
  318. static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
  319. {
  320. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  321. return e->original_sector;
  322. #else
  323. return le64_to_cpu(memory_entry(wc, e)->original_sector);
  324. #endif
  325. }
  326. static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
  327. {
  328. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  329. return e->seq_count;
  330. #else
  331. return le64_to_cpu(memory_entry(wc, e)->seq_count);
  332. #endif
  333. }
  334. static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
  335. {
  336. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  337. e->seq_count = -1;
  338. #endif
  339. pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
  340. }
  341. static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
  342. uint64_t original_sector, uint64_t seq_count)
  343. {
  344. struct wc_memory_entry me;
  345. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  346. e->original_sector = original_sector;
  347. e->seq_count = seq_count;
  348. #endif
  349. me.original_sector = cpu_to_le64(original_sector);
  350. me.seq_count = cpu_to_le64(seq_count);
  351. pmem_assign(*memory_entry(wc, e), me);
  352. }
  353. #define writecache_error(wc, err, msg, arg...) \
  354. do { \
  355. if (!cmpxchg(&(wc)->error, 0, err)) \
  356. DMERR(msg, ##arg); \
  357. wake_up(&(wc)->freelist_wait); \
  358. } while (0)
  359. #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error)))
  360. static void writecache_flush_all_metadata(struct dm_writecache *wc)
  361. {
  362. if (!WC_MODE_PMEM(wc))
  363. memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
  364. }
  365. static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
  366. {
  367. if (!WC_MODE_PMEM(wc))
  368. __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
  369. wc->dirty_bitmap);
  370. }
  371. static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
  372. struct io_notify {
  373. struct dm_writecache *wc;
  374. struct completion c;
  375. atomic_t count;
  376. };
  377. static void writecache_notify_io(unsigned long error, void *context)
  378. {
  379. struct io_notify *endio = context;
  380. if (unlikely(error != 0))
  381. writecache_error(endio->wc, -EIO, "error writing metadata");
  382. BUG_ON(atomic_read(&endio->count) <= 0);
  383. if (atomic_dec_and_test(&endio->count))
  384. complete(&endio->c);
  385. }
  386. static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
  387. {
  388. wait_event(wc->bio_in_progress_wait[direction],
  389. !atomic_read(&wc->bio_in_progress[direction]));
  390. }
  391. static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
  392. {
  393. struct dm_io_region region;
  394. struct dm_io_request req;
  395. struct io_notify endio = {
  396. wc,
  397. COMPLETION_INITIALIZER_ONSTACK(endio.c),
  398. ATOMIC_INIT(1),
  399. };
  400. unsigned bitmap_bits = wc->dirty_bitmap_size * 8;
  401. unsigned i = 0;
  402. while (1) {
  403. unsigned j;
  404. i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
  405. if (unlikely(i == bitmap_bits))
  406. break;
  407. j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
  408. region.bdev = wc->ssd_dev->bdev;
  409. region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
  410. region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
  411. if (unlikely(region.sector >= wc->metadata_sectors))
  412. break;
  413. if (unlikely(region.sector + region.count > wc->metadata_sectors))
  414. region.count = wc->metadata_sectors - region.sector;
  415. region.sector += wc->start_sector;
  416. atomic_inc(&endio.count);
  417. req.bi_op = REQ_OP_WRITE;
  418. req.bi_op_flags = REQ_SYNC;
  419. req.mem.type = DM_IO_VMA;
  420. req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
  421. req.client = wc->dm_io;
  422. req.notify.fn = writecache_notify_io;
  423. req.notify.context = &endio;
  424. /* writing via async dm-io (implied by notify.fn above) won't return an error */
  425. (void) dm_io(&req, 1, &region, NULL);
  426. i = j;
  427. }
  428. writecache_notify_io(0, &endio);
  429. wait_for_completion_io(&endio.c);
  430. if (wait_for_ios)
  431. writecache_wait_for_ios(wc, WRITE);
  432. writecache_disk_flush(wc, wc->ssd_dev);
  433. memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
  434. }
  435. static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
  436. {
  437. if (WC_MODE_PMEM(wc))
  438. wmb();
  439. else
  440. ssd_commit_flushed(wc, wait_for_ios);
  441. }
  442. static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
  443. {
  444. int r;
  445. struct dm_io_region region;
  446. struct dm_io_request req;
  447. region.bdev = dev->bdev;
  448. region.sector = 0;
  449. region.count = 0;
  450. req.bi_op = REQ_OP_WRITE;
  451. req.bi_op_flags = REQ_PREFLUSH;
  452. req.mem.type = DM_IO_KMEM;
  453. req.mem.ptr.addr = NULL;
  454. req.client = wc->dm_io;
  455. req.notify.fn = NULL;
  456. r = dm_io(&req, 1, &region, NULL);
  457. if (unlikely(r))
  458. writecache_error(wc, r, "error flushing metadata: %d", r);
  459. }
  460. #define WFE_RETURN_FOLLOWING 1
  461. #define WFE_LOWEST_SEQ 2
  462. static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
  463. uint64_t block, int flags)
  464. {
  465. struct wc_entry *e;
  466. struct rb_node *node = wc->tree.rb_node;
  467. if (unlikely(!node))
  468. return NULL;
  469. while (1) {
  470. e = container_of(node, struct wc_entry, rb_node);
  471. if (read_original_sector(wc, e) == block)
  472. break;
  473. node = (read_original_sector(wc, e) >= block ?
  474. e->rb_node.rb_left : e->rb_node.rb_right);
  475. if (unlikely(!node)) {
  476. if (!(flags & WFE_RETURN_FOLLOWING)) {
  477. return NULL;
  478. }
  479. if (read_original_sector(wc, e) >= block) {
  480. break;
  481. } else {
  482. node = rb_next(&e->rb_node);
  483. if (unlikely(!node)) {
  484. return NULL;
  485. }
  486. e = container_of(node, struct wc_entry, rb_node);
  487. break;
  488. }
  489. }
  490. }
  491. while (1) {
  492. struct wc_entry *e2;
  493. if (flags & WFE_LOWEST_SEQ)
  494. node = rb_prev(&e->rb_node);
  495. else
  496. node = rb_next(&e->rb_node);
  497. if (!node)
  498. return e;
  499. e2 = container_of(node, struct wc_entry, rb_node);
  500. if (read_original_sector(wc, e2) != block)
  501. return e;
  502. e = e2;
  503. }
  504. }
  505. static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
  506. {
  507. struct wc_entry *e;
  508. struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
  509. while (*node) {
  510. e = container_of(*node, struct wc_entry, rb_node);
  511. parent = &e->rb_node;
  512. if (read_original_sector(wc, e) > read_original_sector(wc, ins))
  513. node = &parent->rb_left;
  514. else
  515. node = &parent->rb_right;
  516. }
  517. rb_link_node(&ins->rb_node, parent, node);
  518. rb_insert_color(&ins->rb_node, &wc->tree);
  519. list_add(&ins->lru, &wc->lru);
  520. }
  521. static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
  522. {
  523. list_del(&e->lru);
  524. rb_erase(&e->rb_node, &wc->tree);
  525. }
  526. static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
  527. {
  528. if (WC_MODE_SORT_FREELIST(wc)) {
  529. struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
  530. if (unlikely(!*node))
  531. wc->current_free = e;
  532. while (*node) {
  533. parent = *node;
  534. if (&e->rb_node < *node)
  535. node = &parent->rb_left;
  536. else
  537. node = &parent->rb_right;
  538. }
  539. rb_link_node(&e->rb_node, parent, node);
  540. rb_insert_color(&e->rb_node, &wc->freetree);
  541. } else {
  542. list_add_tail(&e->lru, &wc->freelist);
  543. }
  544. wc->freelist_size++;
  545. }
  546. static inline void writecache_verify_watermark(struct dm_writecache *wc)
  547. {
  548. if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
  549. queue_work(wc->writeback_wq, &wc->writeback_work);
  550. }
  551. static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
  552. {
  553. struct wc_entry *e;
  554. if (WC_MODE_SORT_FREELIST(wc)) {
  555. struct rb_node *next;
  556. if (unlikely(!wc->current_free))
  557. return NULL;
  558. e = wc->current_free;
  559. next = rb_next(&e->rb_node);
  560. rb_erase(&e->rb_node, &wc->freetree);
  561. if (unlikely(!next))
  562. next = rb_first(&wc->freetree);
  563. wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
  564. } else {
  565. if (unlikely(list_empty(&wc->freelist)))
  566. return NULL;
  567. e = container_of(wc->freelist.next, struct wc_entry, lru);
  568. list_del(&e->lru);
  569. }
  570. wc->freelist_size--;
  571. writecache_verify_watermark(wc);
  572. return e;
  573. }
  574. static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
  575. {
  576. writecache_unlink(wc, e);
  577. writecache_add_to_freelist(wc, e);
  578. clear_seq_count(wc, e);
  579. writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
  580. if (unlikely(waitqueue_active(&wc->freelist_wait)))
  581. wake_up(&wc->freelist_wait);
  582. }
  583. static void writecache_wait_on_freelist(struct dm_writecache *wc)
  584. {
  585. DEFINE_WAIT(wait);
  586. prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
  587. wc_unlock(wc);
  588. io_schedule();
  589. finish_wait(&wc->freelist_wait, &wait);
  590. wc_lock(wc);
  591. }
  592. static void writecache_poison_lists(struct dm_writecache *wc)
  593. {
  594. /*
  595. * Catch incorrect access to these values while the device is suspended.
  596. */
  597. memset(&wc->tree, -1, sizeof wc->tree);
  598. wc->lru.next = LIST_POISON1;
  599. wc->lru.prev = LIST_POISON2;
  600. wc->freelist.next = LIST_POISON1;
  601. wc->freelist.prev = LIST_POISON2;
  602. }
  603. static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
  604. {
  605. writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
  606. if (WC_MODE_PMEM(wc))
  607. writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
  608. }
  609. static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
  610. {
  611. return read_seq_count(wc, e) < wc->seq_count;
  612. }
  613. static void writecache_flush(struct dm_writecache *wc)
  614. {
  615. struct wc_entry *e, *e2;
  616. bool need_flush_after_free;
  617. wc->uncommitted_blocks = 0;
  618. del_timer(&wc->autocommit_timer);
  619. if (list_empty(&wc->lru))
  620. return;
  621. e = container_of(wc->lru.next, struct wc_entry, lru);
  622. if (writecache_entry_is_committed(wc, e)) {
  623. if (wc->overwrote_committed) {
  624. writecache_wait_for_ios(wc, WRITE);
  625. writecache_disk_flush(wc, wc->ssd_dev);
  626. wc->overwrote_committed = false;
  627. }
  628. return;
  629. }
  630. while (1) {
  631. writecache_flush_entry(wc, e);
  632. if (unlikely(e->lru.next == &wc->lru))
  633. break;
  634. e2 = container_of(e->lru.next, struct wc_entry, lru);
  635. if (writecache_entry_is_committed(wc, e2))
  636. break;
  637. e = e2;
  638. cond_resched();
  639. }
  640. writecache_commit_flushed(wc, true);
  641. wc->seq_count++;
  642. pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
  643. writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
  644. writecache_commit_flushed(wc, false);
  645. wc->overwrote_committed = false;
  646. need_flush_after_free = false;
  647. while (1) {
  648. /* Free another committed entry with lower seq-count */
  649. struct rb_node *rb_node = rb_prev(&e->rb_node);
  650. if (rb_node) {
  651. e2 = container_of(rb_node, struct wc_entry, rb_node);
  652. if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
  653. likely(!e2->write_in_progress)) {
  654. writecache_free_entry(wc, e2);
  655. need_flush_after_free = true;
  656. }
  657. }
  658. if (unlikely(e->lru.prev == &wc->lru))
  659. break;
  660. e = container_of(e->lru.prev, struct wc_entry, lru);
  661. cond_resched();
  662. }
  663. if (need_flush_after_free)
  664. writecache_commit_flushed(wc, false);
  665. }
  666. static void writecache_flush_work(struct work_struct *work)
  667. {
  668. struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
  669. wc_lock(wc);
  670. writecache_flush(wc);
  671. wc_unlock(wc);
  672. }
  673. static void writecache_autocommit_timer(struct timer_list *t)
  674. {
  675. struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
  676. if (!writecache_has_error(wc))
  677. queue_work(wc->writeback_wq, &wc->flush_work);
  678. }
  679. static void writecache_schedule_autocommit(struct dm_writecache *wc)
  680. {
  681. if (!timer_pending(&wc->autocommit_timer))
  682. mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
  683. }
  684. static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
  685. {
  686. struct wc_entry *e;
  687. bool discarded_something = false;
  688. e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
  689. if (unlikely(!e))
  690. return;
  691. while (read_original_sector(wc, e) < end) {
  692. struct rb_node *node = rb_next(&e->rb_node);
  693. if (likely(!e->write_in_progress)) {
  694. if (!discarded_something) {
  695. writecache_wait_for_ios(wc, READ);
  696. writecache_wait_for_ios(wc, WRITE);
  697. discarded_something = true;
  698. }
  699. writecache_free_entry(wc, e);
  700. }
  701. if (!node)
  702. break;
  703. e = container_of(node, struct wc_entry, rb_node);
  704. }
  705. if (discarded_something)
  706. writecache_commit_flushed(wc, false);
  707. }
  708. static bool writecache_wait_for_writeback(struct dm_writecache *wc)
  709. {
  710. if (wc->writeback_size) {
  711. writecache_wait_on_freelist(wc);
  712. return true;
  713. }
  714. return false;
  715. }
  716. static void writecache_suspend(struct dm_target *ti)
  717. {
  718. struct dm_writecache *wc = ti->private;
  719. bool flush_on_suspend;
  720. del_timer_sync(&wc->autocommit_timer);
  721. wc_lock(wc);
  722. writecache_flush(wc);
  723. flush_on_suspend = wc->flush_on_suspend;
  724. if (flush_on_suspend) {
  725. wc->flush_on_suspend = false;
  726. wc->writeback_all++;
  727. queue_work(wc->writeback_wq, &wc->writeback_work);
  728. }
  729. wc_unlock(wc);
  730. drain_workqueue(wc->writeback_wq);
  731. wc_lock(wc);
  732. if (flush_on_suspend)
  733. wc->writeback_all--;
  734. while (writecache_wait_for_writeback(wc));
  735. if (WC_MODE_PMEM(wc))
  736. persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
  737. writecache_poison_lists(wc);
  738. wc_unlock(wc);
  739. }
  740. static int writecache_alloc_entries(struct dm_writecache *wc)
  741. {
  742. size_t b;
  743. if (wc->entries)
  744. return 0;
  745. wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
  746. if (!wc->entries)
  747. return -ENOMEM;
  748. for (b = 0; b < wc->n_blocks; b++) {
  749. struct wc_entry *e = &wc->entries[b];
  750. e->index = b;
  751. e->write_in_progress = false;
  752. }
  753. return 0;
  754. }
  755. static void writecache_resume(struct dm_target *ti)
  756. {
  757. struct dm_writecache *wc = ti->private;
  758. size_t b;
  759. bool need_flush = false;
  760. __le64 sb_seq_count;
  761. int r;
  762. wc_lock(wc);
  763. if (WC_MODE_PMEM(wc))
  764. persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
  765. wc->tree = RB_ROOT;
  766. INIT_LIST_HEAD(&wc->lru);
  767. if (WC_MODE_SORT_FREELIST(wc)) {
  768. wc->freetree = RB_ROOT;
  769. wc->current_free = NULL;
  770. } else {
  771. INIT_LIST_HEAD(&wc->freelist);
  772. }
  773. wc->freelist_size = 0;
  774. r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
  775. if (r) {
  776. writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
  777. sb_seq_count = cpu_to_le64(0);
  778. }
  779. wc->seq_count = le64_to_cpu(sb_seq_count);
  780. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  781. for (b = 0; b < wc->n_blocks; b++) {
  782. struct wc_entry *e = &wc->entries[b];
  783. struct wc_memory_entry wme;
  784. if (writecache_has_error(wc)) {
  785. e->original_sector = -1;
  786. e->seq_count = -1;
  787. continue;
  788. }
  789. r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry));
  790. if (r) {
  791. writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
  792. (unsigned long)b, r);
  793. e->original_sector = -1;
  794. e->seq_count = -1;
  795. } else {
  796. e->original_sector = le64_to_cpu(wme.original_sector);
  797. e->seq_count = le64_to_cpu(wme.seq_count);
  798. }
  799. }
  800. #endif
  801. for (b = 0; b < wc->n_blocks; b++) {
  802. struct wc_entry *e = &wc->entries[b];
  803. if (!writecache_entry_is_committed(wc, e)) {
  804. if (read_seq_count(wc, e) != -1) {
  805. erase_this:
  806. clear_seq_count(wc, e);
  807. need_flush = true;
  808. }
  809. writecache_add_to_freelist(wc, e);
  810. } else {
  811. struct wc_entry *old;
  812. old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
  813. if (!old) {
  814. writecache_insert_entry(wc, e);
  815. } else {
  816. if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
  817. writecache_error(wc, -EINVAL,
  818. "two identical entries, position %llu, sector %llu, sequence %llu",
  819. (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
  820. (unsigned long long)read_seq_count(wc, e));
  821. }
  822. if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
  823. goto erase_this;
  824. } else {
  825. writecache_free_entry(wc, old);
  826. writecache_insert_entry(wc, e);
  827. need_flush = true;
  828. }
  829. }
  830. }
  831. cond_resched();
  832. }
  833. if (need_flush) {
  834. writecache_flush_all_metadata(wc);
  835. writecache_commit_flushed(wc, false);
  836. }
  837. writecache_verify_watermark(wc);
  838. wc_unlock(wc);
  839. }
  840. static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
  841. {
  842. if (argc != 1)
  843. return -EINVAL;
  844. wc_lock(wc);
  845. if (dm_suspended(wc->ti)) {
  846. wc_unlock(wc);
  847. return -EBUSY;
  848. }
  849. if (writecache_has_error(wc)) {
  850. wc_unlock(wc);
  851. return -EIO;
  852. }
  853. writecache_flush(wc);
  854. wc->writeback_all++;
  855. queue_work(wc->writeback_wq, &wc->writeback_work);
  856. wc_unlock(wc);
  857. flush_workqueue(wc->writeback_wq);
  858. wc_lock(wc);
  859. wc->writeback_all--;
  860. if (writecache_has_error(wc)) {
  861. wc_unlock(wc);
  862. return -EIO;
  863. }
  864. wc_unlock(wc);
  865. return 0;
  866. }
  867. static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
  868. {
  869. if (argc != 1)
  870. return -EINVAL;
  871. wc_lock(wc);
  872. wc->flush_on_suspend = true;
  873. wc_unlock(wc);
  874. return 0;
  875. }
  876. static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
  877. char *result, unsigned maxlen)
  878. {
  879. int r = -EINVAL;
  880. struct dm_writecache *wc = ti->private;
  881. if (!strcasecmp(argv[0], "flush"))
  882. r = process_flush_mesg(argc, argv, wc);
  883. else if (!strcasecmp(argv[0], "flush_on_suspend"))
  884. r = process_flush_on_suspend_mesg(argc, argv, wc);
  885. else
  886. DMERR("unrecognised message received: %s", argv[0]);
  887. return r;
  888. }
  889. static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
  890. {
  891. void *buf;
  892. unsigned long flags;
  893. unsigned size;
  894. int rw = bio_data_dir(bio);
  895. unsigned remaining_size = wc->block_size;
  896. do {
  897. struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
  898. buf = bvec_kmap_irq(&bv, &flags);
  899. size = bv.bv_len;
  900. if (unlikely(size > remaining_size))
  901. size = remaining_size;
  902. if (rw == READ) {
  903. int r;
  904. r = memcpy_mcsafe(buf, data, size);
  905. flush_dcache_page(bio_page(bio));
  906. if (unlikely(r)) {
  907. writecache_error(wc, r, "hardware memory error when reading data: %d", r);
  908. bio->bi_status = BLK_STS_IOERR;
  909. }
  910. } else {
  911. flush_dcache_page(bio_page(bio));
  912. memcpy_flushcache(data, buf, size);
  913. }
  914. bvec_kunmap_irq(buf, &flags);
  915. data = (char *)data + size;
  916. remaining_size -= size;
  917. bio_advance(bio, size);
  918. } while (unlikely(remaining_size));
  919. }
  920. static int writecache_flush_thread(void *data)
  921. {
  922. struct dm_writecache *wc = data;
  923. while (1) {
  924. struct bio *bio;
  925. wc_lock(wc);
  926. bio = bio_list_pop(&wc->flush_list);
  927. if (!bio) {
  928. set_current_state(TASK_INTERRUPTIBLE);
  929. wc_unlock(wc);
  930. if (unlikely(kthread_should_stop())) {
  931. set_current_state(TASK_RUNNING);
  932. break;
  933. }
  934. schedule();
  935. continue;
  936. }
  937. if (bio_op(bio) == REQ_OP_DISCARD) {
  938. writecache_discard(wc, bio->bi_iter.bi_sector,
  939. bio_end_sector(bio));
  940. wc_unlock(wc);
  941. bio_set_dev(bio, wc->dev->bdev);
  942. generic_make_request(bio);
  943. } else {
  944. writecache_flush(wc);
  945. wc_unlock(wc);
  946. if (writecache_has_error(wc))
  947. bio->bi_status = BLK_STS_IOERR;
  948. bio_endio(bio);
  949. }
  950. }
  951. return 0;
  952. }
  953. static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
  954. {
  955. if (bio_list_empty(&wc->flush_list))
  956. wake_up_process(wc->flush_thread);
  957. bio_list_add(&wc->flush_list, bio);
  958. }
  959. static int writecache_map(struct dm_target *ti, struct bio *bio)
  960. {
  961. struct wc_entry *e;
  962. struct dm_writecache *wc = ti->private;
  963. bio->bi_private = NULL;
  964. wc_lock(wc);
  965. if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
  966. if (writecache_has_error(wc))
  967. goto unlock_error;
  968. if (WC_MODE_PMEM(wc)) {
  969. writecache_flush(wc);
  970. if (writecache_has_error(wc))
  971. goto unlock_error;
  972. goto unlock_submit;
  973. } else {
  974. writecache_offload_bio(wc, bio);
  975. goto unlock_return;
  976. }
  977. }
  978. bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
  979. if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
  980. (wc->block_size / 512 - 1)) != 0)) {
  981. DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
  982. (unsigned long long)bio->bi_iter.bi_sector,
  983. bio->bi_iter.bi_size, wc->block_size);
  984. goto unlock_error;
  985. }
  986. if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
  987. if (writecache_has_error(wc))
  988. goto unlock_error;
  989. if (WC_MODE_PMEM(wc)) {
  990. writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
  991. goto unlock_remap_origin;
  992. } else {
  993. writecache_offload_bio(wc, bio);
  994. goto unlock_return;
  995. }
  996. }
  997. if (bio_data_dir(bio) == READ) {
  998. read_next_block:
  999. e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
  1000. if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
  1001. if (WC_MODE_PMEM(wc)) {
  1002. bio_copy_block(wc, bio, memory_data(wc, e));
  1003. if (bio->bi_iter.bi_size)
  1004. goto read_next_block;
  1005. goto unlock_submit;
  1006. } else {
  1007. dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
  1008. bio_set_dev(bio, wc->ssd_dev->bdev);
  1009. bio->bi_iter.bi_sector = cache_sector(wc, e);
  1010. if (!writecache_entry_is_committed(wc, e))
  1011. writecache_wait_for_ios(wc, WRITE);
  1012. goto unlock_remap;
  1013. }
  1014. } else {
  1015. if (e) {
  1016. sector_t next_boundary =
  1017. read_original_sector(wc, e) - bio->bi_iter.bi_sector;
  1018. if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
  1019. dm_accept_partial_bio(bio, next_boundary);
  1020. }
  1021. }
  1022. goto unlock_remap_origin;
  1023. }
  1024. } else {
  1025. do {
  1026. if (writecache_has_error(wc))
  1027. goto unlock_error;
  1028. e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
  1029. if (e) {
  1030. if (!writecache_entry_is_committed(wc, e))
  1031. goto bio_copy;
  1032. if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
  1033. wc->overwrote_committed = true;
  1034. goto bio_copy;
  1035. }
  1036. }
  1037. e = writecache_pop_from_freelist(wc);
  1038. if (unlikely(!e)) {
  1039. writecache_wait_on_freelist(wc);
  1040. continue;
  1041. }
  1042. write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
  1043. writecache_insert_entry(wc, e);
  1044. wc->uncommitted_blocks++;
  1045. bio_copy:
  1046. if (WC_MODE_PMEM(wc)) {
  1047. bio_copy_block(wc, bio, memory_data(wc, e));
  1048. } else {
  1049. dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
  1050. bio_set_dev(bio, wc->ssd_dev->bdev);
  1051. bio->bi_iter.bi_sector = cache_sector(wc, e);
  1052. if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
  1053. wc->uncommitted_blocks = 0;
  1054. queue_work(wc->writeback_wq, &wc->flush_work);
  1055. } else {
  1056. writecache_schedule_autocommit(wc);
  1057. }
  1058. goto unlock_remap;
  1059. }
  1060. } while (bio->bi_iter.bi_size);
  1061. if (unlikely(bio->bi_opf & REQ_FUA ||
  1062. wc->uncommitted_blocks >= wc->autocommit_blocks))
  1063. writecache_flush(wc);
  1064. else
  1065. writecache_schedule_autocommit(wc);
  1066. goto unlock_submit;
  1067. }
  1068. unlock_remap_origin:
  1069. bio_set_dev(bio, wc->dev->bdev);
  1070. wc_unlock(wc);
  1071. return DM_MAPIO_REMAPPED;
  1072. unlock_remap:
  1073. /* make sure that writecache_end_io decrements bio_in_progress: */
  1074. bio->bi_private = (void *)1;
  1075. atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
  1076. wc_unlock(wc);
  1077. return DM_MAPIO_REMAPPED;
  1078. unlock_submit:
  1079. wc_unlock(wc);
  1080. bio_endio(bio);
  1081. return DM_MAPIO_SUBMITTED;
  1082. unlock_return:
  1083. wc_unlock(wc);
  1084. return DM_MAPIO_SUBMITTED;
  1085. unlock_error:
  1086. wc_unlock(wc);
  1087. bio_io_error(bio);
  1088. return DM_MAPIO_SUBMITTED;
  1089. }
  1090. static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
  1091. {
  1092. struct dm_writecache *wc = ti->private;
  1093. if (bio->bi_private != NULL) {
  1094. int dir = bio_data_dir(bio);
  1095. if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
  1096. if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
  1097. wake_up(&wc->bio_in_progress_wait[dir]);
  1098. }
  1099. return 0;
  1100. }
  1101. static int writecache_iterate_devices(struct dm_target *ti,
  1102. iterate_devices_callout_fn fn, void *data)
  1103. {
  1104. struct dm_writecache *wc = ti->private;
  1105. return fn(ti, wc->dev, 0, ti->len, data);
  1106. }
  1107. static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
  1108. {
  1109. struct dm_writecache *wc = ti->private;
  1110. if (limits->logical_block_size < wc->block_size)
  1111. limits->logical_block_size = wc->block_size;
  1112. if (limits->physical_block_size < wc->block_size)
  1113. limits->physical_block_size = wc->block_size;
  1114. if (limits->io_min < wc->block_size)
  1115. limits->io_min = wc->block_size;
  1116. }
  1117. static void writecache_writeback_endio(struct bio *bio)
  1118. {
  1119. struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
  1120. struct dm_writecache *wc = wb->wc;
  1121. unsigned long flags;
  1122. raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
  1123. if (unlikely(list_empty(&wc->endio_list)))
  1124. wake_up_process(wc->endio_thread);
  1125. list_add_tail(&wb->endio_entry, &wc->endio_list);
  1126. raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
  1127. }
  1128. static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
  1129. {
  1130. struct copy_struct *c = ptr;
  1131. struct dm_writecache *wc = c->wc;
  1132. c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
  1133. raw_spin_lock_irq(&wc->endio_list_lock);
  1134. if (unlikely(list_empty(&wc->endio_list)))
  1135. wake_up_process(wc->endio_thread);
  1136. list_add_tail(&c->endio_entry, &wc->endio_list);
  1137. raw_spin_unlock_irq(&wc->endio_list_lock);
  1138. }
  1139. static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
  1140. {
  1141. unsigned i;
  1142. struct writeback_struct *wb;
  1143. struct wc_entry *e;
  1144. unsigned long n_walked = 0;
  1145. do {
  1146. wb = list_entry(list->next, struct writeback_struct, endio_entry);
  1147. list_del(&wb->endio_entry);
  1148. if (unlikely(wb->bio.bi_status != BLK_STS_OK))
  1149. writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
  1150. "write error %d", wb->bio.bi_status);
  1151. i = 0;
  1152. do {
  1153. e = wb->wc_list[i];
  1154. BUG_ON(!e->write_in_progress);
  1155. e->write_in_progress = false;
  1156. INIT_LIST_HEAD(&e->lru);
  1157. if (!writecache_has_error(wc))
  1158. writecache_free_entry(wc, e);
  1159. BUG_ON(!wc->writeback_size);
  1160. wc->writeback_size--;
  1161. n_walked++;
  1162. if (unlikely(n_walked >= ENDIO_LATENCY)) {
  1163. writecache_commit_flushed(wc, false);
  1164. wc_unlock(wc);
  1165. wc_lock(wc);
  1166. n_walked = 0;
  1167. }
  1168. } while (++i < wb->wc_list_n);
  1169. if (wb->wc_list != wb->wc_list_inline)
  1170. kfree(wb->wc_list);
  1171. bio_put(&wb->bio);
  1172. } while (!list_empty(list));
  1173. }
  1174. static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
  1175. {
  1176. struct copy_struct *c;
  1177. struct wc_entry *e;
  1178. do {
  1179. c = list_entry(list->next, struct copy_struct, endio_entry);
  1180. list_del(&c->endio_entry);
  1181. if (unlikely(c->error))
  1182. writecache_error(wc, c->error, "copy error");
  1183. e = c->e;
  1184. do {
  1185. BUG_ON(!e->write_in_progress);
  1186. e->write_in_progress = false;
  1187. INIT_LIST_HEAD(&e->lru);
  1188. if (!writecache_has_error(wc))
  1189. writecache_free_entry(wc, e);
  1190. BUG_ON(!wc->writeback_size);
  1191. wc->writeback_size--;
  1192. e++;
  1193. } while (--c->n_entries);
  1194. mempool_free(c, &wc->copy_pool);
  1195. } while (!list_empty(list));
  1196. }
  1197. static int writecache_endio_thread(void *data)
  1198. {
  1199. struct dm_writecache *wc = data;
  1200. while (1) {
  1201. struct list_head list;
  1202. raw_spin_lock_irq(&wc->endio_list_lock);
  1203. if (!list_empty(&wc->endio_list))
  1204. goto pop_from_list;
  1205. set_current_state(TASK_INTERRUPTIBLE);
  1206. raw_spin_unlock_irq(&wc->endio_list_lock);
  1207. if (unlikely(kthread_should_stop())) {
  1208. set_current_state(TASK_RUNNING);
  1209. break;
  1210. }
  1211. schedule();
  1212. continue;
  1213. pop_from_list:
  1214. list = wc->endio_list;
  1215. list.next->prev = list.prev->next = &list;
  1216. INIT_LIST_HEAD(&wc->endio_list);
  1217. raw_spin_unlock_irq(&wc->endio_list_lock);
  1218. if (!WC_MODE_FUA(wc))
  1219. writecache_disk_flush(wc, wc->dev);
  1220. wc_lock(wc);
  1221. if (WC_MODE_PMEM(wc)) {
  1222. __writecache_endio_pmem(wc, &list);
  1223. } else {
  1224. __writecache_endio_ssd(wc, &list);
  1225. writecache_wait_for_ios(wc, READ);
  1226. }
  1227. writecache_commit_flushed(wc, false);
  1228. wc_unlock(wc);
  1229. }
  1230. return 0;
  1231. }
  1232. static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp)
  1233. {
  1234. struct dm_writecache *wc = wb->wc;
  1235. unsigned block_size = wc->block_size;
  1236. void *address = memory_data(wc, e);
  1237. persistent_memory_flush_cache(address, block_size);
  1238. return bio_add_page(&wb->bio, persistent_memory_page(address),
  1239. block_size, persistent_memory_page_offset(address)) != 0;
  1240. }
  1241. struct writeback_list {
  1242. struct list_head list;
  1243. size_t size;
  1244. };
  1245. static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
  1246. {
  1247. if (unlikely(wc->max_writeback_jobs)) {
  1248. if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
  1249. wc_lock(wc);
  1250. while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
  1251. writecache_wait_on_freelist(wc);
  1252. wc_unlock(wc);
  1253. }
  1254. }
  1255. cond_resched();
  1256. }
  1257. static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
  1258. {
  1259. struct wc_entry *e, *f;
  1260. struct bio *bio;
  1261. struct writeback_struct *wb;
  1262. unsigned max_pages;
  1263. while (wbl->size) {
  1264. wbl->size--;
  1265. e = container_of(wbl->list.prev, struct wc_entry, lru);
  1266. list_del(&e->lru);
  1267. max_pages = e->wc_list_contiguous;
  1268. bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
  1269. wb = container_of(bio, struct writeback_struct, bio);
  1270. wb->wc = wc;
  1271. wb->bio.bi_end_io = writecache_writeback_endio;
  1272. bio_set_dev(&wb->bio, wc->dev->bdev);
  1273. wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
  1274. wb->page_offset = PAGE_SIZE;
  1275. if (max_pages <= WB_LIST_INLINE ||
  1276. unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
  1277. GFP_NOIO | __GFP_NORETRY |
  1278. __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
  1279. wb->wc_list = wb->wc_list_inline;
  1280. max_pages = WB_LIST_INLINE;
  1281. }
  1282. BUG_ON(!wc_add_block(wb, e, GFP_NOIO));
  1283. wb->wc_list[0] = e;
  1284. wb->wc_list_n = 1;
  1285. while (wbl->size && wb->wc_list_n < max_pages) {
  1286. f = container_of(wbl->list.prev, struct wc_entry, lru);
  1287. if (read_original_sector(wc, f) !=
  1288. read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
  1289. break;
  1290. if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN))
  1291. break;
  1292. wbl->size--;
  1293. list_del(&f->lru);
  1294. wb->wc_list[wb->wc_list_n++] = f;
  1295. e = f;
  1296. }
  1297. bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
  1298. if (writecache_has_error(wc)) {
  1299. bio->bi_status = BLK_STS_IOERR;
  1300. bio_endio(&wb->bio);
  1301. } else {
  1302. submit_bio(&wb->bio);
  1303. }
  1304. __writeback_throttle(wc, wbl);
  1305. }
  1306. }
  1307. static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
  1308. {
  1309. struct wc_entry *e, *f;
  1310. struct dm_io_region from, to;
  1311. struct copy_struct *c;
  1312. while (wbl->size) {
  1313. unsigned n_sectors;
  1314. wbl->size--;
  1315. e = container_of(wbl->list.prev, struct wc_entry, lru);
  1316. list_del(&e->lru);
  1317. n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
  1318. from.bdev = wc->ssd_dev->bdev;
  1319. from.sector = cache_sector(wc, e);
  1320. from.count = n_sectors;
  1321. to.bdev = wc->dev->bdev;
  1322. to.sector = read_original_sector(wc, e);
  1323. to.count = n_sectors;
  1324. c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
  1325. c->wc = wc;
  1326. c->e = e;
  1327. c->n_entries = e->wc_list_contiguous;
  1328. while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
  1329. wbl->size--;
  1330. f = container_of(wbl->list.prev, struct wc_entry, lru);
  1331. BUG_ON(f != e + 1);
  1332. list_del(&f->lru);
  1333. e = f;
  1334. }
  1335. dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
  1336. __writeback_throttle(wc, wbl);
  1337. }
  1338. }
  1339. static void writecache_writeback(struct work_struct *work)
  1340. {
  1341. struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
  1342. struct blk_plug plug;
  1343. struct wc_entry *e, *f, *g;
  1344. struct rb_node *node, *next_node;
  1345. struct list_head skipped;
  1346. struct writeback_list wbl;
  1347. unsigned long n_walked;
  1348. wc_lock(wc);
  1349. restart:
  1350. if (writecache_has_error(wc)) {
  1351. wc_unlock(wc);
  1352. return;
  1353. }
  1354. if (unlikely(wc->writeback_all)) {
  1355. if (writecache_wait_for_writeback(wc))
  1356. goto restart;
  1357. }
  1358. if (wc->overwrote_committed) {
  1359. writecache_wait_for_ios(wc, WRITE);
  1360. }
  1361. n_walked = 0;
  1362. INIT_LIST_HEAD(&skipped);
  1363. INIT_LIST_HEAD(&wbl.list);
  1364. wbl.size = 0;
  1365. while (!list_empty(&wc->lru) &&
  1366. (wc->writeback_all ||
  1367. wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) {
  1368. n_walked++;
  1369. if (unlikely(n_walked > WRITEBACK_LATENCY) &&
  1370. likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) {
  1371. queue_work(wc->writeback_wq, &wc->writeback_work);
  1372. break;
  1373. }
  1374. e = container_of(wc->lru.prev, struct wc_entry, lru);
  1375. BUG_ON(e->write_in_progress);
  1376. if (unlikely(!writecache_entry_is_committed(wc, e))) {
  1377. writecache_flush(wc);
  1378. }
  1379. node = rb_prev(&e->rb_node);
  1380. if (node) {
  1381. f = container_of(node, struct wc_entry, rb_node);
  1382. if (unlikely(read_original_sector(wc, f) ==
  1383. read_original_sector(wc, e))) {
  1384. BUG_ON(!f->write_in_progress);
  1385. list_del(&e->lru);
  1386. list_add(&e->lru, &skipped);
  1387. cond_resched();
  1388. continue;
  1389. }
  1390. }
  1391. wc->writeback_size++;
  1392. list_del(&e->lru);
  1393. list_add(&e->lru, &wbl.list);
  1394. wbl.size++;
  1395. e->write_in_progress = true;
  1396. e->wc_list_contiguous = 1;
  1397. f = e;
  1398. while (1) {
  1399. next_node = rb_next(&f->rb_node);
  1400. if (unlikely(!next_node))
  1401. break;
  1402. g = container_of(next_node, struct wc_entry, rb_node);
  1403. if (read_original_sector(wc, g) ==
  1404. read_original_sector(wc, f)) {
  1405. f = g;
  1406. continue;
  1407. }
  1408. if (read_original_sector(wc, g) !=
  1409. read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
  1410. break;
  1411. if (unlikely(g->write_in_progress))
  1412. break;
  1413. if (unlikely(!writecache_entry_is_committed(wc, g)))
  1414. break;
  1415. if (!WC_MODE_PMEM(wc)) {
  1416. if (g != f + 1)
  1417. break;
  1418. }
  1419. n_walked++;
  1420. //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
  1421. // break;
  1422. wc->writeback_size++;
  1423. list_del(&g->lru);
  1424. list_add(&g->lru, &wbl.list);
  1425. wbl.size++;
  1426. g->write_in_progress = true;
  1427. g->wc_list_contiguous = BIO_MAX_PAGES;
  1428. f = g;
  1429. e->wc_list_contiguous++;
  1430. if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES))
  1431. break;
  1432. }
  1433. cond_resched();
  1434. }
  1435. if (!list_empty(&skipped)) {
  1436. list_splice_tail(&skipped, &wc->lru);
  1437. /*
  1438. * If we didn't do any progress, we must wait until some
  1439. * writeback finishes to avoid burning CPU in a loop
  1440. */
  1441. if (unlikely(!wbl.size))
  1442. writecache_wait_for_writeback(wc);
  1443. }
  1444. wc_unlock(wc);
  1445. blk_start_plug(&plug);
  1446. if (WC_MODE_PMEM(wc))
  1447. __writecache_writeback_pmem(wc, &wbl);
  1448. else
  1449. __writecache_writeback_ssd(wc, &wbl);
  1450. blk_finish_plug(&plug);
  1451. if (unlikely(wc->writeback_all)) {
  1452. wc_lock(wc);
  1453. while (writecache_wait_for_writeback(wc));
  1454. wc_unlock(wc);
  1455. }
  1456. }
  1457. static int calculate_memory_size(uint64_t device_size, unsigned block_size,
  1458. size_t *n_blocks_p, size_t *n_metadata_blocks_p)
  1459. {
  1460. uint64_t n_blocks, offset;
  1461. struct wc_entry e;
  1462. n_blocks = device_size;
  1463. do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
  1464. while (1) {
  1465. if (!n_blocks)
  1466. return -ENOSPC;
  1467. /* Verify the following entries[n_blocks] won't overflow */
  1468. if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
  1469. sizeof(struct wc_memory_entry)))
  1470. return -EFBIG;
  1471. offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
  1472. offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
  1473. if (offset + n_blocks * block_size <= device_size)
  1474. break;
  1475. n_blocks--;
  1476. }
  1477. /* check if the bit field overflows */
  1478. e.index = n_blocks;
  1479. if (e.index != n_blocks)
  1480. return -EFBIG;
  1481. if (n_blocks_p)
  1482. *n_blocks_p = n_blocks;
  1483. if (n_metadata_blocks_p)
  1484. *n_metadata_blocks_p = offset >> __ffs(block_size);
  1485. return 0;
  1486. }
  1487. static int init_memory(struct dm_writecache *wc)
  1488. {
  1489. size_t b;
  1490. int r;
  1491. r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
  1492. if (r)
  1493. return r;
  1494. r = writecache_alloc_entries(wc);
  1495. if (r)
  1496. return r;
  1497. for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
  1498. pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
  1499. pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
  1500. pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
  1501. pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
  1502. pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
  1503. for (b = 0; b < wc->n_blocks; b++)
  1504. write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
  1505. writecache_flush_all_metadata(wc);
  1506. writecache_commit_flushed(wc, false);
  1507. pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
  1508. writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
  1509. writecache_commit_flushed(wc, false);
  1510. return 0;
  1511. }
  1512. static void writecache_dtr(struct dm_target *ti)
  1513. {
  1514. struct dm_writecache *wc = ti->private;
  1515. if (!wc)
  1516. return;
  1517. if (wc->endio_thread)
  1518. kthread_stop(wc->endio_thread);
  1519. if (wc->flush_thread)
  1520. kthread_stop(wc->flush_thread);
  1521. bioset_exit(&wc->bio_set);
  1522. mempool_exit(&wc->copy_pool);
  1523. if (wc->writeback_wq)
  1524. destroy_workqueue(wc->writeback_wq);
  1525. if (wc->dev)
  1526. dm_put_device(ti, wc->dev);
  1527. if (wc->ssd_dev)
  1528. dm_put_device(ti, wc->ssd_dev);
  1529. if (wc->entries)
  1530. vfree(wc->entries);
  1531. if (wc->memory_map) {
  1532. if (WC_MODE_PMEM(wc))
  1533. persistent_memory_release(wc);
  1534. else
  1535. vfree(wc->memory_map);
  1536. }
  1537. if (wc->dm_kcopyd)
  1538. dm_kcopyd_client_destroy(wc->dm_kcopyd);
  1539. if (wc->dm_io)
  1540. dm_io_client_destroy(wc->dm_io);
  1541. if (wc->dirty_bitmap)
  1542. vfree(wc->dirty_bitmap);
  1543. kfree(wc);
  1544. }
  1545. static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
  1546. {
  1547. struct dm_writecache *wc;
  1548. struct dm_arg_set as;
  1549. const char *string;
  1550. unsigned opt_params;
  1551. size_t offset, data_size;
  1552. int i, r;
  1553. char dummy;
  1554. int high_wm_percent = HIGH_WATERMARK;
  1555. int low_wm_percent = LOW_WATERMARK;
  1556. uint64_t x;
  1557. struct wc_memory_superblock s;
  1558. static struct dm_arg _args[] = {
  1559. {0, 10, "Invalid number of feature args"},
  1560. };
  1561. as.argc = argc;
  1562. as.argv = argv;
  1563. wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
  1564. if (!wc) {
  1565. ti->error = "Cannot allocate writecache structure";
  1566. r = -ENOMEM;
  1567. goto bad;
  1568. }
  1569. ti->private = wc;
  1570. wc->ti = ti;
  1571. mutex_init(&wc->lock);
  1572. writecache_poison_lists(wc);
  1573. init_waitqueue_head(&wc->freelist_wait);
  1574. timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
  1575. for (i = 0; i < 2; i++) {
  1576. atomic_set(&wc->bio_in_progress[i], 0);
  1577. init_waitqueue_head(&wc->bio_in_progress_wait[i]);
  1578. }
  1579. wc->dm_io = dm_io_client_create();
  1580. if (IS_ERR(wc->dm_io)) {
  1581. r = PTR_ERR(wc->dm_io);
  1582. ti->error = "Unable to allocate dm-io client";
  1583. wc->dm_io = NULL;
  1584. goto bad;
  1585. }
  1586. wc->writeback_wq = alloc_workqueue("writecache-writeabck", WQ_MEM_RECLAIM, 1);
  1587. if (!wc->writeback_wq) {
  1588. r = -ENOMEM;
  1589. ti->error = "Could not allocate writeback workqueue";
  1590. goto bad;
  1591. }
  1592. INIT_WORK(&wc->writeback_work, writecache_writeback);
  1593. INIT_WORK(&wc->flush_work, writecache_flush_work);
  1594. raw_spin_lock_init(&wc->endio_list_lock);
  1595. INIT_LIST_HEAD(&wc->endio_list);
  1596. wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
  1597. if (IS_ERR(wc->endio_thread)) {
  1598. r = PTR_ERR(wc->endio_thread);
  1599. wc->endio_thread = NULL;
  1600. ti->error = "Couldn't spawn endio thread";
  1601. goto bad;
  1602. }
  1603. wake_up_process(wc->endio_thread);
  1604. /*
  1605. * Parse the mode (pmem or ssd)
  1606. */
  1607. string = dm_shift_arg(&as);
  1608. if (!string)
  1609. goto bad_arguments;
  1610. if (!strcasecmp(string, "s")) {
  1611. wc->pmem_mode = false;
  1612. } else if (!strcasecmp(string, "p")) {
  1613. #ifdef DM_WRITECACHE_HAS_PMEM
  1614. wc->pmem_mode = true;
  1615. wc->writeback_fua = true;
  1616. #else
  1617. /*
  1618. * If the architecture doesn't support persistent memory or
  1619. * the kernel doesn't support any DAX drivers, this driver can
  1620. * only be used in SSD-only mode.
  1621. */
  1622. r = -EOPNOTSUPP;
  1623. ti->error = "Persistent memory or DAX not supported on this system";
  1624. goto bad;
  1625. #endif
  1626. } else {
  1627. goto bad_arguments;
  1628. }
  1629. if (WC_MODE_PMEM(wc)) {
  1630. r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
  1631. offsetof(struct writeback_struct, bio),
  1632. BIOSET_NEED_BVECS);
  1633. if (r) {
  1634. ti->error = "Could not allocate bio set";
  1635. goto bad;
  1636. }
  1637. } else {
  1638. r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
  1639. if (r) {
  1640. ti->error = "Could not allocate mempool";
  1641. goto bad;
  1642. }
  1643. }
  1644. /*
  1645. * Parse the origin data device
  1646. */
  1647. string = dm_shift_arg(&as);
  1648. if (!string)
  1649. goto bad_arguments;
  1650. r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
  1651. if (r) {
  1652. ti->error = "Origin data device lookup failed";
  1653. goto bad;
  1654. }
  1655. /*
  1656. * Parse cache data device (be it pmem or ssd)
  1657. */
  1658. string = dm_shift_arg(&as);
  1659. if (!string)
  1660. goto bad_arguments;
  1661. r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
  1662. if (r) {
  1663. ti->error = "Cache data device lookup failed";
  1664. goto bad;
  1665. }
  1666. wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
  1667. /*
  1668. * Parse the cache block size
  1669. */
  1670. string = dm_shift_arg(&as);
  1671. if (!string)
  1672. goto bad_arguments;
  1673. if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
  1674. wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
  1675. (wc->block_size & (wc->block_size - 1))) {
  1676. r = -EINVAL;
  1677. ti->error = "Invalid block size";
  1678. goto bad;
  1679. }
  1680. wc->block_size_bits = __ffs(wc->block_size);
  1681. wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
  1682. wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
  1683. wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
  1684. /*
  1685. * Parse optional arguments
  1686. */
  1687. r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
  1688. if (r)
  1689. goto bad;
  1690. while (opt_params) {
  1691. string = dm_shift_arg(&as), opt_params--;
  1692. if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
  1693. unsigned long long start_sector;
  1694. string = dm_shift_arg(&as), opt_params--;
  1695. if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
  1696. goto invalid_optional;
  1697. wc->start_sector = start_sector;
  1698. if (wc->start_sector != start_sector ||
  1699. wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
  1700. goto invalid_optional;
  1701. } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
  1702. string = dm_shift_arg(&as), opt_params--;
  1703. if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
  1704. goto invalid_optional;
  1705. if (high_wm_percent < 0 || high_wm_percent > 100)
  1706. goto invalid_optional;
  1707. wc->high_wm_percent_set = true;
  1708. } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
  1709. string = dm_shift_arg(&as), opt_params--;
  1710. if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
  1711. goto invalid_optional;
  1712. if (low_wm_percent < 0 || low_wm_percent > 100)
  1713. goto invalid_optional;
  1714. wc->low_wm_percent_set = true;
  1715. } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
  1716. string = dm_shift_arg(&as), opt_params--;
  1717. if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
  1718. goto invalid_optional;
  1719. wc->max_writeback_jobs_set = true;
  1720. } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
  1721. string = dm_shift_arg(&as), opt_params--;
  1722. if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
  1723. goto invalid_optional;
  1724. wc->autocommit_blocks_set = true;
  1725. } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
  1726. unsigned autocommit_msecs;
  1727. string = dm_shift_arg(&as), opt_params--;
  1728. if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
  1729. goto invalid_optional;
  1730. if (autocommit_msecs > 3600000)
  1731. goto invalid_optional;
  1732. wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
  1733. wc->autocommit_time_set = true;
  1734. } else if (!strcasecmp(string, "fua")) {
  1735. if (WC_MODE_PMEM(wc)) {
  1736. wc->writeback_fua = true;
  1737. wc->writeback_fua_set = true;
  1738. } else goto invalid_optional;
  1739. } else if (!strcasecmp(string, "nofua")) {
  1740. if (WC_MODE_PMEM(wc)) {
  1741. wc->writeback_fua = false;
  1742. wc->writeback_fua_set = true;
  1743. } else goto invalid_optional;
  1744. } else {
  1745. invalid_optional:
  1746. r = -EINVAL;
  1747. ti->error = "Invalid optional argument";
  1748. goto bad;
  1749. }
  1750. }
  1751. if (high_wm_percent < low_wm_percent) {
  1752. r = -EINVAL;
  1753. ti->error = "High watermark must be greater than or equal to low watermark";
  1754. goto bad;
  1755. }
  1756. if (WC_MODE_PMEM(wc)) {
  1757. r = persistent_memory_claim(wc);
  1758. if (r) {
  1759. ti->error = "Unable to map persistent memory for cache";
  1760. goto bad;
  1761. }
  1762. } else {
  1763. struct dm_io_region region;
  1764. struct dm_io_request req;
  1765. size_t n_blocks, n_metadata_blocks;
  1766. uint64_t n_bitmap_bits;
  1767. wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
  1768. bio_list_init(&wc->flush_list);
  1769. wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush");
  1770. if (IS_ERR(wc->flush_thread)) {
  1771. r = PTR_ERR(wc->flush_thread);
  1772. wc->flush_thread = NULL;
  1773. ti->error = "Couldn't spawn endio thread";
  1774. goto bad;
  1775. }
  1776. wake_up_process(wc->flush_thread);
  1777. r = calculate_memory_size(wc->memory_map_size, wc->block_size,
  1778. &n_blocks, &n_metadata_blocks);
  1779. if (r) {
  1780. ti->error = "Invalid device size";
  1781. goto bad;
  1782. }
  1783. n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
  1784. BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
  1785. /* this is limitation of test_bit functions */
  1786. if (n_bitmap_bits > 1U << 31) {
  1787. r = -EFBIG;
  1788. ti->error = "Invalid device size";
  1789. goto bad;
  1790. }
  1791. wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
  1792. if (!wc->memory_map) {
  1793. r = -ENOMEM;
  1794. ti->error = "Unable to allocate memory for metadata";
  1795. goto bad;
  1796. }
  1797. wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
  1798. if (IS_ERR(wc->dm_kcopyd)) {
  1799. r = PTR_ERR(wc->dm_kcopyd);
  1800. ti->error = "Unable to allocate dm-kcopyd client";
  1801. wc->dm_kcopyd = NULL;
  1802. goto bad;
  1803. }
  1804. wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
  1805. wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
  1806. BITS_PER_LONG * sizeof(unsigned long);
  1807. wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
  1808. if (!wc->dirty_bitmap) {
  1809. r = -ENOMEM;
  1810. ti->error = "Unable to allocate dirty bitmap";
  1811. goto bad;
  1812. }
  1813. region.bdev = wc->ssd_dev->bdev;
  1814. region.sector = wc->start_sector;
  1815. region.count = wc->metadata_sectors;
  1816. req.bi_op = REQ_OP_READ;
  1817. req.bi_op_flags = REQ_SYNC;
  1818. req.mem.type = DM_IO_VMA;
  1819. req.mem.ptr.vma = (char *)wc->memory_map;
  1820. req.client = wc->dm_io;
  1821. req.notify.fn = NULL;
  1822. r = dm_io(&req, 1, &region, NULL);
  1823. if (r) {
  1824. ti->error = "Unable to read metadata";
  1825. goto bad;
  1826. }
  1827. }
  1828. r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
  1829. if (r) {
  1830. ti->error = "Hardware memory error when reading superblock";
  1831. goto bad;
  1832. }
  1833. if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
  1834. r = init_memory(wc);
  1835. if (r) {
  1836. ti->error = "Unable to initialize device";
  1837. goto bad;
  1838. }
  1839. r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
  1840. if (r) {
  1841. ti->error = "Hardware memory error when reading superblock";
  1842. goto bad;
  1843. }
  1844. }
  1845. if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
  1846. ti->error = "Invalid magic in the superblock";
  1847. r = -EINVAL;
  1848. goto bad;
  1849. }
  1850. if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
  1851. ti->error = "Invalid version in the superblock";
  1852. r = -EINVAL;
  1853. goto bad;
  1854. }
  1855. if (le32_to_cpu(s.block_size) != wc->block_size) {
  1856. ti->error = "Block size does not match superblock";
  1857. r = -EINVAL;
  1858. goto bad;
  1859. }
  1860. wc->n_blocks = le64_to_cpu(s.n_blocks);
  1861. offset = wc->n_blocks * sizeof(struct wc_memory_entry);
  1862. if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
  1863. overflow:
  1864. ti->error = "Overflow in size calculation";
  1865. r = -EINVAL;
  1866. goto bad;
  1867. }
  1868. offset += sizeof(struct wc_memory_superblock);
  1869. if (offset < sizeof(struct wc_memory_superblock))
  1870. goto overflow;
  1871. offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
  1872. data_size = wc->n_blocks * (size_t)wc->block_size;
  1873. if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
  1874. (offset + data_size < offset))
  1875. goto overflow;
  1876. if (offset + data_size > wc->memory_map_size) {
  1877. ti->error = "Memory area is too small";
  1878. r = -EINVAL;
  1879. goto bad;
  1880. }
  1881. wc->metadata_sectors = offset >> SECTOR_SHIFT;
  1882. wc->block_start = (char *)sb(wc) + offset;
  1883. x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
  1884. x += 50;
  1885. do_div(x, 100);
  1886. wc->freelist_high_watermark = x;
  1887. x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
  1888. x += 50;
  1889. do_div(x, 100);
  1890. wc->freelist_low_watermark = x;
  1891. r = writecache_alloc_entries(wc);
  1892. if (r) {
  1893. ti->error = "Cannot allocate memory";
  1894. goto bad;
  1895. }
  1896. ti->num_flush_bios = 1;
  1897. ti->flush_supported = true;
  1898. ti->num_discard_bios = 1;
  1899. if (WC_MODE_PMEM(wc))
  1900. persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
  1901. return 0;
  1902. bad_arguments:
  1903. r = -EINVAL;
  1904. ti->error = "Bad arguments";
  1905. bad:
  1906. writecache_dtr(ti);
  1907. return r;
  1908. }
  1909. static void writecache_status(struct dm_target *ti, status_type_t type,
  1910. unsigned status_flags, char *result, unsigned maxlen)
  1911. {
  1912. struct dm_writecache *wc = ti->private;
  1913. unsigned extra_args;
  1914. unsigned sz = 0;
  1915. uint64_t x;
  1916. switch (type) {
  1917. case STATUSTYPE_INFO:
  1918. DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc),
  1919. (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
  1920. (unsigned long long)wc->writeback_size);
  1921. break;
  1922. case STATUSTYPE_TABLE:
  1923. DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
  1924. wc->dev->name, wc->ssd_dev->name, wc->block_size);
  1925. extra_args = 0;
  1926. if (wc->start_sector)
  1927. extra_args += 2;
  1928. if (wc->high_wm_percent_set)
  1929. extra_args += 2;
  1930. if (wc->low_wm_percent_set)
  1931. extra_args += 2;
  1932. if (wc->max_writeback_jobs_set)
  1933. extra_args += 2;
  1934. if (wc->autocommit_blocks_set)
  1935. extra_args += 2;
  1936. if (wc->autocommit_time_set)
  1937. extra_args += 2;
  1938. if (wc->writeback_fua_set)
  1939. extra_args++;
  1940. DMEMIT("%u", extra_args);
  1941. if (wc->start_sector)
  1942. DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
  1943. if (wc->high_wm_percent_set) {
  1944. x = (uint64_t)wc->freelist_high_watermark * 100;
  1945. x += wc->n_blocks / 2;
  1946. do_div(x, (size_t)wc->n_blocks);
  1947. DMEMIT(" high_watermark %u", 100 - (unsigned)x);
  1948. }
  1949. if (wc->low_wm_percent_set) {
  1950. x = (uint64_t)wc->freelist_low_watermark * 100;
  1951. x += wc->n_blocks / 2;
  1952. do_div(x, (size_t)wc->n_blocks);
  1953. DMEMIT(" low_watermark %u", 100 - (unsigned)x);
  1954. }
  1955. if (wc->max_writeback_jobs_set)
  1956. DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
  1957. if (wc->autocommit_blocks_set)
  1958. DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
  1959. if (wc->autocommit_time_set)
  1960. DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
  1961. if (wc->writeback_fua_set)
  1962. DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
  1963. break;
  1964. }
  1965. }
  1966. static struct target_type writecache_target = {
  1967. .name = "writecache",
  1968. .version = {1, 1, 1},
  1969. .module = THIS_MODULE,
  1970. .ctr = writecache_ctr,
  1971. .dtr = writecache_dtr,
  1972. .status = writecache_status,
  1973. .postsuspend = writecache_suspend,
  1974. .resume = writecache_resume,
  1975. .message = writecache_message,
  1976. .map = writecache_map,
  1977. .end_io = writecache_end_io,
  1978. .iterate_devices = writecache_iterate_devices,
  1979. .io_hints = writecache_io_hints,
  1980. };
  1981. static int __init dm_writecache_init(void)
  1982. {
  1983. int r;
  1984. r = dm_register_target(&writecache_target);
  1985. if (r < 0) {
  1986. DMERR("register failed %d", r);
  1987. return r;
  1988. }
  1989. return 0;
  1990. }
  1991. static void __exit dm_writecache_exit(void)
  1992. {
  1993. dm_unregister_target(&writecache_target);
  1994. }
  1995. module_init(dm_writecache_init);
  1996. module_exit(dm_writecache_exit);
  1997. MODULE_DESCRIPTION(DM_NAME " writecache target");
  1998. MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
  1999. MODULE_LICENSE("GPL");