iomap.c 54 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174
  1. /*
  2. * Copyright (C) 2010 Red Hat, Inc.
  3. * Copyright (c) 2016-2018 Christoph Hellwig.
  4. *
  5. * This program is free software; you can redistribute it and/or modify it
  6. * under the terms and conditions of the GNU General Public License,
  7. * version 2, as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope it will be useful, but WITHOUT
  10. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  12. * more details.
  13. */
  14. #include <linux/module.h>
  15. #include <linux/compiler.h>
  16. #include <linux/fs.h>
  17. #include <linux/iomap.h>
  18. #include <linux/uaccess.h>
  19. #include <linux/gfp.h>
  20. #include <linux/migrate.h>
  21. #include <linux/mm.h>
  22. #include <linux/mm_inline.h>
  23. #include <linux/swap.h>
  24. #include <linux/pagemap.h>
  25. #include <linux/pagevec.h>
  26. #include <linux/file.h>
  27. #include <linux/uio.h>
  28. #include <linux/backing-dev.h>
  29. #include <linux/buffer_head.h>
  30. #include <linux/task_io_accounting_ops.h>
  31. #include <linux/dax.h>
  32. #include <linux/sched/signal.h>
  33. #include <linux/swap.h>
  34. #include "internal.h"
  35. /*
  36. * Execute a iomap write on a segment of the mapping that spans a
  37. * contiguous range of pages that have identical block mapping state.
  38. *
  39. * This avoids the need to map pages individually, do individual allocations
  40. * for each page and most importantly avoid the need for filesystem specific
  41. * locking per page. Instead, all the operations are amortised over the entire
  42. * range of pages. It is assumed that the filesystems will lock whatever
  43. * resources they require in the iomap_begin call, and release them in the
  44. * iomap_end call.
  45. */
  46. loff_t
  47. iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
  48. const struct iomap_ops *ops, void *data, iomap_actor_t actor)
  49. {
  50. struct iomap iomap = { 0 };
  51. loff_t written = 0, ret;
  52. /*
  53. * Need to map a range from start position for length bytes. This can
  54. * span multiple pages - it is only guaranteed to return a range of a
  55. * single type of pages (e.g. all into a hole, all mapped or all
  56. * unwritten). Failure at this point has nothing to undo.
  57. *
  58. * If allocation is required for this range, reserve the space now so
  59. * that the allocation is guaranteed to succeed later on. Once we copy
  60. * the data into the page cache pages, then we cannot fail otherwise we
  61. * expose transient stale data. If the reserve fails, we can safely
  62. * back out at this point as there is nothing to undo.
  63. */
  64. ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
  65. if (ret)
  66. return ret;
  67. if (WARN_ON(iomap.offset > pos))
  68. return -EIO;
  69. if (WARN_ON(iomap.length == 0))
  70. return -EIO;
  71. /*
  72. * Cut down the length to the one actually provided by the filesystem,
  73. * as it might not be able to give us the whole size that we requested.
  74. */
  75. if (iomap.offset + iomap.length < pos + length)
  76. length = iomap.offset + iomap.length - pos;
  77. /*
  78. * Now that we have guaranteed that the space allocation will succeed.
  79. * we can do the copy-in page by page without having to worry about
  80. * failures exposing transient data.
  81. */
  82. written = actor(inode, pos, length, data, &iomap);
  83. /*
  84. * Now the data has been copied, commit the range we've copied. This
  85. * should not fail unless the filesystem has had a fatal error.
  86. */
  87. if (ops->iomap_end) {
  88. ret = ops->iomap_end(inode, pos, length,
  89. written > 0 ? written : 0,
  90. flags, &iomap);
  91. }
  92. return written ? written : ret;
  93. }
  94. static sector_t
  95. iomap_sector(struct iomap *iomap, loff_t pos)
  96. {
  97. return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
  98. }
  99. static struct iomap_page *
  100. iomap_page_create(struct inode *inode, struct page *page)
  101. {
  102. struct iomap_page *iop = to_iomap_page(page);
  103. if (iop || i_blocksize(inode) == PAGE_SIZE)
  104. return iop;
  105. iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
  106. atomic_set(&iop->read_count, 0);
  107. atomic_set(&iop->write_count, 0);
  108. bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
  109. /*
  110. * migrate_page_move_mapping() assumes that pages with private data have
  111. * their count elevated by 1.
  112. */
  113. get_page(page);
  114. set_page_private(page, (unsigned long)iop);
  115. SetPagePrivate(page);
  116. return iop;
  117. }
  118. static void
  119. iomap_page_release(struct page *page)
  120. {
  121. struct iomap_page *iop = to_iomap_page(page);
  122. if (!iop)
  123. return;
  124. WARN_ON_ONCE(atomic_read(&iop->read_count));
  125. WARN_ON_ONCE(atomic_read(&iop->write_count));
  126. ClearPagePrivate(page);
  127. set_page_private(page, 0);
  128. put_page(page);
  129. kfree(iop);
  130. }
  131. /*
  132. * Calculate the range inside the page that we actually need to read.
  133. */
  134. static void
  135. iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
  136. loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
  137. {
  138. loff_t orig_pos = *pos;
  139. loff_t isize = i_size_read(inode);
  140. unsigned block_bits = inode->i_blkbits;
  141. unsigned block_size = (1 << block_bits);
  142. unsigned poff = offset_in_page(*pos);
  143. unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
  144. unsigned first = poff >> block_bits;
  145. unsigned last = (poff + plen - 1) >> block_bits;
  146. /*
  147. * If the block size is smaller than the page size we need to check the
  148. * per-block uptodate status and adjust the offset and length if needed
  149. * to avoid reading in already uptodate ranges.
  150. */
  151. if (iop) {
  152. unsigned int i;
  153. /* move forward for each leading block marked uptodate */
  154. for (i = first; i <= last; i++) {
  155. if (!test_bit(i, iop->uptodate))
  156. break;
  157. *pos += block_size;
  158. poff += block_size;
  159. plen -= block_size;
  160. first++;
  161. }
  162. /* truncate len if we find any trailing uptodate block(s) */
  163. for ( ; i <= last; i++) {
  164. if (test_bit(i, iop->uptodate)) {
  165. plen -= (last - i + 1) * block_size;
  166. last = i - 1;
  167. break;
  168. }
  169. }
  170. }
  171. /*
  172. * If the extent spans the block that contains the i_size we need to
  173. * handle both halves separately so that we properly zero data in the
  174. * page cache for blocks that are entirely outside of i_size.
  175. */
  176. if (orig_pos <= isize && orig_pos + length > isize) {
  177. unsigned end = offset_in_page(isize - 1) >> block_bits;
  178. if (first <= end && last > end)
  179. plen -= (last - end) * block_size;
  180. }
  181. *offp = poff;
  182. *lenp = plen;
  183. }
  184. static void
  185. iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
  186. {
  187. struct iomap_page *iop = to_iomap_page(page);
  188. struct inode *inode = page->mapping->host;
  189. unsigned first = off >> inode->i_blkbits;
  190. unsigned last = (off + len - 1) >> inode->i_blkbits;
  191. unsigned int i;
  192. bool uptodate = true;
  193. if (iop) {
  194. for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) {
  195. if (i >= first && i <= last)
  196. set_bit(i, iop->uptodate);
  197. else if (!test_bit(i, iop->uptodate))
  198. uptodate = false;
  199. }
  200. }
  201. if (uptodate && !PageError(page))
  202. SetPageUptodate(page);
  203. }
  204. static void
  205. iomap_read_finish(struct iomap_page *iop, struct page *page)
  206. {
  207. if (!iop || atomic_dec_and_test(&iop->read_count))
  208. unlock_page(page);
  209. }
  210. static void
  211. iomap_read_page_end_io(struct bio_vec *bvec, int error)
  212. {
  213. struct page *page = bvec->bv_page;
  214. struct iomap_page *iop = to_iomap_page(page);
  215. if (unlikely(error)) {
  216. ClearPageUptodate(page);
  217. SetPageError(page);
  218. } else {
  219. iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len);
  220. }
  221. iomap_read_finish(iop, page);
  222. }
  223. static void
  224. iomap_read_inline_data(struct inode *inode, struct page *page,
  225. struct iomap *iomap)
  226. {
  227. size_t size = i_size_read(inode);
  228. void *addr;
  229. if (PageUptodate(page))
  230. return;
  231. BUG_ON(page->index);
  232. BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
  233. addr = kmap_atomic(page);
  234. memcpy(addr, iomap->inline_data, size);
  235. memset(addr + size, 0, PAGE_SIZE - size);
  236. kunmap_atomic(addr);
  237. SetPageUptodate(page);
  238. }
  239. static void
  240. iomap_read_end_io(struct bio *bio)
  241. {
  242. int error = blk_status_to_errno(bio->bi_status);
  243. struct bio_vec *bvec;
  244. int i;
  245. bio_for_each_segment_all(bvec, bio, i)
  246. iomap_read_page_end_io(bvec, error);
  247. bio_put(bio);
  248. }
  249. struct iomap_readpage_ctx {
  250. struct page *cur_page;
  251. bool cur_page_in_bio;
  252. bool is_readahead;
  253. struct bio *bio;
  254. struct list_head *pages;
  255. };
  256. static loff_t
  257. iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  258. struct iomap *iomap)
  259. {
  260. struct iomap_readpage_ctx *ctx = data;
  261. struct page *page = ctx->cur_page;
  262. struct iomap_page *iop = iomap_page_create(inode, page);
  263. bool is_contig = false;
  264. loff_t orig_pos = pos;
  265. unsigned poff, plen;
  266. sector_t sector;
  267. if (iomap->type == IOMAP_INLINE) {
  268. WARN_ON_ONCE(pos);
  269. iomap_read_inline_data(inode, page, iomap);
  270. return PAGE_SIZE;
  271. }
  272. /* zero post-eof blocks as the page may be mapped */
  273. iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
  274. if (plen == 0)
  275. goto done;
  276. if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
  277. zero_user(page, poff, plen);
  278. iomap_set_range_uptodate(page, poff, plen);
  279. goto done;
  280. }
  281. ctx->cur_page_in_bio = true;
  282. /*
  283. * Try to merge into a previous segment if we can.
  284. */
  285. sector = iomap_sector(iomap, pos);
  286. if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
  287. if (__bio_try_merge_page(ctx->bio, page, plen, poff))
  288. goto done;
  289. is_contig = true;
  290. }
  291. /*
  292. * If we start a new segment we need to increase the read count, and we
  293. * need to do so before submitting any previous full bio to make sure
  294. * that we don't prematurely unlock the page.
  295. */
  296. if (iop)
  297. atomic_inc(&iop->read_count);
  298. if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
  299. gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
  300. int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
  301. if (ctx->bio)
  302. submit_bio(ctx->bio);
  303. if (ctx->is_readahead) /* same as readahead_gfp_mask */
  304. gfp |= __GFP_NORETRY | __GFP_NOWARN;
  305. ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
  306. ctx->bio->bi_opf = REQ_OP_READ;
  307. if (ctx->is_readahead)
  308. ctx->bio->bi_opf |= REQ_RAHEAD;
  309. ctx->bio->bi_iter.bi_sector = sector;
  310. bio_set_dev(ctx->bio, iomap->bdev);
  311. ctx->bio->bi_end_io = iomap_read_end_io;
  312. }
  313. __bio_add_page(ctx->bio, page, plen, poff);
  314. done:
  315. /*
  316. * Move the caller beyond our range so that it keeps making progress.
  317. * For that we have to include any leading non-uptodate ranges, but
  318. * we can skip trailing ones as they will be handled in the next
  319. * iteration.
  320. */
  321. return pos - orig_pos + plen;
  322. }
  323. int
  324. iomap_readpage(struct page *page, const struct iomap_ops *ops)
  325. {
  326. struct iomap_readpage_ctx ctx = { .cur_page = page };
  327. struct inode *inode = page->mapping->host;
  328. unsigned poff;
  329. loff_t ret;
  330. for (poff = 0; poff < PAGE_SIZE; poff += ret) {
  331. ret = iomap_apply(inode, page_offset(page) + poff,
  332. PAGE_SIZE - poff, 0, ops, &ctx,
  333. iomap_readpage_actor);
  334. if (ret <= 0) {
  335. WARN_ON_ONCE(ret == 0);
  336. SetPageError(page);
  337. break;
  338. }
  339. }
  340. if (ctx.bio) {
  341. submit_bio(ctx.bio);
  342. WARN_ON_ONCE(!ctx.cur_page_in_bio);
  343. } else {
  344. WARN_ON_ONCE(ctx.cur_page_in_bio);
  345. unlock_page(page);
  346. }
  347. /*
  348. * Just like mpage_readpages and block_read_full_page we always
  349. * return 0 and just mark the page as PageError on errors. This
  350. * should be cleaned up all through the stack eventually.
  351. */
  352. return 0;
  353. }
  354. EXPORT_SYMBOL_GPL(iomap_readpage);
  355. static struct page *
  356. iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
  357. loff_t length, loff_t *done)
  358. {
  359. while (!list_empty(pages)) {
  360. struct page *page = lru_to_page(pages);
  361. if (page_offset(page) >= (u64)pos + length)
  362. break;
  363. list_del(&page->lru);
  364. if (!add_to_page_cache_lru(page, inode->i_mapping, page->index,
  365. GFP_NOFS))
  366. return page;
  367. /*
  368. * If we already have a page in the page cache at index we are
  369. * done. Upper layers don't care if it is uptodate after the
  370. * readpages call itself as every page gets checked again once
  371. * actually needed.
  372. */
  373. *done += PAGE_SIZE;
  374. put_page(page);
  375. }
  376. return NULL;
  377. }
  378. static loff_t
  379. iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
  380. void *data, struct iomap *iomap)
  381. {
  382. struct iomap_readpage_ctx *ctx = data;
  383. loff_t done, ret;
  384. for (done = 0; done < length; done += ret) {
  385. if (ctx->cur_page && offset_in_page(pos + done) == 0) {
  386. if (!ctx->cur_page_in_bio)
  387. unlock_page(ctx->cur_page);
  388. put_page(ctx->cur_page);
  389. ctx->cur_page = NULL;
  390. }
  391. if (!ctx->cur_page) {
  392. ctx->cur_page = iomap_next_page(inode, ctx->pages,
  393. pos, length, &done);
  394. if (!ctx->cur_page)
  395. break;
  396. ctx->cur_page_in_bio = false;
  397. }
  398. ret = iomap_readpage_actor(inode, pos + done, length - done,
  399. ctx, iomap);
  400. }
  401. return done;
  402. }
  403. int
  404. iomap_readpages(struct address_space *mapping, struct list_head *pages,
  405. unsigned nr_pages, const struct iomap_ops *ops)
  406. {
  407. struct iomap_readpage_ctx ctx = {
  408. .pages = pages,
  409. .is_readahead = true,
  410. };
  411. loff_t pos = page_offset(list_entry(pages->prev, struct page, lru));
  412. loff_t last = page_offset(list_entry(pages->next, struct page, lru));
  413. loff_t length = last - pos + PAGE_SIZE, ret = 0;
  414. while (length > 0) {
  415. ret = iomap_apply(mapping->host, pos, length, 0, ops,
  416. &ctx, iomap_readpages_actor);
  417. if (ret <= 0) {
  418. WARN_ON_ONCE(ret == 0);
  419. goto done;
  420. }
  421. pos += ret;
  422. length -= ret;
  423. }
  424. ret = 0;
  425. done:
  426. if (ctx.bio)
  427. submit_bio(ctx.bio);
  428. if (ctx.cur_page) {
  429. if (!ctx.cur_page_in_bio)
  430. unlock_page(ctx.cur_page);
  431. put_page(ctx.cur_page);
  432. }
  433. /*
  434. * Check that we didn't lose a page due to the arcance calling
  435. * conventions..
  436. */
  437. WARN_ON_ONCE(!ret && !list_empty(ctx.pages));
  438. return ret;
  439. }
  440. EXPORT_SYMBOL_GPL(iomap_readpages);
  441. /*
  442. * iomap_is_partially_uptodate checks whether blocks within a page are
  443. * uptodate or not.
  444. *
  445. * Returns true if all blocks which correspond to a file portion
  446. * we want to read within the page are uptodate.
  447. */
  448. int
  449. iomap_is_partially_uptodate(struct page *page, unsigned long from,
  450. unsigned long count)
  451. {
  452. struct iomap_page *iop = to_iomap_page(page);
  453. struct inode *inode = page->mapping->host;
  454. unsigned len, first, last;
  455. unsigned i;
  456. /* Limit range to one page */
  457. len = min_t(unsigned, PAGE_SIZE - from, count);
  458. /* First and last blocks in range within page */
  459. first = from >> inode->i_blkbits;
  460. last = (from + len - 1) >> inode->i_blkbits;
  461. if (iop) {
  462. for (i = first; i <= last; i++)
  463. if (!test_bit(i, iop->uptodate))
  464. return 0;
  465. return 1;
  466. }
  467. return 0;
  468. }
  469. EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
  470. int
  471. iomap_releasepage(struct page *page, gfp_t gfp_mask)
  472. {
  473. /*
  474. * mm accommodates an old ext3 case where clean pages might not have had
  475. * the dirty bit cleared. Thus, it can send actual dirty pages to
  476. * ->releasepage() via shrink_active_list(), skip those here.
  477. */
  478. if (PageDirty(page) || PageWriteback(page))
  479. return 0;
  480. iomap_page_release(page);
  481. return 1;
  482. }
  483. EXPORT_SYMBOL_GPL(iomap_releasepage);
  484. void
  485. iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
  486. {
  487. /*
  488. * If we are invalidating the entire page, clear the dirty state from it
  489. * and release it to avoid unnecessary buildup of the LRU.
  490. */
  491. if (offset == 0 && len == PAGE_SIZE) {
  492. WARN_ON_ONCE(PageWriteback(page));
  493. cancel_dirty_page(page);
  494. iomap_page_release(page);
  495. }
  496. }
  497. EXPORT_SYMBOL_GPL(iomap_invalidatepage);
  498. #ifdef CONFIG_MIGRATION
  499. int
  500. iomap_migrate_page(struct address_space *mapping, struct page *newpage,
  501. struct page *page, enum migrate_mode mode)
  502. {
  503. int ret;
  504. ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
  505. if (ret != MIGRATEPAGE_SUCCESS)
  506. return ret;
  507. if (page_has_private(page)) {
  508. ClearPagePrivate(page);
  509. get_page(newpage);
  510. set_page_private(newpage, page_private(page));
  511. set_page_private(page, 0);
  512. put_page(page);
  513. SetPagePrivate(newpage);
  514. }
  515. if (mode != MIGRATE_SYNC_NO_COPY)
  516. migrate_page_copy(newpage, page);
  517. else
  518. migrate_page_states(newpage, page);
  519. return MIGRATEPAGE_SUCCESS;
  520. }
  521. EXPORT_SYMBOL_GPL(iomap_migrate_page);
  522. #endif /* CONFIG_MIGRATION */
  523. static void
  524. iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
  525. {
  526. loff_t i_size = i_size_read(inode);
  527. /*
  528. * Only truncate newly allocated pages beyoned EOF, even if the
  529. * write started inside the existing inode size.
  530. */
  531. if (pos + len > i_size)
  532. truncate_pagecache_range(inode, max(pos, i_size), pos + len);
  533. }
  534. static int
  535. iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
  536. unsigned poff, unsigned plen, unsigned from, unsigned to,
  537. struct iomap *iomap)
  538. {
  539. struct bio_vec bvec;
  540. struct bio bio;
  541. if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
  542. zero_user_segments(page, poff, from, to, poff + plen);
  543. iomap_set_range_uptodate(page, poff, plen);
  544. return 0;
  545. }
  546. bio_init(&bio, &bvec, 1);
  547. bio.bi_opf = REQ_OP_READ;
  548. bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
  549. bio_set_dev(&bio, iomap->bdev);
  550. __bio_add_page(&bio, page, plen, poff);
  551. return submit_bio_wait(&bio);
  552. }
  553. static int
  554. __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
  555. struct page *page, struct iomap *iomap)
  556. {
  557. struct iomap_page *iop = iomap_page_create(inode, page);
  558. loff_t block_size = i_blocksize(inode);
  559. loff_t block_start = pos & ~(block_size - 1);
  560. loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
  561. unsigned from = offset_in_page(pos), to = from + len, poff, plen;
  562. int status = 0;
  563. if (PageUptodate(page))
  564. return 0;
  565. do {
  566. iomap_adjust_read_range(inode, iop, &block_start,
  567. block_end - block_start, &poff, &plen);
  568. if (plen == 0)
  569. break;
  570. if ((from > poff && from < poff + plen) ||
  571. (to > poff && to < poff + plen)) {
  572. status = iomap_read_page_sync(inode, block_start, page,
  573. poff, plen, from, to, iomap);
  574. if (status)
  575. break;
  576. }
  577. } while ((block_start += plen) < block_end);
  578. return status;
  579. }
  580. static int
  581. iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
  582. struct page **pagep, struct iomap *iomap)
  583. {
  584. pgoff_t index = pos >> PAGE_SHIFT;
  585. struct page *page;
  586. int status = 0;
  587. BUG_ON(pos + len > iomap->offset + iomap->length);
  588. if (fatal_signal_pending(current))
  589. return -EINTR;
  590. page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
  591. if (!page)
  592. return -ENOMEM;
  593. if (iomap->type == IOMAP_INLINE)
  594. iomap_read_inline_data(inode, page, iomap);
  595. else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
  596. status = __block_write_begin_int(page, pos, len, NULL, iomap);
  597. else
  598. status = __iomap_write_begin(inode, pos, len, page, iomap);
  599. if (unlikely(status)) {
  600. unlock_page(page);
  601. put_page(page);
  602. page = NULL;
  603. iomap_write_failed(inode, pos, len);
  604. }
  605. *pagep = page;
  606. return status;
  607. }
  608. int
  609. iomap_set_page_dirty(struct page *page)
  610. {
  611. struct address_space *mapping = page_mapping(page);
  612. int newly_dirty;
  613. if (unlikely(!mapping))
  614. return !TestSetPageDirty(page);
  615. /*
  616. * Lock out page->mem_cgroup migration to keep PageDirty
  617. * synchronized with per-memcg dirty page counters.
  618. */
  619. lock_page_memcg(page);
  620. newly_dirty = !TestSetPageDirty(page);
  621. if (newly_dirty)
  622. __set_page_dirty(page, mapping, 0);
  623. unlock_page_memcg(page);
  624. if (newly_dirty)
  625. __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  626. return newly_dirty;
  627. }
  628. EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
  629. static int
  630. __iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
  631. unsigned copied, struct page *page, struct iomap *iomap)
  632. {
  633. flush_dcache_page(page);
  634. /*
  635. * The blocks that were entirely written will now be uptodate, so we
  636. * don't have to worry about a readpage reading them and overwriting a
  637. * partial write. However if we have encountered a short write and only
  638. * partially written into a block, it will not be marked uptodate, so a
  639. * readpage might come in and destroy our partial write.
  640. *
  641. * Do the simplest thing, and just treat any short write to a non
  642. * uptodate page as a zero-length write, and force the caller to redo
  643. * the whole thing.
  644. */
  645. if (unlikely(copied < len && !PageUptodate(page))) {
  646. copied = 0;
  647. } else {
  648. iomap_set_range_uptodate(page, offset_in_page(pos), len);
  649. iomap_set_page_dirty(page);
  650. }
  651. return __generic_write_end(inode, pos, copied, page);
  652. }
  653. static int
  654. iomap_write_end_inline(struct inode *inode, struct page *page,
  655. struct iomap *iomap, loff_t pos, unsigned copied)
  656. {
  657. void *addr;
  658. WARN_ON_ONCE(!PageUptodate(page));
  659. BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
  660. addr = kmap_atomic(page);
  661. memcpy(iomap->inline_data + pos, addr + pos, copied);
  662. kunmap_atomic(addr);
  663. mark_inode_dirty(inode);
  664. __generic_write_end(inode, pos, copied, page);
  665. return copied;
  666. }
  667. static int
  668. iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
  669. unsigned copied, struct page *page, struct iomap *iomap)
  670. {
  671. int ret;
  672. if (iomap->type == IOMAP_INLINE) {
  673. ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
  674. } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
  675. ret = generic_write_end(NULL, inode->i_mapping, pos, len,
  676. copied, page, NULL);
  677. } else {
  678. ret = __iomap_write_end(inode, pos, len, copied, page, iomap);
  679. }
  680. if (iomap->page_done)
  681. iomap->page_done(inode, pos, copied, page, iomap);
  682. if (ret < len)
  683. iomap_write_failed(inode, pos, len);
  684. return ret;
  685. }
  686. static loff_t
  687. iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  688. struct iomap *iomap)
  689. {
  690. struct iov_iter *i = data;
  691. long status = 0;
  692. ssize_t written = 0;
  693. unsigned int flags = AOP_FLAG_NOFS;
  694. do {
  695. struct page *page;
  696. unsigned long offset; /* Offset into pagecache page */
  697. unsigned long bytes; /* Bytes to write to page */
  698. size_t copied; /* Bytes copied from user */
  699. offset = offset_in_page(pos);
  700. bytes = min_t(unsigned long, PAGE_SIZE - offset,
  701. iov_iter_count(i));
  702. again:
  703. if (bytes > length)
  704. bytes = length;
  705. /*
  706. * Bring in the user page that we will copy from _first_.
  707. * Otherwise there's a nasty deadlock on copying from the
  708. * same page as we're writing to, without it being marked
  709. * up-to-date.
  710. *
  711. * Not only is this an optimisation, but it is also required
  712. * to check that the address is actually valid, when atomic
  713. * usercopies are used, below.
  714. */
  715. if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
  716. status = -EFAULT;
  717. break;
  718. }
  719. status = iomap_write_begin(inode, pos, bytes, flags, &page,
  720. iomap);
  721. if (unlikely(status))
  722. break;
  723. if (mapping_writably_mapped(inode->i_mapping))
  724. flush_dcache_page(page);
  725. copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
  726. flush_dcache_page(page);
  727. status = iomap_write_end(inode, pos, bytes, copied, page,
  728. iomap);
  729. if (unlikely(status < 0))
  730. break;
  731. copied = status;
  732. cond_resched();
  733. iov_iter_advance(i, copied);
  734. if (unlikely(copied == 0)) {
  735. /*
  736. * If we were unable to copy any data at all, we must
  737. * fall back to a single segment length write.
  738. *
  739. * If we didn't fallback here, we could livelock
  740. * because not all segments in the iov can be copied at
  741. * once without a pagefault.
  742. */
  743. bytes = min_t(unsigned long, PAGE_SIZE - offset,
  744. iov_iter_single_seg_count(i));
  745. goto again;
  746. }
  747. pos += copied;
  748. written += copied;
  749. length -= copied;
  750. balance_dirty_pages_ratelimited(inode->i_mapping);
  751. } while (iov_iter_count(i) && length);
  752. return written ? written : status;
  753. }
  754. ssize_t
  755. iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
  756. const struct iomap_ops *ops)
  757. {
  758. struct inode *inode = iocb->ki_filp->f_mapping->host;
  759. loff_t pos = iocb->ki_pos, ret = 0, written = 0;
  760. while (iov_iter_count(iter)) {
  761. ret = iomap_apply(inode, pos, iov_iter_count(iter),
  762. IOMAP_WRITE, ops, iter, iomap_write_actor);
  763. if (ret <= 0)
  764. break;
  765. pos += ret;
  766. written += ret;
  767. }
  768. return written ? written : ret;
  769. }
  770. EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
  771. static struct page *
  772. __iomap_read_page(struct inode *inode, loff_t offset)
  773. {
  774. struct address_space *mapping = inode->i_mapping;
  775. struct page *page;
  776. page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
  777. if (IS_ERR(page))
  778. return page;
  779. if (!PageUptodate(page)) {
  780. put_page(page);
  781. return ERR_PTR(-EIO);
  782. }
  783. return page;
  784. }
  785. static loff_t
  786. iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  787. struct iomap *iomap)
  788. {
  789. long status = 0;
  790. ssize_t written = 0;
  791. do {
  792. struct page *page, *rpage;
  793. unsigned long offset; /* Offset into pagecache page */
  794. unsigned long bytes; /* Bytes to write to page */
  795. offset = offset_in_page(pos);
  796. bytes = min_t(loff_t, PAGE_SIZE - offset, length);
  797. rpage = __iomap_read_page(inode, pos);
  798. if (IS_ERR(rpage))
  799. return PTR_ERR(rpage);
  800. status = iomap_write_begin(inode, pos, bytes,
  801. AOP_FLAG_NOFS, &page, iomap);
  802. put_page(rpage);
  803. if (unlikely(status))
  804. return status;
  805. WARN_ON_ONCE(!PageUptodate(page));
  806. status = iomap_write_end(inode, pos, bytes, bytes, page, iomap);
  807. if (unlikely(status <= 0)) {
  808. if (WARN_ON_ONCE(status == 0))
  809. return -EIO;
  810. return status;
  811. }
  812. cond_resched();
  813. pos += status;
  814. written += status;
  815. length -= status;
  816. balance_dirty_pages_ratelimited(inode->i_mapping);
  817. } while (length);
  818. return written;
  819. }
  820. int
  821. iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
  822. const struct iomap_ops *ops)
  823. {
  824. loff_t ret;
  825. while (len) {
  826. ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
  827. iomap_dirty_actor);
  828. if (ret <= 0)
  829. return ret;
  830. pos += ret;
  831. len -= ret;
  832. }
  833. return 0;
  834. }
  835. EXPORT_SYMBOL_GPL(iomap_file_dirty);
  836. static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
  837. unsigned bytes, struct iomap *iomap)
  838. {
  839. struct page *page;
  840. int status;
  841. status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
  842. iomap);
  843. if (status)
  844. return status;
  845. zero_user(page, offset, bytes);
  846. mark_page_accessed(page);
  847. return iomap_write_end(inode, pos, bytes, bytes, page, iomap);
  848. }
  849. static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
  850. struct iomap *iomap)
  851. {
  852. return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
  853. iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
  854. }
  855. static loff_t
  856. iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
  857. void *data, struct iomap *iomap)
  858. {
  859. bool *did_zero = data;
  860. loff_t written = 0;
  861. int status;
  862. /* already zeroed? we're done. */
  863. if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
  864. return count;
  865. do {
  866. unsigned offset, bytes;
  867. offset = offset_in_page(pos);
  868. bytes = min_t(loff_t, PAGE_SIZE - offset, count);
  869. if (IS_DAX(inode))
  870. status = iomap_dax_zero(pos, offset, bytes, iomap);
  871. else
  872. status = iomap_zero(inode, pos, offset, bytes, iomap);
  873. if (status < 0)
  874. return status;
  875. pos += bytes;
  876. count -= bytes;
  877. written += bytes;
  878. if (did_zero)
  879. *did_zero = true;
  880. } while (count > 0);
  881. return written;
  882. }
  883. int
  884. iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
  885. const struct iomap_ops *ops)
  886. {
  887. loff_t ret;
  888. while (len > 0) {
  889. ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
  890. ops, did_zero, iomap_zero_range_actor);
  891. if (ret <= 0)
  892. return ret;
  893. pos += ret;
  894. len -= ret;
  895. }
  896. return 0;
  897. }
  898. EXPORT_SYMBOL_GPL(iomap_zero_range);
  899. int
  900. iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
  901. const struct iomap_ops *ops)
  902. {
  903. unsigned int blocksize = i_blocksize(inode);
  904. unsigned int off = pos & (blocksize - 1);
  905. /* Block boundary? Nothing to do */
  906. if (!off)
  907. return 0;
  908. return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
  909. }
  910. EXPORT_SYMBOL_GPL(iomap_truncate_page);
  911. static loff_t
  912. iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
  913. void *data, struct iomap *iomap)
  914. {
  915. struct page *page = data;
  916. int ret;
  917. if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
  918. ret = __block_write_begin_int(page, pos, length, NULL, iomap);
  919. if (ret)
  920. return ret;
  921. block_commit_write(page, 0, length);
  922. } else {
  923. WARN_ON_ONCE(!PageUptodate(page));
  924. iomap_page_create(inode, page);
  925. set_page_dirty(page);
  926. }
  927. return length;
  928. }
  929. int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
  930. {
  931. struct page *page = vmf->page;
  932. struct inode *inode = file_inode(vmf->vma->vm_file);
  933. unsigned long length;
  934. loff_t offset, size;
  935. ssize_t ret;
  936. lock_page(page);
  937. size = i_size_read(inode);
  938. if ((page->mapping != inode->i_mapping) ||
  939. (page_offset(page) > size)) {
  940. /* We overload EFAULT to mean page got truncated */
  941. ret = -EFAULT;
  942. goto out_unlock;
  943. }
  944. /* page is wholly or partially inside EOF */
  945. if (((page->index + 1) << PAGE_SHIFT) > size)
  946. length = offset_in_page(size);
  947. else
  948. length = PAGE_SIZE;
  949. offset = page_offset(page);
  950. while (length > 0) {
  951. ret = iomap_apply(inode, offset, length,
  952. IOMAP_WRITE | IOMAP_FAULT, ops, page,
  953. iomap_page_mkwrite_actor);
  954. if (unlikely(ret <= 0))
  955. goto out_unlock;
  956. offset += ret;
  957. length -= ret;
  958. }
  959. wait_for_stable_page(page);
  960. return VM_FAULT_LOCKED;
  961. out_unlock:
  962. unlock_page(page);
  963. return block_page_mkwrite_return(ret);
  964. }
  965. EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
  966. struct fiemap_ctx {
  967. struct fiemap_extent_info *fi;
  968. struct iomap prev;
  969. };
  970. static int iomap_to_fiemap(struct fiemap_extent_info *fi,
  971. struct iomap *iomap, u32 flags)
  972. {
  973. switch (iomap->type) {
  974. case IOMAP_HOLE:
  975. /* skip holes */
  976. return 0;
  977. case IOMAP_DELALLOC:
  978. flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
  979. break;
  980. case IOMAP_MAPPED:
  981. break;
  982. case IOMAP_UNWRITTEN:
  983. flags |= FIEMAP_EXTENT_UNWRITTEN;
  984. break;
  985. case IOMAP_INLINE:
  986. flags |= FIEMAP_EXTENT_DATA_INLINE;
  987. break;
  988. }
  989. if (iomap->flags & IOMAP_F_MERGED)
  990. flags |= FIEMAP_EXTENT_MERGED;
  991. if (iomap->flags & IOMAP_F_SHARED)
  992. flags |= FIEMAP_EXTENT_SHARED;
  993. return fiemap_fill_next_extent(fi, iomap->offset,
  994. iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
  995. iomap->length, flags);
  996. }
  997. static loff_t
  998. iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  999. struct iomap *iomap)
  1000. {
  1001. struct fiemap_ctx *ctx = data;
  1002. loff_t ret = length;
  1003. if (iomap->type == IOMAP_HOLE)
  1004. return length;
  1005. ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
  1006. ctx->prev = *iomap;
  1007. switch (ret) {
  1008. case 0: /* success */
  1009. return length;
  1010. case 1: /* extent array full */
  1011. return 0;
  1012. default:
  1013. return ret;
  1014. }
  1015. }
  1016. int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
  1017. loff_t start, loff_t len, const struct iomap_ops *ops)
  1018. {
  1019. struct fiemap_ctx ctx;
  1020. loff_t ret;
  1021. memset(&ctx, 0, sizeof(ctx));
  1022. ctx.fi = fi;
  1023. ctx.prev.type = IOMAP_HOLE;
  1024. ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
  1025. if (ret)
  1026. return ret;
  1027. if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
  1028. ret = filemap_write_and_wait(inode->i_mapping);
  1029. if (ret)
  1030. return ret;
  1031. }
  1032. while (len > 0) {
  1033. ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx,
  1034. iomap_fiemap_actor);
  1035. /* inode with no (attribute) mapping will give ENOENT */
  1036. if (ret == -ENOENT)
  1037. break;
  1038. if (ret < 0)
  1039. return ret;
  1040. if (ret == 0)
  1041. break;
  1042. start += ret;
  1043. len -= ret;
  1044. }
  1045. if (ctx.prev.type != IOMAP_HOLE) {
  1046. ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
  1047. if (ret < 0)
  1048. return ret;
  1049. }
  1050. return 0;
  1051. }
  1052. EXPORT_SYMBOL_GPL(iomap_fiemap);
  1053. /*
  1054. * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
  1055. * Returns true if found and updates @lastoff to the offset in file.
  1056. */
  1057. static bool
  1058. page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff,
  1059. int whence)
  1060. {
  1061. const struct address_space_operations *ops = inode->i_mapping->a_ops;
  1062. unsigned int bsize = i_blocksize(inode), off;
  1063. bool seek_data = whence == SEEK_DATA;
  1064. loff_t poff = page_offset(page);
  1065. if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE))
  1066. return false;
  1067. if (*lastoff < poff) {
  1068. /*
  1069. * Last offset smaller than the start of the page means we found
  1070. * a hole:
  1071. */
  1072. if (whence == SEEK_HOLE)
  1073. return true;
  1074. *lastoff = poff;
  1075. }
  1076. /*
  1077. * Just check the page unless we can and should check block ranges:
  1078. */
  1079. if (bsize == PAGE_SIZE || !ops->is_partially_uptodate)
  1080. return PageUptodate(page) == seek_data;
  1081. lock_page(page);
  1082. if (unlikely(page->mapping != inode->i_mapping))
  1083. goto out_unlock_not_found;
  1084. for (off = 0; off < PAGE_SIZE; off += bsize) {
  1085. if (offset_in_page(*lastoff) >= off + bsize)
  1086. continue;
  1087. if (ops->is_partially_uptodate(page, off, bsize) == seek_data) {
  1088. unlock_page(page);
  1089. return true;
  1090. }
  1091. *lastoff = poff + off + bsize;
  1092. }
  1093. out_unlock_not_found:
  1094. unlock_page(page);
  1095. return false;
  1096. }
  1097. /*
  1098. * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
  1099. *
  1100. * Within unwritten extents, the page cache determines which parts are holes
  1101. * and which are data: uptodate buffer heads count as data; everything else
  1102. * counts as a hole.
  1103. *
  1104. * Returns the resulting offset on successs, and -ENOENT otherwise.
  1105. */
  1106. static loff_t
  1107. page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
  1108. int whence)
  1109. {
  1110. pgoff_t index = offset >> PAGE_SHIFT;
  1111. pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
  1112. loff_t lastoff = offset;
  1113. struct pagevec pvec;
  1114. if (length <= 0)
  1115. return -ENOENT;
  1116. pagevec_init(&pvec);
  1117. do {
  1118. unsigned nr_pages, i;
  1119. nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
  1120. end - 1);
  1121. if (nr_pages == 0)
  1122. break;
  1123. for (i = 0; i < nr_pages; i++) {
  1124. struct page *page = pvec.pages[i];
  1125. if (page_seek_hole_data(inode, page, &lastoff, whence))
  1126. goto check_range;
  1127. lastoff = page_offset(page) + PAGE_SIZE;
  1128. }
  1129. pagevec_release(&pvec);
  1130. } while (index < end);
  1131. /* When no page at lastoff and we are not done, we found a hole. */
  1132. if (whence != SEEK_HOLE)
  1133. goto not_found;
  1134. check_range:
  1135. if (lastoff < offset + length)
  1136. goto out;
  1137. not_found:
  1138. lastoff = -ENOENT;
  1139. out:
  1140. pagevec_release(&pvec);
  1141. return lastoff;
  1142. }
  1143. static loff_t
  1144. iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
  1145. void *data, struct iomap *iomap)
  1146. {
  1147. switch (iomap->type) {
  1148. case IOMAP_UNWRITTEN:
  1149. offset = page_cache_seek_hole_data(inode, offset, length,
  1150. SEEK_HOLE);
  1151. if (offset < 0)
  1152. return length;
  1153. /* fall through */
  1154. case IOMAP_HOLE:
  1155. *(loff_t *)data = offset;
  1156. return 0;
  1157. default:
  1158. return length;
  1159. }
  1160. }
  1161. loff_t
  1162. iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
  1163. {
  1164. loff_t size = i_size_read(inode);
  1165. loff_t length = size - offset;
  1166. loff_t ret;
  1167. /* Nothing to be found before or beyond the end of the file. */
  1168. if (offset < 0 || offset >= size)
  1169. return -ENXIO;
  1170. while (length > 0) {
  1171. ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
  1172. &offset, iomap_seek_hole_actor);
  1173. if (ret < 0)
  1174. return ret;
  1175. if (ret == 0)
  1176. break;
  1177. offset += ret;
  1178. length -= ret;
  1179. }
  1180. return offset;
  1181. }
  1182. EXPORT_SYMBOL_GPL(iomap_seek_hole);
  1183. static loff_t
  1184. iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
  1185. void *data, struct iomap *iomap)
  1186. {
  1187. switch (iomap->type) {
  1188. case IOMAP_HOLE:
  1189. return length;
  1190. case IOMAP_UNWRITTEN:
  1191. offset = page_cache_seek_hole_data(inode, offset, length,
  1192. SEEK_DATA);
  1193. if (offset < 0)
  1194. return length;
  1195. /*FALLTHRU*/
  1196. default:
  1197. *(loff_t *)data = offset;
  1198. return 0;
  1199. }
  1200. }
  1201. loff_t
  1202. iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
  1203. {
  1204. loff_t size = i_size_read(inode);
  1205. loff_t length = size - offset;
  1206. loff_t ret;
  1207. /* Nothing to be found before or beyond the end of the file. */
  1208. if (offset < 0 || offset >= size)
  1209. return -ENXIO;
  1210. while (length > 0) {
  1211. ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
  1212. &offset, iomap_seek_data_actor);
  1213. if (ret < 0)
  1214. return ret;
  1215. if (ret == 0)
  1216. break;
  1217. offset += ret;
  1218. length -= ret;
  1219. }
  1220. if (length <= 0)
  1221. return -ENXIO;
  1222. return offset;
  1223. }
  1224. EXPORT_SYMBOL_GPL(iomap_seek_data);
  1225. /*
  1226. * Private flags for iomap_dio, must not overlap with the public ones in
  1227. * iomap.h:
  1228. */
  1229. #define IOMAP_DIO_WRITE_FUA (1 << 28)
  1230. #define IOMAP_DIO_NEED_SYNC (1 << 29)
  1231. #define IOMAP_DIO_WRITE (1 << 30)
  1232. #define IOMAP_DIO_DIRTY (1 << 31)
  1233. struct iomap_dio {
  1234. struct kiocb *iocb;
  1235. iomap_dio_end_io_t *end_io;
  1236. loff_t i_size;
  1237. loff_t size;
  1238. atomic_t ref;
  1239. unsigned flags;
  1240. int error;
  1241. bool wait_for_completion;
  1242. union {
  1243. /* used during submission and for synchronous completion: */
  1244. struct {
  1245. struct iov_iter *iter;
  1246. struct task_struct *waiter;
  1247. struct request_queue *last_queue;
  1248. blk_qc_t cookie;
  1249. } submit;
  1250. /* used for aio completion: */
  1251. struct {
  1252. struct work_struct work;
  1253. } aio;
  1254. };
  1255. };
  1256. static ssize_t iomap_dio_complete(struct iomap_dio *dio)
  1257. {
  1258. struct kiocb *iocb = dio->iocb;
  1259. struct inode *inode = file_inode(iocb->ki_filp);
  1260. loff_t offset = iocb->ki_pos;
  1261. ssize_t ret;
  1262. if (dio->end_io) {
  1263. ret = dio->end_io(iocb,
  1264. dio->error ? dio->error : dio->size,
  1265. dio->flags);
  1266. } else {
  1267. ret = dio->error;
  1268. }
  1269. if (likely(!ret)) {
  1270. ret = dio->size;
  1271. /* check for short read */
  1272. if (offset + ret > dio->i_size &&
  1273. !(dio->flags & IOMAP_DIO_WRITE))
  1274. ret = dio->i_size - offset;
  1275. iocb->ki_pos += ret;
  1276. }
  1277. /*
  1278. * Try again to invalidate clean pages which might have been cached by
  1279. * non-direct readahead, or faulted in by get_user_pages() if the source
  1280. * of the write was an mmap'ed region of the file we're writing. Either
  1281. * one is a pretty crazy thing to do, so we don't support it 100%. If
  1282. * this invalidation fails, tough, the write still worked...
  1283. *
  1284. * And this page cache invalidation has to be after dio->end_io(), as
  1285. * some filesystems convert unwritten extents to real allocations in
  1286. * end_io() when necessary, otherwise a racing buffer read would cache
  1287. * zeros from unwritten extents.
  1288. */
  1289. if (!dio->error &&
  1290. (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
  1291. int err;
  1292. err = invalidate_inode_pages2_range(inode->i_mapping,
  1293. offset >> PAGE_SHIFT,
  1294. (offset + dio->size - 1) >> PAGE_SHIFT);
  1295. if (err)
  1296. dio_warn_stale_pagecache(iocb->ki_filp);
  1297. }
  1298. /*
  1299. * If this is a DSYNC write, make sure we push it to stable storage now
  1300. * that we've written data.
  1301. */
  1302. if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
  1303. ret = generic_write_sync(iocb, ret);
  1304. inode_dio_end(file_inode(iocb->ki_filp));
  1305. kfree(dio);
  1306. return ret;
  1307. }
  1308. static void iomap_dio_complete_work(struct work_struct *work)
  1309. {
  1310. struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
  1311. struct kiocb *iocb = dio->iocb;
  1312. iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
  1313. }
  1314. /*
  1315. * Set an error in the dio if none is set yet. We have to use cmpxchg
  1316. * as the submission context and the completion context(s) can race to
  1317. * update the error.
  1318. */
  1319. static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
  1320. {
  1321. cmpxchg(&dio->error, 0, ret);
  1322. }
  1323. static void iomap_dio_bio_end_io(struct bio *bio)
  1324. {
  1325. struct iomap_dio *dio = bio->bi_private;
  1326. bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
  1327. if (bio->bi_status)
  1328. iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
  1329. if (atomic_dec_and_test(&dio->ref)) {
  1330. if (dio->wait_for_completion) {
  1331. struct task_struct *waiter = dio->submit.waiter;
  1332. WRITE_ONCE(dio->submit.waiter, NULL);
  1333. wake_up_process(waiter);
  1334. } else if (dio->flags & IOMAP_DIO_WRITE) {
  1335. struct inode *inode = file_inode(dio->iocb->ki_filp);
  1336. INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
  1337. queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
  1338. } else {
  1339. iomap_dio_complete_work(&dio->aio.work);
  1340. }
  1341. }
  1342. if (should_dirty) {
  1343. bio_check_pages_dirty(bio);
  1344. } else {
  1345. struct bio_vec *bvec;
  1346. int i;
  1347. bio_for_each_segment_all(bvec, bio, i)
  1348. put_page(bvec->bv_page);
  1349. bio_put(bio);
  1350. }
  1351. }
  1352. static blk_qc_t
  1353. iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
  1354. unsigned len)
  1355. {
  1356. struct page *page = ZERO_PAGE(0);
  1357. struct bio *bio;
  1358. bio = bio_alloc(GFP_KERNEL, 1);
  1359. bio_set_dev(bio, iomap->bdev);
  1360. bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
  1361. bio->bi_private = dio;
  1362. bio->bi_end_io = iomap_dio_bio_end_io;
  1363. get_page(page);
  1364. __bio_add_page(bio, page, len, 0);
  1365. bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
  1366. atomic_inc(&dio->ref);
  1367. return submit_bio(bio);
  1368. }
  1369. static loff_t
  1370. iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
  1371. struct iomap_dio *dio, struct iomap *iomap)
  1372. {
  1373. unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
  1374. unsigned int fs_block_size = i_blocksize(inode), pad;
  1375. unsigned int align = iov_iter_alignment(dio->submit.iter);
  1376. struct iov_iter iter;
  1377. struct bio *bio;
  1378. bool need_zeroout = false;
  1379. bool use_fua = false;
  1380. int nr_pages, ret = 0;
  1381. size_t copied = 0;
  1382. if ((pos | length | align) & ((1 << blkbits) - 1))
  1383. return -EINVAL;
  1384. if (iomap->type == IOMAP_UNWRITTEN) {
  1385. dio->flags |= IOMAP_DIO_UNWRITTEN;
  1386. need_zeroout = true;
  1387. }
  1388. if (iomap->flags & IOMAP_F_SHARED)
  1389. dio->flags |= IOMAP_DIO_COW;
  1390. if (iomap->flags & IOMAP_F_NEW) {
  1391. need_zeroout = true;
  1392. } else if (iomap->type == IOMAP_MAPPED) {
  1393. /*
  1394. * Use a FUA write if we need datasync semantics, this is a pure
  1395. * data IO that doesn't require any metadata updates (including
  1396. * after IO completion such as unwritten extent conversion) and
  1397. * the underlying device supports FUA. This allows us to avoid
  1398. * cache flushes on IO completion.
  1399. */
  1400. if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
  1401. (dio->flags & IOMAP_DIO_WRITE_FUA) &&
  1402. blk_queue_fua(bdev_get_queue(iomap->bdev)))
  1403. use_fua = true;
  1404. }
  1405. /*
  1406. * Operate on a partial iter trimmed to the extent we were called for.
  1407. * We'll update the iter in the dio once we're done with this extent.
  1408. */
  1409. iter = *dio->submit.iter;
  1410. iov_iter_truncate(&iter, length);
  1411. nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
  1412. if (nr_pages <= 0)
  1413. return nr_pages;
  1414. if (need_zeroout) {
  1415. /* zero out from the start of the block to the write offset */
  1416. pad = pos & (fs_block_size - 1);
  1417. if (pad)
  1418. iomap_dio_zero(dio, iomap, pos - pad, pad);
  1419. }
  1420. do {
  1421. size_t n;
  1422. if (dio->error) {
  1423. iov_iter_revert(dio->submit.iter, copied);
  1424. return 0;
  1425. }
  1426. bio = bio_alloc(GFP_KERNEL, nr_pages);
  1427. bio_set_dev(bio, iomap->bdev);
  1428. bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
  1429. bio->bi_write_hint = dio->iocb->ki_hint;
  1430. bio->bi_ioprio = dio->iocb->ki_ioprio;
  1431. bio->bi_private = dio;
  1432. bio->bi_end_io = iomap_dio_bio_end_io;
  1433. ret = bio_iov_iter_get_pages(bio, &iter);
  1434. if (unlikely(ret)) {
  1435. /*
  1436. * We have to stop part way through an IO. We must fall
  1437. * through to the sub-block tail zeroing here, otherwise
  1438. * this short IO may expose stale data in the tail of
  1439. * the block we haven't written data to.
  1440. */
  1441. bio_put(bio);
  1442. goto zero_tail;
  1443. }
  1444. n = bio->bi_iter.bi_size;
  1445. if (dio->flags & IOMAP_DIO_WRITE) {
  1446. bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
  1447. if (use_fua)
  1448. bio->bi_opf |= REQ_FUA;
  1449. else
  1450. dio->flags &= ~IOMAP_DIO_WRITE_FUA;
  1451. task_io_account_write(n);
  1452. } else {
  1453. bio->bi_opf = REQ_OP_READ;
  1454. if (dio->flags & IOMAP_DIO_DIRTY)
  1455. bio_set_pages_dirty(bio);
  1456. }
  1457. iov_iter_advance(dio->submit.iter, n);
  1458. dio->size += n;
  1459. pos += n;
  1460. copied += n;
  1461. nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
  1462. atomic_inc(&dio->ref);
  1463. dio->submit.last_queue = bdev_get_queue(iomap->bdev);
  1464. dio->submit.cookie = submit_bio(bio);
  1465. } while (nr_pages);
  1466. /*
  1467. * We need to zeroout the tail of a sub-block write if the extent type
  1468. * requires zeroing or the write extends beyond EOF. If we don't zero
  1469. * the block tail in the latter case, we can expose stale data via mmap
  1470. * reads of the EOF block.
  1471. */
  1472. zero_tail:
  1473. if (need_zeroout ||
  1474. ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
  1475. /* zero out from the end of the write to the end of the block */
  1476. pad = pos & (fs_block_size - 1);
  1477. if (pad)
  1478. iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
  1479. }
  1480. return copied ? copied : ret;
  1481. }
  1482. static loff_t
  1483. iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio)
  1484. {
  1485. length = iov_iter_zero(length, dio->submit.iter);
  1486. dio->size += length;
  1487. return length;
  1488. }
  1489. static loff_t
  1490. iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
  1491. struct iomap_dio *dio, struct iomap *iomap)
  1492. {
  1493. struct iov_iter *iter = dio->submit.iter;
  1494. size_t copied;
  1495. BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
  1496. if (dio->flags & IOMAP_DIO_WRITE) {
  1497. loff_t size = inode->i_size;
  1498. if (pos > size)
  1499. memset(iomap->inline_data + size, 0, pos - size);
  1500. copied = copy_from_iter(iomap->inline_data + pos, length, iter);
  1501. if (copied) {
  1502. if (pos + copied > size)
  1503. i_size_write(inode, pos + copied);
  1504. mark_inode_dirty(inode);
  1505. }
  1506. } else {
  1507. copied = copy_to_iter(iomap->inline_data + pos, length, iter);
  1508. }
  1509. dio->size += copied;
  1510. return copied;
  1511. }
  1512. static loff_t
  1513. iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
  1514. void *data, struct iomap *iomap)
  1515. {
  1516. struct iomap_dio *dio = data;
  1517. switch (iomap->type) {
  1518. case IOMAP_HOLE:
  1519. if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
  1520. return -EIO;
  1521. return iomap_dio_hole_actor(length, dio);
  1522. case IOMAP_UNWRITTEN:
  1523. if (!(dio->flags & IOMAP_DIO_WRITE))
  1524. return iomap_dio_hole_actor(length, dio);
  1525. return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
  1526. case IOMAP_MAPPED:
  1527. return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
  1528. case IOMAP_INLINE:
  1529. return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
  1530. default:
  1531. WARN_ON_ONCE(1);
  1532. return -EIO;
  1533. }
  1534. }
  1535. /*
  1536. * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
  1537. * is being issued as AIO or not. This allows us to optimise pure data writes
  1538. * to use REQ_FUA rather than requiring generic_write_sync() to issue a
  1539. * REQ_FLUSH post write. This is slightly tricky because a single request here
  1540. * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
  1541. * may be pure data writes. In that case, we still need to do a full data sync
  1542. * completion.
  1543. */
  1544. ssize_t
  1545. iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
  1546. const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
  1547. {
  1548. struct address_space *mapping = iocb->ki_filp->f_mapping;
  1549. struct inode *inode = file_inode(iocb->ki_filp);
  1550. size_t count = iov_iter_count(iter);
  1551. loff_t pos = iocb->ki_pos, start = pos;
  1552. loff_t end = iocb->ki_pos + count - 1, ret = 0;
  1553. unsigned int flags = IOMAP_DIRECT;
  1554. bool wait_for_completion = is_sync_kiocb(iocb);
  1555. struct blk_plug plug;
  1556. struct iomap_dio *dio;
  1557. lockdep_assert_held(&inode->i_rwsem);
  1558. if (!count)
  1559. return 0;
  1560. dio = kmalloc(sizeof(*dio), GFP_KERNEL);
  1561. if (!dio)
  1562. return -ENOMEM;
  1563. dio->iocb = iocb;
  1564. atomic_set(&dio->ref, 1);
  1565. dio->size = 0;
  1566. dio->i_size = i_size_read(inode);
  1567. dio->end_io = end_io;
  1568. dio->error = 0;
  1569. dio->flags = 0;
  1570. dio->submit.iter = iter;
  1571. dio->submit.waiter = current;
  1572. dio->submit.cookie = BLK_QC_T_NONE;
  1573. dio->submit.last_queue = NULL;
  1574. if (iov_iter_rw(iter) == READ) {
  1575. if (pos >= dio->i_size)
  1576. goto out_free_dio;
  1577. if (iter->type == ITER_IOVEC)
  1578. dio->flags |= IOMAP_DIO_DIRTY;
  1579. } else {
  1580. flags |= IOMAP_WRITE;
  1581. dio->flags |= IOMAP_DIO_WRITE;
  1582. /* for data sync or sync, we need sync completion processing */
  1583. if (iocb->ki_flags & IOCB_DSYNC)
  1584. dio->flags |= IOMAP_DIO_NEED_SYNC;
  1585. /*
  1586. * For datasync only writes, we optimistically try using FUA for
  1587. * this IO. Any non-FUA write that occurs will clear this flag,
  1588. * hence we know before completion whether a cache flush is
  1589. * necessary.
  1590. */
  1591. if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
  1592. dio->flags |= IOMAP_DIO_WRITE_FUA;
  1593. }
  1594. if (iocb->ki_flags & IOCB_NOWAIT) {
  1595. if (filemap_range_has_page(mapping, start, end)) {
  1596. ret = -EAGAIN;
  1597. goto out_free_dio;
  1598. }
  1599. flags |= IOMAP_NOWAIT;
  1600. }
  1601. ret = filemap_write_and_wait_range(mapping, start, end);
  1602. if (ret)
  1603. goto out_free_dio;
  1604. /*
  1605. * Try to invalidate cache pages for the range we're direct
  1606. * writing. If this invalidation fails, tough, the write will
  1607. * still work, but racing two incompatible write paths is a
  1608. * pretty crazy thing to do, so we don't support it 100%.
  1609. */
  1610. ret = invalidate_inode_pages2_range(mapping,
  1611. start >> PAGE_SHIFT, end >> PAGE_SHIFT);
  1612. if (ret)
  1613. dio_warn_stale_pagecache(iocb->ki_filp);
  1614. ret = 0;
  1615. if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
  1616. !inode->i_sb->s_dio_done_wq) {
  1617. ret = sb_init_dio_done_wq(inode->i_sb);
  1618. if (ret < 0)
  1619. goto out_free_dio;
  1620. }
  1621. inode_dio_begin(inode);
  1622. blk_start_plug(&plug);
  1623. do {
  1624. ret = iomap_apply(inode, pos, count, flags, ops, dio,
  1625. iomap_dio_actor);
  1626. if (ret <= 0) {
  1627. /* magic error code to fall back to buffered I/O */
  1628. if (ret == -ENOTBLK) {
  1629. wait_for_completion = true;
  1630. ret = 0;
  1631. }
  1632. break;
  1633. }
  1634. pos += ret;
  1635. if (iov_iter_rw(iter) == READ && pos >= dio->i_size) {
  1636. /*
  1637. * We only report that we've read data up to i_size.
  1638. * Revert iter to a state corresponding to that as
  1639. * some callers (such as splice code) rely on it.
  1640. */
  1641. iov_iter_revert(iter, pos - dio->i_size);
  1642. break;
  1643. }
  1644. } while ((count = iov_iter_count(iter)) > 0);
  1645. blk_finish_plug(&plug);
  1646. if (ret < 0)
  1647. iomap_dio_set_error(dio, ret);
  1648. /*
  1649. * If all the writes we issued were FUA, we don't need to flush the
  1650. * cache on IO completion. Clear the sync flag for this case.
  1651. */
  1652. if (dio->flags & IOMAP_DIO_WRITE_FUA)
  1653. dio->flags &= ~IOMAP_DIO_NEED_SYNC;
  1654. /*
  1655. * We are about to drop our additional submission reference, which
  1656. * might be the last reference to the dio. There are three three
  1657. * different ways we can progress here:
  1658. *
  1659. * (a) If this is the last reference we will always complete and free
  1660. * the dio ourselves.
  1661. * (b) If this is not the last reference, and we serve an asynchronous
  1662. * iocb, we must never touch the dio after the decrement, the
  1663. * I/O completion handler will complete and free it.
  1664. * (c) If this is not the last reference, but we serve a synchronous
  1665. * iocb, the I/O completion handler will wake us up on the drop
  1666. * of the final reference, and we will complete and free it here
  1667. * after we got woken by the I/O completion handler.
  1668. */
  1669. dio->wait_for_completion = wait_for_completion;
  1670. if (!atomic_dec_and_test(&dio->ref)) {
  1671. if (!wait_for_completion)
  1672. return -EIOCBQUEUED;
  1673. for (;;) {
  1674. set_current_state(TASK_UNINTERRUPTIBLE);
  1675. if (!READ_ONCE(dio->submit.waiter))
  1676. break;
  1677. if (!(iocb->ki_flags & IOCB_HIPRI) ||
  1678. !dio->submit.last_queue ||
  1679. !blk_poll(dio->submit.last_queue,
  1680. dio->submit.cookie))
  1681. io_schedule();
  1682. }
  1683. __set_current_state(TASK_RUNNING);
  1684. }
  1685. return iomap_dio_complete(dio);
  1686. out_free_dio:
  1687. kfree(dio);
  1688. return ret;
  1689. }
  1690. EXPORT_SYMBOL_GPL(iomap_dio_rw);
  1691. /* Swapfile activation */
  1692. #ifdef CONFIG_SWAP
  1693. struct iomap_swapfile_info {
  1694. struct iomap iomap; /* accumulated iomap */
  1695. struct swap_info_struct *sis;
  1696. uint64_t lowest_ppage; /* lowest physical addr seen (pages) */
  1697. uint64_t highest_ppage; /* highest physical addr seen (pages) */
  1698. unsigned long nr_pages; /* number of pages collected */
  1699. int nr_extents; /* extent count */
  1700. };
  1701. /*
  1702. * Collect physical extents for this swap file. Physical extents reported to
  1703. * the swap code must be trimmed to align to a page boundary. The logical
  1704. * offset within the file is irrelevant since the swapfile code maps logical
  1705. * page numbers of the swap device to the physical page-aligned extents.
  1706. */
  1707. static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
  1708. {
  1709. struct iomap *iomap = &isi->iomap;
  1710. unsigned long nr_pages;
  1711. uint64_t first_ppage;
  1712. uint64_t first_ppage_reported;
  1713. uint64_t next_ppage;
  1714. int error;
  1715. /*
  1716. * Round the start up and the end down so that the physical
  1717. * extent aligns to a page boundary.
  1718. */
  1719. first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
  1720. next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
  1721. PAGE_SHIFT;
  1722. /* Skip too-short physical extents. */
  1723. if (first_ppage >= next_ppage)
  1724. return 0;
  1725. nr_pages = next_ppage - first_ppage;
  1726. /*
  1727. * Calculate how much swap space we're adding; the first page contains
  1728. * the swap header and doesn't count. The mm still wants that first
  1729. * page fed to add_swap_extent, however.
  1730. */
  1731. first_ppage_reported = first_ppage;
  1732. if (iomap->offset == 0)
  1733. first_ppage_reported++;
  1734. if (isi->lowest_ppage > first_ppage_reported)
  1735. isi->lowest_ppage = first_ppage_reported;
  1736. if (isi->highest_ppage < (next_ppage - 1))
  1737. isi->highest_ppage = next_ppage - 1;
  1738. /* Add extent, set up for the next call. */
  1739. error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage);
  1740. if (error < 0)
  1741. return error;
  1742. isi->nr_extents += error;
  1743. isi->nr_pages += nr_pages;
  1744. return 0;
  1745. }
  1746. /*
  1747. * Accumulate iomaps for this swap file. We have to accumulate iomaps because
  1748. * swap only cares about contiguous page-aligned physical extents and makes no
  1749. * distinction between written and unwritten extents.
  1750. */
  1751. static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
  1752. loff_t count, void *data, struct iomap *iomap)
  1753. {
  1754. struct iomap_swapfile_info *isi = data;
  1755. int error;
  1756. switch (iomap->type) {
  1757. case IOMAP_MAPPED:
  1758. case IOMAP_UNWRITTEN:
  1759. /* Only real or unwritten extents. */
  1760. break;
  1761. case IOMAP_INLINE:
  1762. /* No inline data. */
  1763. pr_err("swapon: file is inline\n");
  1764. return -EINVAL;
  1765. default:
  1766. pr_err("swapon: file has unallocated extents\n");
  1767. return -EINVAL;
  1768. }
  1769. /* No uncommitted metadata or shared blocks. */
  1770. if (iomap->flags & IOMAP_F_DIRTY) {
  1771. pr_err("swapon: file is not committed\n");
  1772. return -EINVAL;
  1773. }
  1774. if (iomap->flags & IOMAP_F_SHARED) {
  1775. pr_err("swapon: file has shared extents\n");
  1776. return -EINVAL;
  1777. }
  1778. /* Only one bdev per swap file. */
  1779. if (iomap->bdev != isi->sis->bdev) {
  1780. pr_err("swapon: file is on multiple devices\n");
  1781. return -EINVAL;
  1782. }
  1783. if (isi->iomap.length == 0) {
  1784. /* No accumulated extent, so just store it. */
  1785. memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
  1786. } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
  1787. /* Append this to the accumulated extent. */
  1788. isi->iomap.length += iomap->length;
  1789. } else {
  1790. /* Otherwise, add the retained iomap and store this one. */
  1791. error = iomap_swapfile_add_extent(isi);
  1792. if (error)
  1793. return error;
  1794. memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
  1795. }
  1796. return count;
  1797. }
  1798. /*
  1799. * Iterate a swap file's iomaps to construct physical extents that can be
  1800. * passed to the swapfile subsystem.
  1801. */
  1802. int iomap_swapfile_activate(struct swap_info_struct *sis,
  1803. struct file *swap_file, sector_t *pagespan,
  1804. const struct iomap_ops *ops)
  1805. {
  1806. struct iomap_swapfile_info isi = {
  1807. .sis = sis,
  1808. .lowest_ppage = (sector_t)-1ULL,
  1809. };
  1810. struct address_space *mapping = swap_file->f_mapping;
  1811. struct inode *inode = mapping->host;
  1812. loff_t pos = 0;
  1813. loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE);
  1814. loff_t ret;
  1815. /*
  1816. * Persist all file mapping metadata so that we won't have any
  1817. * IOMAP_F_DIRTY iomaps.
  1818. */
  1819. ret = vfs_fsync(swap_file, 1);
  1820. if (ret)
  1821. return ret;
  1822. while (len > 0) {
  1823. ret = iomap_apply(inode, pos, len, IOMAP_REPORT,
  1824. ops, &isi, iomap_swapfile_activate_actor);
  1825. if (ret <= 0)
  1826. return ret;
  1827. pos += ret;
  1828. len -= ret;
  1829. }
  1830. if (isi.iomap.length) {
  1831. ret = iomap_swapfile_add_extent(&isi);
  1832. if (ret)
  1833. return ret;
  1834. }
  1835. *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
  1836. sis->max = isi.nr_pages;
  1837. sis->pages = isi.nr_pages - 1;
  1838. sis->highest_bit = isi.nr_pages - 1;
  1839. return isi.nr_extents;
  1840. }
  1841. EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
  1842. #endif /* CONFIG_SWAP */
  1843. static loff_t
  1844. iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
  1845. void *data, struct iomap *iomap)
  1846. {
  1847. sector_t *bno = data, addr;
  1848. if (iomap->type == IOMAP_MAPPED) {
  1849. addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits;
  1850. if (addr > INT_MAX)
  1851. WARN(1, "would truncate bmap result\n");
  1852. else
  1853. *bno = addr;
  1854. }
  1855. return 0;
  1856. }
  1857. /* legacy ->bmap interface. 0 is the error return (!) */
  1858. sector_t
  1859. iomap_bmap(struct address_space *mapping, sector_t bno,
  1860. const struct iomap_ops *ops)
  1861. {
  1862. struct inode *inode = mapping->host;
  1863. loff_t pos = bno << inode->i_blkbits;
  1864. unsigned blocksize = i_blocksize(inode);
  1865. if (filemap_write_and_wait(mapping))
  1866. return 0;
  1867. bno = 0;
  1868. iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
  1869. return bno;
  1870. }
  1871. EXPORT_SYMBOL_GPL(iomap_bmap);