madvise.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * linux/mm/madvise.c
  4. *
  5. * Copyright (C) 1999 Linus Torvalds
  6. * Copyright (C) 2002 Christoph Hellwig
  7. */
  8. #include <linux/mman.h>
  9. #include <linux/pagemap.h>
  10. #include <linux/syscalls.h>
  11. #include <linux/mempolicy.h>
  12. #include <linux/page-isolation.h>
  13. #include <linux/userfaultfd_k.h>
  14. #include <linux/hugetlb.h>
  15. #include <linux/falloc.h>
  16. #include <linux/sched.h>
  17. #include <linux/ksm.h>
  18. #include <linux/fs.h>
  19. #include <linux/file.h>
  20. #include <linux/blkdev.h>
  21. #include <linux/backing-dev.h>
  22. #include <linux/swap.h>
  23. #include <linux/swapops.h>
  24. #include <linux/shmem_fs.h>
  25. #include <linux/mmu_notifier.h>
  26. #include <asm/tlb.h>
  27. #include "internal.h"
  28. /*
  29. * Any behaviour which results in changes to the vma->vm_flags needs to
  30. * take mmap_sem for writing. Others, which simply traverse vmas, need
  31. * to only take it for reading.
  32. */
  33. static int madvise_need_mmap_write(int behavior)
  34. {
  35. switch (behavior) {
  36. case MADV_REMOVE:
  37. case MADV_WILLNEED:
  38. case MADV_DONTNEED:
  39. case MADV_FREE:
  40. return 0;
  41. default:
  42. /* be safe, default to 1. list exceptions explicitly */
  43. return 1;
  44. }
  45. }
  46. /*
  47. * We can potentially split a vm area into separate
  48. * areas, each area with its own behavior.
  49. */
  50. static long madvise_behavior(struct vm_area_struct *vma,
  51. struct vm_area_struct **prev,
  52. unsigned long start, unsigned long end, int behavior)
  53. {
  54. struct mm_struct *mm = vma->vm_mm;
  55. int error = 0;
  56. pgoff_t pgoff;
  57. unsigned long new_flags = vma->vm_flags;
  58. switch (behavior) {
  59. case MADV_NORMAL:
  60. new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  61. break;
  62. case MADV_SEQUENTIAL:
  63. new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
  64. break;
  65. case MADV_RANDOM:
  66. new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
  67. break;
  68. case MADV_DONTFORK:
  69. new_flags |= VM_DONTCOPY;
  70. break;
  71. case MADV_DOFORK:
  72. if (vma->vm_flags & VM_IO) {
  73. error = -EINVAL;
  74. goto out;
  75. }
  76. new_flags &= ~VM_DONTCOPY;
  77. break;
  78. case MADV_WIPEONFORK:
  79. /* MADV_WIPEONFORK is only supported on anonymous memory. */
  80. if (vma->vm_file || vma->vm_flags & VM_SHARED) {
  81. error = -EINVAL;
  82. goto out;
  83. }
  84. new_flags |= VM_WIPEONFORK;
  85. break;
  86. case MADV_KEEPONFORK:
  87. new_flags &= ~VM_WIPEONFORK;
  88. break;
  89. case MADV_DONTDUMP:
  90. new_flags |= VM_DONTDUMP;
  91. break;
  92. case MADV_DODUMP:
  93. if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
  94. error = -EINVAL;
  95. goto out;
  96. }
  97. new_flags &= ~VM_DONTDUMP;
  98. break;
  99. case MADV_MERGEABLE:
  100. case MADV_UNMERGEABLE:
  101. error = ksm_madvise(vma, start, end, behavior, &new_flags);
  102. if (error) {
  103. /*
  104. * madvise() returns EAGAIN if kernel resources, such as
  105. * slab, are temporarily unavailable.
  106. */
  107. if (error == -ENOMEM)
  108. error = -EAGAIN;
  109. goto out;
  110. }
  111. break;
  112. case MADV_HUGEPAGE:
  113. case MADV_NOHUGEPAGE:
  114. error = hugepage_madvise(vma, &new_flags, behavior);
  115. if (error) {
  116. /*
  117. * madvise() returns EAGAIN if kernel resources, such as
  118. * slab, are temporarily unavailable.
  119. */
  120. if (error == -ENOMEM)
  121. error = -EAGAIN;
  122. goto out;
  123. }
  124. break;
  125. }
  126. if (new_flags == vma->vm_flags) {
  127. *prev = vma;
  128. goto out;
  129. }
  130. pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  131. *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
  132. vma->vm_file, pgoff, vma_policy(vma),
  133. vma->vm_userfaultfd_ctx);
  134. if (*prev) {
  135. vma = *prev;
  136. goto success;
  137. }
  138. *prev = vma;
  139. if (start != vma->vm_start) {
  140. if (unlikely(mm->map_count >= sysctl_max_map_count)) {
  141. error = -ENOMEM;
  142. goto out;
  143. }
  144. error = __split_vma(mm, vma, start, 1);
  145. if (error) {
  146. /*
  147. * madvise() returns EAGAIN if kernel resources, such as
  148. * slab, are temporarily unavailable.
  149. */
  150. if (error == -ENOMEM)
  151. error = -EAGAIN;
  152. goto out;
  153. }
  154. }
  155. if (end != vma->vm_end) {
  156. if (unlikely(mm->map_count >= sysctl_max_map_count)) {
  157. error = -ENOMEM;
  158. goto out;
  159. }
  160. error = __split_vma(mm, vma, end, 0);
  161. if (error) {
  162. /*
  163. * madvise() returns EAGAIN if kernel resources, such as
  164. * slab, are temporarily unavailable.
  165. */
  166. if (error == -ENOMEM)
  167. error = -EAGAIN;
  168. goto out;
  169. }
  170. }
  171. success:
  172. /*
  173. * vm_flags is protected by the mmap_sem held in write mode.
  174. */
  175. vma->vm_flags = new_flags;
  176. out:
  177. return error;
  178. }
  179. #ifdef CONFIG_SWAP
  180. static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
  181. unsigned long end, struct mm_walk *walk)
  182. {
  183. pte_t *orig_pte;
  184. struct vm_area_struct *vma = walk->private;
  185. unsigned long index;
  186. if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  187. return 0;
  188. for (index = start; index != end; index += PAGE_SIZE) {
  189. pte_t pte;
  190. swp_entry_t entry;
  191. struct page *page;
  192. spinlock_t *ptl;
  193. orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
  194. pte = *(orig_pte + ((index - start) / PAGE_SIZE));
  195. pte_unmap_unlock(orig_pte, ptl);
  196. if (pte_present(pte) || pte_none(pte))
  197. continue;
  198. entry = pte_to_swp_entry(pte);
  199. if (unlikely(non_swap_entry(entry)))
  200. continue;
  201. page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
  202. vma, index, false);
  203. if (page)
  204. put_page(page);
  205. }
  206. return 0;
  207. }
  208. static void force_swapin_readahead(struct vm_area_struct *vma,
  209. unsigned long start, unsigned long end)
  210. {
  211. struct mm_walk walk = {
  212. .mm = vma->vm_mm,
  213. .pmd_entry = swapin_walk_pmd_entry,
  214. .private = vma,
  215. };
  216. walk_page_range(start, end, &walk);
  217. lru_add_drain(); /* Push any new pages onto the LRU now */
  218. }
  219. static void force_shm_swapin_readahead(struct vm_area_struct *vma,
  220. unsigned long start, unsigned long end,
  221. struct address_space *mapping)
  222. {
  223. pgoff_t index;
  224. struct page *page;
  225. swp_entry_t swap;
  226. for (; start < end; start += PAGE_SIZE) {
  227. index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  228. page = find_get_entry(mapping, index);
  229. if (!radix_tree_exceptional_entry(page)) {
  230. if (page)
  231. put_page(page);
  232. continue;
  233. }
  234. swap = radix_to_swp_entry(page);
  235. page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
  236. NULL, 0, false);
  237. if (page)
  238. put_page(page);
  239. }
  240. lru_add_drain(); /* Push any new pages onto the LRU now */
  241. }
  242. #endif /* CONFIG_SWAP */
  243. /*
  244. * Schedule all required I/O operations. Do not wait for completion.
  245. */
  246. static long madvise_willneed(struct vm_area_struct *vma,
  247. struct vm_area_struct **prev,
  248. unsigned long start, unsigned long end)
  249. {
  250. struct file *file = vma->vm_file;
  251. *prev = vma;
  252. #ifdef CONFIG_SWAP
  253. if (!file) {
  254. force_swapin_readahead(vma, start, end);
  255. return 0;
  256. }
  257. if (shmem_mapping(file->f_mapping)) {
  258. force_shm_swapin_readahead(vma, start, end,
  259. file->f_mapping);
  260. return 0;
  261. }
  262. #else
  263. if (!file)
  264. return -EBADF;
  265. #endif
  266. if (IS_DAX(file_inode(file))) {
  267. /* no bad return value, but ignore advice */
  268. return 0;
  269. }
  270. start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  271. if (end > vma->vm_end)
  272. end = vma->vm_end;
  273. end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  274. force_page_cache_readahead(file->f_mapping, file, start, end - start);
  275. return 0;
  276. }
  277. static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
  278. unsigned long end, struct mm_walk *walk)
  279. {
  280. struct mmu_gather *tlb = walk->private;
  281. struct mm_struct *mm = tlb->mm;
  282. struct vm_area_struct *vma = walk->vma;
  283. spinlock_t *ptl;
  284. pte_t *orig_pte, *pte, ptent;
  285. struct page *page;
  286. int nr_swap = 0;
  287. unsigned long next;
  288. next = pmd_addr_end(addr, end);
  289. if (pmd_trans_huge(*pmd))
  290. if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
  291. goto next;
  292. if (pmd_trans_unstable(pmd))
  293. return 0;
  294. tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
  295. orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  296. flush_tlb_batched_pending(mm);
  297. arch_enter_lazy_mmu_mode();
  298. for (; addr != end; pte++, addr += PAGE_SIZE) {
  299. ptent = *pte;
  300. if (pte_none(ptent))
  301. continue;
  302. /*
  303. * If the pte has swp_entry, just clear page table to
  304. * prevent swap-in which is more expensive rather than
  305. * (page allocation + zeroing).
  306. */
  307. if (!pte_present(ptent)) {
  308. swp_entry_t entry;
  309. entry = pte_to_swp_entry(ptent);
  310. if (non_swap_entry(entry))
  311. continue;
  312. nr_swap--;
  313. free_swap_and_cache(entry);
  314. pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
  315. continue;
  316. }
  317. page = _vm_normal_page(vma, addr, ptent, true);
  318. if (!page)
  319. continue;
  320. /*
  321. * If pmd isn't transhuge but the page is THP and
  322. * is owned by only this process, split it and
  323. * deactivate all pages.
  324. */
  325. if (PageTransCompound(page)) {
  326. if (page_mapcount(page) != 1)
  327. goto out;
  328. get_page(page);
  329. if (!trylock_page(page)) {
  330. put_page(page);
  331. goto out;
  332. }
  333. pte_unmap_unlock(orig_pte, ptl);
  334. if (split_huge_page(page)) {
  335. unlock_page(page);
  336. put_page(page);
  337. pte_offset_map_lock(mm, pmd, addr, &ptl);
  338. goto out;
  339. }
  340. unlock_page(page);
  341. put_page(page);
  342. pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  343. pte--;
  344. addr -= PAGE_SIZE;
  345. continue;
  346. }
  347. VM_BUG_ON_PAGE(PageTransCompound(page), page);
  348. if (PageSwapCache(page) || PageDirty(page)) {
  349. if (!trylock_page(page))
  350. continue;
  351. /*
  352. * If page is shared with others, we couldn't clear
  353. * PG_dirty of the page.
  354. */
  355. if (page_mapcount(page) != 1) {
  356. unlock_page(page);
  357. continue;
  358. }
  359. if (PageSwapCache(page) && !try_to_free_swap(page)) {
  360. unlock_page(page);
  361. continue;
  362. }
  363. ClearPageDirty(page);
  364. unlock_page(page);
  365. }
  366. if (pte_young(ptent) || pte_dirty(ptent)) {
  367. /*
  368. * Some of architecture(ex, PPC) don't update TLB
  369. * with set_pte_at and tlb_remove_tlb_entry so for
  370. * the portability, remap the pte with old|clean
  371. * after pte clearing.
  372. */
  373. ptent = ptep_get_and_clear_full(mm, addr, pte,
  374. tlb->fullmm);
  375. ptent = pte_mkold(ptent);
  376. ptent = pte_mkclean(ptent);
  377. set_pte_at(mm, addr, pte, ptent);
  378. tlb_remove_tlb_entry(tlb, pte, addr);
  379. }
  380. mark_page_lazyfree(page);
  381. }
  382. out:
  383. if (nr_swap) {
  384. if (current->mm == mm)
  385. sync_mm_rss(mm);
  386. add_mm_counter(mm, MM_SWAPENTS, nr_swap);
  387. }
  388. arch_leave_lazy_mmu_mode();
  389. pte_unmap_unlock(orig_pte, ptl);
  390. cond_resched();
  391. next:
  392. return 0;
  393. }
  394. static void madvise_free_page_range(struct mmu_gather *tlb,
  395. struct vm_area_struct *vma,
  396. unsigned long addr, unsigned long end)
  397. {
  398. struct mm_walk free_walk = {
  399. .pmd_entry = madvise_free_pte_range,
  400. .mm = vma->vm_mm,
  401. .private = tlb,
  402. };
  403. tlb_start_vma(tlb, vma);
  404. walk_page_range(addr, end, &free_walk);
  405. tlb_end_vma(tlb, vma);
  406. }
  407. static int madvise_free_single_vma(struct vm_area_struct *vma,
  408. unsigned long start_addr, unsigned long end_addr)
  409. {
  410. unsigned long start, end;
  411. struct mm_struct *mm = vma->vm_mm;
  412. struct mmu_gather tlb;
  413. /* MADV_FREE works for only anon vma at the moment */
  414. if (!vma_is_anonymous(vma))
  415. return -EINVAL;
  416. start = max(vma->vm_start, start_addr);
  417. if (start >= vma->vm_end)
  418. return -EINVAL;
  419. end = min(vma->vm_end, end_addr);
  420. if (end <= vma->vm_start)
  421. return -EINVAL;
  422. lru_add_drain();
  423. tlb_gather_mmu(&tlb, mm, start, end);
  424. update_hiwater_rss(mm);
  425. mmu_notifier_invalidate_range_start(mm, start, end);
  426. madvise_free_page_range(&tlb, vma, start, end);
  427. mmu_notifier_invalidate_range_end(mm, start, end);
  428. tlb_finish_mmu(&tlb, start, end);
  429. return 0;
  430. }
  431. /*
  432. * Application no longer needs these pages. If the pages are dirty,
  433. * it's OK to just throw them away. The app will be more careful about
  434. * data it wants to keep. Be sure to free swap resources too. The
  435. * zap_page_range call sets things up for shrink_active_list to actually free
  436. * these pages later if no one else has touched them in the meantime,
  437. * although we could add these pages to a global reuse list for
  438. * shrink_active_list to pick up before reclaiming other pages.
  439. *
  440. * NB: This interface discards data rather than pushes it out to swap,
  441. * as some implementations do. This has performance implications for
  442. * applications like large transactional databases which want to discard
  443. * pages in anonymous maps after committing to backing store the data
  444. * that was kept in them. There is no reason to write this data out to
  445. * the swap area if the application is discarding it.
  446. *
  447. * An interface that causes the system to free clean pages and flush
  448. * dirty pages is already available as msync(MS_INVALIDATE).
  449. */
  450. static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
  451. unsigned long start, unsigned long end)
  452. {
  453. zap_page_range(vma, start, end - start);
  454. return 0;
  455. }
  456. static long madvise_dontneed_free(struct vm_area_struct *vma,
  457. struct vm_area_struct **prev,
  458. unsigned long start, unsigned long end,
  459. int behavior)
  460. {
  461. *prev = vma;
  462. if (!can_madv_dontneed_vma(vma))
  463. return -EINVAL;
  464. if (!userfaultfd_remove(vma, start, end)) {
  465. *prev = NULL; /* mmap_sem has been dropped, prev is stale */
  466. down_read(&current->mm->mmap_sem);
  467. vma = find_vma(current->mm, start);
  468. if (!vma)
  469. return -ENOMEM;
  470. if (start < vma->vm_start) {
  471. /*
  472. * This "vma" under revalidation is the one
  473. * with the lowest vma->vm_start where start
  474. * is also < vma->vm_end. If start <
  475. * vma->vm_start it means an hole materialized
  476. * in the user address space within the
  477. * virtual range passed to MADV_DONTNEED
  478. * or MADV_FREE.
  479. */
  480. return -ENOMEM;
  481. }
  482. if (!can_madv_dontneed_vma(vma))
  483. return -EINVAL;
  484. if (end > vma->vm_end) {
  485. /*
  486. * Don't fail if end > vma->vm_end. If the old
  487. * vma was splitted while the mmap_sem was
  488. * released the effect of the concurrent
  489. * operation may not cause madvise() to
  490. * have an undefined result. There may be an
  491. * adjacent next vma that we'll walk
  492. * next. userfaultfd_remove() will generate an
  493. * UFFD_EVENT_REMOVE repetition on the
  494. * end-vma->vm_end range, but the manager can
  495. * handle a repetition fine.
  496. */
  497. end = vma->vm_end;
  498. }
  499. VM_WARN_ON(start >= end);
  500. }
  501. if (behavior == MADV_DONTNEED)
  502. return madvise_dontneed_single_vma(vma, start, end);
  503. else if (behavior == MADV_FREE)
  504. return madvise_free_single_vma(vma, start, end);
  505. else
  506. return -EINVAL;
  507. }
  508. /*
  509. * Application wants to free up the pages and associated backing store.
  510. * This is effectively punching a hole into the middle of a file.
  511. */
  512. static long madvise_remove(struct vm_area_struct *vma,
  513. struct vm_area_struct **prev,
  514. unsigned long start, unsigned long end)
  515. {
  516. loff_t offset;
  517. int error;
  518. struct file *f;
  519. *prev = NULL; /* tell sys_madvise we drop mmap_sem */
  520. if (vma->vm_flags & VM_LOCKED)
  521. return -EINVAL;
  522. f = vma->vm_file;
  523. if (!f || !f->f_mapping || !f->f_mapping->host) {
  524. return -EINVAL;
  525. }
  526. if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
  527. return -EACCES;
  528. offset = (loff_t)(start - vma->vm_start)
  529. + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
  530. /*
  531. * Filesystem's fallocate may need to take i_mutex. We need to
  532. * explicitly grab a reference because the vma (and hence the
  533. * vma's reference to the file) can go away as soon as we drop
  534. * mmap_sem.
  535. */
  536. get_file(f);
  537. if (userfaultfd_remove(vma, start, end)) {
  538. /* mmap_sem was not released by userfaultfd_remove() */
  539. up_read(&current->mm->mmap_sem);
  540. }
  541. error = vfs_fallocate(f,
  542. FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
  543. offset, end - start);
  544. fput(f);
  545. down_read(&current->mm->mmap_sem);
  546. return error;
  547. }
  548. #ifdef CONFIG_MEMORY_FAILURE
  549. /*
  550. * Error injection support for memory error handling.
  551. */
  552. static int madvise_inject_error(int behavior,
  553. unsigned long start, unsigned long end)
  554. {
  555. struct page *page;
  556. struct zone *zone;
  557. unsigned int order;
  558. if (!capable(CAP_SYS_ADMIN))
  559. return -EPERM;
  560. for (; start < end; start += PAGE_SIZE << order) {
  561. unsigned long pfn;
  562. int ret;
  563. ret = get_user_pages_fast(start, 1, 0, &page);
  564. if (ret != 1)
  565. return ret;
  566. pfn = page_to_pfn(page);
  567. /*
  568. * When soft offlining hugepages, after migrating the page
  569. * we dissolve it, therefore in the second loop "page" will
  570. * no longer be a compound page, and order will be 0.
  571. */
  572. order = compound_order(compound_head(page));
  573. if (PageHWPoison(page)) {
  574. put_page(page);
  575. continue;
  576. }
  577. if (behavior == MADV_SOFT_OFFLINE) {
  578. pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
  579. pfn, start);
  580. ret = soft_offline_page(page, MF_COUNT_INCREASED);
  581. if (ret)
  582. return ret;
  583. continue;
  584. }
  585. pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
  586. pfn, start);
  587. /*
  588. * Drop the page reference taken by get_user_pages_fast(). In
  589. * the absence of MF_COUNT_INCREASED the memory_failure()
  590. * routine is responsible for pinning the page to prevent it
  591. * from being released back to the page allocator.
  592. */
  593. put_page(page);
  594. ret = memory_failure(pfn, 0);
  595. if (ret)
  596. return ret;
  597. }
  598. /* Ensure that all poisoned pages are removed from per-cpu lists */
  599. for_each_populated_zone(zone)
  600. drain_all_pages(zone);
  601. return 0;
  602. }
  603. #endif
  604. static long
  605. madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  606. unsigned long start, unsigned long end, int behavior)
  607. {
  608. switch (behavior) {
  609. case MADV_REMOVE:
  610. return madvise_remove(vma, prev, start, end);
  611. case MADV_WILLNEED:
  612. return madvise_willneed(vma, prev, start, end);
  613. case MADV_FREE:
  614. case MADV_DONTNEED:
  615. return madvise_dontneed_free(vma, prev, start, end, behavior);
  616. default:
  617. return madvise_behavior(vma, prev, start, end, behavior);
  618. }
  619. }
  620. static bool
  621. madvise_behavior_valid(int behavior)
  622. {
  623. switch (behavior) {
  624. case MADV_DOFORK:
  625. case MADV_DONTFORK:
  626. case MADV_NORMAL:
  627. case MADV_SEQUENTIAL:
  628. case MADV_RANDOM:
  629. case MADV_REMOVE:
  630. case MADV_WILLNEED:
  631. case MADV_DONTNEED:
  632. case MADV_FREE:
  633. #ifdef CONFIG_KSM
  634. case MADV_MERGEABLE:
  635. case MADV_UNMERGEABLE:
  636. #endif
  637. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  638. case MADV_HUGEPAGE:
  639. case MADV_NOHUGEPAGE:
  640. #endif
  641. case MADV_DONTDUMP:
  642. case MADV_DODUMP:
  643. case MADV_WIPEONFORK:
  644. case MADV_KEEPONFORK:
  645. #ifdef CONFIG_MEMORY_FAILURE
  646. case MADV_SOFT_OFFLINE:
  647. case MADV_HWPOISON:
  648. #endif
  649. return true;
  650. default:
  651. return false;
  652. }
  653. }
  654. /*
  655. * The madvise(2) system call.
  656. *
  657. * Applications can use madvise() to advise the kernel how it should
  658. * handle paging I/O in this VM area. The idea is to help the kernel
  659. * use appropriate read-ahead and caching techniques. The information
  660. * provided is advisory only, and can be safely disregarded by the
  661. * kernel without affecting the correct operation of the application.
  662. *
  663. * behavior values:
  664. * MADV_NORMAL - the default behavior is to read clusters. This
  665. * results in some read-ahead and read-behind.
  666. * MADV_RANDOM - the system should read the minimum amount of data
  667. * on any access, since it is unlikely that the appli-
  668. * cation will need more than what it asks for.
  669. * MADV_SEQUENTIAL - pages in the given range will probably be accessed
  670. * once, so they can be aggressively read ahead, and
  671. * can be freed soon after they are accessed.
  672. * MADV_WILLNEED - the application is notifying the system to read
  673. * some pages ahead.
  674. * MADV_DONTNEED - the application is finished with the given range,
  675. * so the kernel can free resources associated with it.
  676. * MADV_FREE - the application marks pages in the given range as lazy free,
  677. * where actual purges are postponed until memory pressure happens.
  678. * MADV_REMOVE - the application wants to free up the given range of
  679. * pages and associated backing store.
  680. * MADV_DONTFORK - omit this area from child's address space when forking:
  681. * typically, to avoid COWing pages pinned by get_user_pages().
  682. * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
  683. * MADV_WIPEONFORK - present the child process with zero-filled memory in this
  684. * range after a fork.
  685. * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
  686. * MADV_HWPOISON - trigger memory error handler as if the given memory range
  687. * were corrupted by unrecoverable hardware memory failure.
  688. * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
  689. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
  690. * this area with pages of identical content from other such areas.
  691. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
  692. * MADV_HUGEPAGE - the application wants to back the given range by transparent
  693. * huge pages in the future. Existing pages might be coalesced and
  694. * new pages might be allocated as THP.
  695. * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
  696. * transparent huge pages so the existing pages will not be
  697. * coalesced into THP and new pages will not be allocated as THP.
  698. * MADV_DONTDUMP - the application wants to prevent pages in the given range
  699. * from being included in its core dump.
  700. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
  701. *
  702. * return values:
  703. * zero - success
  704. * -EINVAL - start + len < 0, start is not page-aligned,
  705. * "behavior" is not a valid value, or application
  706. * is attempting to release locked or shared pages,
  707. * or the specified address range includes file, Huge TLB,
  708. * MAP_SHARED or VMPFNMAP range.
  709. * -ENOMEM - addresses in the specified range are not currently
  710. * mapped, or are outside the AS of the process.
  711. * -EIO - an I/O error occurred while paging in data.
  712. * -EBADF - map exists, but area maps something that isn't a file.
  713. * -EAGAIN - a kernel resource was temporarily unavailable.
  714. */
  715. SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
  716. {
  717. unsigned long end, tmp;
  718. struct vm_area_struct *vma, *prev;
  719. int unmapped_error = 0;
  720. int error = -EINVAL;
  721. int write;
  722. size_t len;
  723. struct blk_plug plug;
  724. if (!madvise_behavior_valid(behavior))
  725. return error;
  726. if (start & ~PAGE_MASK)
  727. return error;
  728. len = (len_in + ~PAGE_MASK) & PAGE_MASK;
  729. /* Check to see whether len was rounded up from small -ve to zero */
  730. if (len_in && !len)
  731. return error;
  732. end = start + len;
  733. if (end < start)
  734. return error;
  735. error = 0;
  736. if (end == start)
  737. return error;
  738. #ifdef CONFIG_MEMORY_FAILURE
  739. if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  740. return madvise_inject_error(behavior, start, start + len_in);
  741. #endif
  742. write = madvise_need_mmap_write(behavior);
  743. if (write) {
  744. if (down_write_killable(&current->mm->mmap_sem))
  745. return -EINTR;
  746. } else {
  747. down_read(&current->mm->mmap_sem);
  748. }
  749. /*
  750. * If the interval [start,end) covers some unmapped address
  751. * ranges, just ignore them, but return -ENOMEM at the end.
  752. * - different from the way of handling in mlock etc.
  753. */
  754. vma = find_vma_prev(current->mm, start, &prev);
  755. if (vma && start > vma->vm_start)
  756. prev = vma;
  757. blk_start_plug(&plug);
  758. for (;;) {
  759. /* Still start < end. */
  760. error = -ENOMEM;
  761. if (!vma)
  762. goto out;
  763. /* Here start < (end|vma->vm_end). */
  764. if (start < vma->vm_start) {
  765. unmapped_error = -ENOMEM;
  766. start = vma->vm_start;
  767. if (start >= end)
  768. goto out;
  769. }
  770. /* Here vma->vm_start <= start < (end|vma->vm_end) */
  771. tmp = vma->vm_end;
  772. if (end < tmp)
  773. tmp = end;
  774. /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  775. error = madvise_vma(vma, &prev, start, tmp, behavior);
  776. if (error)
  777. goto out;
  778. start = tmp;
  779. if (prev && start < prev->vm_end)
  780. start = prev->vm_end;
  781. error = unmapped_error;
  782. if (start >= end)
  783. goto out;
  784. if (prev)
  785. vma = prev->vm_next;
  786. else /* madvise_remove dropped mmap_sem */
  787. vma = find_vma(current->mm, start);
  788. }
  789. out:
  790. blk_finish_plug(&plug);
  791. if (write)
  792. up_write(&current->mm->mmap_sem);
  793. else
  794. up_read(&current->mm->mmap_sem);
  795. return error;
  796. }