hmm.c 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349
  1. /*
  2. * Copyright 2013 Red Hat Inc.
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * Authors: Jérôme Glisse <jglisse@redhat.com>
  15. */
  16. /*
  17. * Refer to include/linux/hmm.h for information about heterogeneous memory
  18. * management or HMM for short.
  19. */
  20. #include <linux/mm.h>
  21. #include <linux/hmm.h>
  22. #include <linux/init.h>
  23. #include <linux/rmap.h>
  24. #include <linux/swap.h>
  25. #include <linux/slab.h>
  26. #include <linux/sched.h>
  27. #include <linux/mmzone.h>
  28. #include <linux/pagemap.h>
  29. #include <linux/swapops.h>
  30. #include <linux/hugetlb.h>
  31. #include <linux/memremap.h>
  32. #include <linux/jump_label.h>
  33. #include <linux/mmu_notifier.h>
  34. #include <linux/memory_hotplug.h>
  35. #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
  36. #if IS_ENABLED(CONFIG_HMM_MIRROR)
  37. static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
  38. /*
  39. * struct hmm - HMM per mm struct
  40. *
  41. * @mm: mm struct this HMM struct is bound to
  42. * @lock: lock protecting ranges list
  43. * @sequence: we track updates to the CPU page table with a sequence number
  44. * @ranges: list of range being snapshotted
  45. * @mirrors: list of mirrors for this mm
  46. * @mmu_notifier: mmu notifier to track updates to CPU page table
  47. * @mirrors_sem: read/write semaphore protecting the mirrors list
  48. */
  49. struct hmm {
  50. struct mm_struct *mm;
  51. spinlock_t lock;
  52. atomic_t sequence;
  53. struct list_head ranges;
  54. struct list_head mirrors;
  55. struct mmu_notifier mmu_notifier;
  56. struct rw_semaphore mirrors_sem;
  57. };
  58. /*
  59. * hmm_register - register HMM against an mm (HMM internal)
  60. *
  61. * @mm: mm struct to attach to
  62. *
  63. * This is not intended to be used directly by device drivers. It allocates an
  64. * HMM struct if mm does not have one, and initializes it.
  65. */
  66. static struct hmm *hmm_register(struct mm_struct *mm)
  67. {
  68. struct hmm *hmm = READ_ONCE(mm->hmm);
  69. bool cleanup = false;
  70. /*
  71. * The hmm struct can only be freed once the mm_struct goes away,
  72. * hence we should always have pre-allocated an new hmm struct
  73. * above.
  74. */
  75. if (hmm)
  76. return hmm;
  77. hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
  78. if (!hmm)
  79. return NULL;
  80. INIT_LIST_HEAD(&hmm->mirrors);
  81. init_rwsem(&hmm->mirrors_sem);
  82. atomic_set(&hmm->sequence, 0);
  83. hmm->mmu_notifier.ops = NULL;
  84. INIT_LIST_HEAD(&hmm->ranges);
  85. spin_lock_init(&hmm->lock);
  86. hmm->mm = mm;
  87. spin_lock(&mm->page_table_lock);
  88. if (!mm->hmm)
  89. mm->hmm = hmm;
  90. else
  91. cleanup = true;
  92. spin_unlock(&mm->page_table_lock);
  93. if (cleanup)
  94. goto error;
  95. /*
  96. * We should only get here if hold the mmap_sem in write mode ie on
  97. * registration of first mirror through hmm_mirror_register()
  98. */
  99. hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
  100. if (__mmu_notifier_register(&hmm->mmu_notifier, mm))
  101. goto error_mm;
  102. return mm->hmm;
  103. error_mm:
  104. spin_lock(&mm->page_table_lock);
  105. if (mm->hmm == hmm)
  106. mm->hmm = NULL;
  107. spin_unlock(&mm->page_table_lock);
  108. error:
  109. kfree(hmm);
  110. return NULL;
  111. }
  112. void hmm_mm_destroy(struct mm_struct *mm)
  113. {
  114. kfree(mm->hmm);
  115. }
  116. static void hmm_invalidate_range(struct hmm *hmm,
  117. enum hmm_update_type action,
  118. unsigned long start,
  119. unsigned long end)
  120. {
  121. struct hmm_mirror *mirror;
  122. struct hmm_range *range;
  123. spin_lock(&hmm->lock);
  124. list_for_each_entry(range, &hmm->ranges, list) {
  125. unsigned long addr, idx, npages;
  126. if (end < range->start || start >= range->end)
  127. continue;
  128. range->valid = false;
  129. addr = max(start, range->start);
  130. idx = (addr - range->start) >> PAGE_SHIFT;
  131. npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
  132. memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
  133. }
  134. spin_unlock(&hmm->lock);
  135. down_read(&hmm->mirrors_sem);
  136. list_for_each_entry(mirror, &hmm->mirrors, list)
  137. mirror->ops->sync_cpu_device_pagetables(mirror, action,
  138. start, end);
  139. up_read(&hmm->mirrors_sem);
  140. }
  141. static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
  142. {
  143. struct hmm_mirror *mirror;
  144. struct hmm *hmm = mm->hmm;
  145. down_write(&hmm->mirrors_sem);
  146. mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
  147. list);
  148. while (mirror) {
  149. list_del_init(&mirror->list);
  150. if (mirror->ops->release) {
  151. /*
  152. * Drop mirrors_sem so callback can wait on any pending
  153. * work that might itself trigger mmu_notifier callback
  154. * and thus would deadlock with us.
  155. */
  156. up_write(&hmm->mirrors_sem);
  157. mirror->ops->release(mirror);
  158. down_write(&hmm->mirrors_sem);
  159. }
  160. mirror = list_first_entry_or_null(&hmm->mirrors,
  161. struct hmm_mirror, list);
  162. }
  163. up_write(&hmm->mirrors_sem);
  164. }
  165. static int hmm_invalidate_range_start(struct mmu_notifier *mn,
  166. struct mm_struct *mm,
  167. unsigned long start,
  168. unsigned long end,
  169. bool blockable)
  170. {
  171. struct hmm *hmm = mm->hmm;
  172. VM_BUG_ON(!hmm);
  173. atomic_inc(&hmm->sequence);
  174. return 0;
  175. }
  176. static void hmm_invalidate_range_end(struct mmu_notifier *mn,
  177. struct mm_struct *mm,
  178. unsigned long start,
  179. unsigned long end)
  180. {
  181. struct hmm *hmm = mm->hmm;
  182. VM_BUG_ON(!hmm);
  183. hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
  184. }
  185. static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
  186. .release = hmm_release,
  187. .invalidate_range_start = hmm_invalidate_range_start,
  188. .invalidate_range_end = hmm_invalidate_range_end,
  189. };
  190. /*
  191. * hmm_mirror_register() - register a mirror against an mm
  192. *
  193. * @mirror: new mirror struct to register
  194. * @mm: mm to register against
  195. *
  196. * To start mirroring a process address space, the device driver must register
  197. * an HMM mirror struct.
  198. *
  199. * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
  200. */
  201. int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
  202. {
  203. /* Sanity check */
  204. if (!mm || !mirror || !mirror->ops)
  205. return -EINVAL;
  206. again:
  207. mirror->hmm = hmm_register(mm);
  208. if (!mirror->hmm)
  209. return -ENOMEM;
  210. down_write(&mirror->hmm->mirrors_sem);
  211. if (mirror->hmm->mm == NULL) {
  212. /*
  213. * A racing hmm_mirror_unregister() is about to destroy the hmm
  214. * struct. Try again to allocate a new one.
  215. */
  216. up_write(&mirror->hmm->mirrors_sem);
  217. mirror->hmm = NULL;
  218. goto again;
  219. } else {
  220. list_add(&mirror->list, &mirror->hmm->mirrors);
  221. up_write(&mirror->hmm->mirrors_sem);
  222. }
  223. return 0;
  224. }
  225. EXPORT_SYMBOL(hmm_mirror_register);
  226. /*
  227. * hmm_mirror_unregister() - unregister a mirror
  228. *
  229. * @mirror: new mirror struct to register
  230. *
  231. * Stop mirroring a process address space, and cleanup.
  232. */
  233. void hmm_mirror_unregister(struct hmm_mirror *mirror)
  234. {
  235. bool should_unregister = false;
  236. struct mm_struct *mm;
  237. struct hmm *hmm;
  238. if (mirror->hmm == NULL)
  239. return;
  240. hmm = mirror->hmm;
  241. down_write(&hmm->mirrors_sem);
  242. list_del_init(&mirror->list);
  243. should_unregister = list_empty(&hmm->mirrors);
  244. mirror->hmm = NULL;
  245. mm = hmm->mm;
  246. hmm->mm = NULL;
  247. up_write(&hmm->mirrors_sem);
  248. if (!should_unregister || mm == NULL)
  249. return;
  250. mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
  251. spin_lock(&mm->page_table_lock);
  252. if (mm->hmm == hmm)
  253. mm->hmm = NULL;
  254. spin_unlock(&mm->page_table_lock);
  255. kfree(hmm);
  256. }
  257. EXPORT_SYMBOL(hmm_mirror_unregister);
  258. struct hmm_vma_walk {
  259. struct hmm_range *range;
  260. unsigned long last;
  261. bool fault;
  262. bool block;
  263. };
  264. static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
  265. bool write_fault, uint64_t *pfn)
  266. {
  267. unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
  268. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  269. struct hmm_range *range = hmm_vma_walk->range;
  270. struct vm_area_struct *vma = walk->vma;
  271. vm_fault_t ret;
  272. flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
  273. flags |= write_fault ? FAULT_FLAG_WRITE : 0;
  274. ret = handle_mm_fault(vma, addr, flags);
  275. if (ret & VM_FAULT_RETRY)
  276. return -EBUSY;
  277. if (ret & VM_FAULT_ERROR) {
  278. *pfn = range->values[HMM_PFN_ERROR];
  279. return -EFAULT;
  280. }
  281. return -EAGAIN;
  282. }
  283. static int hmm_pfns_bad(unsigned long addr,
  284. unsigned long end,
  285. struct mm_walk *walk)
  286. {
  287. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  288. struct hmm_range *range = hmm_vma_walk->range;
  289. uint64_t *pfns = range->pfns;
  290. unsigned long i;
  291. i = (addr - range->start) >> PAGE_SHIFT;
  292. for (; addr < end; addr += PAGE_SIZE, i++)
  293. pfns[i] = range->values[HMM_PFN_ERROR];
  294. return 0;
  295. }
  296. /*
  297. * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s)
  298. * @start: range virtual start address (inclusive)
  299. * @end: range virtual end address (exclusive)
  300. * @fault: should we fault or not ?
  301. * @write_fault: write fault ?
  302. * @walk: mm_walk structure
  303. * Returns: 0 on success, -EAGAIN after page fault, or page fault error
  304. *
  305. * This function will be called whenever pmd_none() or pte_none() returns true,
  306. * or whenever there is no page directory covering the virtual address range.
  307. */
  308. static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
  309. bool fault, bool write_fault,
  310. struct mm_walk *walk)
  311. {
  312. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  313. struct hmm_range *range = hmm_vma_walk->range;
  314. uint64_t *pfns = range->pfns;
  315. unsigned long i;
  316. hmm_vma_walk->last = addr;
  317. i = (addr - range->start) >> PAGE_SHIFT;
  318. for (; addr < end; addr += PAGE_SIZE, i++) {
  319. pfns[i] = range->values[HMM_PFN_NONE];
  320. if (fault || write_fault) {
  321. int ret;
  322. ret = hmm_vma_do_fault(walk, addr, write_fault,
  323. &pfns[i]);
  324. if (ret != -EAGAIN)
  325. return ret;
  326. }
  327. }
  328. return (fault || write_fault) ? -EAGAIN : 0;
  329. }
  330. static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
  331. uint64_t pfns, uint64_t cpu_flags,
  332. bool *fault, bool *write_fault)
  333. {
  334. struct hmm_range *range = hmm_vma_walk->range;
  335. *fault = *write_fault = false;
  336. if (!hmm_vma_walk->fault)
  337. return;
  338. /* We aren't ask to do anything ... */
  339. if (!(pfns & range->flags[HMM_PFN_VALID]))
  340. return;
  341. /* If this is device memory than only fault if explicitly requested */
  342. if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
  343. /* Do we fault on device memory ? */
  344. if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
  345. *write_fault = pfns & range->flags[HMM_PFN_WRITE];
  346. *fault = true;
  347. }
  348. return;
  349. }
  350. /* If CPU page table is not valid then we need to fault */
  351. *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
  352. /* Need to write fault ? */
  353. if ((pfns & range->flags[HMM_PFN_WRITE]) &&
  354. !(cpu_flags & range->flags[HMM_PFN_WRITE])) {
  355. *write_fault = true;
  356. *fault = true;
  357. }
  358. }
  359. static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
  360. const uint64_t *pfns, unsigned long npages,
  361. uint64_t cpu_flags, bool *fault,
  362. bool *write_fault)
  363. {
  364. unsigned long i;
  365. if (!hmm_vma_walk->fault) {
  366. *fault = *write_fault = false;
  367. return;
  368. }
  369. for (i = 0; i < npages; ++i) {
  370. hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
  371. fault, write_fault);
  372. if ((*fault) || (*write_fault))
  373. return;
  374. }
  375. }
  376. static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
  377. struct mm_walk *walk)
  378. {
  379. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  380. struct hmm_range *range = hmm_vma_walk->range;
  381. bool fault, write_fault;
  382. unsigned long i, npages;
  383. uint64_t *pfns;
  384. i = (addr - range->start) >> PAGE_SHIFT;
  385. npages = (end - addr) >> PAGE_SHIFT;
  386. pfns = &range->pfns[i];
  387. hmm_range_need_fault(hmm_vma_walk, pfns, npages,
  388. 0, &fault, &write_fault);
  389. return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
  390. }
  391. static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
  392. {
  393. if (pmd_protnone(pmd))
  394. return 0;
  395. return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
  396. range->flags[HMM_PFN_WRITE] :
  397. range->flags[HMM_PFN_VALID];
  398. }
  399. static int hmm_vma_handle_pmd(struct mm_walk *walk,
  400. unsigned long addr,
  401. unsigned long end,
  402. uint64_t *pfns,
  403. pmd_t pmd)
  404. {
  405. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  406. struct hmm_range *range = hmm_vma_walk->range;
  407. unsigned long pfn, npages, i;
  408. bool fault, write_fault;
  409. uint64_t cpu_flags;
  410. npages = (end - addr) >> PAGE_SHIFT;
  411. cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
  412. hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
  413. &fault, &write_fault);
  414. if (pmd_protnone(pmd) || fault || write_fault)
  415. return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
  416. pfn = pmd_pfn(pmd) + pte_index(addr);
  417. for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
  418. pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
  419. hmm_vma_walk->last = end;
  420. return 0;
  421. }
  422. static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
  423. {
  424. if (pte_none(pte) || !pte_present(pte))
  425. return 0;
  426. return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
  427. range->flags[HMM_PFN_WRITE] :
  428. range->flags[HMM_PFN_VALID];
  429. }
  430. static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
  431. unsigned long end, pmd_t *pmdp, pte_t *ptep,
  432. uint64_t *pfn)
  433. {
  434. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  435. struct hmm_range *range = hmm_vma_walk->range;
  436. struct vm_area_struct *vma = walk->vma;
  437. bool fault, write_fault;
  438. uint64_t cpu_flags;
  439. pte_t pte = *ptep;
  440. uint64_t orig_pfn = *pfn;
  441. *pfn = range->values[HMM_PFN_NONE];
  442. cpu_flags = pte_to_hmm_pfn_flags(range, pte);
  443. hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
  444. &fault, &write_fault);
  445. if (pte_none(pte)) {
  446. if (fault || write_fault)
  447. goto fault;
  448. return 0;
  449. }
  450. if (!pte_present(pte)) {
  451. swp_entry_t entry = pte_to_swp_entry(pte);
  452. if (!non_swap_entry(entry)) {
  453. if (fault || write_fault)
  454. goto fault;
  455. return 0;
  456. }
  457. /*
  458. * This is a special swap entry, ignore migration, use
  459. * device and report anything else as error.
  460. */
  461. if (is_device_private_entry(entry)) {
  462. cpu_flags = range->flags[HMM_PFN_VALID] |
  463. range->flags[HMM_PFN_DEVICE_PRIVATE];
  464. cpu_flags |= is_write_device_private_entry(entry) ?
  465. range->flags[HMM_PFN_WRITE] : 0;
  466. hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
  467. &fault, &write_fault);
  468. if (fault || write_fault)
  469. goto fault;
  470. *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
  471. *pfn |= cpu_flags;
  472. return 0;
  473. }
  474. if (is_migration_entry(entry)) {
  475. if (fault || write_fault) {
  476. pte_unmap(ptep);
  477. hmm_vma_walk->last = addr;
  478. migration_entry_wait(vma->vm_mm,
  479. pmdp, addr);
  480. return -EAGAIN;
  481. }
  482. return 0;
  483. }
  484. /* Report error for everything else */
  485. *pfn = range->values[HMM_PFN_ERROR];
  486. return -EFAULT;
  487. }
  488. if (fault || write_fault)
  489. goto fault;
  490. *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
  491. return 0;
  492. fault:
  493. pte_unmap(ptep);
  494. /* Fault any virtual address we were asked to fault */
  495. return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
  496. }
  497. static int hmm_vma_walk_pmd(pmd_t *pmdp,
  498. unsigned long start,
  499. unsigned long end,
  500. struct mm_walk *walk)
  501. {
  502. struct hmm_vma_walk *hmm_vma_walk = walk->private;
  503. struct hmm_range *range = hmm_vma_walk->range;
  504. uint64_t *pfns = range->pfns;
  505. unsigned long addr = start, i;
  506. pte_t *ptep;
  507. i = (addr - range->start) >> PAGE_SHIFT;
  508. again:
  509. if (pmd_none(*pmdp))
  510. return hmm_vma_walk_hole(start, end, walk);
  511. if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
  512. return hmm_pfns_bad(start, end, walk);
  513. if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
  514. pmd_t pmd;
  515. /*
  516. * No need to take pmd_lock here, even if some other threads
  517. * is splitting the huge pmd we will get that event through
  518. * mmu_notifier callback.
  519. *
  520. * So just read pmd value and check again its a transparent
  521. * huge or device mapping one and compute corresponding pfn
  522. * values.
  523. */
  524. pmd = pmd_read_atomic(pmdp);
  525. barrier();
  526. if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
  527. goto again;
  528. return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
  529. }
  530. if (pmd_bad(*pmdp))
  531. return hmm_pfns_bad(start, end, walk);
  532. ptep = pte_offset_map(pmdp, addr);
  533. for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
  534. int r;
  535. r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
  536. if (r) {
  537. /* hmm_vma_handle_pte() did unmap pte directory */
  538. hmm_vma_walk->last = addr;
  539. return r;
  540. }
  541. }
  542. pte_unmap(ptep - 1);
  543. hmm_vma_walk->last = addr;
  544. return 0;
  545. }
  546. static void hmm_pfns_clear(struct hmm_range *range,
  547. uint64_t *pfns,
  548. unsigned long addr,
  549. unsigned long end)
  550. {
  551. for (; addr < end; addr += PAGE_SIZE, pfns++)
  552. *pfns = range->values[HMM_PFN_NONE];
  553. }
  554. static void hmm_pfns_special(struct hmm_range *range)
  555. {
  556. unsigned long addr = range->start, i = 0;
  557. for (; addr < range->end; addr += PAGE_SIZE, i++)
  558. range->pfns[i] = range->values[HMM_PFN_SPECIAL];
  559. }
  560. /*
  561. * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
  562. * @range: range being snapshotted
  563. * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
  564. * vma permission, 0 success
  565. *
  566. * This snapshots the CPU page table for a range of virtual addresses. Snapshot
  567. * validity is tracked by range struct. See hmm_vma_range_done() for further
  568. * information.
  569. *
  570. * The range struct is initialized here. It tracks the CPU page table, but only
  571. * if the function returns success (0), in which case the caller must then call
  572. * hmm_vma_range_done() to stop CPU page table update tracking on this range.
  573. *
  574. * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
  575. * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
  576. */
  577. int hmm_vma_get_pfns(struct hmm_range *range)
  578. {
  579. struct vm_area_struct *vma = range->vma;
  580. struct hmm_vma_walk hmm_vma_walk;
  581. struct mm_walk mm_walk;
  582. struct hmm *hmm;
  583. /* Sanity check, this really should not happen ! */
  584. if (range->start < vma->vm_start || range->start >= vma->vm_end)
  585. return -EINVAL;
  586. if (range->end < vma->vm_start || range->end > vma->vm_end)
  587. return -EINVAL;
  588. hmm = hmm_register(vma->vm_mm);
  589. if (!hmm)
  590. return -ENOMEM;
  591. /* Caller must have registered a mirror, via hmm_mirror_register() ! */
  592. if (!hmm->mmu_notifier.ops)
  593. return -EINVAL;
  594. /* FIXME support hugetlb fs */
  595. if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
  596. vma_is_dax(vma)) {
  597. hmm_pfns_special(range);
  598. return -EINVAL;
  599. }
  600. if (!(vma->vm_flags & VM_READ)) {
  601. /*
  602. * If vma do not allow read access, then assume that it does
  603. * not allow write access, either. Architecture that allow
  604. * write without read access are not supported by HMM, because
  605. * operations such has atomic access would not work.
  606. */
  607. hmm_pfns_clear(range, range->pfns, range->start, range->end);
  608. return -EPERM;
  609. }
  610. /* Initialize range to track CPU page table update */
  611. spin_lock(&hmm->lock);
  612. range->valid = true;
  613. list_add_rcu(&range->list, &hmm->ranges);
  614. spin_unlock(&hmm->lock);
  615. hmm_vma_walk.fault = false;
  616. hmm_vma_walk.range = range;
  617. mm_walk.private = &hmm_vma_walk;
  618. mm_walk.vma = vma;
  619. mm_walk.mm = vma->vm_mm;
  620. mm_walk.pte_entry = NULL;
  621. mm_walk.test_walk = NULL;
  622. mm_walk.hugetlb_entry = NULL;
  623. mm_walk.pmd_entry = hmm_vma_walk_pmd;
  624. mm_walk.pte_hole = hmm_vma_walk_hole;
  625. walk_page_range(range->start, range->end, &mm_walk);
  626. return 0;
  627. }
  628. EXPORT_SYMBOL(hmm_vma_get_pfns);
  629. /*
  630. * hmm_vma_range_done() - stop tracking change to CPU page table over a range
  631. * @range: range being tracked
  632. * Returns: false if range data has been invalidated, true otherwise
  633. *
  634. * Range struct is used to track updates to the CPU page table after a call to
  635. * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
  636. * using the data, or wants to lock updates to the data it got from those
  637. * functions, it must call the hmm_vma_range_done() function, which will then
  638. * stop tracking CPU page table updates.
  639. *
  640. * Note that device driver must still implement general CPU page table update
  641. * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
  642. * the mmu_notifier API directly.
  643. *
  644. * CPU page table update tracking done through hmm_range is only temporary and
  645. * to be used while trying to duplicate CPU page table contents for a range of
  646. * virtual addresses.
  647. *
  648. * There are two ways to use this :
  649. * again:
  650. * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
  651. * trans = device_build_page_table_update_transaction(pfns);
  652. * device_page_table_lock();
  653. * if (!hmm_vma_range_done(range)) {
  654. * device_page_table_unlock();
  655. * goto again;
  656. * }
  657. * device_commit_transaction(trans);
  658. * device_page_table_unlock();
  659. *
  660. * Or:
  661. * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
  662. * device_page_table_lock();
  663. * hmm_vma_range_done(range);
  664. * device_update_page_table(range->pfns);
  665. * device_page_table_unlock();
  666. */
  667. bool hmm_vma_range_done(struct hmm_range *range)
  668. {
  669. unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
  670. struct hmm *hmm;
  671. if (range->end <= range->start) {
  672. BUG();
  673. return false;
  674. }
  675. hmm = hmm_register(range->vma->vm_mm);
  676. if (!hmm) {
  677. memset(range->pfns, 0, sizeof(*range->pfns) * npages);
  678. return false;
  679. }
  680. spin_lock(&hmm->lock);
  681. list_del_rcu(&range->list);
  682. spin_unlock(&hmm->lock);
  683. return range->valid;
  684. }
  685. EXPORT_SYMBOL(hmm_vma_range_done);
  686. /*
  687. * hmm_vma_fault() - try to fault some address in a virtual address range
  688. * @range: range being faulted
  689. * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
  690. * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
  691. *
  692. * This is similar to a regular CPU page fault except that it will not trigger
  693. * any memory migration if the memory being faulted is not accessible by CPUs.
  694. *
  695. * On error, for one virtual address in the range, the function will mark the
  696. * corresponding HMM pfn entry with an error flag.
  697. *
  698. * Expected use pattern:
  699. * retry:
  700. * down_read(&mm->mmap_sem);
  701. * // Find vma and address device wants to fault, initialize hmm_pfn_t
  702. * // array accordingly
  703. * ret = hmm_vma_fault(range, write, block);
  704. * switch (ret) {
  705. * case -EAGAIN:
  706. * hmm_vma_range_done(range);
  707. * // You might want to rate limit or yield to play nicely, you may
  708. * // also commit any valid pfn in the array assuming that you are
  709. * // getting true from hmm_vma_range_monitor_end()
  710. * goto retry;
  711. * case 0:
  712. * break;
  713. * case -ENOMEM:
  714. * case -EINVAL:
  715. * case -EPERM:
  716. * default:
  717. * // Handle error !
  718. * up_read(&mm->mmap_sem)
  719. * return;
  720. * }
  721. * // Take device driver lock that serialize device page table update
  722. * driver_lock_device_page_table_update();
  723. * hmm_vma_range_done(range);
  724. * // Commit pfns we got from hmm_vma_fault()
  725. * driver_unlock_device_page_table_update();
  726. * up_read(&mm->mmap_sem)
  727. *
  728. * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
  729. * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
  730. *
  731. * YOU HAVE BEEN WARNED !
  732. */
  733. int hmm_vma_fault(struct hmm_range *range, bool block)
  734. {
  735. struct vm_area_struct *vma = range->vma;
  736. unsigned long start = range->start;
  737. struct hmm_vma_walk hmm_vma_walk;
  738. struct mm_walk mm_walk;
  739. struct hmm *hmm;
  740. int ret;
  741. /* Sanity check, this really should not happen ! */
  742. if (range->start < vma->vm_start || range->start >= vma->vm_end)
  743. return -EINVAL;
  744. if (range->end < vma->vm_start || range->end > vma->vm_end)
  745. return -EINVAL;
  746. hmm = hmm_register(vma->vm_mm);
  747. if (!hmm) {
  748. hmm_pfns_clear(range, range->pfns, range->start, range->end);
  749. return -ENOMEM;
  750. }
  751. /* Caller must have registered a mirror using hmm_mirror_register() */
  752. if (!hmm->mmu_notifier.ops)
  753. return -EINVAL;
  754. /* FIXME support hugetlb fs */
  755. if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
  756. vma_is_dax(vma)) {
  757. hmm_pfns_special(range);
  758. return -EINVAL;
  759. }
  760. if (!(vma->vm_flags & VM_READ)) {
  761. /*
  762. * If vma do not allow read access, then assume that it does
  763. * not allow write access, either. Architecture that allow
  764. * write without read access are not supported by HMM, because
  765. * operations such has atomic access would not work.
  766. */
  767. hmm_pfns_clear(range, range->pfns, range->start, range->end);
  768. return -EPERM;
  769. }
  770. /* Initialize range to track CPU page table update */
  771. spin_lock(&hmm->lock);
  772. range->valid = true;
  773. list_add_rcu(&range->list, &hmm->ranges);
  774. spin_unlock(&hmm->lock);
  775. hmm_vma_walk.fault = true;
  776. hmm_vma_walk.block = block;
  777. hmm_vma_walk.range = range;
  778. mm_walk.private = &hmm_vma_walk;
  779. hmm_vma_walk.last = range->start;
  780. mm_walk.vma = vma;
  781. mm_walk.mm = vma->vm_mm;
  782. mm_walk.pte_entry = NULL;
  783. mm_walk.test_walk = NULL;
  784. mm_walk.hugetlb_entry = NULL;
  785. mm_walk.pmd_entry = hmm_vma_walk_pmd;
  786. mm_walk.pte_hole = hmm_vma_walk_hole;
  787. do {
  788. ret = walk_page_range(start, range->end, &mm_walk);
  789. start = hmm_vma_walk.last;
  790. } while (ret == -EAGAIN);
  791. if (ret) {
  792. unsigned long i;
  793. i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
  794. hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
  795. range->end);
  796. hmm_vma_range_done(range);
  797. }
  798. return ret;
  799. }
  800. EXPORT_SYMBOL(hmm_vma_fault);
  801. #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
  802. #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
  803. struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
  804. unsigned long addr)
  805. {
  806. struct page *page;
  807. page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
  808. if (!page)
  809. return NULL;
  810. lock_page(page);
  811. return page;
  812. }
  813. EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
  814. static void hmm_devmem_ref_release(struct percpu_ref *ref)
  815. {
  816. struct hmm_devmem *devmem;
  817. devmem = container_of(ref, struct hmm_devmem, ref);
  818. complete(&devmem->completion);
  819. }
  820. static void hmm_devmem_ref_exit(void *data)
  821. {
  822. struct percpu_ref *ref = data;
  823. struct hmm_devmem *devmem;
  824. devmem = container_of(ref, struct hmm_devmem, ref);
  825. percpu_ref_exit(ref);
  826. }
  827. static void hmm_devmem_ref_kill(void *data)
  828. {
  829. struct percpu_ref *ref = data;
  830. struct hmm_devmem *devmem;
  831. devmem = container_of(ref, struct hmm_devmem, ref);
  832. percpu_ref_kill(ref);
  833. wait_for_completion(&devmem->completion);
  834. }
  835. static int hmm_devmem_fault(struct vm_area_struct *vma,
  836. unsigned long addr,
  837. const struct page *page,
  838. unsigned int flags,
  839. pmd_t *pmdp)
  840. {
  841. struct hmm_devmem *devmem = page->pgmap->data;
  842. return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
  843. }
  844. static void hmm_devmem_free(struct page *page, void *data)
  845. {
  846. struct hmm_devmem *devmem = data;
  847. page->mapping = NULL;
  848. devmem->ops->free(devmem, page);
  849. }
  850. static DEFINE_MUTEX(hmm_devmem_lock);
  851. static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
  852. static void hmm_devmem_radix_release(struct resource *resource)
  853. {
  854. resource_size_t key;
  855. mutex_lock(&hmm_devmem_lock);
  856. for (key = resource->start;
  857. key <= resource->end;
  858. key += PA_SECTION_SIZE)
  859. radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
  860. mutex_unlock(&hmm_devmem_lock);
  861. }
  862. static void hmm_devmem_release(void *data)
  863. {
  864. struct hmm_devmem *devmem = data;
  865. struct resource *resource = devmem->resource;
  866. unsigned long start_pfn, npages;
  867. struct page *page;
  868. int nid;
  869. /* pages are dead and unused, undo the arch mapping */
  870. start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
  871. npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
  872. page = pfn_to_page(start_pfn);
  873. nid = page_to_nid(page);
  874. mem_hotplug_begin();
  875. if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
  876. __remove_pages(start_pfn, npages, NULL);
  877. else
  878. arch_remove_memory(nid, start_pfn << PAGE_SHIFT,
  879. npages << PAGE_SHIFT, NULL);
  880. mem_hotplug_done();
  881. hmm_devmem_radix_release(resource);
  882. }
  883. static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
  884. {
  885. resource_size_t key, align_start, align_size, align_end;
  886. struct device *device = devmem->device;
  887. int ret, nid, is_ram;
  888. unsigned long pfn;
  889. align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
  890. align_size = ALIGN(devmem->resource->start +
  891. resource_size(devmem->resource),
  892. PA_SECTION_SIZE) - align_start;
  893. is_ram = region_intersects(align_start, align_size,
  894. IORESOURCE_SYSTEM_RAM,
  895. IORES_DESC_NONE);
  896. if (is_ram == REGION_MIXED) {
  897. WARN_ONCE(1, "%s attempted on mixed region %pr\n",
  898. __func__, devmem->resource);
  899. return -ENXIO;
  900. }
  901. if (is_ram == REGION_INTERSECTS)
  902. return -ENXIO;
  903. if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
  904. devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
  905. else
  906. devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
  907. devmem->pagemap.res = *devmem->resource;
  908. devmem->pagemap.page_fault = hmm_devmem_fault;
  909. devmem->pagemap.page_free = hmm_devmem_free;
  910. devmem->pagemap.dev = devmem->device;
  911. devmem->pagemap.ref = &devmem->ref;
  912. devmem->pagemap.data = devmem;
  913. mutex_lock(&hmm_devmem_lock);
  914. align_end = align_start + align_size - 1;
  915. for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
  916. struct hmm_devmem *dup;
  917. dup = radix_tree_lookup(&hmm_devmem_radix,
  918. key >> PA_SECTION_SHIFT);
  919. if (dup) {
  920. dev_err(device, "%s: collides with mapping for %s\n",
  921. __func__, dev_name(dup->device));
  922. mutex_unlock(&hmm_devmem_lock);
  923. ret = -EBUSY;
  924. goto error;
  925. }
  926. ret = radix_tree_insert(&hmm_devmem_radix,
  927. key >> PA_SECTION_SHIFT,
  928. devmem);
  929. if (ret) {
  930. dev_err(device, "%s: failed: %d\n", __func__, ret);
  931. mutex_unlock(&hmm_devmem_lock);
  932. goto error_radix;
  933. }
  934. }
  935. mutex_unlock(&hmm_devmem_lock);
  936. nid = dev_to_node(device);
  937. if (nid < 0)
  938. nid = numa_mem_id();
  939. mem_hotplug_begin();
  940. /*
  941. * For device private memory we call add_pages() as we only need to
  942. * allocate and initialize struct page for the device memory. More-
  943. * over the device memory is un-accessible thus we do not want to
  944. * create a linear mapping for the memory like arch_add_memory()
  945. * would do.
  946. *
  947. * For device public memory, which is accesible by the CPU, we do
  948. * want the linear mapping and thus use arch_add_memory().
  949. */
  950. if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
  951. ret = arch_add_memory(nid, align_start, align_size, NULL,
  952. false);
  953. else
  954. ret = add_pages(nid, align_start >> PAGE_SHIFT,
  955. align_size >> PAGE_SHIFT, NULL, false);
  956. if (ret) {
  957. mem_hotplug_done();
  958. goto error_add_memory;
  959. }
  960. move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
  961. align_start >> PAGE_SHIFT,
  962. align_size >> PAGE_SHIFT, NULL);
  963. mem_hotplug_done();
  964. for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
  965. struct page *page = pfn_to_page(pfn);
  966. page->pgmap = &devmem->pagemap;
  967. }
  968. return 0;
  969. error_add_memory:
  970. untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
  971. error_radix:
  972. hmm_devmem_radix_release(devmem->resource);
  973. error:
  974. return ret;
  975. }
  976. /*
  977. * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
  978. *
  979. * @ops: memory event device driver callback (see struct hmm_devmem_ops)
  980. * @device: device struct to bind the resource too
  981. * @size: size in bytes of the device memory to add
  982. * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
  983. *
  984. * This function first finds an empty range of physical address big enough to
  985. * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
  986. * in turn allocates struct pages. It does not do anything beyond that; all
  987. * events affecting the memory will go through the various callbacks provided
  988. * by hmm_devmem_ops struct.
  989. *
  990. * Device driver should call this function during device initialization and
  991. * is then responsible of memory management. HMM only provides helpers.
  992. */
  993. struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
  994. struct device *device,
  995. unsigned long size)
  996. {
  997. struct hmm_devmem *devmem;
  998. resource_size_t addr;
  999. int ret;
  1000. dev_pagemap_get_ops();
  1001. devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
  1002. if (!devmem)
  1003. return ERR_PTR(-ENOMEM);
  1004. init_completion(&devmem->completion);
  1005. devmem->pfn_first = -1UL;
  1006. devmem->pfn_last = -1UL;
  1007. devmem->resource = NULL;
  1008. devmem->device = device;
  1009. devmem->ops = ops;
  1010. ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
  1011. 0, GFP_KERNEL);
  1012. if (ret)
  1013. return ERR_PTR(ret);
  1014. ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref);
  1015. if (ret)
  1016. return ERR_PTR(ret);
  1017. size = ALIGN(size, PA_SECTION_SIZE);
  1018. addr = min((unsigned long)iomem_resource.end,
  1019. (1UL << MAX_PHYSMEM_BITS) - 1);
  1020. addr = addr - size + 1UL;
  1021. /*
  1022. * FIXME add a new helper to quickly walk resource tree and find free
  1023. * range
  1024. *
  1025. * FIXME what about ioport_resource resource ?
  1026. */
  1027. for (; addr > size && addr >= iomem_resource.start; addr -= size) {
  1028. ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
  1029. if (ret != REGION_DISJOINT)
  1030. continue;
  1031. devmem->resource = devm_request_mem_region(device, addr, size,
  1032. dev_name(device));
  1033. if (!devmem->resource)
  1034. return ERR_PTR(-ENOMEM);
  1035. break;
  1036. }
  1037. if (!devmem->resource)
  1038. return ERR_PTR(-ERANGE);
  1039. devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
  1040. devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
  1041. devmem->pfn_last = devmem->pfn_first +
  1042. (resource_size(devmem->resource) >> PAGE_SHIFT);
  1043. ret = hmm_devmem_pages_create(devmem);
  1044. if (ret)
  1045. return ERR_PTR(ret);
  1046. ret = devm_add_action_or_reset(device, hmm_devmem_release, devmem);
  1047. if (ret)
  1048. return ERR_PTR(ret);
  1049. return devmem;
  1050. }
  1051. EXPORT_SYMBOL_GPL(hmm_devmem_add);
  1052. struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
  1053. struct device *device,
  1054. struct resource *res)
  1055. {
  1056. struct hmm_devmem *devmem;
  1057. int ret;
  1058. if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
  1059. return ERR_PTR(-EINVAL);
  1060. dev_pagemap_get_ops();
  1061. devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
  1062. if (!devmem)
  1063. return ERR_PTR(-ENOMEM);
  1064. init_completion(&devmem->completion);
  1065. devmem->pfn_first = -1UL;
  1066. devmem->pfn_last = -1UL;
  1067. devmem->resource = res;
  1068. devmem->device = device;
  1069. devmem->ops = ops;
  1070. ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
  1071. 0, GFP_KERNEL);
  1072. if (ret)
  1073. return ERR_PTR(ret);
  1074. ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit,
  1075. &devmem->ref);
  1076. if (ret)
  1077. return ERR_PTR(ret);
  1078. devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
  1079. devmem->pfn_last = devmem->pfn_first +
  1080. (resource_size(devmem->resource) >> PAGE_SHIFT);
  1081. ret = hmm_devmem_pages_create(devmem);
  1082. if (ret)
  1083. return ERR_PTR(ret);
  1084. ret = devm_add_action_or_reset(device, hmm_devmem_release, devmem);
  1085. if (ret)
  1086. return ERR_PTR(ret);
  1087. ret = devm_add_action_or_reset(device, hmm_devmem_ref_kill,
  1088. &devmem->ref);
  1089. if (ret)
  1090. return ERR_PTR(ret);
  1091. return devmem;
  1092. }
  1093. EXPORT_SYMBOL_GPL(hmm_devmem_add_resource);
  1094. /*
  1095. * A device driver that wants to handle multiple devices memory through a
  1096. * single fake device can use hmm_device to do so. This is purely a helper
  1097. * and it is not needed to make use of any HMM functionality.
  1098. */
  1099. #define HMM_DEVICE_MAX 256
  1100. static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
  1101. static DEFINE_SPINLOCK(hmm_device_lock);
  1102. static struct class *hmm_device_class;
  1103. static dev_t hmm_device_devt;
  1104. static void hmm_device_release(struct device *device)
  1105. {
  1106. struct hmm_device *hmm_device;
  1107. hmm_device = container_of(device, struct hmm_device, device);
  1108. spin_lock(&hmm_device_lock);
  1109. clear_bit(hmm_device->minor, hmm_device_mask);
  1110. spin_unlock(&hmm_device_lock);
  1111. kfree(hmm_device);
  1112. }
  1113. struct hmm_device *hmm_device_new(void *drvdata)
  1114. {
  1115. struct hmm_device *hmm_device;
  1116. hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
  1117. if (!hmm_device)
  1118. return ERR_PTR(-ENOMEM);
  1119. spin_lock(&hmm_device_lock);
  1120. hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
  1121. if (hmm_device->minor >= HMM_DEVICE_MAX) {
  1122. spin_unlock(&hmm_device_lock);
  1123. kfree(hmm_device);
  1124. return ERR_PTR(-EBUSY);
  1125. }
  1126. set_bit(hmm_device->minor, hmm_device_mask);
  1127. spin_unlock(&hmm_device_lock);
  1128. dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
  1129. hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
  1130. hmm_device->minor);
  1131. hmm_device->device.release = hmm_device_release;
  1132. dev_set_drvdata(&hmm_device->device, drvdata);
  1133. hmm_device->device.class = hmm_device_class;
  1134. device_initialize(&hmm_device->device);
  1135. return hmm_device;
  1136. }
  1137. EXPORT_SYMBOL(hmm_device_new);
  1138. void hmm_device_put(struct hmm_device *hmm_device)
  1139. {
  1140. put_device(&hmm_device->device);
  1141. }
  1142. EXPORT_SYMBOL(hmm_device_put);
  1143. static int __init hmm_init(void)
  1144. {
  1145. int ret;
  1146. ret = alloc_chrdev_region(&hmm_device_devt, 0,
  1147. HMM_DEVICE_MAX,
  1148. "hmm_device");
  1149. if (ret)
  1150. return ret;
  1151. hmm_device_class = class_create(THIS_MODULE, "hmm_device");
  1152. if (IS_ERR(hmm_device_class)) {
  1153. unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
  1154. return PTR_ERR(hmm_device_class);
  1155. }
  1156. return 0;
  1157. }
  1158. device_initcall(hmm_init);
  1159. #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */