pgtable.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. /*
  2. * Copyright IBM Corp. 2007,2009
  3. * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  4. */
  5. #include <linux/sched.h>
  6. #include <linux/kernel.h>
  7. #include <linux/errno.h>
  8. #include <linux/gfp.h>
  9. #include <linux/mm.h>
  10. #include <linux/swap.h>
  11. #include <linux/smp.h>
  12. #include <linux/highmem.h>
  13. #include <linux/pagemap.h>
  14. #include <linux/spinlock.h>
  15. #include <linux/module.h>
  16. #include <linux/quicklist.h>
  17. #include <linux/rcupdate.h>
  18. #include <asm/system.h>
  19. #include <asm/pgtable.h>
  20. #include <asm/pgalloc.h>
  21. #include <asm/tlb.h>
  22. #include <asm/tlbflush.h>
  23. #include <asm/mmu_context.h>
  24. #ifndef CONFIG_64BIT
  25. #define ALLOC_ORDER 1
  26. #define FRAG_MASK 0x0f
  27. #else
  28. #define ALLOC_ORDER 2
  29. #define FRAG_MASK 0x03
  30. #endif
  31. unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE;
  32. EXPORT_SYMBOL(VMALLOC_START);
  33. static int __init parse_vmalloc(char *arg)
  34. {
  35. if (!arg)
  36. return -EINVAL;
  37. VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK;
  38. return 0;
  39. }
  40. early_param("vmalloc", parse_vmalloc);
  41. unsigned long *crst_table_alloc(struct mm_struct *mm)
  42. {
  43. struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
  44. if (!page)
  45. return NULL;
  46. return (unsigned long *) page_to_phys(page);
  47. }
  48. void crst_table_free(struct mm_struct *mm, unsigned long *table)
  49. {
  50. free_pages((unsigned long) table, ALLOC_ORDER);
  51. }
  52. #ifdef CONFIG_64BIT
  53. int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
  54. {
  55. unsigned long *table, *pgd;
  56. unsigned long entry;
  57. BUG_ON(limit > (1UL << 53));
  58. repeat:
  59. table = crst_table_alloc(mm);
  60. if (!table)
  61. return -ENOMEM;
  62. spin_lock_bh(&mm->page_table_lock);
  63. if (mm->context.asce_limit < limit) {
  64. pgd = (unsigned long *) mm->pgd;
  65. if (mm->context.asce_limit <= (1UL << 31)) {
  66. entry = _REGION3_ENTRY_EMPTY;
  67. mm->context.asce_limit = 1UL << 42;
  68. mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  69. _ASCE_USER_BITS |
  70. _ASCE_TYPE_REGION3;
  71. } else {
  72. entry = _REGION2_ENTRY_EMPTY;
  73. mm->context.asce_limit = 1UL << 53;
  74. mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  75. _ASCE_USER_BITS |
  76. _ASCE_TYPE_REGION2;
  77. }
  78. crst_table_init(table, entry);
  79. pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
  80. mm->pgd = (pgd_t *) table;
  81. mm->task_size = mm->context.asce_limit;
  82. table = NULL;
  83. }
  84. spin_unlock_bh(&mm->page_table_lock);
  85. if (table)
  86. crst_table_free(mm, table);
  87. if (mm->context.asce_limit < limit)
  88. goto repeat;
  89. update_mm(mm, current);
  90. return 0;
  91. }
  92. void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
  93. {
  94. pgd_t *pgd;
  95. if (mm->context.asce_limit <= limit)
  96. return;
  97. __tlb_flush_mm(mm);
  98. while (mm->context.asce_limit > limit) {
  99. pgd = mm->pgd;
  100. switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
  101. case _REGION_ENTRY_TYPE_R2:
  102. mm->context.asce_limit = 1UL << 42;
  103. mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  104. _ASCE_USER_BITS |
  105. _ASCE_TYPE_REGION3;
  106. break;
  107. case _REGION_ENTRY_TYPE_R3:
  108. mm->context.asce_limit = 1UL << 31;
  109. mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  110. _ASCE_USER_BITS |
  111. _ASCE_TYPE_SEGMENT;
  112. break;
  113. default:
  114. BUG();
  115. }
  116. mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
  117. mm->task_size = mm->context.asce_limit;
  118. crst_table_free(mm, (unsigned long *) pgd);
  119. }
  120. update_mm(mm, current);
  121. }
  122. #endif
  123. static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
  124. {
  125. unsigned int old, new;
  126. do {
  127. old = atomic_read(v);
  128. new = old ^ bits;
  129. } while (atomic_cmpxchg(v, old, new) != old);
  130. return new;
  131. }
  132. /*
  133. * page table entry allocation/free routines.
  134. */
  135. #ifdef CONFIG_PGSTE
  136. static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm)
  137. {
  138. struct page *page;
  139. unsigned long *table;
  140. page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
  141. if (!page)
  142. return NULL;
  143. pgtable_page_ctor(page);
  144. atomic_set(&page->_mapcount, 3);
  145. table = (unsigned long *) page_to_phys(page);
  146. clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
  147. clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
  148. return table;
  149. }
  150. static inline void page_table_free_pgste(unsigned long *table)
  151. {
  152. struct page *page;
  153. page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
  154. pgtable_page_ctor(page);
  155. atomic_set(&page->_mapcount, -1);
  156. __free_page(page);
  157. }
  158. #endif
  159. unsigned long *page_table_alloc(struct mm_struct *mm)
  160. {
  161. struct page *page;
  162. unsigned long *table;
  163. unsigned int mask, bit;
  164. #ifdef CONFIG_PGSTE
  165. if (mm_has_pgste(mm))
  166. return page_table_alloc_pgste(mm);
  167. #endif
  168. /* Allocate fragments of a 4K page as 1K/2K page table */
  169. spin_lock_bh(&mm->context.list_lock);
  170. mask = FRAG_MASK;
  171. if (!list_empty(&mm->context.pgtable_list)) {
  172. page = list_first_entry(&mm->context.pgtable_list,
  173. struct page, lru);
  174. table = (unsigned long *) page_to_phys(page);
  175. mask = atomic_read(&page->_mapcount);
  176. mask = mask | (mask >> 4);
  177. }
  178. if ((mask & FRAG_MASK) == FRAG_MASK) {
  179. spin_unlock_bh(&mm->context.list_lock);
  180. page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
  181. if (!page)
  182. return NULL;
  183. pgtable_page_ctor(page);
  184. atomic_set(&page->_mapcount, 1);
  185. table = (unsigned long *) page_to_phys(page);
  186. clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
  187. spin_lock_bh(&mm->context.list_lock);
  188. list_add(&page->lru, &mm->context.pgtable_list);
  189. } else {
  190. for (bit = 1; mask & bit; bit <<= 1)
  191. table += PTRS_PER_PTE;
  192. mask = atomic_xor_bits(&page->_mapcount, bit);
  193. if ((mask & FRAG_MASK) == FRAG_MASK)
  194. list_del(&page->lru);
  195. }
  196. spin_unlock_bh(&mm->context.list_lock);
  197. return table;
  198. }
  199. void page_table_free(struct mm_struct *mm, unsigned long *table)
  200. {
  201. struct page *page;
  202. unsigned int bit, mask;
  203. #ifdef CONFIG_PGSTE
  204. if (mm_has_pgste(mm))
  205. return page_table_free_pgste(table);
  206. #endif
  207. /* Free 1K/2K page table fragment of a 4K page */
  208. page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
  209. bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
  210. spin_lock_bh(&mm->context.list_lock);
  211. if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
  212. list_del(&page->lru);
  213. mask = atomic_xor_bits(&page->_mapcount, bit);
  214. if (mask & FRAG_MASK)
  215. list_add(&page->lru, &mm->context.pgtable_list);
  216. spin_unlock_bh(&mm->context.list_lock);
  217. if (mask == 0) {
  218. pgtable_page_dtor(page);
  219. atomic_set(&page->_mapcount, -1);
  220. __free_page(page);
  221. }
  222. }
  223. static void __page_table_free_rcu(void *table, unsigned bit)
  224. {
  225. struct page *page;
  226. #ifdef CONFIG_PGSTE
  227. if (bit == FRAG_MASK)
  228. return page_table_free_pgste(table);
  229. #endif
  230. /* Free 1K/2K page table fragment of a 4K page */
  231. page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
  232. if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
  233. pgtable_page_dtor(page);
  234. atomic_set(&page->_mapcount, -1);
  235. __free_page(page);
  236. }
  237. }
  238. void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
  239. {
  240. struct mm_struct *mm;
  241. struct page *page;
  242. unsigned int bit, mask;
  243. mm = tlb->mm;
  244. #ifdef CONFIG_PGSTE
  245. if (mm_has_pgste(mm)) {
  246. table = (unsigned long *) (__pa(table) | FRAG_MASK);
  247. tlb_remove_table(tlb, table);
  248. return;
  249. }
  250. #endif
  251. bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
  252. page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
  253. spin_lock_bh(&mm->context.list_lock);
  254. if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
  255. list_del(&page->lru);
  256. mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
  257. if (mask & FRAG_MASK)
  258. list_add_tail(&page->lru, &mm->context.pgtable_list);
  259. spin_unlock_bh(&mm->context.list_lock);
  260. table = (unsigned long *) (__pa(table) | (bit << 4));
  261. tlb_remove_table(tlb, table);
  262. }
  263. void __tlb_remove_table(void *_table)
  264. {
  265. const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK;
  266. void *table = (void *)((unsigned long) _table & ~mask);
  267. unsigned type = (unsigned long) _table & mask;
  268. if (type)
  269. __page_table_free_rcu(table, type);
  270. else
  271. free_pages((unsigned long) table, ALLOC_ORDER);
  272. }
  273. static void tlb_remove_table_smp_sync(void *arg)
  274. {
  275. /* Simply deliver the interrupt */
  276. }
  277. static void tlb_remove_table_one(void *table)
  278. {
  279. /*
  280. * This isn't an RCU grace period and hence the page-tables cannot be
  281. * assumed to be actually RCU-freed.
  282. *
  283. * It is however sufficient for software page-table walkers that rely
  284. * on IRQ disabling. See the comment near struct mmu_table_batch.
  285. */
  286. smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
  287. __tlb_remove_table(table);
  288. }
  289. static void tlb_remove_table_rcu(struct rcu_head *head)
  290. {
  291. struct mmu_table_batch *batch;
  292. int i;
  293. batch = container_of(head, struct mmu_table_batch, rcu);
  294. for (i = 0; i < batch->nr; i++)
  295. __tlb_remove_table(batch->tables[i]);
  296. free_page((unsigned long)batch);
  297. }
  298. void tlb_table_flush(struct mmu_gather *tlb)
  299. {
  300. struct mmu_table_batch **batch = &tlb->batch;
  301. if (*batch) {
  302. __tlb_flush_mm(tlb->mm);
  303. call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
  304. *batch = NULL;
  305. }
  306. }
  307. void tlb_remove_table(struct mmu_gather *tlb, void *table)
  308. {
  309. struct mmu_table_batch **batch = &tlb->batch;
  310. if (*batch == NULL) {
  311. *batch = (struct mmu_table_batch *)
  312. __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
  313. if (*batch == NULL) {
  314. __tlb_flush_mm(tlb->mm);
  315. tlb_remove_table_one(table);
  316. return;
  317. }
  318. (*batch)->nr = 0;
  319. }
  320. (*batch)->tables[(*batch)->nr++] = table;
  321. if ((*batch)->nr == MAX_TABLE_BATCH)
  322. tlb_table_flush(tlb);
  323. }
  324. /*
  325. * switch on pgstes for its userspace process (for kvm)
  326. */
  327. int s390_enable_sie(void)
  328. {
  329. struct task_struct *tsk = current;
  330. struct mm_struct *mm, *old_mm;
  331. /* Do we have switched amode? If no, we cannot do sie */
  332. if (user_mode == HOME_SPACE_MODE)
  333. return -EINVAL;
  334. /* Do we have pgstes? if yes, we are done */
  335. if (mm_has_pgste(tsk->mm))
  336. return 0;
  337. /* lets check if we are allowed to replace the mm */
  338. task_lock(tsk);
  339. if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
  340. #ifdef CONFIG_AIO
  341. !hlist_empty(&tsk->mm->ioctx_list) ||
  342. #endif
  343. tsk->mm != tsk->active_mm) {
  344. task_unlock(tsk);
  345. return -EINVAL;
  346. }
  347. task_unlock(tsk);
  348. /* we copy the mm and let dup_mm create the page tables with_pgstes */
  349. tsk->mm->context.alloc_pgste = 1;
  350. mm = dup_mm(tsk);
  351. tsk->mm->context.alloc_pgste = 0;
  352. if (!mm)
  353. return -ENOMEM;
  354. /* Now lets check again if something happened */
  355. task_lock(tsk);
  356. if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
  357. #ifdef CONFIG_AIO
  358. !hlist_empty(&tsk->mm->ioctx_list) ||
  359. #endif
  360. tsk->mm != tsk->active_mm) {
  361. mmput(mm);
  362. task_unlock(tsk);
  363. return -EINVAL;
  364. }
  365. /* ok, we are alone. No ptrace, no threads, etc. */
  366. old_mm = tsk->mm;
  367. tsk->mm = tsk->active_mm = mm;
  368. preempt_disable();
  369. update_mm(mm, tsk);
  370. atomic_inc(&mm->context.attach_count);
  371. atomic_dec(&old_mm->context.attach_count);
  372. cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
  373. preempt_enable();
  374. task_unlock(tsk);
  375. mmput(old_mm);
  376. return 0;
  377. }
  378. EXPORT_SYMBOL_GPL(s390_enable_sie);
  379. #if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
  380. bool kernel_page_present(struct page *page)
  381. {
  382. unsigned long addr;
  383. int cc;
  384. addr = page_to_phys(page);
  385. asm volatile(
  386. " lra %1,0(%1)\n"
  387. " ipm %0\n"
  388. " srl %0,28"
  389. : "=d" (cc), "+a" (addr) : : "cc");
  390. return cc == 0;
  391. }
  392. #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */