memory.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Memory subsystem support
  4. *
  5. * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
  6. * Dave Hansen <haveblue@us.ibm.com>
  7. *
  8. * This file provides the necessary infrastructure to represent
  9. * a SPARSEMEM-memory-model system's physical memory in /sysfs.
  10. * All arch-independent code that assumes MEMORY_HOTPLUG requires
  11. * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
  12. */
  13. #include <linux/module.h>
  14. #include <linux/init.h>
  15. #include <linux/topology.h>
  16. #include <linux/capability.h>
  17. #include <linux/device.h>
  18. #include <linux/memory.h>
  19. #include <linux/memory_hotplug.h>
  20. #include <linux/mm.h>
  21. #include <linux/mutex.h>
  22. #include <linux/stat.h>
  23. #include <linux/slab.h>
  24. #include <linux/atomic.h>
  25. #include <linux/uaccess.h>
  26. static DEFINE_MUTEX(mem_sysfs_mutex);
  27. #define MEMORY_CLASS_NAME "memory"
  28. #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
  29. static int sections_per_block;
  30. static inline int base_memory_block_id(int section_nr)
  31. {
  32. return section_nr / sections_per_block;
  33. }
  34. static inline int pfn_to_block_id(unsigned long pfn)
  35. {
  36. return base_memory_block_id(pfn_to_section_nr(pfn));
  37. }
  38. static int memory_subsys_online(struct device *dev);
  39. static int memory_subsys_offline(struct device *dev);
  40. static struct bus_type memory_subsys = {
  41. .name = MEMORY_CLASS_NAME,
  42. .dev_name = MEMORY_CLASS_NAME,
  43. .online = memory_subsys_online,
  44. .offline = memory_subsys_offline,
  45. };
  46. static BLOCKING_NOTIFIER_HEAD(memory_chain);
  47. int register_memory_notifier(struct notifier_block *nb)
  48. {
  49. return blocking_notifier_chain_register(&memory_chain, nb);
  50. }
  51. EXPORT_SYMBOL(register_memory_notifier);
  52. void unregister_memory_notifier(struct notifier_block *nb)
  53. {
  54. blocking_notifier_chain_unregister(&memory_chain, nb);
  55. }
  56. EXPORT_SYMBOL(unregister_memory_notifier);
  57. static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain);
  58. int register_memory_isolate_notifier(struct notifier_block *nb)
  59. {
  60. return atomic_notifier_chain_register(&memory_isolate_chain, nb);
  61. }
  62. EXPORT_SYMBOL(register_memory_isolate_notifier);
  63. void unregister_memory_isolate_notifier(struct notifier_block *nb)
  64. {
  65. atomic_notifier_chain_unregister(&memory_isolate_chain, nb);
  66. }
  67. EXPORT_SYMBOL(unregister_memory_isolate_notifier);
  68. static void memory_block_release(struct device *dev)
  69. {
  70. struct memory_block *mem = to_memory_block(dev);
  71. kfree(mem);
  72. }
  73. unsigned long __weak memory_block_size_bytes(void)
  74. {
  75. return MIN_MEMORY_BLOCK_SIZE;
  76. }
  77. static unsigned long get_memory_block_size(void)
  78. {
  79. unsigned long block_sz;
  80. block_sz = memory_block_size_bytes();
  81. /* Validate blk_sz is a power of 2 and not less than section size */
  82. if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
  83. WARN_ON(1);
  84. block_sz = MIN_MEMORY_BLOCK_SIZE;
  85. }
  86. return block_sz;
  87. }
  88. /*
  89. * use this as the physical section index that this memsection
  90. * uses.
  91. */
  92. static ssize_t show_mem_start_phys_index(struct device *dev,
  93. struct device_attribute *attr, char *buf)
  94. {
  95. struct memory_block *mem = to_memory_block(dev);
  96. unsigned long phys_index;
  97. phys_index = mem->start_section_nr / sections_per_block;
  98. return sprintf(buf, "%08lx\n", phys_index);
  99. }
  100. /*
  101. * Show whether the section of memory is likely to be hot-removable
  102. */
  103. static ssize_t show_mem_removable(struct device *dev,
  104. struct device_attribute *attr, char *buf)
  105. {
  106. unsigned long i, pfn;
  107. int ret = 1;
  108. struct memory_block *mem = to_memory_block(dev);
  109. if (mem->state != MEM_ONLINE)
  110. goto out;
  111. for (i = 0; i < sections_per_block; i++) {
  112. if (!present_section_nr(mem->start_section_nr + i))
  113. continue;
  114. pfn = section_nr_to_pfn(mem->start_section_nr + i);
  115. ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
  116. }
  117. out:
  118. return sprintf(buf, "%d\n", ret);
  119. }
  120. /*
  121. * online, offline, going offline, etc.
  122. */
  123. static ssize_t show_mem_state(struct device *dev,
  124. struct device_attribute *attr, char *buf)
  125. {
  126. struct memory_block *mem = to_memory_block(dev);
  127. ssize_t len = 0;
  128. /*
  129. * We can probably put these states in a nice little array
  130. * so that they're not open-coded
  131. */
  132. switch (mem->state) {
  133. case MEM_ONLINE:
  134. len = sprintf(buf, "online\n");
  135. break;
  136. case MEM_OFFLINE:
  137. len = sprintf(buf, "offline\n");
  138. break;
  139. case MEM_GOING_OFFLINE:
  140. len = sprintf(buf, "going-offline\n");
  141. break;
  142. default:
  143. len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
  144. mem->state);
  145. WARN_ON(1);
  146. break;
  147. }
  148. return len;
  149. }
  150. int memory_notify(unsigned long val, void *v)
  151. {
  152. return blocking_notifier_call_chain(&memory_chain, val, v);
  153. }
  154. int memory_isolate_notify(unsigned long val, void *v)
  155. {
  156. return atomic_notifier_call_chain(&memory_isolate_chain, val, v);
  157. }
  158. /*
  159. * The probe routines leave the pages uninitialized, just as the bootmem code
  160. * does. Make sure we do not access them, but instead use only information from
  161. * within sections.
  162. */
  163. static bool pages_correctly_probed(unsigned long start_pfn)
  164. {
  165. unsigned long section_nr = pfn_to_section_nr(start_pfn);
  166. unsigned long section_nr_end = section_nr + sections_per_block;
  167. unsigned long pfn = start_pfn;
  168. /*
  169. * memmap between sections is not contiguous except with
  170. * SPARSEMEM_VMEMMAP. We lookup the page once per section
  171. * and assume memmap is contiguous within each section
  172. */
  173. for (; section_nr < section_nr_end; section_nr++) {
  174. if (WARN_ON_ONCE(!pfn_valid(pfn)))
  175. return false;
  176. if (!present_section_nr(section_nr)) {
  177. pr_warn("section %ld pfn[%lx, %lx) not present",
  178. section_nr, pfn, pfn + PAGES_PER_SECTION);
  179. return false;
  180. } else if (!valid_section_nr(section_nr)) {
  181. pr_warn("section %ld pfn[%lx, %lx) no valid memmap",
  182. section_nr, pfn, pfn + PAGES_PER_SECTION);
  183. return false;
  184. } else if (online_section_nr(section_nr)) {
  185. pr_warn("section %ld pfn[%lx, %lx) is already online",
  186. section_nr, pfn, pfn + PAGES_PER_SECTION);
  187. return false;
  188. }
  189. pfn += PAGES_PER_SECTION;
  190. }
  191. return true;
  192. }
  193. /*
  194. * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
  195. * OK to have direct references to sparsemem variables in here.
  196. */
  197. static int
  198. memory_block_action(unsigned long start_section_nr, unsigned long action,
  199. int online_type)
  200. {
  201. unsigned long start_pfn;
  202. unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
  203. int ret;
  204. start_pfn = section_nr_to_pfn(start_section_nr);
  205. switch (action) {
  206. case MEM_ONLINE:
  207. if (!pages_correctly_probed(start_pfn))
  208. return -EBUSY;
  209. ret = online_pages(start_pfn, nr_pages, online_type);
  210. break;
  211. case MEM_OFFLINE:
  212. ret = offline_pages(start_pfn, nr_pages);
  213. break;
  214. default:
  215. WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
  216. "%ld\n", __func__, start_section_nr, action, action);
  217. ret = -EINVAL;
  218. }
  219. return ret;
  220. }
  221. static int memory_block_change_state(struct memory_block *mem,
  222. unsigned long to_state, unsigned long from_state_req)
  223. {
  224. int ret = 0;
  225. if (mem->state != from_state_req)
  226. return -EINVAL;
  227. if (to_state == MEM_OFFLINE)
  228. mem->state = MEM_GOING_OFFLINE;
  229. ret = memory_block_action(mem->start_section_nr, to_state,
  230. mem->online_type);
  231. mem->state = ret ? from_state_req : to_state;
  232. return ret;
  233. }
  234. /* The device lock serializes operations on memory_subsys_[online|offline] */
  235. static int memory_subsys_online(struct device *dev)
  236. {
  237. struct memory_block *mem = to_memory_block(dev);
  238. int ret;
  239. if (mem->state == MEM_ONLINE)
  240. return 0;
  241. /*
  242. * If we are called from store_mem_state(), online_type will be
  243. * set >= 0 Otherwise we were called from the device online
  244. * attribute and need to set the online_type.
  245. */
  246. if (mem->online_type < 0)
  247. mem->online_type = MMOP_ONLINE_KEEP;
  248. ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
  249. /* clear online_type */
  250. mem->online_type = -1;
  251. return ret;
  252. }
  253. static int memory_subsys_offline(struct device *dev)
  254. {
  255. struct memory_block *mem = to_memory_block(dev);
  256. if (mem->state == MEM_OFFLINE)
  257. return 0;
  258. /* Can't offline block with non-present sections */
  259. if (mem->section_count != sections_per_block)
  260. return -EINVAL;
  261. return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
  262. }
  263. static ssize_t
  264. store_mem_state(struct device *dev,
  265. struct device_attribute *attr, const char *buf, size_t count)
  266. {
  267. struct memory_block *mem = to_memory_block(dev);
  268. int ret, online_type;
  269. ret = lock_device_hotplug_sysfs();
  270. if (ret)
  271. return ret;
  272. if (sysfs_streq(buf, "online_kernel"))
  273. online_type = MMOP_ONLINE_KERNEL;
  274. else if (sysfs_streq(buf, "online_movable"))
  275. online_type = MMOP_ONLINE_MOVABLE;
  276. else if (sysfs_streq(buf, "online"))
  277. online_type = MMOP_ONLINE_KEEP;
  278. else if (sysfs_streq(buf, "offline"))
  279. online_type = MMOP_OFFLINE;
  280. else {
  281. ret = -EINVAL;
  282. goto err;
  283. }
  284. switch (online_type) {
  285. case MMOP_ONLINE_KERNEL:
  286. case MMOP_ONLINE_MOVABLE:
  287. case MMOP_ONLINE_KEEP:
  288. /* mem->online_type is protected by device_hotplug_lock */
  289. mem->online_type = online_type;
  290. ret = device_online(&mem->dev);
  291. break;
  292. case MMOP_OFFLINE:
  293. ret = device_offline(&mem->dev);
  294. break;
  295. default:
  296. ret = -EINVAL; /* should never happen */
  297. }
  298. err:
  299. unlock_device_hotplug();
  300. if (ret < 0)
  301. return ret;
  302. if (ret)
  303. return -EINVAL;
  304. return count;
  305. }
  306. /*
  307. * phys_device is a bad name for this. What I really want
  308. * is a way to differentiate between memory ranges that
  309. * are part of physical devices that constitute
  310. * a complete removable unit or fru.
  311. * i.e. do these ranges belong to the same physical device,
  312. * s.t. if I offline all of these sections I can then
  313. * remove the physical device?
  314. */
  315. static ssize_t show_phys_device(struct device *dev,
  316. struct device_attribute *attr, char *buf)
  317. {
  318. struct memory_block *mem = to_memory_block(dev);
  319. return sprintf(buf, "%d\n", mem->phys_device);
  320. }
  321. #ifdef CONFIG_MEMORY_HOTREMOVE
  322. static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn,
  323. unsigned long nr_pages, int online_type,
  324. struct zone *default_zone)
  325. {
  326. struct zone *zone;
  327. zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
  328. if (zone != default_zone) {
  329. strcat(buf, " ");
  330. strcat(buf, zone->name);
  331. }
  332. }
  333. static ssize_t show_valid_zones(struct device *dev,
  334. struct device_attribute *attr, char *buf)
  335. {
  336. struct memory_block *mem = to_memory_block(dev);
  337. unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
  338. unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
  339. unsigned long valid_start_pfn, valid_end_pfn;
  340. struct zone *default_zone;
  341. int nid;
  342. /*
  343. * Check the existing zone. Make sure that we do that only on the
  344. * online nodes otherwise the page_zone is not reliable
  345. */
  346. if (mem->state == MEM_ONLINE) {
  347. /*
  348. * The block contains more than one zone can not be offlined.
  349. * This can happen e.g. for ZONE_DMA and ZONE_DMA32
  350. */
  351. if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages,
  352. &valid_start_pfn, &valid_end_pfn))
  353. return sprintf(buf, "none\n");
  354. start_pfn = valid_start_pfn;
  355. strcat(buf, page_zone(pfn_to_page(start_pfn))->name);
  356. goto out;
  357. }
  358. nid = mem->nid;
  359. default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages);
  360. strcat(buf, default_zone->name);
  361. print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
  362. default_zone);
  363. print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE,
  364. default_zone);
  365. out:
  366. strcat(buf, "\n");
  367. return strlen(buf);
  368. }
  369. static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL);
  370. #endif
  371. static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
  372. static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state);
  373. static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL);
  374. static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL);
  375. /*
  376. * Block size attribute stuff
  377. */
  378. static ssize_t
  379. print_block_size(struct device *dev, struct device_attribute *attr,
  380. char *buf)
  381. {
  382. return sprintf(buf, "%lx\n", get_memory_block_size());
  383. }
  384. static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL);
  385. /*
  386. * Memory auto online policy.
  387. */
  388. static ssize_t
  389. show_auto_online_blocks(struct device *dev, struct device_attribute *attr,
  390. char *buf)
  391. {
  392. if (memhp_auto_online)
  393. return sprintf(buf, "online\n");
  394. else
  395. return sprintf(buf, "offline\n");
  396. }
  397. static ssize_t
  398. store_auto_online_blocks(struct device *dev, struct device_attribute *attr,
  399. const char *buf, size_t count)
  400. {
  401. if (sysfs_streq(buf, "online"))
  402. memhp_auto_online = true;
  403. else if (sysfs_streq(buf, "offline"))
  404. memhp_auto_online = false;
  405. else
  406. return -EINVAL;
  407. return count;
  408. }
  409. static DEVICE_ATTR(auto_online_blocks, 0644, show_auto_online_blocks,
  410. store_auto_online_blocks);
  411. /*
  412. * Some architectures will have custom drivers to do this, and
  413. * will not need to do it from userspace. The fake hot-add code
  414. * as well as ppc64 will do all of their discovery in userspace
  415. * and will require this interface.
  416. */
  417. #ifdef CONFIG_ARCH_MEMORY_PROBE
  418. static ssize_t
  419. memory_probe_store(struct device *dev, struct device_attribute *attr,
  420. const char *buf, size_t count)
  421. {
  422. u64 phys_addr;
  423. int nid, ret;
  424. unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
  425. ret = kstrtoull(buf, 0, &phys_addr);
  426. if (ret)
  427. return ret;
  428. if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
  429. return -EINVAL;
  430. ret = lock_device_hotplug_sysfs();
  431. if (ret)
  432. return ret;
  433. nid = memory_add_physaddr_to_nid(phys_addr);
  434. ret = __add_memory(nid, phys_addr,
  435. MIN_MEMORY_BLOCK_SIZE * sections_per_block);
  436. if (ret)
  437. goto out;
  438. ret = count;
  439. out:
  440. unlock_device_hotplug();
  441. return ret;
  442. }
  443. static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
  444. #endif
  445. #ifdef CONFIG_MEMORY_FAILURE
  446. /*
  447. * Support for offlining pages of memory
  448. */
  449. /* Soft offline a page */
  450. static ssize_t
  451. store_soft_offline_page(struct device *dev,
  452. struct device_attribute *attr,
  453. const char *buf, size_t count)
  454. {
  455. int ret;
  456. u64 pfn;
  457. if (!capable(CAP_SYS_ADMIN))
  458. return -EPERM;
  459. if (kstrtoull(buf, 0, &pfn) < 0)
  460. return -EINVAL;
  461. pfn >>= PAGE_SHIFT;
  462. if (!pfn_valid(pfn))
  463. return -ENXIO;
  464. /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
  465. if (!pfn_to_online_page(pfn))
  466. return -EIO;
  467. ret = soft_offline_page(pfn_to_page(pfn), 0);
  468. return ret == 0 ? count : ret;
  469. }
  470. /* Forcibly offline a page, including killing processes. */
  471. static ssize_t
  472. store_hard_offline_page(struct device *dev,
  473. struct device_attribute *attr,
  474. const char *buf, size_t count)
  475. {
  476. int ret;
  477. u64 pfn;
  478. if (!capable(CAP_SYS_ADMIN))
  479. return -EPERM;
  480. if (kstrtoull(buf, 0, &pfn) < 0)
  481. return -EINVAL;
  482. pfn >>= PAGE_SHIFT;
  483. ret = memory_failure(pfn, 0);
  484. return ret ? ret : count;
  485. }
  486. static DEVICE_ATTR(soft_offline_page, S_IWUSR, NULL, store_soft_offline_page);
  487. static DEVICE_ATTR(hard_offline_page, S_IWUSR, NULL, store_hard_offline_page);
  488. #endif
  489. /*
  490. * Note that phys_device is optional. It is here to allow for
  491. * differentiation between which *physical* devices each
  492. * section belongs to...
  493. */
  494. int __weak arch_get_memory_phys_device(unsigned long start_pfn)
  495. {
  496. return 0;
  497. }
  498. /*
  499. * A reference for the returned object is held and the reference for the
  500. * hinted object is released.
  501. */
  502. static struct memory_block *find_memory_block_by_id(int block_id,
  503. struct memory_block *hint)
  504. {
  505. struct device *hintdev = hint ? &hint->dev : NULL;
  506. struct device *dev;
  507. dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev);
  508. if (hint)
  509. put_device(&hint->dev);
  510. if (!dev)
  511. return NULL;
  512. return to_memory_block(dev);
  513. }
  514. struct memory_block *find_memory_block_hinted(struct mem_section *section,
  515. struct memory_block *hint)
  516. {
  517. int block_id = base_memory_block_id(__section_nr(section));
  518. return find_memory_block_by_id(block_id, hint);
  519. }
  520. /*
  521. * For now, we have a linear search to go find the appropriate
  522. * memory_block corresponding to a particular phys_index. If
  523. * this gets to be a real problem, we can always use a radix
  524. * tree or something here.
  525. *
  526. * This could be made generic for all device subsystems.
  527. */
  528. struct memory_block *find_memory_block(struct mem_section *section)
  529. {
  530. return find_memory_block_hinted(section, NULL);
  531. }
  532. static struct attribute *memory_memblk_attrs[] = {
  533. &dev_attr_phys_index.attr,
  534. &dev_attr_state.attr,
  535. &dev_attr_phys_device.attr,
  536. &dev_attr_removable.attr,
  537. #ifdef CONFIG_MEMORY_HOTREMOVE
  538. &dev_attr_valid_zones.attr,
  539. #endif
  540. NULL
  541. };
  542. static struct attribute_group memory_memblk_attr_group = {
  543. .attrs = memory_memblk_attrs,
  544. };
  545. static const struct attribute_group *memory_memblk_attr_groups[] = {
  546. &memory_memblk_attr_group,
  547. NULL,
  548. };
  549. /*
  550. * register_memory - Setup a sysfs device for a memory block
  551. */
  552. static
  553. int register_memory(struct memory_block *memory)
  554. {
  555. int ret;
  556. memory->dev.bus = &memory_subsys;
  557. memory->dev.id = memory->start_section_nr / sections_per_block;
  558. memory->dev.release = memory_block_release;
  559. memory->dev.groups = memory_memblk_attr_groups;
  560. memory->dev.offline = memory->state == MEM_OFFLINE;
  561. ret = device_register(&memory->dev);
  562. if (ret)
  563. put_device(&memory->dev);
  564. return ret;
  565. }
  566. static int init_memory_block(struct memory_block **memory, int block_id,
  567. unsigned long state)
  568. {
  569. struct memory_block *mem;
  570. unsigned long start_pfn;
  571. int ret = 0;
  572. mem = find_memory_block_by_id(block_id, NULL);
  573. if (mem) {
  574. put_device(&mem->dev);
  575. return -EEXIST;
  576. }
  577. mem = kzalloc(sizeof(*mem), GFP_KERNEL);
  578. if (!mem)
  579. return -ENOMEM;
  580. mem->start_section_nr = block_id * sections_per_block;
  581. mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
  582. mem->state = state;
  583. start_pfn = section_nr_to_pfn(mem->start_section_nr);
  584. mem->phys_device = arch_get_memory_phys_device(start_pfn);
  585. mem->nid = NUMA_NO_NODE;
  586. ret = register_memory(mem);
  587. *memory = mem;
  588. return ret;
  589. }
  590. static int add_memory_block(int base_section_nr)
  591. {
  592. struct memory_block *mem;
  593. int i, ret, section_count = 0;
  594. for (i = base_section_nr;
  595. i < base_section_nr + sections_per_block;
  596. i++)
  597. if (present_section_nr(i))
  598. section_count++;
  599. if (section_count == 0)
  600. return 0;
  601. ret = init_memory_block(&mem, base_memory_block_id(base_section_nr),
  602. MEM_ONLINE);
  603. if (ret)
  604. return ret;
  605. mem->section_count = section_count;
  606. return 0;
  607. }
  608. static void unregister_memory(struct memory_block *memory)
  609. {
  610. if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
  611. return;
  612. /* drop the ref. we got via find_memory_block() */
  613. put_device(&memory->dev);
  614. device_unregister(&memory->dev);
  615. }
  616. /*
  617. * Create memory block devices for the given memory area. Start and size
  618. * have to be aligned to memory block granularity. Memory block devices
  619. * will be initialized as offline.
  620. */
  621. int create_memory_block_devices(unsigned long start, unsigned long size)
  622. {
  623. const int start_block_id = pfn_to_block_id(PFN_DOWN(start));
  624. int end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
  625. struct memory_block *mem;
  626. unsigned long block_id;
  627. int ret = 0;
  628. if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
  629. !IS_ALIGNED(size, memory_block_size_bytes())))
  630. return -EINVAL;
  631. mutex_lock(&mem_sysfs_mutex);
  632. for (block_id = start_block_id; block_id != end_block_id; block_id++) {
  633. ret = init_memory_block(&mem, block_id, MEM_OFFLINE);
  634. if (ret)
  635. break;
  636. mem->section_count = sections_per_block;
  637. }
  638. if (ret) {
  639. end_block_id = block_id;
  640. for (block_id = start_block_id; block_id != end_block_id;
  641. block_id++) {
  642. mem = find_memory_block_by_id(block_id, NULL);
  643. mem->section_count = 0;
  644. unregister_memory(mem);
  645. }
  646. }
  647. mutex_unlock(&mem_sysfs_mutex);
  648. return ret;
  649. }
  650. /*
  651. * Remove memory block devices for the given memory area. Start and size
  652. * have to be aligned to memory block granularity. Memory block devices
  653. * have to be offline.
  654. */
  655. void remove_memory_block_devices(unsigned long start, unsigned long size)
  656. {
  657. const int start_block_id = pfn_to_block_id(PFN_DOWN(start));
  658. const int end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
  659. struct memory_block *mem;
  660. int block_id;
  661. if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
  662. !IS_ALIGNED(size, memory_block_size_bytes())))
  663. return;
  664. mutex_lock(&mem_sysfs_mutex);
  665. for (block_id = start_block_id; block_id != end_block_id; block_id++) {
  666. mem = find_memory_block_by_id(block_id, NULL);
  667. if (WARN_ON_ONCE(!mem))
  668. continue;
  669. mem->section_count = 0;
  670. unregister_memory_block_under_nodes(mem);
  671. unregister_memory(mem);
  672. }
  673. mutex_unlock(&mem_sysfs_mutex);
  674. }
  675. /* return true if the memory block is offlined, otherwise, return false */
  676. bool is_memblock_offlined(struct memory_block *mem)
  677. {
  678. return mem->state == MEM_OFFLINE;
  679. }
  680. static struct attribute *memory_root_attrs[] = {
  681. #ifdef CONFIG_ARCH_MEMORY_PROBE
  682. &dev_attr_probe.attr,
  683. #endif
  684. #ifdef CONFIG_MEMORY_FAILURE
  685. &dev_attr_soft_offline_page.attr,
  686. &dev_attr_hard_offline_page.attr,
  687. #endif
  688. &dev_attr_block_size_bytes.attr,
  689. &dev_attr_auto_online_blocks.attr,
  690. NULL
  691. };
  692. static struct attribute_group memory_root_attr_group = {
  693. .attrs = memory_root_attrs,
  694. };
  695. static const struct attribute_group *memory_root_attr_groups[] = {
  696. &memory_root_attr_group,
  697. NULL,
  698. };
  699. /*
  700. * Initialize the sysfs support for memory devices...
  701. */
  702. int __init memory_dev_init(void)
  703. {
  704. unsigned int i;
  705. int ret;
  706. int err;
  707. unsigned long block_sz;
  708. ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
  709. if (ret)
  710. goto out;
  711. block_sz = get_memory_block_size();
  712. sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
  713. /*
  714. * Create entries for memory sections that were found
  715. * during boot and have been initialized
  716. */
  717. mutex_lock(&mem_sysfs_mutex);
  718. for (i = 0; i <= __highest_present_section_nr;
  719. i += sections_per_block) {
  720. err = add_memory_block(i);
  721. if (!ret)
  722. ret = err;
  723. }
  724. mutex_unlock(&mem_sysfs_mutex);
  725. out:
  726. if (ret)
  727. printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
  728. return ret;
  729. }
  730. struct for_each_memory_block_cb_data {
  731. walk_memory_blocks_func_t func;
  732. void *arg;
  733. };
  734. static int for_each_memory_block_cb(struct device *dev, void *data)
  735. {
  736. struct memory_block *mem = to_memory_block(dev);
  737. struct for_each_memory_block_cb_data *cb_data = data;
  738. return cb_data->func(mem, cb_data->arg);
  739. }
  740. /**
  741. * for_each_memory_block - walk through all present memory blocks
  742. *
  743. * @arg: argument passed to func
  744. * @func: callback for each memory block walked
  745. *
  746. * This function walks through all present memory blocks, calling func on
  747. * each memory block.
  748. *
  749. * In case func() returns an error, walking is aborted and the error is
  750. * returned.
  751. */
  752. int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
  753. {
  754. struct for_each_memory_block_cb_data cb_data = {
  755. .func = func,
  756. .arg = arg,
  757. };
  758. return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
  759. for_each_memory_block_cb);
  760. }