perf_event_intel_rapl.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783
  1. /*
  2. * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
  3. * Copyright (C) 2013 Google, Inc., Stephane Eranian
  4. *
  5. * Intel RAPL interface is specified in the IA-32 Manual Vol3b
  6. * section 14.7.1 (September 2013)
  7. *
  8. * RAPL provides more controls than just reporting energy consumption
  9. * however here we only expose the 3 energy consumption free running
  10. * counters (pp0, pkg, dram).
  11. *
  12. * Each of those counters increments in a power unit defined by the
  13. * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
  14. * but it can vary.
  15. *
  16. * Counter to rapl events mappings:
  17. *
  18. * pp0 counter: consumption of all physical cores (power plane 0)
  19. * event: rapl_energy_cores
  20. * perf code: 0x1
  21. *
  22. * pkg counter: consumption of the whole processor package
  23. * event: rapl_energy_pkg
  24. * perf code: 0x2
  25. *
  26. * dram counter: consumption of the dram domain (servers only)
  27. * event: rapl_energy_dram
  28. * perf code: 0x3
  29. *
  30. * dram counter: consumption of the builtin-gpu domain (client only)
  31. * event: rapl_energy_gpu
  32. * perf code: 0x4
  33. *
  34. * We manage those counters as free running (read-only). They may be
  35. * use simultaneously by other tools, such as turbostat.
  36. *
  37. * The events only support system-wide mode counting. There is no
  38. * sampling support because it does not make sense and is not
  39. * supported by the RAPL hardware.
  40. *
  41. * Because we want to avoid floating-point operations in the kernel,
  42. * the events are all reported in fixed point arithmetic (32.32).
  43. * Tools must adjust the counts to convert them to Watts using
  44. * the duration of the measurement. Tools may use a function such as
  45. * ldexp(raw_count, -32);
  46. */
  47. #include <linux/module.h>
  48. #include <linux/slab.h>
  49. #include <linux/perf_event.h>
  50. #include <asm/cpu_device_id.h>
  51. #include "perf_event.h"
  52. /*
  53. * RAPL energy status counters
  54. */
  55. #define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
  56. #define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
  57. #define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
  58. #define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
  59. #define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
  60. #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
  61. #define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
  62. #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
  63. #define NR_RAPL_DOMAINS 0x4
  64. static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
  65. "pp0-core",
  66. "package",
  67. "dram",
  68. "pp1-gpu",
  69. };
  70. /* Clients have PP0, PKG */
  71. #define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
  72. 1<<RAPL_IDX_PKG_NRG_STAT|\
  73. 1<<RAPL_IDX_PP1_NRG_STAT)
  74. /* Servers have PP0, PKG, RAM */
  75. #define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
  76. 1<<RAPL_IDX_PKG_NRG_STAT|\
  77. 1<<RAPL_IDX_RAM_NRG_STAT)
  78. /* Servers have PP0, PKG, RAM, PP1 */
  79. #define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\
  80. 1<<RAPL_IDX_PKG_NRG_STAT|\
  81. 1<<RAPL_IDX_RAM_NRG_STAT|\
  82. 1<<RAPL_IDX_PP1_NRG_STAT)
  83. /*
  84. * event code: LSB 8 bits, passed in attr->config
  85. * any other bit is reserved
  86. */
  87. #define RAPL_EVENT_MASK 0xFFULL
  88. #define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
  89. static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
  90. struct kobj_attribute *attr, \
  91. char *page) \
  92. { \
  93. BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
  94. return sprintf(page, _format "\n"); \
  95. } \
  96. static struct kobj_attribute format_attr_##_var = \
  97. __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
  98. #define RAPL_EVENT_DESC(_name, _config) \
  99. { \
  100. .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \
  101. .config = _config, \
  102. }
  103. #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
  104. #define RAPL_EVENT_ATTR_STR(_name, v, str) \
  105. static struct perf_pmu_events_attr event_attr_##v = { \
  106. .attr = __ATTR(_name, 0444, rapl_sysfs_show, NULL), \
  107. .id = 0, \
  108. .event_str = str, \
  109. };
  110. struct rapl_pmu {
  111. spinlock_t lock;
  112. int n_active; /* number of active events */
  113. struct list_head active_list;
  114. struct pmu *pmu; /* pointer to rapl_pmu_class */
  115. ktime_t timer_interval; /* in ktime_t unit */
  116. struct hrtimer hrtimer;
  117. };
  118. static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* 1/2^hw_unit Joule */
  119. static struct pmu rapl_pmu_class;
  120. static cpumask_t rapl_cpu_mask;
  121. static int rapl_cntr_mask;
  122. static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
  123. static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
  124. static struct x86_pmu_quirk *rapl_quirks;
  125. static inline u64 rapl_read_counter(struct perf_event *event)
  126. {
  127. u64 raw;
  128. rdmsrl(event->hw.event_base, raw);
  129. return raw;
  130. }
  131. #define rapl_add_quirk(func_) \
  132. do { \
  133. static struct x86_pmu_quirk __quirk __initdata = { \
  134. .func = func_, \
  135. }; \
  136. __quirk.next = rapl_quirks; \
  137. rapl_quirks = &__quirk; \
  138. } while (0)
  139. static inline u64 rapl_scale(u64 v, int cfg)
  140. {
  141. if (cfg > NR_RAPL_DOMAINS) {
  142. pr_warn("invalid domain %d, failed to scale data\n", cfg);
  143. return v;
  144. }
  145. /*
  146. * scale delta to smallest unit (1/2^32)
  147. * users must then scale back: count * 1/(1e9*2^32) to get Joules
  148. * or use ldexp(count, -32).
  149. * Watts = Joules/Time delta
  150. */
  151. return v << (32 - rapl_hw_unit[cfg - 1]);
  152. }
  153. static u64 rapl_event_update(struct perf_event *event)
  154. {
  155. struct hw_perf_event *hwc = &event->hw;
  156. u64 prev_raw_count, new_raw_count;
  157. s64 delta, sdelta;
  158. int shift = RAPL_CNTR_WIDTH;
  159. again:
  160. prev_raw_count = local64_read(&hwc->prev_count);
  161. rdmsrl(event->hw.event_base, new_raw_count);
  162. if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
  163. new_raw_count) != prev_raw_count) {
  164. cpu_relax();
  165. goto again;
  166. }
  167. /*
  168. * Now we have the new raw value and have updated the prev
  169. * timestamp already. We can now calculate the elapsed delta
  170. * (event-)time and add that to the generic event.
  171. *
  172. * Careful, not all hw sign-extends above the physical width
  173. * of the count.
  174. */
  175. delta = (new_raw_count << shift) - (prev_raw_count << shift);
  176. delta >>= shift;
  177. sdelta = rapl_scale(delta, event->hw.config);
  178. local64_add(sdelta, &event->count);
  179. return new_raw_count;
  180. }
  181. static void rapl_start_hrtimer(struct rapl_pmu *pmu)
  182. {
  183. hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
  184. HRTIMER_MODE_REL_PINNED);
  185. }
  186. static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
  187. {
  188. hrtimer_cancel(&pmu->hrtimer);
  189. }
  190. static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
  191. {
  192. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  193. struct perf_event *event;
  194. unsigned long flags;
  195. if (!pmu->n_active)
  196. return HRTIMER_NORESTART;
  197. spin_lock_irqsave(&pmu->lock, flags);
  198. list_for_each_entry(event, &pmu->active_list, active_entry) {
  199. rapl_event_update(event);
  200. }
  201. spin_unlock_irqrestore(&pmu->lock, flags);
  202. hrtimer_forward_now(hrtimer, pmu->timer_interval);
  203. return HRTIMER_RESTART;
  204. }
  205. static void rapl_hrtimer_init(struct rapl_pmu *pmu)
  206. {
  207. struct hrtimer *hr = &pmu->hrtimer;
  208. hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  209. hr->function = rapl_hrtimer_handle;
  210. }
  211. static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
  212. struct perf_event *event)
  213. {
  214. if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
  215. return;
  216. event->hw.state = 0;
  217. list_add_tail(&event->active_entry, &pmu->active_list);
  218. local64_set(&event->hw.prev_count, rapl_read_counter(event));
  219. pmu->n_active++;
  220. if (pmu->n_active == 1)
  221. rapl_start_hrtimer(pmu);
  222. }
  223. static void rapl_pmu_event_start(struct perf_event *event, int mode)
  224. {
  225. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  226. unsigned long flags;
  227. spin_lock_irqsave(&pmu->lock, flags);
  228. __rapl_pmu_event_start(pmu, event);
  229. spin_unlock_irqrestore(&pmu->lock, flags);
  230. }
  231. static void rapl_pmu_event_stop(struct perf_event *event, int mode)
  232. {
  233. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  234. struct hw_perf_event *hwc = &event->hw;
  235. unsigned long flags;
  236. spin_lock_irqsave(&pmu->lock, flags);
  237. /* mark event as deactivated and stopped */
  238. if (!(hwc->state & PERF_HES_STOPPED)) {
  239. WARN_ON_ONCE(pmu->n_active <= 0);
  240. pmu->n_active--;
  241. if (pmu->n_active == 0)
  242. rapl_stop_hrtimer(pmu);
  243. list_del(&event->active_entry);
  244. WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
  245. hwc->state |= PERF_HES_STOPPED;
  246. }
  247. /* check if update of sw counter is necessary */
  248. if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
  249. /*
  250. * Drain the remaining delta count out of a event
  251. * that we are disabling:
  252. */
  253. rapl_event_update(event);
  254. hwc->state |= PERF_HES_UPTODATE;
  255. }
  256. spin_unlock_irqrestore(&pmu->lock, flags);
  257. }
  258. static int rapl_pmu_event_add(struct perf_event *event, int mode)
  259. {
  260. struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
  261. struct hw_perf_event *hwc = &event->hw;
  262. unsigned long flags;
  263. spin_lock_irqsave(&pmu->lock, flags);
  264. hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
  265. if (mode & PERF_EF_START)
  266. __rapl_pmu_event_start(pmu, event);
  267. spin_unlock_irqrestore(&pmu->lock, flags);
  268. return 0;
  269. }
  270. static void rapl_pmu_event_del(struct perf_event *event, int flags)
  271. {
  272. rapl_pmu_event_stop(event, PERF_EF_UPDATE);
  273. }
  274. static int rapl_pmu_event_init(struct perf_event *event)
  275. {
  276. u64 cfg = event->attr.config & RAPL_EVENT_MASK;
  277. int bit, msr, ret = 0;
  278. /* only look at RAPL events */
  279. if (event->attr.type != rapl_pmu_class.type)
  280. return -ENOENT;
  281. /* check only supported bits are set */
  282. if (event->attr.config & ~RAPL_EVENT_MASK)
  283. return -EINVAL;
  284. /*
  285. * check event is known (determines counter)
  286. */
  287. switch (cfg) {
  288. case INTEL_RAPL_PP0:
  289. bit = RAPL_IDX_PP0_NRG_STAT;
  290. msr = MSR_PP0_ENERGY_STATUS;
  291. break;
  292. case INTEL_RAPL_PKG:
  293. bit = RAPL_IDX_PKG_NRG_STAT;
  294. msr = MSR_PKG_ENERGY_STATUS;
  295. break;
  296. case INTEL_RAPL_RAM:
  297. bit = RAPL_IDX_RAM_NRG_STAT;
  298. msr = MSR_DRAM_ENERGY_STATUS;
  299. break;
  300. case INTEL_RAPL_PP1:
  301. bit = RAPL_IDX_PP1_NRG_STAT;
  302. msr = MSR_PP1_ENERGY_STATUS;
  303. break;
  304. default:
  305. return -EINVAL;
  306. }
  307. /* check event supported */
  308. if (!(rapl_cntr_mask & (1 << bit)))
  309. return -EINVAL;
  310. /* unsupported modes and filters */
  311. if (event->attr.exclude_user ||
  312. event->attr.exclude_kernel ||
  313. event->attr.exclude_hv ||
  314. event->attr.exclude_idle ||
  315. event->attr.exclude_host ||
  316. event->attr.exclude_guest ||
  317. event->attr.sample_period) /* no sampling */
  318. return -EINVAL;
  319. /* must be done before validate_group */
  320. event->hw.event_base = msr;
  321. event->hw.config = cfg;
  322. event->hw.idx = bit;
  323. return ret;
  324. }
  325. static void rapl_pmu_event_read(struct perf_event *event)
  326. {
  327. rapl_event_update(event);
  328. }
  329. static ssize_t rapl_get_attr_cpumask(struct device *dev,
  330. struct device_attribute *attr, char *buf)
  331. {
  332. return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
  333. }
  334. static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
  335. static struct attribute *rapl_pmu_attrs[] = {
  336. &dev_attr_cpumask.attr,
  337. NULL,
  338. };
  339. static struct attribute_group rapl_pmu_attr_group = {
  340. .attrs = rapl_pmu_attrs,
  341. };
  342. static ssize_t rapl_sysfs_show(struct device *dev,
  343. struct device_attribute *attr,
  344. char *page)
  345. {
  346. struct perf_pmu_events_attr *pmu_attr = \
  347. container_of(attr, struct perf_pmu_events_attr, attr);
  348. if (pmu_attr->event_str)
  349. return sprintf(page, "%s", pmu_attr->event_str);
  350. return 0;
  351. }
  352. RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
  353. RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
  354. RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
  355. RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
  356. RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
  357. RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
  358. RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
  359. RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
  360. /*
  361. * we compute in 0.23 nJ increments regardless of MSR
  362. */
  363. RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
  364. RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
  365. RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
  366. RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
  367. static struct attribute *rapl_events_srv_attr[] = {
  368. EVENT_PTR(rapl_cores),
  369. EVENT_PTR(rapl_pkg),
  370. EVENT_PTR(rapl_ram),
  371. EVENT_PTR(rapl_cores_unit),
  372. EVENT_PTR(rapl_pkg_unit),
  373. EVENT_PTR(rapl_ram_unit),
  374. EVENT_PTR(rapl_cores_scale),
  375. EVENT_PTR(rapl_pkg_scale),
  376. EVENT_PTR(rapl_ram_scale),
  377. NULL,
  378. };
  379. static struct attribute *rapl_events_cln_attr[] = {
  380. EVENT_PTR(rapl_cores),
  381. EVENT_PTR(rapl_pkg),
  382. EVENT_PTR(rapl_gpu),
  383. EVENT_PTR(rapl_cores_unit),
  384. EVENT_PTR(rapl_pkg_unit),
  385. EVENT_PTR(rapl_gpu_unit),
  386. EVENT_PTR(rapl_cores_scale),
  387. EVENT_PTR(rapl_pkg_scale),
  388. EVENT_PTR(rapl_gpu_scale),
  389. NULL,
  390. };
  391. static struct attribute *rapl_events_hsw_attr[] = {
  392. EVENT_PTR(rapl_cores),
  393. EVENT_PTR(rapl_pkg),
  394. EVENT_PTR(rapl_gpu),
  395. EVENT_PTR(rapl_ram),
  396. EVENT_PTR(rapl_cores_unit),
  397. EVENT_PTR(rapl_pkg_unit),
  398. EVENT_PTR(rapl_gpu_unit),
  399. EVENT_PTR(rapl_ram_unit),
  400. EVENT_PTR(rapl_cores_scale),
  401. EVENT_PTR(rapl_pkg_scale),
  402. EVENT_PTR(rapl_gpu_scale),
  403. EVENT_PTR(rapl_ram_scale),
  404. NULL,
  405. };
  406. static struct attribute_group rapl_pmu_events_group = {
  407. .name = "events",
  408. .attrs = NULL, /* patched at runtime */
  409. };
  410. DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
  411. static struct attribute *rapl_formats_attr[] = {
  412. &format_attr_event.attr,
  413. NULL,
  414. };
  415. static struct attribute_group rapl_pmu_format_group = {
  416. .name = "format",
  417. .attrs = rapl_formats_attr,
  418. };
  419. const struct attribute_group *rapl_attr_groups[] = {
  420. &rapl_pmu_attr_group,
  421. &rapl_pmu_format_group,
  422. &rapl_pmu_events_group,
  423. NULL,
  424. };
  425. static struct pmu rapl_pmu_class = {
  426. .attr_groups = rapl_attr_groups,
  427. .task_ctx_nr = perf_invalid_context, /* system-wide only */
  428. .event_init = rapl_pmu_event_init,
  429. .add = rapl_pmu_event_add, /* must have */
  430. .del = rapl_pmu_event_del, /* must have */
  431. .start = rapl_pmu_event_start,
  432. .stop = rapl_pmu_event_stop,
  433. .read = rapl_pmu_event_read,
  434. };
  435. static void rapl_cpu_exit(int cpu)
  436. {
  437. struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
  438. int i, phys_id = topology_physical_package_id(cpu);
  439. int target = -1;
  440. /* find a new cpu on same package */
  441. for_each_online_cpu(i) {
  442. if (i == cpu)
  443. continue;
  444. if (phys_id == topology_physical_package_id(i)) {
  445. target = i;
  446. break;
  447. }
  448. }
  449. /*
  450. * clear cpu from cpumask
  451. * if was set in cpumask and still some cpu on package,
  452. * then move to new cpu
  453. */
  454. if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
  455. cpumask_set_cpu(target, &rapl_cpu_mask);
  456. WARN_ON(cpumask_empty(&rapl_cpu_mask));
  457. /*
  458. * migrate events and context to new cpu
  459. */
  460. if (target >= 0)
  461. perf_pmu_migrate_context(pmu->pmu, cpu, target);
  462. /* cancel overflow polling timer for CPU */
  463. rapl_stop_hrtimer(pmu);
  464. }
  465. static void rapl_cpu_init(int cpu)
  466. {
  467. int i, phys_id = topology_physical_package_id(cpu);
  468. /* check if phys_is is already covered */
  469. for_each_cpu(i, &rapl_cpu_mask) {
  470. if (phys_id == topology_physical_package_id(i))
  471. return;
  472. }
  473. /* was not found, so add it */
  474. cpumask_set_cpu(cpu, &rapl_cpu_mask);
  475. }
  476. static __init void rapl_hsw_server_quirk(void)
  477. {
  478. /*
  479. * DRAM domain on HSW server has fixed energy unit which can be
  480. * different than the unit from power unit MSR.
  481. * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
  482. * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
  483. */
  484. rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
  485. }
  486. static int rapl_cpu_prepare(int cpu)
  487. {
  488. struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
  489. int phys_id = topology_physical_package_id(cpu);
  490. u64 ms;
  491. if (pmu)
  492. return 0;
  493. if (phys_id < 0)
  494. return -1;
  495. pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
  496. if (!pmu)
  497. return -1;
  498. spin_lock_init(&pmu->lock);
  499. INIT_LIST_HEAD(&pmu->active_list);
  500. pmu->pmu = &rapl_pmu_class;
  501. /*
  502. * use reference of 200W for scaling the timeout
  503. * to avoid missing counter overflows.
  504. * 200W = 200 Joules/sec
  505. * divide interval by 2 to avoid lockstep (2 * 100)
  506. * if hw unit is 32, then we use 2 ms 1/200/2
  507. */
  508. if (rapl_hw_unit[0] < 32)
  509. ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
  510. else
  511. ms = 2;
  512. pmu->timer_interval = ms_to_ktime(ms);
  513. rapl_hrtimer_init(pmu);
  514. /* set RAPL pmu for this cpu for now */
  515. per_cpu(rapl_pmu, cpu) = pmu;
  516. per_cpu(rapl_pmu_to_free, cpu) = NULL;
  517. return 0;
  518. }
  519. static void rapl_cpu_kfree(int cpu)
  520. {
  521. struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
  522. kfree(pmu);
  523. per_cpu(rapl_pmu_to_free, cpu) = NULL;
  524. }
  525. static int rapl_cpu_dying(int cpu)
  526. {
  527. struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
  528. if (!pmu)
  529. return 0;
  530. per_cpu(rapl_pmu, cpu) = NULL;
  531. per_cpu(rapl_pmu_to_free, cpu) = pmu;
  532. return 0;
  533. }
  534. static int rapl_cpu_notifier(struct notifier_block *self,
  535. unsigned long action, void *hcpu)
  536. {
  537. unsigned int cpu = (long)hcpu;
  538. switch (action & ~CPU_TASKS_FROZEN) {
  539. case CPU_UP_PREPARE:
  540. rapl_cpu_prepare(cpu);
  541. break;
  542. case CPU_STARTING:
  543. rapl_cpu_init(cpu);
  544. break;
  545. case CPU_UP_CANCELED:
  546. case CPU_DYING:
  547. rapl_cpu_dying(cpu);
  548. break;
  549. case CPU_ONLINE:
  550. case CPU_DEAD:
  551. rapl_cpu_kfree(cpu);
  552. break;
  553. case CPU_DOWN_PREPARE:
  554. rapl_cpu_exit(cpu);
  555. break;
  556. default:
  557. break;
  558. }
  559. return NOTIFY_OK;
  560. }
  561. static int rapl_check_hw_unit(void)
  562. {
  563. u64 msr_rapl_power_unit_bits;
  564. int i;
  565. /* protect rdmsrl() to handle virtualization */
  566. if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
  567. return -1;
  568. for (i = 0; i < NR_RAPL_DOMAINS; i++)
  569. rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
  570. return 0;
  571. }
  572. static const struct x86_cpu_id rapl_cpu_match[] = {
  573. [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
  574. [1] = {},
  575. };
  576. static int __init rapl_pmu_init(void)
  577. {
  578. struct rapl_pmu *pmu;
  579. int cpu, ret;
  580. struct x86_pmu_quirk *quirk;
  581. int i;
  582. /*
  583. * check for Intel processor family 6
  584. */
  585. if (!x86_match_cpu(rapl_cpu_match))
  586. return 0;
  587. /* check supported CPU */
  588. switch (boot_cpu_data.x86_model) {
  589. case 42: /* Sandy Bridge */
  590. case 58: /* Ivy Bridge */
  591. rapl_cntr_mask = RAPL_IDX_CLN;
  592. rapl_pmu_events_group.attrs = rapl_events_cln_attr;
  593. break;
  594. case 63: /* Haswell-Server */
  595. rapl_add_quirk(rapl_hsw_server_quirk);
  596. rapl_cntr_mask = RAPL_IDX_SRV;
  597. rapl_pmu_events_group.attrs = rapl_events_srv_attr;
  598. break;
  599. case 60: /* Haswell */
  600. case 69: /* Haswell-Celeron */
  601. case 61: /* Broadwell */
  602. rapl_cntr_mask = RAPL_IDX_HSW;
  603. rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
  604. break;
  605. case 45: /* Sandy Bridge-EP */
  606. case 62: /* IvyTown */
  607. rapl_cntr_mask = RAPL_IDX_SRV;
  608. rapl_pmu_events_group.attrs = rapl_events_srv_attr;
  609. break;
  610. default:
  611. /* unsupported */
  612. return 0;
  613. }
  614. ret = rapl_check_hw_unit();
  615. if (ret)
  616. return ret;
  617. /* run cpu model quirks */
  618. for (quirk = rapl_quirks; quirk; quirk = quirk->next)
  619. quirk->func();
  620. cpu_notifier_register_begin();
  621. for_each_online_cpu(cpu) {
  622. ret = rapl_cpu_prepare(cpu);
  623. if (ret)
  624. goto out;
  625. rapl_cpu_init(cpu);
  626. }
  627. __perf_cpu_notifier(rapl_cpu_notifier);
  628. ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
  629. if (WARN_ON(ret)) {
  630. pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
  631. cpu_notifier_register_done();
  632. return -1;
  633. }
  634. pmu = __this_cpu_read(rapl_pmu);
  635. pr_info("RAPL PMU detected,"
  636. " API unit is 2^-32 Joules,"
  637. " %d fixed counters"
  638. " %llu ms ovfl timer\n",
  639. hweight32(rapl_cntr_mask),
  640. ktime_to_ms(pmu->timer_interval));
  641. for (i = 0; i < NR_RAPL_DOMAINS; i++) {
  642. if (rapl_cntr_mask & (1 << i)) {
  643. pr_info("hw unit of domain %s 2^-%d Joules\n",
  644. rapl_domain_names[i], rapl_hw_unit[i]);
  645. }
  646. }
  647. out:
  648. cpu_notifier_register_done();
  649. return 0;
  650. }
  651. device_initcall(rapl_pmu_init);