membarrier.c 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. /*
  2. * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  3. *
  4. * membarrier system call
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. */
  16. #include "sched.h"
  17. /*
  18. * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  19. * except MEMBARRIER_CMD_QUERY.
  20. */
  21. #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
  22. #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
  23. (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
  24. | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
  25. #else
  26. #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
  27. #endif
  28. #define MEMBARRIER_CMD_BITMASK \
  29. (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
  30. | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
  31. | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
  32. | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
  33. | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
  34. static void ipi_mb(void *info)
  35. {
  36. smp_mb(); /* IPIs should be serializing but paranoid. */
  37. }
  38. static int membarrier_global_expedited(void)
  39. {
  40. int cpu;
  41. bool fallback = false;
  42. cpumask_var_t tmpmask;
  43. if (num_online_cpus() == 1)
  44. return 0;
  45. /*
  46. * Matches memory barriers around rq->curr modification in
  47. * scheduler.
  48. */
  49. smp_mb(); /* system call entry is not a mb. */
  50. /*
  51. * Expedited membarrier commands guarantee that they won't
  52. * block, hence the GFP_NOWAIT allocation flag and fallback
  53. * implementation.
  54. */
  55. if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
  56. /* Fallback for OOM. */
  57. fallback = true;
  58. }
  59. cpus_read_lock();
  60. for_each_online_cpu(cpu) {
  61. struct task_struct *p;
  62. /*
  63. * Skipping the current CPU is OK even through we can be
  64. * migrated at any point. The current CPU, at the point
  65. * where we read raw_smp_processor_id(), is ensured to
  66. * be in program order with respect to the caller
  67. * thread. Therefore, we can skip this CPU from the
  68. * iteration.
  69. */
  70. if (cpu == raw_smp_processor_id())
  71. continue;
  72. rcu_read_lock();
  73. p = task_rcu_dereference(&cpu_rq(cpu)->curr);
  74. if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
  75. MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
  76. if (!fallback)
  77. __cpumask_set_cpu(cpu, tmpmask);
  78. else
  79. smp_call_function_single(cpu, ipi_mb, NULL, 1);
  80. }
  81. rcu_read_unlock();
  82. }
  83. if (!fallback) {
  84. preempt_disable();
  85. smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
  86. preempt_enable();
  87. free_cpumask_var(tmpmask);
  88. }
  89. cpus_read_unlock();
  90. /*
  91. * Memory barrier on the caller thread _after_ we finished
  92. * waiting for the last IPI. Matches memory barriers around
  93. * rq->curr modification in scheduler.
  94. */
  95. smp_mb(); /* exit from system call is not a mb */
  96. return 0;
  97. }
  98. static int membarrier_private_expedited(int flags)
  99. {
  100. int cpu;
  101. bool fallback = false;
  102. cpumask_var_t tmpmask;
  103. if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
  104. if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
  105. return -EINVAL;
  106. if (!(atomic_read(&current->mm->membarrier_state) &
  107. MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
  108. return -EPERM;
  109. } else {
  110. if (!(atomic_read(&current->mm->membarrier_state) &
  111. MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
  112. return -EPERM;
  113. }
  114. if (num_online_cpus() == 1)
  115. return 0;
  116. /*
  117. * Matches memory barriers around rq->curr modification in
  118. * scheduler.
  119. */
  120. smp_mb(); /* system call entry is not a mb. */
  121. /*
  122. * Expedited membarrier commands guarantee that they won't
  123. * block, hence the GFP_NOWAIT allocation flag and fallback
  124. * implementation.
  125. */
  126. if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
  127. /* Fallback for OOM. */
  128. fallback = true;
  129. }
  130. cpus_read_lock();
  131. for_each_online_cpu(cpu) {
  132. struct task_struct *p;
  133. /*
  134. * Skipping the current CPU is OK even through we can be
  135. * migrated at any point. The current CPU, at the point
  136. * where we read raw_smp_processor_id(), is ensured to
  137. * be in program order with respect to the caller
  138. * thread. Therefore, we can skip this CPU from the
  139. * iteration.
  140. */
  141. if (cpu == raw_smp_processor_id())
  142. continue;
  143. rcu_read_lock();
  144. p = task_rcu_dereference(&cpu_rq(cpu)->curr);
  145. if (p && p->mm == current->mm) {
  146. if (!fallback)
  147. __cpumask_set_cpu(cpu, tmpmask);
  148. else
  149. smp_call_function_single(cpu, ipi_mb, NULL, 1);
  150. }
  151. rcu_read_unlock();
  152. }
  153. if (!fallback) {
  154. preempt_disable();
  155. smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
  156. preempt_enable();
  157. free_cpumask_var(tmpmask);
  158. }
  159. cpus_read_unlock();
  160. /*
  161. * Memory barrier on the caller thread _after_ we finished
  162. * waiting for the last IPI. Matches memory barriers around
  163. * rq->curr modification in scheduler.
  164. */
  165. smp_mb(); /* exit from system call is not a mb */
  166. return 0;
  167. }
  168. static int membarrier_register_global_expedited(void)
  169. {
  170. struct task_struct *p = current;
  171. struct mm_struct *mm = p->mm;
  172. if (atomic_read(&mm->membarrier_state) &
  173. MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
  174. return 0;
  175. atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
  176. if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
  177. /*
  178. * For single mm user, single threaded process, we can
  179. * simply issue a memory barrier after setting
  180. * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
  181. * no memory access following registration is reordered
  182. * before registration.
  183. */
  184. smp_mb();
  185. } else {
  186. /*
  187. * For multi-mm user threads, we need to ensure all
  188. * future scheduler executions will observe the new
  189. * thread flag state for this mm.
  190. */
  191. synchronize_sched();
  192. }
  193. atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
  194. &mm->membarrier_state);
  195. return 0;
  196. }
  197. static int membarrier_register_private_expedited(int flags)
  198. {
  199. struct task_struct *p = current;
  200. struct mm_struct *mm = p->mm;
  201. int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
  202. if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
  203. if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
  204. return -EINVAL;
  205. state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
  206. }
  207. /*
  208. * We need to consider threads belonging to different thread
  209. * groups, which use the same mm. (CLONE_VM but not
  210. * CLONE_THREAD).
  211. */
  212. if ((atomic_read(&mm->membarrier_state) & state) == state)
  213. return 0;
  214. atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
  215. if (flags & MEMBARRIER_FLAG_SYNC_CORE)
  216. atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
  217. &mm->membarrier_state);
  218. if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
  219. /*
  220. * Ensure all future scheduler executions will observe the
  221. * new thread flag state for this process.
  222. */
  223. synchronize_sched();
  224. }
  225. atomic_or(state, &mm->membarrier_state);
  226. return 0;
  227. }
  228. /**
  229. * sys_membarrier - issue memory barriers on a set of threads
  230. * @cmd: Takes command values defined in enum membarrier_cmd.
  231. * @flags: Currently needs to be 0. For future extensions.
  232. *
  233. * If this system call is not implemented, -ENOSYS is returned. If the
  234. * command specified does not exist, not available on the running
  235. * kernel, or if the command argument is invalid, this system call
  236. * returns -EINVAL. For a given command, with flags argument set to 0,
  237. * this system call is guaranteed to always return the same value until
  238. * reboot.
  239. *
  240. * All memory accesses performed in program order from each targeted thread
  241. * is guaranteed to be ordered with respect to sys_membarrier(). If we use
  242. * the semantic "barrier()" to represent a compiler barrier forcing memory
  243. * accesses to be performed in program order across the barrier, and
  244. * smp_mb() to represent explicit memory barriers forcing full memory
  245. * ordering across the barrier, we have the following ordering table for
  246. * each pair of barrier(), sys_membarrier() and smp_mb():
  247. *
  248. * The pair ordering is detailed as (O: ordered, X: not ordered):
  249. *
  250. * barrier() smp_mb() sys_membarrier()
  251. * barrier() X X O
  252. * smp_mb() X O O
  253. * sys_membarrier() O O O
  254. */
  255. SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
  256. {
  257. if (unlikely(flags))
  258. return -EINVAL;
  259. switch (cmd) {
  260. case MEMBARRIER_CMD_QUERY:
  261. {
  262. int cmd_mask = MEMBARRIER_CMD_BITMASK;
  263. if (tick_nohz_full_enabled())
  264. cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
  265. return cmd_mask;
  266. }
  267. case MEMBARRIER_CMD_GLOBAL:
  268. /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
  269. if (tick_nohz_full_enabled())
  270. return -EINVAL;
  271. if (num_online_cpus() > 1)
  272. synchronize_sched();
  273. return 0;
  274. case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
  275. return membarrier_global_expedited();
  276. case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
  277. return membarrier_register_global_expedited();
  278. case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
  279. return membarrier_private_expedited(0);
  280. case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
  281. return membarrier_register_private_expedited(0);
  282. case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
  283. return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
  284. case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
  285. return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
  286. default:
  287. return -EINVAL;
  288. }
  289. }