numa_linux.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. // Copyright © 2021 Jeffrey H. Johnson <trnsz@pobox.com>.
  2. // Copyright © 2021 Gridfinity, LLC.
  3. // Copyright © 2019 Neal.
  4. // Copyright © 2018 lrita@163.com.
  5. //
  6. // Use of this source code is governed by the MIT
  7. // license that can be found in the LICENSE file.
  8. //go:build linux
  9. // +build linux
  10. package gonuma // import "github.com/johnsonjh/gonuma"
  11. import (
  12. "fmt"
  13. "io/ioutil"
  14. "runtime"
  15. "strconv"
  16. "strings"
  17. "syscall"
  18. "unsafe"
  19. "github.com/intel-go/cpuid"
  20. )
  21. func init() {
  22. _, _, e1 := syscall.Syscall6(
  23. syscall.SYS_GET_MEMPOLICY,
  24. 0,
  25. 0,
  26. 0,
  27. 0,
  28. 0,
  29. 0,
  30. )
  31. available = e1 != syscall.ENOSYS
  32. NUMAnodemax = setupnodemask() // max nodes
  33. memnodes = NewBitmask(NodePossibleCount())
  34. numanodes = NewBitmask(NodePossibleCount())
  35. NUMAconfigurednode = setupconfigurednodes() // configured nodes
  36. NUMAcpuMax = setupncpu() // max cpu
  37. NUMAconfiguredcpu = setupnconfiguredcpu() // configured cpu
  38. setupconstraints()
  39. }
  40. // GetMemPolicy retrieves the NUMA policy of the calling process or of a
  41. // memory address, depending on the setting of flags.
  42. // Details to see manpage of get_mempolicy.
  43. //
  44. // If flags is specified as 0, then information about the calling process's
  45. // default policy (as set by set_mempolicy(2)) is returned. The policy
  46. // returned [mode and nodemask] may be used to restore the process's policy
  47. // to its state at the time of the call to get_mempolicy() using
  48. // set_mempolicy(2).
  49. //
  50. // If flags specifies MPolFMemsAllowed (available since Linux 2.6.24),
  51. // the mode argument is ignored and the set of nodes [memories] that the
  52. // process is allowed to specify in subsequent calls to mbind(2) or
  53. // set_mempolicy(2) [in the absence of any mode flags] is returned in
  54. // nodemask. It is not permitted to combine MPolFMemsAllowed with
  55. // either MPolFAddr or MPolFNode.
  56. //
  57. // If flags specifies MPolFAddr, then information is returned about the
  58. // policy governing the memory address given in addr. This policy may be
  59. // different from the process's default policy if mbind(2) or one of the
  60. // helper functions described in numa(3) has been used to establish a policy
  61. // for the memory range containing addr.
  62. //
  63. // If flags specifies both MPolFNode and MPolFAddr, get_mempolicy() will
  64. // return the node ID of the node on which the address addr is allocated
  65. // into the location pointed to by mode. If no page has yet been allocated for
  66. // the specified address, get_mempolicy() will allocate a page as if the
  67. // process had performed a read [load] access to that address, and return the
  68. // ID of the node where that page was allocated.
  69. //
  70. // If flags specifies MPolFNode, but not MPolFAddr, and the process's
  71. // current policy is MPolInterleave, then get_mempolicy() will return in
  72. // the location pointed to by a non-NULL mode argument, the node ID of the
  73. // next node that will be used for interleaving of internal kernel pages
  74. // allocated on behalf of the process. These allocations include pages for
  75. // memory mapped files in process memory ranges mapped using the mmap(2)
  76. // call with the MAP_PRIVATE flag for read accesses, and in memory ranges
  77. // mapped with the MAP_SHARED flag for all accesses.
  78. func GetMemPolicy(
  79. nodemask Bitmask,
  80. addr unsafe.Pointer,
  81. flags int,
  82. ) (mode int, err error) {
  83. var mask, maxnode uintptr
  84. if maxnode = uintptr(nodemask.Len()); maxnode != 0 {
  85. mask = uintptr(unsafe.Pointer(&nodemask[0]))
  86. }
  87. _, _, errno := syscall.Syscall6(syscall.SYS_GET_MEMPOLICY,
  88. uintptr(unsafe.Pointer(&mode)), mask, maxnode,
  89. uintptr(addr), uintptr(flags), 0)
  90. if errno != 0 {
  91. err = errno
  92. }
  93. return
  94. }
  95. // SetMemPolicy sets the NUMA memory policy of the calling process, which
  96. // consists of a policy mode and zero or more nodes, to the values specified
  97. // by the mode, nodemask and maxnode arguments.
  98. // Details to see manpage of set_mempolicy.
  99. //
  100. // A NUMA machine has different memory controllers with different distances
  101. // to specific CPUs. The memory policy defines from which node memory is
  102. // allocated for the process.
  103. // SetMemPolicy defines the default policy for the process. The process
  104. // policy governs allocation of pages in the process's address space
  105. // outside of memory ranges controlled by a more specific policy set by
  106. // mbind(2). The process default policy also controls allocation of any
  107. // pages for memory mapped files mapped using the mmap(2) call with the
  108. // MAP_PRIVATE flag and that are only read [loaded] from by the process
  109. // and of memory mapped files mapped using the mmap(2) call with the
  110. // MAP_SHARED flag, regardless of the access type. The policy is applied
  111. // only when a new page is allocated for the process. For anonymous memory
  112. // this is when the page is first touched by the application. The mode
  113. // argument must specify one of MPolDefault, MPolBind, MPolInterleave or
  114. // PolPreferred. All modes except MPolDefault require the caller to
  115. // specify via the nodemask argument one or more nodes. The mode argument
  116. // may also include an optional mode flag. The supported mode flags are:
  117. // MPolFStaticNodes and MPolFRelativeNodes. Where a nodemask is required,
  118. // it must contain at least one node that is on-line, allowed by the process
  119. // current cpuset context, [unless the MPolFStaticNodes mode flag is
  120. // specified], and contains memory. If the MPolFStaticNodes is set in mode
  121. // and a required nodemask contains no nodes that are allowed by the
  122. // process current cpuset context, the memory policy reverts to local
  123. // allocation. This effectively overrides the specified policy until the
  124. // process cpuset context includes one or more of the nodes specified
  125. // by nodemask.
  126. func SetMemPolicy(mode int, nodemask Bitmask) (err error) {
  127. var mask, maxnode uintptr
  128. if maxnode = uintptr(nodemask.Len()); maxnode != 0 {
  129. mask = uintptr(unsafe.Pointer(&nodemask[0]))
  130. }
  131. _, _, errno := syscall.Syscall(syscall.SYS_SET_MEMPOLICY,
  132. uintptr(mode), mask, maxnode)
  133. if errno != 0 {
  134. err = errno
  135. }
  136. return
  137. }
  138. // MBind sets the NUMA memory policy, which consists of a policy mode
  139. // and zero or more nodes, for the memory range starting with addr
  140. // and continuing for length bytes. The memory policy defines from
  141. // which node memory is allocated. Details to see manpage of mbind.
  142. // If the memory range specified by the addr and length arguments
  143. // includes an "anonymous" region of memory that is a region of memory
  144. // created using the mmap(2) system call with the MAP_ANONYMOUS or a
  145. // memory mapped file, mapped using the mmap(2) system call with the
  146. // MAP_PRIVATE flag, pages will be allocated only according to the
  147. // specified policy when the application writes [stores] to the page.
  148. // For anonymous regions, an initial read access will use a shared page
  149. // in the kernel containing all zeros. For a file mapped with MAP_PRIVATE,
  150. // an initial read access will allocate pages according to the process
  151. // policy of the process that causes the page to be allocated. This may
  152. // not be the process that called mbind(). The specified policy will be
  153. // ignored for any MAP_SHARED mappings in the specified memory range.
  154. // Rather the pages will be allocated according to the process policy of
  155. // the process that caused the page to be allocated. Again, this may not
  156. // be the process that called mbind(). If the specified memory range
  157. // includes a shared memory region created using the shmget(2) system call
  158. // and attached using the shmat(2) system call, pages allocated for the
  159. // anonymous or shared memory region will be allocated according to the
  160. // policy specified, regardless which process attached to the shared memory
  161. // segment causes the allocation. If, however, the shared memory region was
  162. // created with the SHM_HUGETLB flag, the huge pages will be allocated
  163. // according to the policy specified only if the page allocation is caused by
  164. // the process that calls mbind() for that region. By default, mbind() has an
  165. // effect only for new allocations; if the pages inside the range have been
  166. // already touched before setting the policy, then the policy has no effect.
  167. // This default behavior may be overridden by the MPolMFMove and MPolMFMoveAll
  168. // flags described below.
  169. func MBind(
  170. addr unsafe.Pointer,
  171. length, mode, flags int,
  172. nodemask Bitmask,
  173. ) (err error) {
  174. var mask, maxnode uintptr
  175. if maxnode = uintptr(nodemask.Len()); maxnode != 0 {
  176. mask = uintptr(unsafe.Pointer(&nodemask[0]))
  177. }
  178. _, _, errno := syscall.Syscall6(syscall.SYS_MBIND, uintptr(addr),
  179. uintptr(length), uintptr(mode), mask, maxnode, uintptr(flags))
  180. if errno != 0 {
  181. err = errno
  182. }
  183. return
  184. }
  185. // GetSchedAffinity writes the affinity mask of the process whose ID is pid
  186. // into the input mask. If pid is zero, then the mask of the calling process
  187. // is returned.
  188. func GetSchedAffinity(pid int, cpumask Bitmask) (int, error) {
  189. var mask, maxnode uintptr
  190. if maxnode = uintptr(cpumask.Len() / 8); maxnode != 0 {
  191. mask = uintptr(unsafe.Pointer(&cpumask[0]))
  192. }
  193. length, _, e1 := syscall.Syscall(syscall.SYS_SCHED_GETAFFINITY,
  194. uintptr(pid), maxnode, mask)
  195. if e1 != 0 {
  196. return 0, e1
  197. }
  198. return int(length), nil
  199. }
  200. // SetSchedAffinity sets the CPU affinity mask of the process whose ID
  201. // is pid to the value specified by mask. If pid is zero, then the calling
  202. // process is used.
  203. func SetSchedAffinity(pid int, cpumask Bitmask) error {
  204. var mask, maxnode uintptr
  205. if maxnode = uintptr(cpumask.Len() / 8); maxnode != 0 {
  206. mask = uintptr(unsafe.Pointer(&cpumask[0]))
  207. }
  208. _, _, e1 := syscall.Syscall(syscall.SYS_SCHED_SETAFFINITY,
  209. uintptr(pid), maxnode, mask)
  210. if e1 != 0 {
  211. return e1
  212. }
  213. return nil
  214. }
  215. // We do this the way Paul Jackson's libcpuset does it. The nodemask
  216. // values in /proc/self/status are in an ASCII format that uses nine
  217. // characters for each 32 bits of mask. This could also be used to
  218. // find the cpumask size.
  219. func setupnodemask() (n int) {
  220. d, err := ioutil.ReadFile("/proc/self/status")
  221. if err == nil {
  222. const stp = "Mems_allowed:\t"
  223. for _, line := range strings.Split(string(d), "\n") {
  224. if !strings.HasPrefix(line, stp) {
  225. continue
  226. }
  227. n = (len(line) - len(stp) + 1) * 32 / 9
  228. }
  229. }
  230. if n == 0 {
  231. n = 16
  232. for n < 4096*8 {
  233. n <<= 1
  234. mask := NewBitmask(n)
  235. if _, err := GetMemPolicy(mask, nil, 0); err != nil &&
  236. err != syscall.EINVAL {
  237. break
  238. }
  239. }
  240. }
  241. return
  242. }
  243. func setupconfigurednodes() (n int) {
  244. files, err := ioutil.ReadDir("/sys/devices/system/node")
  245. if err != nil {
  246. return 1
  247. }
  248. for _, f := range files {
  249. if !strings.HasPrefix(f.Name(), "node") {
  250. continue
  251. }
  252. i, _ := strconv.Atoi(f.Name()[4:])
  253. if n < i {
  254. n = i // maybe some node absence
  255. }
  256. numanodes.Set(i, true)
  257. if _, _, err := NodeMemSize64(i); err == nil {
  258. memnodes.Set(i, true)
  259. }
  260. }
  261. n++
  262. return
  263. }
  264. func setupncpu() (n int) {
  265. length := 4096
  266. for {
  267. mask := NewBitmask(length)
  268. nn, err := GetSchedAffinity(0, mask)
  269. if err == nil {
  270. return nn * 8
  271. }
  272. if err != syscall.EINVAL {
  273. return 128
  274. }
  275. length *= 2
  276. }
  277. }
  278. func setupnconfiguredcpu() (n int) {
  279. // sysconf(_SC_NPROCESSORS_CONF)
  280. files, err := ioutil.ReadDir("/sys/devices/system/cpu")
  281. if err == nil {
  282. for _, f := range files {
  283. if !f.IsDir() || !strings.HasPrefix(f.Name(), "cpu") {
  284. continue
  285. }
  286. if _, err := strconv.Atoi(f.Name()[3:]); err == nil {
  287. n++
  288. }
  289. }
  290. return
  291. }
  292. // fallback
  293. d, _ := ioutil.ReadFile("/proc/cpuinfo")
  294. for _, line := range strings.Split(string(d), "\n") {
  295. if strings.HasPrefix(line, "processor") {
  296. n++
  297. }
  298. }
  299. if n == 0 {
  300. n = 1
  301. }
  302. return
  303. }
  304. func setupconstraints() {
  305. node2cpu = make(map[int]Bitmask)
  306. cpu2node = make(map[int]int)
  307. for i := 0; i < numanodes.Len(); i++ {
  308. if !numanodes.Get(i) {
  309. continue
  310. }
  311. fname := fmt.Sprintf("/sys/devices/system/node/node%d/cpumap", i)
  312. d, err := ioutil.ReadFile(fname)
  313. if err != nil {
  314. continue
  315. }
  316. cpumask := NewBitmask(CPUCount())
  317. tokens := strings.Split(strings.TrimSpace(string(d)), ",")
  318. for j := 0; j < len(tokens); j++ {
  319. mask, _ := strconv.ParseUint(tokens[len(tokens)-1-j], 16, 64)
  320. nn := 64
  321. if runtime.GOARCH == "386" {
  322. nn = 32
  323. }
  324. for k := 0; k < nn; k++ {
  325. if (mask>>uint64(k))&0x01 != 0 {
  326. cpumask.Set(k+j*nn, true)
  327. }
  328. }
  329. }
  330. node2cpu[i] = cpumask
  331. for j := 0; j < cpumask.Len(); j++ {
  332. if cpumask.Get(j) {
  333. cpu2node[j] = i
  334. }
  335. }
  336. }
  337. }
  338. // NodeMemSize64 return the memory total size and free size of given node.
  339. func NodeMemSize64(node int) (total, free int64, err error) {
  340. var (
  341. d []byte
  342. fname = fmt.Sprintf("/sys/devices/system/node/node%d/meminfo", node)
  343. )
  344. d, err = ioutil.ReadFile(fname)
  345. if err != nil {
  346. return
  347. }
  348. split := func(s, d string) string {
  349. return strings.TrimFunc(
  350. s[strings.Index(s, d)+len(d):], func(x rune) bool {
  351. return x < '0' || x > '9'
  352. })
  353. }
  354. for _, line := range strings.Split(string(d), "\n") {
  355. if !strings.HasSuffix(line, "kB") {
  356. continue
  357. }
  358. switch {
  359. case strings.Contains(line, "MemTotal"):
  360. total, err = strconv.ParseInt(split(line, "MemTotal"), 10, 64)
  361. if err != nil {
  362. return
  363. }
  364. total *= 1024
  365. case strings.Contains(line, "MemFree"):
  366. free, err = strconv.ParseInt(split(line, "MemFree:"), 10, 64)
  367. if err != nil {
  368. return
  369. }
  370. free *= 1024
  371. }
  372. }
  373. return
  374. }
  375. // NUMAfastway ...
  376. var NUMAfastway = cpuid.HasFeature(cpuid.RDTSCP)
  377. func getcpu()
  378. // GetCPUAndNode returns the node and cpu which current caller is running on.
  379. func GetCPUAndNode() (cpu, node int)