linux-container.scm 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. ;;; GNU Guix --- Functional package management for GNU
  2. ;;; Copyright © 2015 David Thompson <davet@gnu.org>
  3. ;;; Copyright © 2017 Ludovic Courtès <ludo@gnu.org>
  4. ;;;
  5. ;;; This file is part of GNU Guix.
  6. ;;;
  7. ;;; GNU Guix is free software; you can redistribute it and/or modify it
  8. ;;; under the terms of the GNU General Public License as published by
  9. ;;; the Free Software Foundation; either version 3 of the License, or (at
  10. ;;; your option) any later version.
  11. ;;;
  12. ;;; GNU Guix is distributed in the hope that it will be useful, but
  13. ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. ;;; GNU General Public License for more details.
  16. ;;;
  17. ;;; You should have received a copy of the GNU General Public License
  18. ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
  19. (define-module (gnu build linux-container)
  20. #:use-module (ice-9 format)
  21. #:use-module (ice-9 match)
  22. #:use-module (ice-9 rdelim)
  23. #:use-module (srfi srfi-98)
  24. #:use-module (guix utils)
  25. #:use-module (guix build utils)
  26. #:use-module (guix build syscalls)
  27. #:use-module (gnu system file-systems) ;<file-system>
  28. #:use-module ((gnu build file-systems) #:select (mount-file-system))
  29. #:export (user-namespace-supported?
  30. unprivileged-user-namespace-supported?
  31. setgroups-supported?
  32. %namespaces
  33. run-container
  34. call-with-container
  35. container-excursion
  36. container-excursion*))
  37. (define (user-namespace-supported?)
  38. "Return #t if user namespaces are supported on this system."
  39. (file-exists? "/proc/self/ns/user"))
  40. (define (unprivileged-user-namespace-supported?)
  41. "Return #t if user namespaces can be created by unprivileged users."
  42. (let ((userns-file "/proc/sys/kernel/unprivileged_userns_clone"))
  43. (if (file-exists? userns-file)
  44. (eqv? #\1 (call-with-input-file userns-file read-char))
  45. #t)))
  46. (define (setgroups-supported?)
  47. "Return #t if the setgroups proc file, introduced in Linux-libre 3.19,
  48. exists."
  49. (file-exists? "/proc/self/setgroups"))
  50. (define %namespaces
  51. '(mnt pid ipc uts user net))
  52. (define (call-with-clean-exit thunk)
  53. "Apply THUNK, but exit with a status code of 1 if it fails."
  54. (dynamic-wind
  55. (const #t)
  56. (lambda ()
  57. (thunk)
  58. (primitive-exit 0))
  59. (lambda ()
  60. (primitive-exit 1))))
  61. (define (purify-environment)
  62. "Unset all environment variables."
  63. (for-each unsetenv
  64. (match (get-environment-variables)
  65. (((names . _) ...) names))))
  66. ;; The container setup procedure closely resembles that of the Docker
  67. ;; specification:
  68. ;; https://raw.githubusercontent.com/docker/libcontainer/master/SPEC.md
  69. (define* (mount-file-systems root mounts #:key mount-/sys? mount-/proc?)
  70. "Mount the essential file systems and the those in MOUNTS, a list of
  71. <file-system> objects, relative to ROOT; then make ROOT the new root directory
  72. for the process."
  73. (define (scope dir)
  74. (string-append root dir))
  75. (define (touch file-name)
  76. (call-with-output-file file-name (const #t)))
  77. (define (bind-mount src dest)
  78. (mount src dest "none" MS_BIND))
  79. ;; Like mount, but creates the mount point if it doesn't exist.
  80. (define* (mount* source target type #:optional (flags 0) options
  81. #:key (update-mtab? #f))
  82. (mkdir-p target)
  83. (mount source target type flags options #:update-mtab? update-mtab?))
  84. ;; The container's file system is completely ephemeral, sans directories
  85. ;; bind-mounted from the host.
  86. (mount "none" root "tmpfs")
  87. ;; A proc mount requires a new pid namespace.
  88. (when mount-/proc?
  89. (mount* "none" (scope "/proc") "proc"
  90. (logior MS_NOEXEC MS_NOSUID MS_NODEV)))
  91. ;; A sysfs mount requires the user to have the CAP_SYS_ADMIN capability in
  92. ;; the current network namespace.
  93. (when mount-/sys?
  94. (mount* "none" (scope "/sys") "sysfs"
  95. (logior MS_NOEXEC MS_NOSUID MS_NODEV MS_RDONLY)))
  96. (mount* "none" (scope "/dev") "tmpfs"
  97. (logior MS_NOEXEC MS_STRICTATIME)
  98. "mode=755")
  99. ;; Create essential device nodes via bind-mounting them from the
  100. ;; host, because a process within a user namespace cannot create
  101. ;; device nodes.
  102. (for-each (lambda (device)
  103. (when (file-exists? device)
  104. ;; Create the mount point file.
  105. (touch (scope device))
  106. (bind-mount device (scope device))))
  107. '("/dev/null"
  108. "/dev/zero"
  109. "/dev/full"
  110. "/dev/random"
  111. "/dev/urandom"
  112. "/dev/tty"
  113. "/dev/ptmx"
  114. "/dev/fuse"))
  115. ;; Setup the container's /dev/console by bind mounting the pseudo-terminal
  116. ;; associated with standard input when there is one.
  117. (let* ((in (current-input-port))
  118. (tty (catch 'system-error
  119. (lambda ()
  120. ;; This call throws if IN does not correspond to a tty.
  121. ;; This is more reliable than 'isatty?'.
  122. (ttyname in))
  123. (const #f)))
  124. (console (scope "/dev/console")))
  125. (when tty
  126. (touch console)
  127. (chmod console #o600)
  128. (bind-mount tty console)))
  129. ;; Setup standard input/output/error.
  130. (symlink "/proc/self/fd" (scope "/dev/fd"))
  131. (symlink "/proc/self/fd/0" (scope "/dev/stdin"))
  132. (symlink "/proc/self/fd/1" (scope "/dev/stdout"))
  133. (symlink "/proc/self/fd/2" (scope "/dev/stderr"))
  134. ;; Mount user-specified file systems.
  135. (for-each (lambda (file-system)
  136. (mount-file-system (file-system->spec file-system)
  137. #:root root))
  138. mounts)
  139. ;; Jail the process inside the container's root file system.
  140. (let ((put-old (string-append root "/real-root")))
  141. (mkdir put-old)
  142. (pivot-root root put-old)
  143. (chdir "/")
  144. (umount "real-root" MNT_DETACH)
  145. (rmdir "real-root")))
  146. (define (initialize-user-namespace pid host-uids)
  147. "Configure the user namespace for PID. HOST-UIDS specifies the number of
  148. host user identifiers to map into the user namespace."
  149. (define proc-dir
  150. (string-append "/proc/" (number->string pid)))
  151. (define (scope file)
  152. (string-append proc-dir file))
  153. (let ((uid (getuid))
  154. (gid (getgid)))
  155. ;; Only root can write to the gid map without first disabling the
  156. ;; setgroups syscall.
  157. (unless (and (zero? uid) (zero? gid))
  158. (call-with-output-file (scope "/setgroups")
  159. (lambda (port)
  160. (display "deny" port))))
  161. ;; Map the user/group that created the container to the root user
  162. ;; within the container.
  163. (call-with-output-file (scope "/uid_map")
  164. (lambda (port)
  165. (format port "0 ~d ~d" uid host-uids)))
  166. (call-with-output-file (scope "/gid_map")
  167. (lambda (port)
  168. (format port "0 ~d ~d" gid host-uids)))))
  169. (define (namespaces->bit-mask namespaces)
  170. "Return the number suitable for the 'flags' argument of 'clone' that
  171. corresponds to the symbols in NAMESPACES."
  172. ;; Use the same flags as fork(3) in addition to the namespace flags.
  173. (apply logior SIGCHLD
  174. (map (match-lambda
  175. ('mnt CLONE_NEWNS)
  176. ('uts CLONE_NEWUTS)
  177. ('ipc CLONE_NEWIPC)
  178. ('user CLONE_NEWUSER)
  179. ('pid CLONE_NEWPID)
  180. ('net CLONE_NEWNET))
  181. namespaces)))
  182. (define (run-container root mounts namespaces host-uids thunk)
  183. "Run THUNK in a new container process and return its PID. ROOT specifies
  184. the root directory for the container. MOUNTS is a list of <file-system>
  185. objects that specify file systems to mount inside the container. NAMESPACES
  186. is a list of symbols that correspond to the possible Linux namespaces: mnt,
  187. ipc, uts, user, and net. HOST-UIDS specifies the number of
  188. host user identifiers to map into the user namespace."
  189. ;; The parent process must initialize the user namespace for the child
  190. ;; before it can boot. To negotiate this, a pipe is used such that the
  191. ;; child process blocks until the parent writes to it.
  192. (match (socketpair PF_UNIX SOCK_STREAM 0)
  193. ((child . parent)
  194. (let ((flags (namespaces->bit-mask namespaces)))
  195. (match (clone flags)
  196. (0
  197. (call-with-clean-exit
  198. (lambda ()
  199. (close-port parent)
  200. ;; Wait for parent to set things up.
  201. (match (read child)
  202. ('ready
  203. (purify-environment)
  204. (when (memq 'mnt namespaces)
  205. (catch #t
  206. (lambda ()
  207. (mount-file-systems root mounts
  208. #:mount-/proc? (memq 'pid namespaces)
  209. #:mount-/sys? (memq 'net
  210. namespaces)))
  211. (lambda args
  212. ;; Forward the exception to the parent process.
  213. ;; FIXME: SRFI-35 conditions and non-trivial objects
  214. ;; cannot be 'read' so they shouldn't be written as is.
  215. (write args child)
  216. (primitive-exit 3))))
  217. ;; TODO: Manage capabilities.
  218. (write 'ready child)
  219. (close-port child)
  220. (thunk))
  221. (_ ;parent died or something
  222. (primitive-exit 2))))))
  223. (pid
  224. (close-port child)
  225. (when (memq 'user namespaces)
  226. (initialize-user-namespace pid host-uids))
  227. ;; TODO: Initialize cgroups.
  228. (write 'ready parent)
  229. (newline parent)
  230. ;; Check whether the child process' setup phase succeeded.
  231. (let ((message (read parent)))
  232. (close-port parent)
  233. (match message
  234. ('ready ;success
  235. pid)
  236. (((? symbol? key) args ...) ;exception
  237. (apply throw key args))
  238. (_ ;unexpected termination
  239. #f)))))))))
  240. (define* (call-with-container mounts thunk #:key (namespaces %namespaces)
  241. (host-uids 1))
  242. "Run THUNK in a new container process and return its exit status.
  243. MOUNTS is a list of <file-system> objects that specify file systems to mount
  244. inside the container. NAMESPACES is a list of symbols corresponding to
  245. the identifiers for Linux namespaces: mnt, ipc, uts, pid, user, and net. By
  246. default, all namespaces are used. HOST-UIDS is the number of host user
  247. identifiers to map into the container's user namespace, if there is one. By
  248. default, only a single uid/gid, that of the current user, is mapped into the
  249. container. The host user that creates the container is the root user (uid/gid
  250. 0) within the container. Only root can map more than a single uid/gid.
  251. Note that if THUNK needs to load any additional Guile modules, the relevant
  252. module files must be present in one of the mappings in MOUNTS and the Guile
  253. load path must be adjusted as needed."
  254. (call-with-temporary-directory
  255. (lambda (root)
  256. (let ((pid (run-container root mounts namespaces host-uids thunk)))
  257. ;; Catch SIGINT and kill the container process.
  258. (sigaction SIGINT
  259. (lambda (signum)
  260. (false-if-exception
  261. (kill pid SIGKILL))))
  262. (match (waitpid pid)
  263. ((_ . status) status))))))
  264. (define (container-excursion pid thunk)
  265. "Run THUNK as a child process within the namespaces of process PID and
  266. return the exit status."
  267. (define (namespace-file pid namespace)
  268. (string-append "/proc/" (number->string pid) "/ns/" namespace))
  269. (match (primitive-fork)
  270. (0
  271. (call-with-clean-exit
  272. (lambda ()
  273. (for-each (lambda (ns)
  274. (let ((source (namespace-file (getpid) ns))
  275. (target (namespace-file pid ns)))
  276. ;; Joining the namespace that the process already
  277. ;; belongs to would throw an error so avoid that.
  278. ;; XXX: This /proc interface leads to TOCTTOU.
  279. (unless (string=? (readlink source) (readlink target))
  280. (call-with-input-file source
  281. (lambda (current-ns-port)
  282. (call-with-input-file target
  283. (lambda (new-ns-port)
  284. (setns (fileno new-ns-port) 0))))))))
  285. ;; It's important that the user namespace is joined first,
  286. ;; so that the user will have the privileges to join the
  287. ;; other namespaces. Furthermore, it's important that the
  288. ;; mount namespace is joined last, otherwise the /proc mount
  289. ;; point would no longer be accessible.
  290. '("user" "ipc" "uts" "net" "pid" "mnt"))
  291. (purify-environment)
  292. (chdir "/")
  293. (thunk))))
  294. (pid
  295. (match (waitpid pid)
  296. ((_ . status)
  297. (status:exit-val status))))))
  298. (define (container-excursion* pid thunk)
  299. "Like 'container-excursion', but return the return value of THUNK."
  300. (match (pipe)
  301. ((in . out)
  302. (match (container-excursion pid
  303. (lambda ()
  304. (close-port in)
  305. (write (thunk) out)))
  306. (0
  307. (close-port out)
  308. (let ((result (read in)))
  309. (close-port in)
  310. result))
  311. (_ ;maybe PID died already
  312. (close-port out)
  313. (close-port in)
  314. #f)))))