deduplication.scm 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. ;;; GNU Guix --- Functional package management for GNU
  2. ;;; Copyright © 2017 Caleb Ristvedt <caleb.ristvedt@cune.org>
  3. ;;; Copyright © 2018-2021 Ludovic Courtès <ludo@gnu.org>
  4. ;;;
  5. ;;; This file is part of GNU Guix.
  6. ;;;
  7. ;;; GNU Guix is free software; you can redistribute it and/or modify it
  8. ;;; under the terms of the GNU General Public License as published by
  9. ;;; the Free Software Foundation; either version 3 of the License, or (at
  10. ;;; your option) any later version.
  11. ;;;
  12. ;;; GNU Guix is distributed in the hope that it will be useful, but
  13. ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. ;;; GNU General Public License for more details.
  16. ;;;
  17. ;;; You should have received a copy of the GNU General Public License
  18. ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
  19. ;;; This houses stuff we do to files when they arrive at the store - resetting
  20. ;;; timestamps, deduplicating, etc.
  21. (define-module (guix store deduplication)
  22. #:use-module (gcrypt hash)
  23. #:use-module ((guix build utils) #:hide (dump-port))
  24. #:use-module (guix build syscalls)
  25. #:use-module (guix base32)
  26. #:use-module (srfi srfi-11)
  27. #:use-module (srfi srfi-34)
  28. #:use-module (srfi srfi-35)
  29. #:use-module (rnrs bytevectors)
  30. #:use-module (rnrs io ports)
  31. #:use-module (ice-9 ftw)
  32. #:use-module (ice-9 match)
  33. #:use-module (guix serialization)
  34. #:export (nar-sha256
  35. deduplicate
  36. dump-file/deduplicate
  37. copy-file/deduplicate))
  38. ;; TODO: Remove once 'dump-port' in (guix build utils) has an optional 'len'
  39. ;; parameter.
  40. (define* (dump-port in out
  41. #:optional len
  42. #:key (buffer-size 16384))
  43. "Read LEN bytes from IN (or as much as possible if LEN is #f) and write it
  44. to OUT, using chunks of BUFFER-SIZE bytes."
  45. (define buffer
  46. (make-bytevector buffer-size))
  47. (let loop ((total 0)
  48. (bytes (get-bytevector-n! in buffer 0
  49. (if len
  50. (min len buffer-size)
  51. buffer-size))))
  52. (or (eof-object? bytes)
  53. (and len (= total len))
  54. (let ((total (+ total bytes)))
  55. (put-bytevector out buffer 0 bytes)
  56. (loop total
  57. (get-bytevector-n! in buffer 0
  58. (if len
  59. (min (- len total) buffer-size)
  60. buffer-size)))))))
  61. (define (nar-sha256 file)
  62. "Gives the sha256 hash of a file and the size of the file in nar form."
  63. (let-values (((port get-hash) (open-sha256-port)))
  64. (write-file file port)
  65. (force-output port)
  66. (let ((hash (get-hash))
  67. (size (port-position port)))
  68. (close-port port)
  69. (values hash size))))
  70. (define (tempname-in directory)
  71. "Gives an unused temporary name under DIRECTORY. Not guaranteed to still be
  72. unused by the time you create anything with that name, but a good shot."
  73. (let ((const-part (string-append directory "/.tmp-link-"
  74. (number->string (getpid)))))
  75. (let try ((guess-part
  76. (number->string (random most-positive-fixnum) 16)))
  77. (if (file-exists? (string-append const-part "-" guess-part))
  78. (try (number->string (random most-positive-fixnum) 16))
  79. (string-append const-part "-" guess-part)))))
  80. (define* (get-temp-link target #:optional (link-prefix (dirname target)))
  81. "Like mkstemp!, but instead of creating a new file and giving you the name,
  82. it creates a new hardlink to TARGET and gives you the name. Since
  83. cross-file-system hardlinks don't work, the temp link must be created on the
  84. same file system - where in that file system it is can be controlled by
  85. LINK-PREFIX."
  86. (let try ((tempname (tempname-in link-prefix)))
  87. (catch 'system-error
  88. (lambda ()
  89. (link target tempname)
  90. tempname)
  91. (lambda args
  92. (if (= (system-error-errno args) EEXIST)
  93. (try (tempname-in link-prefix))
  94. (apply throw args))))))
  95. (define (call-with-writable-file file store thunk)
  96. (if (string=? file store)
  97. (thunk) ;don't meddle with the store's permissions
  98. (let ((stat (lstat file)))
  99. (dynamic-wind
  100. (lambda ()
  101. (make-file-writable file))
  102. thunk
  103. (lambda ()
  104. (set-file-time file stat)
  105. (chmod file (stat:mode stat)))))))
  106. (define-syntax-rule (with-writable-file file store exp ...)
  107. "Make FILE writable for the dynamic extent of EXP..., except if FILE is the
  108. store."
  109. (call-with-writable-file file store (lambda () exp ...)))
  110. ;; There are 3 main kinds of errors we can get from hardlinking: "Too many
  111. ;; things link to this" (EMLINK), "this link already exists" (EEXIST), and
  112. ;; "can't fit more stuff in this directory" (ENOSPC).
  113. (define* (replace-with-link target to-replace
  114. #:key (swap-directory (dirname target))
  115. (store (%store-directory)))
  116. "Atomically replace the file TO-REPLACE with a link to TARGET. Use
  117. SWAP-DIRECTORY as the directory to store temporary hard links. Upon ENOSPC
  118. and EMLINK, TO-REPLACE is left unchanged.
  119. Note: TARGET, TO-REPLACE, and SWAP-DIRECTORY must be on the same file system."
  120. (define temp-link
  121. (catch 'system-error
  122. (lambda ()
  123. (get-temp-link target swap-directory))
  124. (lambda args
  125. ;; We get ENOSPC when we can't fit an additional entry in
  126. ;; SWAP-DIRECTORY. If it's EMLINK, then TARGET has reached its
  127. ;; maximum number of links.
  128. (if (memv (system-error-errno args) `(,ENOSPC ,EMLINK))
  129. #f
  130. (apply throw args)))))
  131. ;; If we couldn't create TEMP-LINK, that's OK: just don't do the
  132. ;; replacement, which means TO-REPLACE won't be deduplicated.
  133. (when temp-link
  134. (with-writable-file (dirname to-replace) store
  135. (catch 'system-error
  136. (lambda ()
  137. (rename-file temp-link to-replace))
  138. (lambda args
  139. (delete-file temp-link)
  140. (unless (= EMLINK (system-error-errno args))
  141. (apply throw args)))))))
  142. (define %deduplication-minimum-size
  143. ;; Size below which files are not deduplicated. This avoids adding too many
  144. ;; entries to '.links', which would slow down 'removeUnusedLinks' while
  145. ;; saving little space. Keep in sync with optimize-store.cc.
  146. 8192)
  147. (define* (deduplicate path hash #:key (store (%store-directory)))
  148. "Check if a store item with sha256 hash HASH already exists. If so,
  149. replace PATH with a hardlink to the already-existing one. If not, register
  150. PATH so that future duplicates can hardlink to it. PATH is assumed to be
  151. under STORE."
  152. ;; Lightweight promises.
  153. (define-syntax-rule (delay exp)
  154. (let ((value #f))
  155. (lambda ()
  156. (unless value
  157. (set! value exp))
  158. value)))
  159. (define-syntax-rule (force promise)
  160. (promise))
  161. (define links-directory
  162. (string-append store "/.links"))
  163. (let loop ((path path)
  164. (type (stat:type (lstat path)))
  165. (hash hash))
  166. (if (eq? 'directory type)
  167. ;; Can't hardlink directories, so hardlink their atoms.
  168. (for-each (match-lambda
  169. ((file . properties)
  170. (unless (member file '("." ".."))
  171. (let* ((file (string-append path "/" file))
  172. (st (delay (lstat file)))
  173. (type (match (assoc-ref properties 'type)
  174. ((or 'unknown #f)
  175. (stat:type (force st)))
  176. (type type))))
  177. (when (or (eq? 'directory type)
  178. (and (eq? 'regular type)
  179. (>= (stat:size (force st))
  180. %deduplication-minimum-size)))
  181. (loop file type
  182. (and (not (eq? 'directory type))
  183. (nar-sha256 file))))))))
  184. (scandir* path))
  185. (let ((link-file (string-append links-directory "/"
  186. (bytevector->nix-base32-string hash))))
  187. (if (file-exists? link-file)
  188. (replace-with-link link-file path
  189. #:swap-directory links-directory
  190. #:store store)
  191. (catch 'system-error
  192. (lambda ()
  193. (link path link-file))
  194. (lambda args
  195. (let ((errno (system-error-errno args)))
  196. (cond ((= errno EEXIST)
  197. ;; Someone else put an entry for PATH in
  198. ;; LINKS-DIRECTORY before we could. Let's use it.
  199. (replace-with-link path link-file
  200. #:swap-directory
  201. links-directory
  202. #:store store))
  203. ((= errno ENOENT)
  204. ;; This most likely means that LINKS-DIRECTORY does
  205. ;; not exist. Attempt to create it and try again.
  206. (mkdir-p links-directory)
  207. (loop path type hash))
  208. ((= errno ENOSPC)
  209. ;; There's not enough room in the directory index for
  210. ;; more entries in .links, but that's fine: we can
  211. ;; just stop.
  212. #f)
  213. ((= errno EMLINK)
  214. ;; PATH has reached the maximum number of links, but
  215. ;; that's OK: we just can't deduplicate it more.
  216. #f)
  217. (else (apply throw args)))))))))))
  218. (define (tee input len output)
  219. "Return a port that reads up to LEN bytes from INPUT and writes them to
  220. OUTPUT as it goes."
  221. (define bytes-read 0)
  222. (define (fail)
  223. ;; Reached EOF before we had read LEN bytes from INPUT.
  224. (raise (condition
  225. (&nar-error (port input)
  226. (file (port-filename output))))))
  227. (define (read! bv start count)
  228. ;; Read at most LEN bytes in total.
  229. (let ((count (min count (- len bytes-read))))
  230. (let loop ((ret (get-bytevector-n! input bv start count)))
  231. (cond ((eof-object? ret)
  232. (if (= bytes-read len)
  233. 0 ; EOF
  234. (fail)))
  235. ((and (zero? ret) (> count 0))
  236. ;; Do not return zero since zero means EOF, so try again.
  237. (loop (get-bytevector-n! input bv start count)))
  238. (else
  239. (put-bytevector output bv start ret)
  240. (set! bytes-read (+ bytes-read ret))
  241. ret)))))
  242. (make-custom-binary-input-port "tee input port" read! #f #f #f))
  243. (define* (dump-file/deduplicate file input size type
  244. #:key (store (%store-directory)))
  245. "Write SIZE bytes read from INPUT to FILE. TYPE is a symbol, either
  246. 'regular or 'executable.
  247. This procedure is suitable as a #:dump-file argument to 'restore-file'. When
  248. used that way, it deduplicates files on the fly as they are restored, thereby
  249. removing the need for a deduplication pass that would re-read all the files
  250. down the road."
  251. (define (dump-and-compute-hash)
  252. (call-with-output-file file
  253. (lambda (output)
  254. (let-values (((hash-port get-hash)
  255. (open-hash-port (hash-algorithm sha256))))
  256. (write-file-tree file hash-port
  257. #:file-type+size (lambda (_) (values type size))
  258. #:file-port
  259. (const (tee input size output)))
  260. (close-port hash-port)
  261. (get-hash)))))
  262. (if (>= size %deduplication-minimum-size)
  263. (deduplicate file (dump-and-compute-hash) #:store store)
  264. (call-with-output-file file
  265. (lambda (output)
  266. (dump-port input output size)))))
  267. (define* (copy-file/deduplicate source target
  268. #:key (store (%store-directory)))
  269. "Like 'copy-file', but additionally deduplicate TARGET in STORE."
  270. (call-with-input-file source
  271. (lambda (input)
  272. (let ((stat (stat input)))
  273. (dump-file/deduplicate target input (stat:size stat)
  274. (if (zero? (logand (stat:mode stat)
  275. #o100))
  276. 'regular
  277. 'executable)
  278. #:store store)))))