123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304 |
- ;;; GNU Guix --- Functional package management for GNU
- ;;; Copyright © 2017 Caleb Ristvedt <caleb.ristvedt@cune.org>
- ;;; Copyright © 2018-2021 Ludovic Courtès <ludo@gnu.org>
- ;;;
- ;;; This file is part of GNU Guix.
- ;;;
- ;;; GNU Guix is free software; you can redistribute it and/or modify it
- ;;; under the terms of the GNU General Public License as published by
- ;;; the Free Software Foundation; either version 3 of the License, or (at
- ;;; your option) any later version.
- ;;;
- ;;; GNU Guix is distributed in the hope that it will be useful, but
- ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
- ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ;;; GNU General Public License for more details.
- ;;;
- ;;; You should have received a copy of the GNU General Public License
- ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
- ;;; This houses stuff we do to files when they arrive at the store - resetting
- ;;; timestamps, deduplicating, etc.
- (define-module (guix store deduplication)
- #:use-module (gcrypt hash)
- #:use-module ((guix build utils) #:hide (dump-port))
- #:use-module (guix build syscalls)
- #:use-module (guix base32)
- #:use-module (srfi srfi-11)
- #:use-module (srfi srfi-34)
- #:use-module (srfi srfi-35)
- #:use-module (rnrs bytevectors)
- #:use-module (rnrs io ports)
- #:use-module (ice-9 ftw)
- #:use-module (ice-9 match)
- #:use-module (guix serialization)
- #:export (nar-sha256
- deduplicate
- dump-file/deduplicate
- copy-file/deduplicate))
- ;; TODO: Remove once 'dump-port' in (guix build utils) has an optional 'len'
- ;; parameter.
- (define* (dump-port in out
- #:optional len
- #:key (buffer-size 16384))
- "Read LEN bytes from IN (or as much as possible if LEN is #f) and write it
- to OUT, using chunks of BUFFER-SIZE bytes."
- (define buffer
- (make-bytevector buffer-size))
- (let loop ((total 0)
- (bytes (get-bytevector-n! in buffer 0
- (if len
- (min len buffer-size)
- buffer-size))))
- (or (eof-object? bytes)
- (and len (= total len))
- (let ((total (+ total bytes)))
- (put-bytevector out buffer 0 bytes)
- (loop total
- (get-bytevector-n! in buffer 0
- (if len
- (min (- len total) buffer-size)
- buffer-size)))))))
- (define (nar-sha256 file)
- "Gives the sha256 hash of a file and the size of the file in nar form."
- (let-values (((port get-hash) (open-sha256-port)))
- (write-file file port)
- (force-output port)
- (let ((hash (get-hash))
- (size (port-position port)))
- (close-port port)
- (values hash size))))
- (define (tempname-in directory)
- "Gives an unused temporary name under DIRECTORY. Not guaranteed to still be
- unused by the time you create anything with that name, but a good shot."
- (let ((const-part (string-append directory "/.tmp-link-"
- (number->string (getpid)))))
- (let try ((guess-part
- (number->string (random most-positive-fixnum) 16)))
- (if (file-exists? (string-append const-part "-" guess-part))
- (try (number->string (random most-positive-fixnum) 16))
- (string-append const-part "-" guess-part)))))
- (define* (get-temp-link target #:optional (link-prefix (dirname target)))
- "Like mkstemp!, but instead of creating a new file and giving you the name,
- it creates a new hardlink to TARGET and gives you the name. Since
- cross-file-system hardlinks don't work, the temp link must be created on the
- same file system - where in that file system it is can be controlled by
- LINK-PREFIX."
- (let try ((tempname (tempname-in link-prefix)))
- (catch 'system-error
- (lambda ()
- (link target tempname)
- tempname)
- (lambda args
- (if (= (system-error-errno args) EEXIST)
- (try (tempname-in link-prefix))
- (apply throw args))))))
- (define (call-with-writable-file file store thunk)
- (if (string=? file store)
- (thunk) ;don't meddle with the store's permissions
- (let ((stat (lstat file)))
- (dynamic-wind
- (lambda ()
- (make-file-writable file))
- thunk
- (lambda ()
- (set-file-time file stat)
- (chmod file (stat:mode stat)))))))
- (define-syntax-rule (with-writable-file file store exp ...)
- "Make FILE writable for the dynamic extent of EXP..., except if FILE is the
- store."
- (call-with-writable-file file store (lambda () exp ...)))
- ;; There are 3 main kinds of errors we can get from hardlinking: "Too many
- ;; things link to this" (EMLINK), "this link already exists" (EEXIST), and
- ;; "can't fit more stuff in this directory" (ENOSPC).
- (define* (replace-with-link target to-replace
- #:key (swap-directory (dirname target))
- (store (%store-directory)))
- "Atomically replace the file TO-REPLACE with a link to TARGET. Use
- SWAP-DIRECTORY as the directory to store temporary hard links. Upon ENOSPC
- and EMLINK, TO-REPLACE is left unchanged.
- Note: TARGET, TO-REPLACE, and SWAP-DIRECTORY must be on the same file system."
- (define temp-link
- (catch 'system-error
- (lambda ()
- (get-temp-link target swap-directory))
- (lambda args
- ;; We get ENOSPC when we can't fit an additional entry in
- ;; SWAP-DIRECTORY. If it's EMLINK, then TARGET has reached its
- ;; maximum number of links.
- (if (memv (system-error-errno args) `(,ENOSPC ,EMLINK))
- #f
- (apply throw args)))))
- ;; If we couldn't create TEMP-LINK, that's OK: just don't do the
- ;; replacement, which means TO-REPLACE won't be deduplicated.
- (when temp-link
- (with-writable-file (dirname to-replace) store
- (catch 'system-error
- (lambda ()
- (rename-file temp-link to-replace))
- (lambda args
- (delete-file temp-link)
- (unless (= EMLINK (system-error-errno args))
- (apply throw args)))))))
- (define %deduplication-minimum-size
- ;; Size below which files are not deduplicated. This avoids adding too many
- ;; entries to '.links', which would slow down 'removeUnusedLinks' while
- ;; saving little space. Keep in sync with optimize-store.cc.
- 8192)
- (define* (deduplicate path hash #:key (store (%store-directory)))
- "Check if a store item with sha256 hash HASH already exists. If so,
- replace PATH with a hardlink to the already-existing one. If not, register
- PATH so that future duplicates can hardlink to it. PATH is assumed to be
- under STORE."
- ;; Lightweight promises.
- (define-syntax-rule (delay exp)
- (let ((value #f))
- (lambda ()
- (unless value
- (set! value exp))
- value)))
- (define-syntax-rule (force promise)
- (promise))
- (define links-directory
- (string-append store "/.links"))
- (let loop ((path path)
- (type (stat:type (lstat path)))
- (hash hash))
- (if (eq? 'directory type)
- ;; Can't hardlink directories, so hardlink their atoms.
- (for-each (match-lambda
- ((file . properties)
- (unless (member file '("." ".."))
- (let* ((file (string-append path "/" file))
- (st (delay (lstat file)))
- (type (match (assoc-ref properties 'type)
- ((or 'unknown #f)
- (stat:type (force st)))
- (type type))))
- (when (or (eq? 'directory type)
- (and (eq? 'regular type)
- (>= (stat:size (force st))
- %deduplication-minimum-size)))
- (loop file type
- (and (not (eq? 'directory type))
- (nar-sha256 file))))))))
- (scandir* path))
- (let ((link-file (string-append links-directory "/"
- (bytevector->nix-base32-string hash))))
- (if (file-exists? link-file)
- (replace-with-link link-file path
- #:swap-directory links-directory
- #:store store)
- (catch 'system-error
- (lambda ()
- (link path link-file))
- (lambda args
- (let ((errno (system-error-errno args)))
- (cond ((= errno EEXIST)
- ;; Someone else put an entry for PATH in
- ;; LINKS-DIRECTORY before we could. Let's use it.
- (replace-with-link path link-file
- #:swap-directory
- links-directory
- #:store store))
- ((= errno ENOENT)
- ;; This most likely means that LINKS-DIRECTORY does
- ;; not exist. Attempt to create it and try again.
- (mkdir-p links-directory)
- (loop path type hash))
- ((= errno ENOSPC)
- ;; There's not enough room in the directory index for
- ;; more entries in .links, but that's fine: we can
- ;; just stop.
- #f)
- ((= errno EMLINK)
- ;; PATH has reached the maximum number of links, but
- ;; that's OK: we just can't deduplicate it more.
- #f)
- (else (apply throw args)))))))))))
- (define (tee input len output)
- "Return a port that reads up to LEN bytes from INPUT and writes them to
- OUTPUT as it goes."
- (define bytes-read 0)
- (define (fail)
- ;; Reached EOF before we had read LEN bytes from INPUT.
- (raise (condition
- (&nar-error (port input)
- (file (port-filename output))))))
- (define (read! bv start count)
- ;; Read at most LEN bytes in total.
- (let ((count (min count (- len bytes-read))))
- (let loop ((ret (get-bytevector-n! input bv start count)))
- (cond ((eof-object? ret)
- (if (= bytes-read len)
- 0 ; EOF
- (fail)))
- ((and (zero? ret) (> count 0))
- ;; Do not return zero since zero means EOF, so try again.
- (loop (get-bytevector-n! input bv start count)))
- (else
- (put-bytevector output bv start ret)
- (set! bytes-read (+ bytes-read ret))
- ret)))))
- (make-custom-binary-input-port "tee input port" read! #f #f #f))
- (define* (dump-file/deduplicate file input size type
- #:key (store (%store-directory)))
- "Write SIZE bytes read from INPUT to FILE. TYPE is a symbol, either
- 'regular or 'executable.
- This procedure is suitable as a #:dump-file argument to 'restore-file'. When
- used that way, it deduplicates files on the fly as they are restored, thereby
- removing the need for a deduplication pass that would re-read all the files
- down the road."
- (define (dump-and-compute-hash)
- (call-with-output-file file
- (lambda (output)
- (let-values (((hash-port get-hash)
- (open-hash-port (hash-algorithm sha256))))
- (write-file-tree file hash-port
- #:file-type+size (lambda (_) (values type size))
- #:file-port
- (const (tee input size output)))
- (close-port hash-port)
- (get-hash)))))
- (if (>= size %deduplication-minimum-size)
- (deduplicate file (dump-and-compute-hash) #:store store)
- (call-with-output-file file
- (lambda (output)
- (dump-port input output size)))))
- (define* (copy-file/deduplicate source target
- #:key (store (%store-directory)))
- "Like 'copy-file', but additionally deduplicate TARGET in STORE."
- (call-with-input-file source
- (lambda (input)
- (let ((stat (stat input)))
- (dump-file/deduplicate target input (stat:size stat)
- (if (zero? (logand (stat:mode stat)
- #o100))
- 'regular
- 'executable)
- #:store store)))))
|