archive_dedup_pool.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. #!/usr/bin/env python
  2. # vim:set et ts=4 sw=4:
  3. """ De-duplicates files in the pool directory
  4. @contact: Debian FTP Master <ftpmaster@debian.org>
  5. @copyright: 2017 Bastian Blank <waldi@debian.org>
  6. @license: GNU General Public License version 2 or later
  7. """
  8. # This program is free software; you can redistribute it and/or modify
  9. # it under the terms of the GNU General Public License as published by
  10. # the Free Software Foundation; either version 2 of the License, or
  11. # (at your option) any later version.
  12. # This program is distributed in the hope that it will be useful,
  13. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. # GNU General Public License for more details.
  16. # You should have received a copy of the GNU General Public License
  17. # along with this program; if not, write to the Free Software
  18. # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  19. ################################################################################
  20. from __future__ import print_function
  21. import apt_pkg
  22. import errno
  23. import os
  24. import stat
  25. import sys
  26. from daklib.dbconn import DBConn
  27. from daklib import daklog
  28. from daklib.config import Config
  29. Options = None
  30. Logger = None
  31. ################################################################################
  32. ################################################################################
  33. ################################################################################
  34. def usage(exit_code=0):
  35. print("""Usage: dak archive-dedup-pool [OPTION]...
  36. -h, --help show this help and exit.
  37. -V, --version display the version number and exit
  38. """)
  39. sys.exit(exit_code)
  40. ################################################################################
  41. def dedup_one(size, reference, *filenames):
  42. stat_reference = os.stat(reference)
  43. # safety net
  44. if stat_reference.st_size != size:
  45. raise RuntimeError('Size of {} does not match database: {} != {}'.format(
  46. reference, size, stat_reference.st_size))
  47. for filename in filenames:
  48. stat_filename = os.stat(filename)
  49. # if file is already a hard-linked, ignore
  50. if stat_reference == stat_filename:
  51. continue
  52. # safety net
  53. if stat_filename.st_size != size:
  54. raise RuntimeError('Size of {} does not match database: {} != {}'.format(
  55. filename, size, stat_filename.st_size))
  56. tempfile = filename + '.new'
  57. os.link(reference, tempfile)
  58. try:
  59. Logger.log(["deduplicate", filename, reference])
  60. os.rename(tempfile, filename)
  61. finally:
  62. try:
  63. os.unlink(tempfile)
  64. except OSError as e:
  65. if e.errno != errno.ENOENT:
  66. raise
  67. ################################################################################
  68. def dedup(session):
  69. results = session.execute("""
  70. SELECT DISTINCT *
  71. FROM (
  72. SELECT
  73. f.size,
  74. array_agg(a.path || '/pool/' || c.name || '/' || f.filename) OVER (
  75. -- we aggregate all files with the same size, sha256sum and archive
  76. PARTITION BY f.size, f.sha256sum, a.id
  77. -- the oldest should be first
  78. ORDER by f.created
  79. -- we always want to see all rows
  80. ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
  81. )
  82. AS filenames
  83. FROM
  84. files AS f INNER JOIN
  85. files_archive_map AS fa ON f.id = fa.file_id INNER JOIN
  86. component c ON fa.component_id = c.id INNER JOIN
  87. archive a ON fa.archive_id = a.id
  88. ) AS f
  89. -- we only care about entries with more than one filename
  90. WHERE array_length(filenames, 1) > 1
  91. """)
  92. for i in results:
  93. dedup_one(i['size'], *i['filenames'])
  94. ################################################################################
  95. def main():
  96. global Options, Logger
  97. cnf = Config()
  98. session = DBConn().session()
  99. Arguments = [('h', "help", "Archive-Dedup-Pool::Options::Help")]
  100. apt_pkg.parse_commandline(cnf.Cnf, Arguments, sys.argv)
  101. for i in ["help"]:
  102. key = "Archive-Dedup-Pool::Options::%s" % i
  103. if key not in cnf:
  104. cnf[key] = ""
  105. Options = cnf.subtree("Archive-Dedup-Pool::Options")
  106. if Options["Help"]:
  107. usage()
  108. Logger = daklog.Logger("archive-dedup-pool")
  109. dedup(session)
  110. Logger.close()
  111. ################################################################################
  112. if __name__ == '__main__':
  113. main()