packages.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668
  1. # Copyright 2013 The Distro Tracker Developers
  2. # See the COPYRIGHT file at the top-level directory of this distribution and
  3. # at http://deb.li/DTAuthors
  4. #
  5. # This file is part of Distro Tracker. It is subject to the license terms
  6. # in the LICENSE file found in the top-level directory of this
  7. # distribution and at http://deb.li/DTLicense. No part of Distro Tracker,
  8. # including this file, may be copied, modified, propagated, or distributed
  9. # except according to the terms contained in the LICENSE file.
  10. """Utilities for processing Debian package information."""
  11. from __future__ import unicode_literals
  12. from distro_tracker.core.utils.email_messages import (
  13. name_and_address_from_string as parse_address,
  14. names_and_addresses_from_string as parse_addresses
  15. )
  16. from django.conf import settings
  17. from django.utils.encoding import force_bytes
  18. from debian import deb822
  19. from distro_tracker.core.utils import extract_tar_archive
  20. import os
  21. import apt
  22. import shutil
  23. import apt_pkg
  24. import subprocess
  25. def package_hashdir(package_name):
  26. """
  27. Returns the name of the hash directory used to avoid having too
  28. many entries in a single directory. It's usually the first letter
  29. of the package except for lib* packages where it's the first 4
  30. letters.
  31. :param package_name: The package name.
  32. :type package_name: str
  33. :returns: Name of the hash directory.
  34. :rtype: str
  35. """
  36. if package_name is None:
  37. return None
  38. if package_name.startswith('lib'):
  39. return package_name[0:4]
  40. else:
  41. return package_name[0:1]
  42. def extract_vcs_information(stanza):
  43. """
  44. Extracts the VCS information from a package's Sources entry.
  45. :param stanza: The ``Sources`` entry from which to extract the VCS info.
  46. Maps ``Sources`` key names to values.
  47. :type stanza: dict
  48. :returns: VCS information regarding the package. Contains the following
  49. keys: type[, browser, url]
  50. :rtype: dict
  51. """
  52. vcs = {}
  53. for key, value in stanza.items():
  54. key = key.lower()
  55. if key == 'vcs-browser':
  56. vcs['browser'] = value
  57. elif key.startswith('vcs-'):
  58. vcs['type'] = key[4:]
  59. vcs['url'] = value
  60. return vcs
  61. def extract_dsc_file_name(stanza):
  62. """
  63. Extracts the name of the .dsc file from a package's Sources entry.
  64. :param stanza: The ``Sources`` entry from which to extract the VCS info.
  65. Maps ``Sources`` key names to values.
  66. :type stanza: dict
  67. """
  68. for field in ('checksums-sha256', 'checksums-sha1', 'files'):
  69. for entry in stanza.get(field, []):
  70. if entry.get('name', '').endswith('.dsc'):
  71. return entry['name']
  72. return None
  73. def extract_information_from_sources_entry(stanza):
  74. """
  75. Extracts information from a ``Sources`` file entry and returns it in the
  76. form of a dictionary.
  77. :param stanza: The raw entry's key-value pairs.
  78. :type stanza: Case-insensitive dict
  79. """
  80. binaries = [
  81. binary.strip()
  82. for binary in stanza['binary'].split(',')
  83. ]
  84. entry = {
  85. 'version': stanza['version'],
  86. 'homepage': stanza.get('homepage', ''),
  87. 'priority': stanza.get('priority', ''),
  88. 'section': stanza.get('section', ''),
  89. 'architectures': stanza['architecture'].split(),
  90. 'binary_packages': binaries,
  91. 'maintainer': parse_address(stanza['maintainer']),
  92. 'uploaders': parse_addresses(stanza.get('uploaders', '')),
  93. 'standards_version': stanza.get('standards-version', ''),
  94. 'vcs': extract_vcs_information(stanza),
  95. 'dsc_file_name': extract_dsc_file_name(stanza),
  96. 'directory': stanza.get('directory', ''),
  97. }
  98. return entry
  99. def extract_information_from_packages_entry(stanza):
  100. """
  101. Extracts information from a ``Packages`` file entry and returns it in the
  102. form of a dictionary.
  103. :param stanza: The raw entry's key-value pairs.
  104. :type stanza: Case-insensitive dict
  105. """
  106. entry = {
  107. 'version': stanza['version'],
  108. 'short_description': stanza.get('description', '')[:300],
  109. }
  110. return entry
  111. class SourcePackageRetrieveError(Exception):
  112. pass
  113. class AptCache(object):
  114. """
  115. A class for handling cached package information.
  116. """
  117. DEFAULT_MAX_SIZE = 1 * 1024 ** 3 # 1 GiB
  118. QUILT_FORMAT = '3.0 (quilt)'
  119. class AcquireProgress(apt.progress.base.AcquireProgress):
  120. """
  121. Instances of this class can be passed to :meth:`apt.cache.Cache.update`
  122. calls.
  123. It provides a way to track which files were changed and which were not
  124. by an update operation.
  125. """
  126. def __init__(self, *args, **kwargs):
  127. super(AptCache.AcquireProgress, self).__init__(*args, **kwargs)
  128. self.fetched = []
  129. self.hit = []
  130. def done(self, item):
  131. self.fetched.append(os.path.split(item.owner.destfile)[1])
  132. def ims_hit(self, item):
  133. self.hit.append(os.path.split(item.owner.destfile)[1])
  134. def pulse(self, owner):
  135. return True
  136. def __init__(self):
  137. # The root cache directory is a subdirectory in the
  138. # DISTRO_TRACKER_CACHE_DIRECTORY
  139. self.cache_root_dir = os.path.join(
  140. settings.DISTRO_TRACKER_CACHE_DIRECTORY,
  141. 'apt-cache'
  142. )
  143. self.sources_list_path = os.path.join(
  144. self.cache_root_dir, 'etc', 'sources.list')
  145. self.conf_file_path = os.path.join(self.cache_root_dir,
  146. 'etc', 'apt.conf')
  147. os.environ['APT_CONFIG'] = self.conf_file_path
  148. self.sources = []
  149. self.packages = []
  150. self.cache_max_size = getattr(
  151. settings, 'DISTRO_TRACKER_APT_CACHE_MAX_SIZE',
  152. self.DEFAULT_MAX_SIZE)
  153. #: The directory where source package files are cached
  154. self.source_cache_directory = os.path.join(self.cache_root_dir,
  155. 'packages')
  156. self._cache_size = None # Evaluate the cache size lazily
  157. self.configure_cache()
  158. @property
  159. def cache_size(self):
  160. if self._cache_size is None:
  161. self._cache_size = \
  162. self.get_directory_size(self.source_cache_directory)
  163. return self._cache_size
  164. def get_directory_size(self, directory_path):
  165. """
  166. Returns the total space taken by the given directory in bytes.
  167. :param directory_path: The path to the directory
  168. :type directory_path: string
  169. :rtype: int
  170. """
  171. # Convert the directory path to bytes to make sure all os calls deal
  172. # with bytes, not unicode objects.
  173. # This way any file names with invalid utf-8 names, are correctly
  174. # handled, without causing an error.
  175. directory_path = force_bytes(directory_path)
  176. total_size = 0
  177. for dirpath, dirnames, filenames in os.walk(directory_path):
  178. for file_name in filenames:
  179. file_path = os.path.join(dirpath, file_name)
  180. stat = os.lstat(file_path)
  181. total_size += stat.st_size
  182. return total_size
  183. def clear_cache(self):
  184. """
  185. Removes all cache information. This causes the next update to retrieve
  186. fresh repository files.
  187. """
  188. self._remove_dir(self.cache_root_dir)
  189. self.configure_cache()
  190. def update_sources_list(self):
  191. """
  192. Updates the ``sources.list`` file used to list repositories for which
  193. package information should be cached.
  194. """
  195. from distro_tracker.core.models import Repository
  196. directory = os.path.dirname(self.sources_list_path)
  197. if not os.path.exists(directory):
  198. os.makedirs(directory)
  199. with open(self.sources_list_path, 'w') as sources_list:
  200. for repository in Repository.objects.all():
  201. sources_list.write(repository.sources_list_entry + '\n')
  202. def update_apt_conf(self):
  203. """
  204. Updates the ``apt.conf`` file which gives general settings for the
  205. :class:`apt.cache.Cache`.
  206. In particular, this updates the list of all architectures which should
  207. be considered in package updates based on architectures that the
  208. repositories support.
  209. """
  210. from distro_tracker.core.models import Architecture
  211. with open(self.conf_file_path, 'w') as conf_file:
  212. conf_file.write('APT::Architectures { ')
  213. for architecture in Architecture.objects.all():
  214. conf_file.write('"{arch}"; '.format(arch=architecture))
  215. conf_file.write('};\n')
  216. conf_file.write('Acquire::CompressionTypes::Order:: "xz";\n')
  217. conf_file.write('Dir "{}/";\n'.format(self.cache_root_dir))
  218. conf_file.write('Dir::State "state/";\n')
  219. conf_file.write('Dir::State::status "dpkg-status";\n')
  220. conf_file.write('Dir::Etc "etc/";\n')
  221. conf_file.write('Dir::Etc::sourcelist "{src}";\n'.format(
  222. src=self.sources_list_path))
  223. conf_file.write('Dir::Etc::Trusted "{src}";\n'.format(
  224. src=settings.DISTRO_TRACKER_TRUSTED_GPG_MAIN_FILE))
  225. conf_file.write('Dir::Etc::TrustedParts "{src}";\n'.format(
  226. src=settings.DISTRO_TRACKER_TRUSTED_GPG_PARTS_DIR))
  227. def configure_cache(self):
  228. """
  229. Configures the cache based on the most current repository information.
  230. """
  231. self.update_sources_list()
  232. self.update_apt_conf()
  233. # Clean up the configuration we might have read during "import apt"
  234. for root_key in apt_pkg.config.list():
  235. apt_pkg.config.clear(root_key)
  236. # Load the proper configuration
  237. apt_pkg.init()
  238. # Ensure we have the required directories
  239. for apt_dir in [apt_pkg.config.find_dir('Dir::State::lists'),
  240. apt_pkg.config.find_dir('Dir::Etc::sourceparts'),
  241. apt_pkg.config.find_dir('Dir::Cache::archives')]:
  242. if not os.path.exists(apt_dir):
  243. os.makedirs(apt_dir)
  244. def _index_file_full_path(self, file_name):
  245. """
  246. Returns the absolute path for the given cached index file.
  247. :param file_name: The name of the cached index file.
  248. :type file_name: string
  249. :rtype: string
  250. """
  251. return os.path.join(
  252. apt_pkg.config.find_dir('Dir::State::lists'),
  253. file_name
  254. )
  255. def _match_index_file_to_repository(self, sources_file):
  256. """
  257. Returns the :class:`Repository <distro_tracker.core.models.Repository>`
  258. instance which matches the given cached ``Sources`` file.
  259. :rtype: :class:`Repository <distro_tracker.core.models.Repository>`
  260. """
  261. from distro_tracker.core.models import Repository
  262. sources_list = apt_pkg.SourceList()
  263. sources_list.read_main_list()
  264. component_url = None
  265. for entry in sources_list.list:
  266. for index_file in entry.index_files:
  267. if os.path.basename(sources_file) in index_file.describe:
  268. base_url, component, _ = index_file.describe.split(None, 2)
  269. base_url = base_url.rstrip('/')
  270. component_url = base_url + '/' + component
  271. break
  272. for repository in Repository.objects.all():
  273. if component_url in repository.component_urls:
  274. return repository
  275. def _get_all_cached_files(self):
  276. """
  277. Returns a list of all cached files.
  278. """
  279. lists_directory = apt_pkg.config.find_dir('Dir::State::lists')
  280. try:
  281. return [
  282. os.path.join(lists_directory, file_name)
  283. for file_name in os.listdir(lists_directory)
  284. if os.path.isfile(os.path.join(lists_directory, file_name))
  285. ]
  286. except OSError:
  287. # The directory structure does not exist => nothing is cached
  288. return []
  289. def get_cached_files(self, filter_function=None):
  290. """
  291. Returns cached files, optionally filtered by the given
  292. ``filter_function``
  293. :param filter_function: Takes a file name as the only parameter and
  294. returns a :class:`bool` indicating whether it should be included
  295. in the result.
  296. :type filter_function: callable
  297. :returns: A list of cached file names
  298. :rtype: list
  299. """
  300. if filter_function is None:
  301. # Include all files if the filter function is not provided
  302. def filter_function(x):
  303. return True
  304. return [
  305. file_name
  306. for file_name in self._get_all_cached_files()
  307. if filter_function(file_name)
  308. ]
  309. def get_sources_files_for_repository(self, repository):
  310. """
  311. Returns all ``Sources`` files which are cached for the given
  312. repository.
  313. For instance, ``Sources`` files for different suites are cached
  314. separately.
  315. :param repository: The repository for which to return all cached
  316. ``Sources`` files
  317. :type repository: :class:`Repository
  318. <distro_tracker.core.models.Repository>`
  319. :rtype: ``iterable`` of strings
  320. """
  321. return self.get_cached_files(
  322. lambda file_name: (
  323. file_name.endswith('Sources') and
  324. self._match_index_file_to_repository(file_name) == repository))
  325. def get_packages_files_for_repository(self, repository):
  326. """
  327. Returns all ``Packages`` files which are cached for the given
  328. repository.
  329. For instance, ``Packages`` files for different suites are cached
  330. separately.
  331. :param repository: The repository for which to return all cached
  332. ``Packages`` files
  333. :type repository: :class:`Repository
  334. <distro_tracker.core.models.Repository>`
  335. :rtype: ``iterable`` of strings
  336. """
  337. return self.get_cached_files(
  338. lambda file_name: (
  339. file_name.endswith('Packages') and
  340. self._match_index_file_to_repository(file_name) == repository))
  341. def update_repositories(self, force_download=False):
  342. """
  343. Initiates a cache update.
  344. :param force_download: If set to ``True`` causes the cache to be
  345. cleared before starting the update, thus making sure all index
  346. files are downloaded again.
  347. :returns: A two-tuple ``(updated_sources, updated_packages)``. Each of
  348. the tuple's members is a list of
  349. (:class:`Repository <distro_tracker.core.models.Repository>`,
  350. ``file_name``) pairs representing the repository which was updated
  351. and the file which contains the fresh information. The file is
  352. either a ``Sources`` or a ``Packages`` file, respectively.
  353. """
  354. if force_download:
  355. self.clear_cache()
  356. self.configure_cache()
  357. cache = apt.Cache(rootdir=self.cache_root_dir)
  358. progress = AptCache.AcquireProgress()
  359. cache.update(progress)
  360. updated_sources = []
  361. updated_packages = []
  362. for fetched_file in progress.fetched:
  363. if fetched_file.endswith('Sources'):
  364. dest = updated_sources
  365. elif fetched_file.endswith('Packages'):
  366. dest = updated_packages
  367. else:
  368. continue
  369. repository = self._match_index_file_to_repository(fetched_file)
  370. dest.append((
  371. repository, self._index_file_full_path(fetched_file)
  372. ))
  373. return updated_sources, updated_packages
  374. def _get_format(self, record):
  375. """
  376. Returns the Format field value of the given source package record.
  377. """
  378. record = deb822.Deb822(record)
  379. return record['format']
  380. def _extract_quilt_package_debian_tar(self, debian_tar_path, outdir):
  381. """
  382. Extracts the given tarball to the given output directory.
  383. """
  384. extract_tar_archive(debian_tar_path, outdir)
  385. def get_package_source_cache_directory(self, package_name):
  386. """
  387. Returns the path to the directory where a particular source package is
  388. cached.
  389. :param package_name: The name of the source package
  390. :type package_name: string
  391. :rtype: string
  392. """
  393. package_hash = (
  394. package_name[0]
  395. if not package_name.startswith('lib') else
  396. package_name[:4]
  397. )
  398. return os.path.join(
  399. self.source_cache_directory,
  400. package_hash,
  401. package_name)
  402. def get_source_version_cache_directory(self, package_name, version):
  403. """
  404. Returns the path to the directory where a particular source package
  405. version files are extracted.
  406. :param package_name: The name of the source package
  407. :type package_name: string
  408. :param version: The version of the source package
  409. :type version: string
  410. :rtype: string
  411. """
  412. package_dir = self.get_package_source_cache_directory(package_name)
  413. return os.path.join(package_dir, package_name + '-' + version)
  414. def _remove_dir(self, directory_path):
  415. """
  416. Removes the given directory, including any subdirectories and files.
  417. The method makes sure to correctly handle the situation where the
  418. directory contains files with names which are invalid utf-8.
  419. """
  420. # Convert the directory path to bytes to make sure all os calls deal
  421. # with bytes, not unicode objects.
  422. # This way any file names with invalid utf-8 names, are correctly
  423. # handled, without causing an error.
  424. directory_path = force_bytes(directory_path)
  425. if os.path.exists(directory_path):
  426. shutil.rmtree(directory_path)
  427. def clear_cached_sources(self):
  428. """
  429. Clears all cached package source files.
  430. """
  431. self._remove_dir(self.source_cache_directory)
  432. self._cache_size = self.get_directory_size(self.source_cache_directory)
  433. def _get_apt_source_records(self, source_name, version):
  434. """
  435. Returns a :class:`apt_pkg.SourceRecords` instance where the given
  436. source package is the current working record.
  437. """
  438. apt.Cache(rootdir=self.cache_root_dir) # must be pre-created
  439. source_records = apt_pkg.SourceRecords()
  440. source_records.restart()
  441. # Find the cached record matching this source package and version
  442. found = False
  443. while source_records.lookup(source_name):
  444. if source_records.version == version:
  445. found = True
  446. break
  447. if not found:
  448. # Package version does not exist in the cache
  449. raise SourcePackageRetrieveError(
  450. "Could not retrieve package {pkg} version {ver}:"
  451. " No such version found in the cache".format(
  452. pkg=source_name, ver=version))
  453. return source_records
  454. def _extract_dpkg_source(self, retrieved_files, outdir):
  455. """
  456. Uses dpkg-source to extract the source package.
  457. """
  458. dsc_file_path = next(
  459. file_path
  460. for file_path in retrieved_files
  461. if file_path.endswith('.dsc'))
  462. dsc_file_path = os.path.abspath(dsc_file_path)
  463. outdir = os.path.abspath(outdir)
  464. subprocess.check_output(["dpkg-source", "-x", dsc_file_path, outdir],
  465. stderr=subprocess.STDOUT)
  466. def _apt_acquire_package(self,
  467. source_records,
  468. dest_dir_path,
  469. debian_directory_only):
  470. """
  471. Using :class:`apt_pkg.Acquire`, retrieves the source files for the
  472. source package described by the current source_records record.
  473. :param source_records: The record describing the source package whose
  474. files should be retrieved.
  475. :type source_records: :class:`apt_pkg.Acquire`
  476. :param dest_dir_path: The path to the directory where the downloaded
  477. files should be saved.
  478. :type dest_dir_path: string
  479. :param debian_directory_only: A flag indicating whether only the debian
  480. directory should be downloaded.
  481. :returns: A list of absolute paths of all retrieved source files.
  482. :rtype: list of strings
  483. """
  484. package_format = self._get_format(source_records.record)
  485. # A reference to each AcquireFile instance must be kept
  486. files = []
  487. acquire = apt_pkg.Acquire(apt.progress.base.AcquireProgress())
  488. for md5, size, path, file_type in source_records.files:
  489. base = os.path.basename(path)
  490. dest_file_path = os.path.join(dest_dir_path, base)
  491. if debian_directory_only and package_format == self.QUILT_FORMAT:
  492. if file_type != 'diff':
  493. # Only retrieve the .debian.tar.* file for quilt packages
  494. # when only the debian directory is wanted
  495. continue
  496. files.append(apt_pkg.AcquireFile(
  497. acquire,
  498. source_records.index.archive_uri(path),
  499. md5,
  500. size,
  501. base,
  502. destfile=dest_file_path
  503. ))
  504. acquire.run()
  505. # Check if all items are correctly retrieved and build the list of file
  506. # paths.
  507. retrieved_paths = []
  508. for item in acquire.items:
  509. if item.status != item.STAT_DONE:
  510. raise SourcePackageRetrieveError(
  511. 'Could not retrieve file {file}: {error}'.format(
  512. file=item.destfile,
  513. error=item.error_text.decode('utf-8')))
  514. retrieved_paths.append(item.destfile)
  515. return retrieved_paths
  516. def retrieve_source(self, source_name, version,
  517. debian_directory_only=False):
  518. """
  519. Retrieve the source package files for the given source package version.
  520. :param source_name: The name of the source package
  521. :type source_name: string
  522. :param version: The version of the source package
  523. :type version: string
  524. :param debian_directory_only: Flag indicating if the method should try
  525. to retrieve only the debian directory of the source package. This
  526. is usually only possible when the package format is 3.0 (quilt).
  527. :type debian_directory_only: Boolean
  528. :returns: The path to the directory containing the extracted source
  529. package files.
  530. :rtype: string
  531. """
  532. if self.cache_size > self.cache_max_size:
  533. # If the maximum allowed cache size has been exceeded,
  534. # clear the cache
  535. self.clear_cached_sources()
  536. source_records = self._get_apt_source_records(source_name, version)
  537. dest_dir_path = self.get_package_source_cache_directory(source_name)
  538. if not os.path.exists(dest_dir_path):
  539. os.makedirs(dest_dir_path)
  540. # Remember the size of the directory in the beginning
  541. old_size = self.get_directory_size(dest_dir_path)
  542. # Download the source files
  543. retrieved_files = self._apt_acquire_package(
  544. source_records, dest_dir_path, debian_directory_only)
  545. # Extract the retrieved source files
  546. outdir = self.get_source_version_cache_directory(source_name, version)
  547. # dpkg-source expects this directory not to exist
  548. self._remove_dir(outdir)
  549. package_format = self._get_format(source_records.record)
  550. if debian_directory_only and package_format == self.QUILT_FORMAT:
  551. # dpkg-source cannot extract an incomplete package
  552. self._extract_quilt_package_debian_tar(retrieved_files[0], outdir)
  553. else:
  554. # Let dpkg-source handle the extraction in all other cases
  555. self._extract_dpkg_source(retrieved_files, outdir)
  556. # Update the current cache size based on the changes made by getting
  557. # this source package.
  558. new_size = self.get_directory_size(dest_dir_path)
  559. size_delta = new_size - old_size
  560. self._cache_size += size_delta
  561. return outdir