http.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. # Copyright 2013 The Distro Tracker Developers
  2. # See the COPYRIGHT file at the top-level directory of this distribution and
  3. # at http://deb.li/DTAuthors
  4. #
  5. # This file is part of Distro Tracker. It is subject to the license terms
  6. # in the LICENSE file found in the top-level directory of this
  7. # distribution and at http://deb.li/DTLicense. No part of Distro Tracker,
  8. # including this file, may be copied, modified, propagated, or distributed
  9. # except according to the terms contained in the LICENSE file.
  10. """
  11. Utilities for handling HTTP resource access.
  12. """
  13. from __future__ import unicode_literals
  14. from hashlib import md5
  15. from django.utils import timezone
  16. from django.utils.http import parse_http_date
  17. from django.conf import settings
  18. import os
  19. import time
  20. import json
  21. from requests.structures import CaseInsensitiveDict
  22. import requests
  23. def parse_cache_control_header(header):
  24. """
  25. Parses the given Cache-Control header's values.
  26. :returns: The key-value pairs found in the header.
  27. If some key did not have an associated value in the header, ``None``
  28. is used instead.
  29. :rtype: dict
  30. """
  31. parts = header.split(',')
  32. cache_control = {}
  33. for part in parts:
  34. part = part.strip()
  35. if '=' not in part:
  36. cache_control[part] = None
  37. continue
  38. key, value = part.split('=', 1)
  39. cache_control[key] = value
  40. return cache_control
  41. class HttpCache(object):
  42. """
  43. A class providing an interface to a cache of HTTP responses.
  44. """
  45. def __init__(self, cache_directory_path):
  46. self.cache_directory_path = cache_directory_path
  47. def __contains__(self, item):
  48. cache_file_name = self._content_cache_file_path(item)
  49. return os.path.exists(cache_file_name)
  50. def is_expired(self, url):
  51. """
  52. If the cached response for the given URL is expired based on
  53. Cache-Control or Expires headers, returns True.
  54. """
  55. if url not in self:
  56. return True
  57. headers = self.get_headers(url)
  58. # First check if the Cache-Control header has set a max-age
  59. if 'cache-control' in headers:
  60. cache_control = parse_cache_control_header(headers['cache-control'])
  61. if 'max-age' in cache_control:
  62. max_age = int(cache_control['max-age'])
  63. response_age = int(
  64. os.stat(self._header_cache_file_path(url)).st_mtime)
  65. current_timestamp = int(time.time())
  66. return current_timestamp - response_age >= max_age
  67. # Alternatively, try the Expires header
  68. if 'expires' in headers:
  69. expires_date = timezone.datetime.utcfromtimestamp(
  70. parse_http_date(headers['expires']))
  71. expires_date = timezone.make_aware(expires_date, timezone.utc)
  72. current_date = timezone.now()
  73. return current_date > expires_date
  74. # If there is no cache freshness date consider the item expired
  75. return True
  76. def get_content(self, url):
  77. """
  78. Returns the content of the cached response for the given URL.
  79. :rtype: :class:`bytes`
  80. """
  81. if url in self:
  82. with open(self._content_cache_file_path(url), 'rb') as content_file:
  83. return content_file.read()
  84. def get_headers(self, url):
  85. """
  86. Returns the HTTP headers of the cached response for the given URL.
  87. :rtype: dict
  88. """
  89. if url in self:
  90. with open(self._header_cache_file_path(url), 'r') as header_file:
  91. return CaseInsensitiveDict(json.load(header_file))
  92. else:
  93. return {}
  94. def remove(self, url):
  95. """
  96. Removes the cached response for the given URL.
  97. """
  98. if url in self:
  99. os.remove(self._content_cache_file_path(url))
  100. os.remove(self._header_cache_file_path(url))
  101. def update(self, url, force=False):
  102. """
  103. Performs an update of the cached resource. This means that it validates
  104. that its most current version is found in the cache by doing a
  105. conditional GET request.
  106. :param force: To force the method to perform a full GET request, set
  107. the parameter to ``True``
  108. :returns: The original HTTP response and a Boolean indicating whether
  109. the cached value was updated.
  110. :rtype: two-tuple of (:class:`requests.Response`, ``Boolean``)
  111. """
  112. cached_headers = self.get_headers(url)
  113. headers = {}
  114. if not force:
  115. if 'last-modified' in cached_headers:
  116. headers['If-Modified-Since'] = cached_headers['last-modified']
  117. if 'etag' in cached_headers:
  118. headers['If-None-Match'] = cached_headers['etag']
  119. else:
  120. # Ask all possible intermediate proxies to return a fresh response
  121. headers['Cache-Control'] = 'no-cache'
  122. response = requests.get(url, headers=headers, verify=False,
  123. allow_redirects=True)
  124. # Invalidate previously cached value if the response is not valid now
  125. if not response.ok:
  126. self.remove(url)
  127. elif response.status_code == 200:
  128. # Dump the content and headers only if a new response is generated
  129. with open(self._content_cache_file_path(url), 'wb') as content_file:
  130. content_file.write(response.content)
  131. with open(self._header_cache_file_path(url), 'w') as header_file:
  132. json.dump(dict(response.headers), header_file)
  133. return response, response.status_code != 304
  134. def _content_cache_file_path(self, url):
  135. return os.path.join(self.cache_directory_path, self._url_hash(url))
  136. def _header_cache_file_path(self, url):
  137. url_hash = self._url_hash(url)
  138. header_file_name = url_hash + '.headers'
  139. return os.path.join(self.cache_directory_path, header_file_name)
  140. def _url_hash(self, url):
  141. return md5(url.encode('utf-8')).hexdigest()
  142. def get_resource_content(url, cache=None):
  143. """
  144. A helper function which returns the content of the resource found at the
  145. given URL.
  146. If the resource is already cached in the ``cache`` object and the cached
  147. content has not expired, the function will not do any HTTP requests and
  148. will return the cached content.
  149. If the resource is stale or not cached at all, it is from the Web.
  150. :param url: The URL of the resource to be retrieved
  151. :param cache: A cache object which should be used to look up and store
  152. the cached resource. If it is not provided, an instance of
  153. :class:`HttpCache` with a
  154. ``DISTRO_TRACKER_CACHE_DIRECTORY`` cache directory
  155. is used.
  156. :type cache: :class:`HttpCache` or an object with an equivalent interface
  157. :returns: The bytes representation of the resource found at the given url
  158. :rtype: bytes
  159. """
  160. if cache is None:
  161. cache_directory_path = settings.DISTRO_TRACKER_CACHE_DIRECTORY
  162. cache = HttpCache(cache_directory_path)
  163. try:
  164. if cache.is_expired(url):
  165. cache.update(url)
  166. return cache.get_content(url)
  167. except:
  168. pass