dl_github_archive.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. #!/usr/bin/env python3
  2. #
  3. # Copyright (c) 2018 Yousong Zhou <yszhou4tech@gmail.com>
  4. #
  5. # This is free software, licensed under the GNU General Public License v2.
  6. # See /LICENSE for more information.
  7. import argparse
  8. import calendar
  9. import datetime
  10. import errno
  11. import fcntl
  12. import hashlib
  13. import json
  14. import os
  15. import os.path
  16. import re
  17. import shutil
  18. import ssl
  19. import subprocess
  20. import sys
  21. import time
  22. import urllib.request
  23. TMPDIR = os.environ.get('TMP_DIR') or '/tmp'
  24. TMPDIR_DL = os.path.join(TMPDIR, 'dl')
  25. class PathException(Exception): pass
  26. class DownloadGitHubError(Exception): pass
  27. class Path(object):
  28. """Context class for preparing and cleaning up directories.
  29. If ```preclean` is ``False``, ``path`` will NOT be removed on context enter
  30. If ``path`` ``isdir``, then it will be created on context enter.
  31. If ``keep`` is True, then ``path`` will NOT be removed on context exit
  32. """
  33. def __init__(self, path, isdir=True, preclean=False, keep=False):
  34. self.path = path
  35. self.isdir = isdir
  36. self.preclean = preclean
  37. self.keep = keep
  38. def __enter__(self):
  39. if self.preclean:
  40. self.rm_all(self.path)
  41. if self.isdir:
  42. self.mkdir_all(self.path)
  43. return self
  44. def __exit__(self, exc_type, exc_value, traceback):
  45. if not self.keep:
  46. self.rm_all(self.path)
  47. @staticmethod
  48. def mkdir_all(path):
  49. """Same as mkdir -p."""
  50. names = os.path.split(path)
  51. p = ''
  52. for name in names:
  53. p = os.path.join(p, name)
  54. Path._mkdir(p)
  55. @staticmethod
  56. def _rmdir_dir(dir_):
  57. names = Path._listdir(dir_)
  58. for name in names:
  59. p = os.path.join(dir_, name)
  60. Path.rm_all(p)
  61. Path._rmdir(dir_)
  62. @staticmethod
  63. def _mkdir(path):
  64. Path._os_func(os.mkdir, path, errno.EEXIST)
  65. @staticmethod
  66. def _rmdir(path):
  67. Path._os_func(os.rmdir, path, errno.ENOENT)
  68. @staticmethod
  69. def _remove(path):
  70. Path._os_func(os.remove, path, errno.ENOENT)
  71. @staticmethod
  72. def _listdir(path):
  73. return Path._os_func(os.listdir, path, errno.ENOENT, default=[])
  74. @staticmethod
  75. def _os_func(func, path, errno, default=None):
  76. """Call func(path) in an idempotent way.
  77. On exception ``ex``, if the type is OSError and ``ex.errno == errno``,
  78. return ``default``, otherwise, re-raise
  79. """
  80. try:
  81. return func(path)
  82. except OSError as e:
  83. if e.errno == errno:
  84. return default
  85. else:
  86. raise
  87. @staticmethod
  88. def rm_all(path):
  89. """Same as rm -r."""
  90. if os.path.islink(path):
  91. Path._remove(path)
  92. elif os.path.isdir(path):
  93. Path._rmdir_dir(path)
  94. else:
  95. Path._remove(path)
  96. @staticmethod
  97. def untar(path, into=None):
  98. """Extract tarball at ``path`` into subdir ``into``.
  99. return subdir name if and only if there exists one, otherwise raise PathException
  100. """
  101. args = ('tar', '-C', into, '-xzf', path, '--no-same-permissions')
  102. subprocess.check_call(args, preexec_fn=lambda: os.umask(0o22))
  103. dirs = os.listdir(into)
  104. if len(dirs) == 1:
  105. return dirs[0]
  106. else:
  107. raise PathException('untar %s: expecting a single subdir, got %s' % (path, dirs))
  108. @staticmethod
  109. def tar(path, subdir, into=None, ts=None):
  110. """Pack ``path`` into tarball ``into``."""
  111. # --sort=name requires a recent build of GNU tar
  112. args = ['tar', '--numeric-owner', '--owner=0', '--group=0', '--sort=name', '--mode=a-s']
  113. args += ['-C', path, '-cf', into, subdir]
  114. envs = os.environ.copy()
  115. if ts is not None:
  116. args.append('--mtime=@%d' % ts)
  117. if into.endswith('.xz'):
  118. envs['XZ_OPT'] = '-7e'
  119. args.append('-J')
  120. elif into.endswith('.bz2'):
  121. args.append('-j')
  122. elif into.endswith('.gz'):
  123. args.append('-z')
  124. envs['GZIP'] = '-n'
  125. else:
  126. raise PathException('unknown compression type %s' % into)
  127. subprocess.check_call(args, env=envs)
  128. class GitHubCommitTsCache(object):
  129. __cachef = 'github.commit.ts.cache'
  130. __cachen = 2048
  131. def __init__(self):
  132. Path.mkdir_all(TMPDIR_DL)
  133. self.cachef = os.path.join(TMPDIR_DL, self.__cachef)
  134. self.cache = {}
  135. def get(self, k):
  136. """Get timestamp with key ``k``."""
  137. fileno = os.open(self.cachef, os.O_RDONLY | os.O_CREAT)
  138. with os.fdopen(fileno) as fin:
  139. try:
  140. fcntl.lockf(fileno, fcntl.LOCK_SH)
  141. self._cache_init(fin)
  142. if k in self.cache:
  143. ts = self.cache[k][0]
  144. return ts
  145. finally:
  146. fcntl.lockf(fileno, fcntl.LOCK_UN)
  147. return None
  148. def set(self, k, v):
  149. """Update timestamp with ``k``."""
  150. fileno = os.open(self.cachef, os.O_RDWR | os.O_CREAT)
  151. with os.fdopen(fileno, 'w+') as f:
  152. try:
  153. fcntl.lockf(fileno, fcntl.LOCK_EX)
  154. self._cache_init(f)
  155. self.cache[k] = (v, int(time.time()))
  156. self._cache_flush(f)
  157. finally:
  158. fcntl.lockf(fileno, fcntl.LOCK_UN)
  159. def _cache_init(self, fin):
  160. for line in fin:
  161. k, ts, updated = line.split()
  162. ts = int(ts)
  163. updated = int(updated)
  164. self.cache[k] = (ts, updated)
  165. def _cache_flush(self, fout):
  166. cache = sorted(self.cache.items(), key=lambda a: a[1][1])
  167. cache = cache[:self.__cachen]
  168. self.cache = {}
  169. os.ftruncate(fout.fileno(), 0)
  170. fout.seek(0, os.SEEK_SET)
  171. for k, ent in cache:
  172. ts = ent[0]
  173. updated = ent[1]
  174. line = '{0} {1} {2}\n'.format(k, ts, updated)
  175. fout.write(line)
  176. class DownloadGitHubTarball(object):
  177. """Download and repack archive tarball from GitHub.
  178. Compared with the method of packing after cloning the whole repo, this
  179. method is more friendly to users with fragile internet connection.
  180. However, there are limitations with this method
  181. - GitHub imposes a 60 reqs/hour limit for unauthenticated API access.
  182. This affects fetching commit date for reproducible tarballs. Download
  183. through the archive link is not affected.
  184. - GitHub archives do not contain source codes for submodules.
  185. - GitHub archives seem to respect .gitattributes and ignore paths with
  186. export-ignore attributes.
  187. For the first two issues, the method will fail loudly to allow fallback to
  188. clone-then-pack method.
  189. As for the 3rd issue, to make sure that this method only produces identical
  190. tarballs as the fallback method, we require the expected hash value to be
  191. supplied. That means the first tarball will need to be prepared by the
  192. clone-then-pack method
  193. """
  194. __repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)')
  195. def __init__(self, args):
  196. self.dl_dir = args.dl_dir
  197. self.version = args.version
  198. self.subdir = args.subdir
  199. self.source = args.source
  200. self.submodules = args.submodules
  201. self.url = args.url
  202. self._init_owner_repo()
  203. self.xhash = args.hash
  204. self._init_hasher()
  205. self.commit_ts = None # lazy load commit timestamp
  206. self.commit_ts_cache = GitHubCommitTsCache()
  207. self.name = 'github-tarball'
  208. def download(self):
  209. """Download and repack GitHub archive tarball."""
  210. if self.submodules and self.submodules != ['skip']:
  211. raise self._error('Fetching submodules is not yet supported')
  212. self._init_commit_ts()
  213. with Path(TMPDIR_DL, keep=True) as dir_dl:
  214. # fetch tarball from GitHub
  215. tarball_path = os.path.join(dir_dl.path, self.subdir + '.tar.gz.dl')
  216. with Path(tarball_path, isdir=False):
  217. self._fetch(tarball_path)
  218. # unpack
  219. d = os.path.join(dir_dl.path, self.subdir + '.untar')
  220. with Path(d, preclean=True) as dir_untar:
  221. tarball_prefix = Path.untar(tarball_path, into=dir_untar.path)
  222. dir0 = os.path.join(dir_untar.path, tarball_prefix)
  223. dir1 = os.path.join(dir_untar.path, self.subdir)
  224. # submodules check
  225. if self.submodules != ['skip'] and self._has_submodule(dir0):
  226. raise self._error('Fetching submodules is not yet supported')
  227. # rename subdir
  228. os.rename(dir0, dir1)
  229. # repack
  230. into=os.path.join(TMPDIR_DL, self.source)
  231. Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts)
  232. try:
  233. self._hash_check(into)
  234. except Exception:
  235. Path.rm_all(into)
  236. raise
  237. # move to target location
  238. file1 = os.path.join(self.dl_dir, self.source)
  239. if into != file1:
  240. shutil.move(into, file1)
  241. def _has_submodule(self, dir_):
  242. m = os.path.join(dir_, '.gitmodules')
  243. try:
  244. st = os.stat(m)
  245. return st.st_size > 0
  246. except OSError as e:
  247. return e.errno != errno.ENOENT
  248. def _init_owner_repo(self):
  249. m = self.__repo_url_regex.search(self.url)
  250. if m is None:
  251. raise self._error('Invalid github url: {}'.format(self.url))
  252. owner = m.group('owner')
  253. repo = m.group('repo')
  254. if repo.endswith('.git'):
  255. repo = repo[:-4]
  256. self.owner = owner
  257. self.repo = repo
  258. def _init_hasher(self):
  259. xhash = self.xhash
  260. if len(xhash) == 64:
  261. self.hasher = hashlib.sha256()
  262. elif len(xhash) == 32:
  263. self.hasher = hashlib.md5()
  264. else:
  265. raise self._error('Requires sha256sum for verification')
  266. self.xhash = xhash
  267. def _hash_check(self, f):
  268. with open(f, 'rb') as fin:
  269. while True:
  270. d = fin.read(4096)
  271. if not d:
  272. break
  273. self.hasher.update(d)
  274. xhash = self.hasher.hexdigest()
  275. if xhash != self.xhash:
  276. raise self._error('Wrong hash (probably caused by .gitattributes), expecting {}, got {}'.format(self.xhash, xhash))
  277. def _init_commit_ts(self):
  278. if self.commit_ts is not None:
  279. return
  280. # GitHub provides 2 APIs[1,2] for fetching commit data. API[1] is more
  281. # terse while API[2] provides more verbose info such as commit diff
  282. # etc. That's the main reason why API[1] is preferred: the response
  283. # size is predictable.
  284. #
  285. # However, API[1] only accepts complete commit sha1sum as the parameter
  286. # while API[2] is more liberal accepting also partial commit id and
  287. # tags, etc.
  288. #
  289. # [1] Get a single commit, Repositories, https://developer.github.com/v3/repos/commits/#get-a-single-commit
  290. # [2] Git Commits, Git Data, https://developer.github.com/v3/git/commits/#get-a-commit
  291. apis = [
  292. {
  293. 'url': self._make_repo_url_path('git', 'commits', self.version),
  294. 'attr_path': ('committer', 'date'),
  295. }, {
  296. 'url': self._make_repo_url_path('commits', self.version),
  297. 'attr_path': ('commit', 'committer', 'date'),
  298. },
  299. ]
  300. version_is_sha1sum = len(self.version) == 40
  301. if not version_is_sha1sum:
  302. apis.insert(0, apis.pop())
  303. reasons = ''
  304. for api in apis:
  305. url = api['url']
  306. attr_path = api['attr_path']
  307. try:
  308. ct = self.commit_ts_cache.get(url)
  309. if ct is not None:
  310. self.commit_ts = ct
  311. return
  312. ct = self._init_commit_ts_remote_get(url, attr_path)
  313. self.commit_ts = ct
  314. self.commit_ts_cache.set(url, ct)
  315. return
  316. except Exception as e:
  317. reasons += '\n' + (" {}: {}".format(url, e))
  318. raise self._error('Cannot fetch commit ts:{}'.format(reasons))
  319. def _init_commit_ts_remote_get(self, url, attrpath):
  320. resp = self._make_request(url)
  321. data = resp.read()
  322. date = json.loads(data)
  323. for attr in attrpath:
  324. date = date[attr]
  325. date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
  326. date = date.timetuple()
  327. ct = calendar.timegm(date)
  328. return ct
  329. def _fetch(self, path):
  330. """Fetch tarball of the specified version ref."""
  331. ref = self.version
  332. url = self._make_repo_url_path('tarball', ref)
  333. resp = self._make_request(url)
  334. with open(path, 'wb') as fout:
  335. while True:
  336. d = resp.read(4096)
  337. if not d:
  338. break
  339. fout.write(d)
  340. def _make_repo_url_path(self, *args):
  341. url = '/repos/{0}/{1}'.format(self.owner, self.repo)
  342. if args:
  343. url += '/' + '/'.join(args)
  344. return url
  345. def _make_request(self, path):
  346. """Request GitHub API endpoint on ``path``."""
  347. url = 'https://api.github.com' + path
  348. headers = {
  349. 'Accept': 'application/vnd.github.v3+json',
  350. 'User-Agent': 'OpenWrt',
  351. }
  352. req = urllib.request.Request(url, headers=headers)
  353. sslcontext = ssl._create_unverified_context()
  354. fileobj = urllib.request.urlopen(req, context=sslcontext)
  355. return fileobj
  356. def _error(self, msg):
  357. return DownloadGitHubError('{}: {}'.format(self.source, msg))
  358. def main():
  359. parser = argparse.ArgumentParser()
  360. parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir')
  361. parser.add_argument('--url', help='Download URL')
  362. parser.add_argument('--subdir', help='Source code subdir name')
  363. parser.add_argument('--version', help='Source code version')
  364. parser.add_argument('--source', help='Source tarball filename')
  365. parser.add_argument('--hash', help='Source tarball\'s expected sha256sum')
  366. parser.add_argument('--submodules', nargs='*', help='List of submodules, or "skip"')
  367. args = parser.parse_args()
  368. try:
  369. method = DownloadGitHubTarball(args)
  370. method.download()
  371. except Exception as ex:
  372. sys.stderr.write('{}: Download from {} failed\n'.format(args.source, args.url))
  373. sys.stderr.write('{}\n'.format(ex))
  374. sys.exit(1)
  375. if __name__ == '__main__':
  376. main()