nytimes.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. import hmac
  2. import hashlib
  3. import base64
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. determine_ext,
  7. float_or_none,
  8. int_or_none,
  9. js_to_json,
  10. mimetype2ext,
  11. parse_iso8601,
  12. remove_start,
  13. )
  14. class NYTimesBaseIE(InfoExtractor):
  15. _SECRET = b'pX(2MbU2);4N{7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v'
  16. def _extract_video_from_id(self, video_id):
  17. # Authorization generation algorithm is reverse engineered from `signer` in
  18. # http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js
  19. path = '/svc/video/api/v3/video/' + video_id
  20. hm = hmac.new(self._SECRET, (path + ':vhs').encode(), hashlib.sha512).hexdigest()
  21. video_data = self._download_json('http://www.nytimes.com' + path, video_id, 'Downloading video JSON', headers={
  22. 'Authorization': 'NYTV ' + base64.b64encode(hm.encode()).decode(),
  23. 'X-NYTV': 'vhs',
  24. }, fatal=False)
  25. if not video_data:
  26. video_data = self._download_json(
  27. 'http://www.nytimes.com/svc/video/api/v2/video/' + video_id,
  28. video_id, 'Downloading video JSON')
  29. title = video_data['headline']
  30. def get_file_size(file_size):
  31. if isinstance(file_size, int):
  32. return file_size
  33. elif isinstance(file_size, dict):
  34. return int(file_size.get('value', 0))
  35. else:
  36. return None
  37. urls = []
  38. formats = []
  39. subtitles = {}
  40. for video in video_data.get('renditions', []):
  41. video_url = video.get('url')
  42. format_id = video.get('type')
  43. if not video_url or format_id == 'thumbs' or video_url in urls:
  44. continue
  45. urls.append(video_url)
  46. ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url)
  47. if ext == 'm3u8':
  48. m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
  49. video_url, video_id, 'mp4', 'm3u8_native',
  50. m3u8_id=format_id or 'hls', fatal=False)
  51. formats.extend(m3u8_fmts)
  52. subtitles = self._merge_subtitles(subtitles, m3u8_subs)
  53. elif ext == 'mpd':
  54. continue
  55. # formats.extend(self._extract_mpd_formats(
  56. # video_url, video_id, format_id or 'dash', fatal=False))
  57. else:
  58. formats.append({
  59. 'url': video_url,
  60. 'format_id': format_id,
  61. 'vcodec': video.get('videoencoding') or video.get('video_codec'),
  62. 'width': int_or_none(video.get('width')),
  63. 'height': int_or_none(video.get('height')),
  64. 'filesize': get_file_size(video.get('file_size') or video.get('fileSize')),
  65. 'tbr': int_or_none(video.get('bitrate'), 1000) or None,
  66. 'ext': ext,
  67. })
  68. thumbnails = []
  69. for image in video_data.get('images', []):
  70. image_url = image.get('url')
  71. if not image_url:
  72. continue
  73. thumbnails.append({
  74. 'url': 'http://www.nytimes.com/' + image_url,
  75. 'width': int_or_none(image.get('width')),
  76. 'height': int_or_none(image.get('height')),
  77. })
  78. publication_date = video_data.get('publication_date')
  79. timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None
  80. return {
  81. 'id': video_id,
  82. 'title': title,
  83. 'description': video_data.get('summary'),
  84. 'timestamp': timestamp,
  85. 'uploader': video_data.get('byline'),
  86. 'duration': float_or_none(video_data.get('duration'), 1000),
  87. 'formats': formats,
  88. 'subtitles': subtitles,
  89. 'thumbnails': thumbnails,
  90. }
  91. class NYTimesIE(NYTimesBaseIE):
  92. _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
  93. _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>']
  94. _TESTS = [{
  95. 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
  96. 'md5': 'd665342765db043f7e225cff19df0f2d',
  97. 'info_dict': {
  98. 'id': '100000002847155',
  99. 'ext': 'mov',
  100. 'title': 'Verbatim: What Is a Photocopier?',
  101. 'description': 'md5:93603dada88ddbda9395632fdc5da260',
  102. 'timestamp': 1398631707,
  103. 'upload_date': '20140427',
  104. 'uploader': 'Brett Weiner',
  105. 'duration': 419,
  106. }
  107. }, {
  108. 'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html',
  109. 'only_matching': True,
  110. }]
  111. def _real_extract(self, url):
  112. video_id = self._match_id(url)
  113. return self._extract_video_from_id(video_id)
  114. class NYTimesArticleIE(NYTimesBaseIE):
  115. _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?'
  116. _TESTS = [{
  117. 'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0',
  118. 'md5': 'e2076d58b4da18e6a001d53fd56db3c9',
  119. 'info_dict': {
  120. 'id': '100000003628438',
  121. 'ext': 'mov',
  122. 'title': 'New Minimum Wage: $70,000 a Year',
  123. 'description': 'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.',
  124. 'timestamp': 1429033037,
  125. 'upload_date': '20150414',
  126. 'uploader': 'Matthew Williams',
  127. }
  128. }, {
  129. 'url': 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html',
  130. 'md5': 'e0d52040cafb07662acf3c9132db3575',
  131. 'info_dict': {
  132. 'id': '100000004709062',
  133. 'title': 'The Run-Up: ‘He Was Like an Octopus’',
  134. 'ext': 'mp3',
  135. 'description': 'md5:fb5c6b93b12efc51649b4847fe066ee4',
  136. 'series': 'The Run-Up',
  137. 'episode': '‘He Was Like an Octopus’',
  138. 'episode_number': 20,
  139. 'duration': 2130,
  140. }
  141. }, {
  142. 'url': 'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html',
  143. 'info_dict': {
  144. 'id': '100000004709479',
  145. 'title': 'The Rise of Hitler',
  146. 'ext': 'mp3',
  147. 'description': 'md5:bce877fd9e3444990cb141875fab0028',
  148. 'creator': 'Pamela Paul',
  149. 'duration': 3475,
  150. },
  151. 'params': {
  152. 'skip_download': True,
  153. },
  154. }, {
  155. 'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1',
  156. 'only_matching': True,
  157. }]
  158. def _extract_podcast_from_json(self, json, page_id, webpage):
  159. podcast_audio = self._parse_json(
  160. json, page_id, transform_source=js_to_json)
  161. audio_data = podcast_audio['data']
  162. track = audio_data['track']
  163. episode_title = track['title']
  164. video_url = track['source']
  165. description = track.get('description') or self._html_search_meta(
  166. ['og:description', 'twitter:description'], webpage)
  167. podcast_title = audio_data.get('podcast', {}).get('title')
  168. title = ('%s: %s' % (podcast_title, episode_title)
  169. if podcast_title else episode_title)
  170. episode = audio_data.get('podcast', {}).get('episode') or ''
  171. episode_number = int_or_none(self._search_regex(
  172. r'[Ee]pisode\s+(\d+)', episode, 'episode number', default=None))
  173. return {
  174. 'id': remove_start(podcast_audio.get('target'), 'FT') or page_id,
  175. 'url': video_url,
  176. 'title': title,
  177. 'description': description,
  178. 'creator': track.get('credit'),
  179. 'series': podcast_title,
  180. 'episode': episode_title,
  181. 'episode_number': episode_number,
  182. 'duration': int_or_none(track.get('duration')),
  183. }
  184. def _real_extract(self, url):
  185. page_id = self._match_id(url)
  186. webpage = self._download_webpage(url, page_id)
  187. video_id = self._search_regex(
  188. r'data-videoid=["\'](\d+)', webpage, 'video id',
  189. default=None, fatal=False)
  190. if video_id is not None:
  191. return self._extract_video_from_id(video_id)
  192. podcast_data = self._search_regex(
  193. (r'NYTD\.FlexTypes\.push\s*\(\s*({.+?})\s*\)\s*;\s*</script',
  194. r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'),
  195. webpage, 'podcast data')
  196. return self._extract_podcast_from_json(podcast_data, page_id, webpage)
  197. class NYTimesCookingIE(NYTimesBaseIE):
  198. _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P<id>\d+)'
  199. _TESTS = [{
  200. 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart',
  201. 'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3',
  202. 'info_dict': {
  203. 'id': '100000004756089',
  204. 'ext': 'mov',
  205. 'timestamp': 1479383008,
  206. 'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON',
  207. 'title': 'Cranberry Tart',
  208. 'upload_date': '20161117',
  209. 'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.',
  210. },
  211. }, {
  212. 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey',
  213. 'md5': '4b2e8c70530a89b8d905a2b572316eb8',
  214. 'info_dict': {
  215. 'id': '100000003951728',
  216. 'ext': 'mov',
  217. 'timestamp': 1445509539,
  218. 'description': 'Turkey guide',
  219. 'upload_date': '20151022',
  220. 'title': 'Turkey',
  221. }
  222. }]
  223. def _real_extract(self, url):
  224. page_id = self._match_id(url)
  225. webpage = self._download_webpage(url, page_id)
  226. video_id = self._search_regex(
  227. r'data-video-id=["\'](\d+)', webpage, 'video id')
  228. return self._extract_video_from_id(video_id)