cda.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. import base64
  2. import codecs
  3. import datetime
  4. import hashlib
  5. import hmac
  6. import json
  7. import re
  8. from .common import InfoExtractor
  9. from ..compat import compat_ord, compat_urllib_parse_unquote
  10. from ..utils import (
  11. ExtractorError,
  12. float_or_none,
  13. int_or_none,
  14. merge_dicts,
  15. multipart_encode,
  16. parse_duration,
  17. random_birthday,
  18. traverse_obj,
  19. try_call,
  20. try_get,
  21. urljoin,
  22. )
  23. class CDAIE(InfoExtractor):
  24. _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
  25. _NETRC_MACHINE = 'cdapl'
  26. _BASE_URL = 'http://www.cda.pl/'
  27. _BASE_API_URL = 'https://api.cda.pl'
  28. _API_HEADERS = {
  29. 'Accept': 'application/vnd.cda.public+json',
  30. 'User-Agent': 'pl.cda 1.0 (version 1.2.88 build 15306; Android 9; Xiaomi Redmi 3S)',
  31. }
  32. # hardcoded in the app
  33. _LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q'
  34. _BEARER_CACHE = 'cda-bearer'
  35. _TESTS = [{
  36. 'url': 'http://www.cda.pl/video/5749950c',
  37. 'md5': '6f844bf51b15f31fae165365707ae970',
  38. 'info_dict': {
  39. 'id': '5749950c',
  40. 'ext': 'mp4',
  41. 'height': 720,
  42. 'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
  43. 'description': 'md5:269ccd135d550da90d1662651fcb9772',
  44. 'thumbnail': r're:^https?://.*\.jpg$',
  45. 'average_rating': float,
  46. 'duration': 39,
  47. 'age_limit': 0,
  48. 'upload_date': '20160221',
  49. 'timestamp': 1456078244,
  50. }
  51. }, {
  52. 'url': 'http://www.cda.pl/video/57413289',
  53. 'md5': 'a88828770a8310fc00be6c95faf7f4d5',
  54. 'info_dict': {
  55. 'id': '57413289',
  56. 'ext': 'mp4',
  57. 'title': 'Lądowanie na lotnisku na Maderze',
  58. 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
  59. 'thumbnail': r're:^https?://.*\.jpg$',
  60. 'uploader': 'crash404',
  61. 'view_count': int,
  62. 'average_rating': float,
  63. 'duration': 137,
  64. 'age_limit': 0,
  65. }
  66. }, {
  67. # Age-restricted
  68. 'url': 'http://www.cda.pl/video/1273454c4',
  69. 'info_dict': {
  70. 'id': '1273454c4',
  71. 'ext': 'mp4',
  72. 'title': 'Bronson (2008) napisy HD 1080p',
  73. 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c',
  74. 'height': 1080,
  75. 'uploader': 'boniek61',
  76. 'thumbnail': r're:^https?://.*\.jpg$',
  77. 'duration': 5554,
  78. 'age_limit': 18,
  79. 'view_count': int,
  80. 'average_rating': float,
  81. },
  82. }, {
  83. 'url': 'http://ebd.cda.pl/0x0/5749950c',
  84. 'only_matching': True,
  85. }]
  86. def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
  87. form_data = random_birthday('rok', 'miesiac', 'dzien')
  88. form_data.update({'return': url, 'module': 'video', 'module_id': video_id})
  89. data, content_type = multipart_encode(form_data)
  90. return self._download_webpage(
  91. urljoin(url, '/a/validatebirth'), video_id, *args,
  92. data=data, headers={
  93. 'Referer': url,
  94. 'Content-Type': content_type,
  95. }, **kwargs)
  96. def _perform_login(self, username, password):
  97. cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {}
  98. if cached_bearer.get('valid_until', 0) > datetime.datetime.now().timestamp() + 5:
  99. self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}'
  100. return
  101. password_hash = base64.urlsafe_b64encode(hmac.new(
  102. b's01m1Oer5IANoyBXQETzSOLWXgWs01m1Oer5bMg5xrTMMxRZ9Pi4fIPeFgIVRZ9PeXL8mPfXQETZGUAN5StRZ9P',
  103. ''.join(f'{bytes((bt & 255, )).hex():0>2}'
  104. for bt in hashlib.md5(password.encode()).digest()).encode(),
  105. hashlib.sha256).digest()).decode().replace('=', '')
  106. token_res = self._download_json(
  107. f'{self._BASE_API_URL}/oauth/token', None, 'Logging in', data=b'',
  108. headers={**self._API_HEADERS, 'Authorization': self._LOGIN_REQUEST_AUTH},
  109. query={
  110. 'grant_type': 'password',
  111. 'login': username,
  112. 'password': password_hash,
  113. })
  114. self.cache.store(self._BEARER_CACHE, username, {
  115. 'token': token_res['access_token'],
  116. 'valid_until': token_res['expires_in'] + datetime.datetime.now().timestamp(),
  117. })
  118. self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}'
  119. def _real_extract(self, url):
  120. video_id = self._match_id(url)
  121. if 'Authorization' in self._API_HEADERS:
  122. return self._api_extract(video_id)
  123. else:
  124. return self._web_extract(video_id, url)
  125. def _api_extract(self, video_id):
  126. meta = self._download_json(
  127. f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video']
  128. if meta.get('premium') and not meta.get('premium_free'):
  129. self.report_drm(video_id)
  130. uploader = traverse_obj(meta, 'author', 'login')
  131. formats = [{
  132. 'url': quality['file'],
  133. 'format': quality.get('title'),
  134. 'resolution': quality.get('name'),
  135. 'height': try_call(lambda: int(quality['name'][:-1])),
  136. 'filesize': quality.get('length'),
  137. } for quality in meta['qualities'] if quality.get('file')]
  138. return {
  139. 'id': video_id,
  140. 'title': meta.get('title'),
  141. 'description': meta.get('description'),
  142. 'uploader': None if uploader == 'anonim' else uploader,
  143. 'average_rating': float_or_none(meta.get('rating')),
  144. 'thumbnail': meta.get('thumb'),
  145. 'formats': formats,
  146. 'duration': meta.get('duration'),
  147. 'age_limit': 18 if meta.get('for_adults') else 0,
  148. 'view_count': meta.get('views'),
  149. }
  150. def _web_extract(self, video_id, url):
  151. self._set_cookie('cda.pl', 'cda.player', 'html5')
  152. webpage = self._download_webpage(
  153. self._BASE_URL + '/video/' + video_id, video_id)
  154. if 'Ten film jest dostępny dla użytkowników premium' in webpage:
  155. raise ExtractorError('This video is only available for premium users.', expected=True)
  156. if re.search(r'niedostępn[ey] w(?:&nbsp;|\s+)Twoim kraju\s*<', webpage):
  157. self.raise_geo_restricted()
  158. need_confirm_age = False
  159. if self._html_search_regex(r'(<form[^>]+action="[^"]*/a/validatebirth[^"]*")',
  160. webpage, 'birthday validate form', default=None):
  161. webpage = self._download_age_confirm_page(
  162. url, video_id, note='Confirming age')
  163. need_confirm_age = True
  164. formats = []
  165. uploader = self._search_regex(r'''(?x)
  166. <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
  167. (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
  168. <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
  169. ''', webpage, 'uploader', default=None, group='uploader')
  170. view_count = self._search_regex(
  171. r'Odsłony:(?:\s|&nbsp;)*([0-9]+)', webpage,
  172. 'view_count', default=None)
  173. average_rating = self._search_regex(
  174. (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
  175. r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
  176. group='rating_value')
  177. info_dict = {
  178. 'id': video_id,
  179. 'title': self._og_search_title(webpage),
  180. 'description': self._og_search_description(webpage),
  181. 'uploader': uploader,
  182. 'view_count': int_or_none(view_count),
  183. 'average_rating': float_or_none(average_rating),
  184. 'thumbnail': self._og_search_thumbnail(webpage),
  185. 'formats': formats,
  186. 'duration': None,
  187. 'age_limit': 18 if need_confirm_age else 0,
  188. }
  189. info = self._search_json_ld(webpage, video_id, default={})
  190. # Source: https://www.cda.pl/js/player.js?t=1606154898
  191. def decrypt_file(a):
  192. for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
  193. a = a.replace(p, '')
  194. a = compat_urllib_parse_unquote(a)
  195. b = []
  196. for c in a:
  197. f = compat_ord(c)
  198. b.append(chr(33 + (f + 14) % 94) if 33 <= f <= 126 else chr(f))
  199. a = ''.join(b)
  200. a = a.replace('.cda.mp4', '')
  201. for p in ('.2cda.pl', '.3cda.pl'):
  202. a = a.replace(p, '.cda.pl')
  203. if '/upstream' in a:
  204. a = a.replace('/upstream', '.mp4/upstream')
  205. return 'https://' + a
  206. return 'https://' + a + '.mp4'
  207. def extract_format(page, version):
  208. json_str = self._html_search_regex(
  209. r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
  210. '%s player_json' % version, fatal=False, group='player_data')
  211. if not json_str:
  212. return
  213. player_data = self._parse_json(
  214. json_str, '%s player_data' % version, fatal=False)
  215. if not player_data:
  216. return
  217. video = player_data.get('video')
  218. if not video or 'file' not in video:
  219. self.report_warning('Unable to extract %s version information' % version)
  220. return
  221. if video['file'].startswith('uggc'):
  222. video['file'] = codecs.decode(video['file'], 'rot_13')
  223. if video['file'].endswith('adc.mp4'):
  224. video['file'] = video['file'].replace('adc.mp4', '.mp4')
  225. elif not video['file'].startswith('http'):
  226. video['file'] = decrypt_file(video['file'])
  227. video_quality = video.get('quality')
  228. qualities = video.get('qualities', {})
  229. video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality)
  230. info_dict['formats'].append({
  231. 'url': video['file'],
  232. 'format_id': video_quality,
  233. 'height': int_or_none(video_quality[:-1]),
  234. })
  235. for quality, cda_quality in qualities.items():
  236. if quality == video_quality:
  237. continue
  238. data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2,
  239. 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]}
  240. data = json.dumps(data).encode('utf-8')
  241. video_url = self._download_json(
  242. f'https://www.cda.pl/video/{video_id}', video_id, headers={
  243. 'Content-Type': 'application/json',
  244. 'X-Requested-With': 'XMLHttpRequest'
  245. }, data=data, note=f'Fetching {quality} url',
  246. errnote=f'Failed to fetch {quality} url', fatal=False)
  247. if try_get(video_url, lambda x: x['result']['status']) == 'ok':
  248. video_url = try_get(video_url, lambda x: x['result']['resp'])
  249. info_dict['formats'].append({
  250. 'url': video_url,
  251. 'format_id': quality,
  252. 'height': int_or_none(quality[:-1])
  253. })
  254. if not info_dict['duration']:
  255. info_dict['duration'] = parse_duration(video.get('duration'))
  256. extract_format(webpage, 'default')
  257. for href, resolution in re.findall(
  258. r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
  259. webpage):
  260. if need_confirm_age:
  261. handler = self._download_age_confirm_page
  262. else:
  263. handler = self._download_webpage
  264. webpage = handler(
  265. urljoin(self._BASE_URL, href), video_id,
  266. 'Downloading %s version information' % resolution, fatal=False)
  267. if not webpage:
  268. # Manually report warning because empty page is returned when
  269. # invalid version is requested.
  270. self.report_warning('Unable to download %s version information' % resolution)
  271. continue
  272. extract_format(webpage, resolution)
  273. return merge_dicts(info_dict, info)