soundcloud.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931
  1. import itertools
  2. import re
  3. import json
  4. # import random
  5. from .common import (
  6. InfoExtractor,
  7. SearchInfoExtractor
  8. )
  9. from ..compat import (
  10. compat_HTTPError,
  11. compat_str,
  12. )
  13. from ..utils import (
  14. error_to_compat_str,
  15. ExtractorError,
  16. float_or_none,
  17. HEADRequest,
  18. int_or_none,
  19. KNOWN_EXTENSIONS,
  20. mimetype2ext,
  21. parse_qs,
  22. str_or_none,
  23. try_get,
  24. unified_timestamp,
  25. update_url_query,
  26. url_or_none,
  27. urlhandle_detect_ext,
  28. sanitized_Request,
  29. )
  30. class SoundcloudEmbedIE(InfoExtractor):
  31. _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)'
  32. _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1']
  33. _TEST = {
  34. # from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/
  35. 'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey',
  36. 'only_matching': True,
  37. }
  38. def _real_extract(self, url):
  39. query = parse_qs(url)
  40. api_url = query['url'][0]
  41. secret_token = query.get('secret_token')
  42. if secret_token:
  43. api_url = update_url_query(api_url, {'secret_token': secret_token[0]})
  44. return self.url_result(api_url)
  45. class SoundcloudBaseIE(InfoExtractor):
  46. _NETRC_MACHINE = 'soundcloud'
  47. _API_V2_BASE = 'https://api-v2.soundcloud.com/'
  48. _BASE_URL = 'https://soundcloud.com/'
  49. _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
  50. _API_AUTH_QUERY_TEMPLATE = '?client_id=%s'
  51. _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s'
  52. _API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s'
  53. _access_token = None
  54. _HEADERS = {}
  55. _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg'
  56. _ARTWORK_MAP = {
  57. 'mini': 16,
  58. 'tiny': 20,
  59. 'small': 32,
  60. 'badge': 47,
  61. 't67x67': 67,
  62. 'large': 100,
  63. 't300x300': 300,
  64. 'crop': 400,
  65. 't500x500': 500,
  66. 'original': 0,
  67. }
  68. def _store_client_id(self, client_id):
  69. self.cache.store('soundcloud', 'client_id', client_id)
  70. def _update_client_id(self):
  71. webpage = self._download_webpage('https://soundcloud.com/', None)
  72. for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)):
  73. script = self._download_webpage(src, None, fatal=False)
  74. if script:
  75. client_id = self._search_regex(
  76. r'client_id\s*:\s*"([0-9a-zA-Z]{32})"',
  77. script, 'client id', default=None)
  78. if client_id:
  79. self._CLIENT_ID = client_id
  80. self._store_client_id(client_id)
  81. return
  82. raise ExtractorError('Unable to extract client id')
  83. def _download_json(self, *args, **kwargs):
  84. non_fatal = kwargs.get('fatal') is False
  85. if non_fatal:
  86. del kwargs['fatal']
  87. query = kwargs.get('query', {}).copy()
  88. for _ in range(2):
  89. query['client_id'] = self._CLIENT_ID
  90. kwargs['query'] = query
  91. try:
  92. return super()._download_json(*args, **kwargs)
  93. except ExtractorError as e:
  94. if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403):
  95. self._store_client_id(None)
  96. self._update_client_id()
  97. continue
  98. elif non_fatal:
  99. self.report_warning(error_to_compat_str(e))
  100. return False
  101. raise
  102. def _initialize_pre_login(self):
  103. self._CLIENT_ID = self.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf'
  104. def _perform_login(self, username, password):
  105. if username != 'oauth':
  106. self.report_warning(
  107. 'Login using username and password is not currently supported. '
  108. 'Use "--username oauth --password <oauth_token>" to login using an oauth token')
  109. self._access_token = password
  110. query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
  111. payload = {'session': {'access_token': self._access_token}}
  112. token_verification = sanitized_Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8'))
  113. response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False)
  114. if response is not False:
  115. self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
  116. self.report_login()
  117. else:
  118. self.report_warning('Provided authorization token seems to be invalid. Continue as guest')
  119. r'''
  120. def genDevId():
  121. def genNumBlock():
  122. return ''.join([str(random.randrange(10)) for i in range(6)])
  123. return '-'.join([genNumBlock() for i in range(4)])
  124. payload = {
  125. 'client_id': self._CLIENT_ID,
  126. 'recaptcha_pubkey': 'null',
  127. 'recaptcha_response': 'null',
  128. 'credentials': {
  129. 'identifier': username,
  130. 'password': password
  131. },
  132. 'signature': self.sign(username, password, self._CLIENT_ID),
  133. 'device_id': genDevId(),
  134. 'user_agent': self._USER_AGENT
  135. }
  136. query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
  137. login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8'))
  138. response = self._download_json(login, None)
  139. self._access_token = response.get('session').get('access_token')
  140. if not self._access_token:
  141. self.report_warning('Unable to get access token, login may has failed')
  142. else:
  143. self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
  144. '''
  145. # signature generation
  146. def sign(self, user, pw, clid):
  147. a = 33
  148. i = 1
  149. s = 440123
  150. w = 117
  151. u = 1800000
  152. l = 1042
  153. b = 37
  154. k = 37
  155. c = 5
  156. n = '0763ed7314c69015fd4a0dc16bbf4b90' # _KEY
  157. y = '8' # _REV
  158. r = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' # _USER_AGENT
  159. e = user # _USERNAME
  160. t = clid # _CLIENT_ID
  161. d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]])
  162. p = n + y + d + r + e + t + d + n
  163. h = p
  164. m = 8011470
  165. f = 0
  166. for f in range(f, len(h)):
  167. m = (m >> 1) + ((1 & m) << 23)
  168. m += ord(h[f])
  169. m &= 16777215
  170. # c is not even needed
  171. out = str(y) + ':' + str(d) + ':' + format(m, 'x') + ':' + str(c)
  172. return out
  173. def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_flat=False):
  174. track_id = compat_str(info['id'])
  175. title = info['title']
  176. format_urls = set()
  177. formats = []
  178. query = {'client_id': self._CLIENT_ID}
  179. if secret_token:
  180. query['secret_token'] = secret_token
  181. if not extract_flat and info.get('downloadable') and info.get('has_downloads_left'):
  182. download_url = update_url_query(
  183. self._API_V2_BASE + 'tracks/' + track_id + '/download', query)
  184. redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri')
  185. if redirect_url:
  186. urlh = self._request_webpage(
  187. HEADRequest(redirect_url), track_id, fatal=False)
  188. if urlh:
  189. format_url = urlh.geturl()
  190. format_urls.add(format_url)
  191. formats.append({
  192. 'format_id': 'download',
  193. 'ext': urlhandle_detect_ext(urlh) or 'mp3',
  194. 'filesize': int_or_none(urlh.headers.get('Content-Length')),
  195. 'url': format_url,
  196. 'quality': 10,
  197. })
  198. def invalid_url(url):
  199. return not url or url in format_urls
  200. def add_format(f, protocol, is_preview=False):
  201. mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url)
  202. if mobj:
  203. for k, v in mobj.groupdict().items():
  204. if not f.get(k):
  205. f[k] = v
  206. format_id_list = []
  207. if protocol:
  208. format_id_list.append(protocol)
  209. ext = f.get('ext')
  210. if ext == 'aac':
  211. f['abr'] = '256'
  212. for k in ('ext', 'abr'):
  213. v = f.get(k)
  214. if v:
  215. format_id_list.append(v)
  216. preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url'])
  217. if preview:
  218. format_id_list.append('preview')
  219. abr = f.get('abr')
  220. if abr:
  221. f['abr'] = int(abr)
  222. if protocol == 'hls':
  223. protocol = 'm3u8' if ext == 'aac' else 'm3u8_native'
  224. else:
  225. protocol = 'http'
  226. f.update({
  227. 'format_id': '_'.join(format_id_list),
  228. 'protocol': protocol,
  229. 'preference': -10 if preview else None,
  230. })
  231. formats.append(f)
  232. # New API
  233. transcodings = try_get(
  234. info, lambda x: x['media']['transcodings'], list) or []
  235. for t in transcodings:
  236. if not isinstance(t, dict):
  237. continue
  238. format_url = url_or_none(t.get('url'))
  239. if not format_url:
  240. continue
  241. stream = None if extract_flat else self._download_json(
  242. format_url, track_id, query=query, fatal=False, headers=self._HEADERS)
  243. if not isinstance(stream, dict):
  244. continue
  245. stream_url = url_or_none(stream.get('url'))
  246. if invalid_url(stream_url):
  247. continue
  248. format_urls.add(stream_url)
  249. stream_format = t.get('format') or {}
  250. protocol = stream_format.get('protocol')
  251. if protocol != 'hls' and '/hls' in format_url:
  252. protocol = 'hls'
  253. ext = None
  254. preset = str_or_none(t.get('preset'))
  255. if preset:
  256. ext = preset.split('_')[0]
  257. if ext not in KNOWN_EXTENSIONS:
  258. ext = mimetype2ext(stream_format.get('mime_type'))
  259. add_format({
  260. 'url': stream_url,
  261. 'ext': ext,
  262. }, 'http' if protocol == 'progressive' else protocol,
  263. t.get('snipped') or '/preview/' in format_url)
  264. for f in formats:
  265. f['vcodec'] = 'none'
  266. if not formats and info.get('policy') == 'BLOCK':
  267. self.raise_geo_restricted(metadata_available=True)
  268. user = info.get('user') or {}
  269. thumbnails = []
  270. artwork_url = info.get('artwork_url')
  271. thumbnail = artwork_url or user.get('avatar_url')
  272. if isinstance(thumbnail, compat_str):
  273. if re.search(self._IMAGE_REPL_RE, thumbnail):
  274. for image_id, size in self._ARTWORK_MAP.items():
  275. i = {
  276. 'id': image_id,
  277. 'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail),
  278. }
  279. if image_id == 'tiny' and not artwork_url:
  280. size = 18
  281. elif image_id == 'original':
  282. i['preference'] = 10
  283. if size:
  284. i.update({
  285. 'width': size,
  286. 'height': size,
  287. })
  288. thumbnails.append(i)
  289. else:
  290. thumbnails = [{'url': thumbnail}]
  291. def extract_count(key):
  292. return int_or_none(info.get('%s_count' % key))
  293. return {
  294. 'id': track_id,
  295. 'uploader': user.get('username'),
  296. 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'),
  297. 'uploader_url': user.get('permalink_url'),
  298. 'timestamp': unified_timestamp(info.get('created_at')),
  299. 'title': title,
  300. 'description': info.get('description'),
  301. 'thumbnails': thumbnails,
  302. 'duration': float_or_none(info.get('duration'), 1000),
  303. 'webpage_url': info.get('permalink_url'),
  304. 'license': info.get('license'),
  305. 'view_count': extract_count('playback'),
  306. 'like_count': extract_count('favoritings') or extract_count('likes'),
  307. 'comment_count': extract_count('comment'),
  308. 'repost_count': extract_count('reposts'),
  309. 'genre': info.get('genre'),
  310. 'formats': formats if not extract_flat else None
  311. }
  312. @classmethod
  313. def _resolv_url(cls, url):
  314. return cls._API_V2_BASE + 'resolve?url=' + url
  315. class SoundcloudIE(SoundcloudBaseIE):
  316. """Information extractor for soundcloud.com
  317. To access the media, the uid of the song and a stream token
  318. must be extracted from the page source and the script must make
  319. a request to media.soundcloud.com/crossdomain.xml. Then
  320. the media can be grabbed by requesting from an url composed
  321. of the stream token and uid
  322. """
  323. _VALID_URL = r'''(?x)^(?:https?://)?
  324. (?:(?:(?:www\.|m\.)?soundcloud\.com/
  325. (?!stations/track)
  326. (?P<uploader>[\w\d-]+)/
  327. (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
  328. (?P<title>[\w\d-]+)
  329. (?:/(?P<token>(?!(?:albums|sets|recommended))[^?]+?))?
  330. (?:[?].*)?$)
  331. |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+)
  332. (?:/?\?secret_token=(?P<secret_token>[^&]+))?)
  333. )
  334. '''
  335. IE_NAME = 'soundcloud'
  336. _TESTS = [
  337. {
  338. 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
  339. 'md5': 'ebef0a451b909710ed1d7787dddbf0d7',
  340. 'info_dict': {
  341. 'id': '62986583',
  342. 'ext': 'mp3',
  343. 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
  344. 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d',
  345. 'uploader': 'E.T. ExTerrestrial Music',
  346. 'uploader_id': '1571244',
  347. 'timestamp': 1349920598,
  348. 'upload_date': '20121011',
  349. 'duration': 143.216,
  350. 'license': 'all-rights-reserved',
  351. 'view_count': int,
  352. 'like_count': int,
  353. 'comment_count': int,
  354. 'repost_count': int,
  355. }
  356. },
  357. # geo-restricted
  358. {
  359. 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
  360. 'info_dict': {
  361. 'id': '47127627',
  362. 'ext': 'mp3',
  363. 'title': 'Goldrushed',
  364. 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
  365. 'uploader': 'The Royal Concept',
  366. 'uploader_id': '9615865',
  367. 'timestamp': 1337635207,
  368. 'upload_date': '20120521',
  369. 'duration': 227.155,
  370. 'license': 'all-rights-reserved',
  371. 'view_count': int,
  372. 'like_count': int,
  373. 'comment_count': int,
  374. 'repost_count': int,
  375. },
  376. },
  377. # private link
  378. {
  379. 'url': 'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp',
  380. 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604',
  381. 'info_dict': {
  382. 'id': '123998367',
  383. 'ext': 'mp3',
  384. 'title': 'Youtube - Dl Test Video \'\' Ä↭',
  385. 'description': 'test chars: \"\'/\\ä↭',
  386. 'uploader': 'jaimeMF',
  387. 'uploader_id': '69767071',
  388. 'timestamp': 1386604920,
  389. 'upload_date': '20131209',
  390. 'duration': 9.927,
  391. 'license': 'all-rights-reserved',
  392. 'view_count': int,
  393. 'like_count': int,
  394. 'comment_count': int,
  395. 'repost_count': int,
  396. },
  397. },
  398. # private link (alt format)
  399. {
  400. 'url': 'https://api.soundcloud.com/tracks/123998367?secret_token=s-8Pjrp',
  401. 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604',
  402. 'info_dict': {
  403. 'id': '123998367',
  404. 'ext': 'mp3',
  405. 'title': 'Youtube - Dl Test Video \'\' Ä↭',
  406. 'description': 'test chars: \"\'/\\ä↭',
  407. 'uploader': 'jaimeMF',
  408. 'uploader_id': '69767071',
  409. 'timestamp': 1386604920,
  410. 'upload_date': '20131209',
  411. 'duration': 9.927,
  412. 'license': 'all-rights-reserved',
  413. 'view_count': int,
  414. 'like_count': int,
  415. 'comment_count': int,
  416. 'repost_count': int,
  417. },
  418. },
  419. # downloadable song
  420. {
  421. 'url': 'https://soundcloud.com/the80m/the-following',
  422. 'md5': '9ffcddb08c87d74fb5808a3c183a1d04',
  423. 'info_dict': {
  424. 'id': '343609555',
  425. 'ext': 'wav',
  426. },
  427. },
  428. # private link, downloadable format
  429. {
  430. 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd',
  431. 'md5': '64a60b16e617d41d0bef032b7f55441e',
  432. 'info_dict': {
  433. 'id': '340344461',
  434. 'ext': 'wav',
  435. 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]',
  436. 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366',
  437. 'uploader': 'Ori Uplift Music',
  438. 'uploader_id': '12563093',
  439. 'timestamp': 1504206263,
  440. 'upload_date': '20170831',
  441. 'duration': 7449.096,
  442. 'license': 'all-rights-reserved',
  443. 'view_count': int,
  444. 'like_count': int,
  445. 'comment_count': int,
  446. 'repost_count': int,
  447. },
  448. },
  449. # no album art, use avatar pic for thumbnail
  450. {
  451. 'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real',
  452. 'md5': '59c7872bc44e5d99b7211891664760c2',
  453. 'info_dict': {
  454. 'id': '309699954',
  455. 'ext': 'mp3',
  456. 'title': 'Sideways (Prod. Mad Real)',
  457. 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
  458. 'uploader': 'garyvee',
  459. 'uploader_id': '2366352',
  460. 'timestamp': 1488152409,
  461. 'upload_date': '20170226',
  462. 'duration': 207.012,
  463. 'thumbnail': r're:https?://.*\.jpg',
  464. 'license': 'all-rights-reserved',
  465. 'view_count': int,
  466. 'like_count': int,
  467. 'comment_count': int,
  468. 'repost_count': int,
  469. },
  470. 'params': {
  471. 'skip_download': True,
  472. },
  473. },
  474. {
  475. 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer',
  476. 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7',
  477. 'info_dict': {
  478. 'id': '583011102',
  479. 'ext': 'mp3',
  480. 'title': 'Mezzo Valzer',
  481. 'description': 'md5:4138d582f81866a530317bae316e8b61',
  482. 'uploader': 'Micronie',
  483. 'uploader_id': '3352531',
  484. 'timestamp': 1551394171,
  485. 'upload_date': '20190228',
  486. 'duration': 180.157,
  487. 'thumbnail': r're:https?://.*\.jpg',
  488. 'license': 'all-rights-reserved',
  489. 'view_count': int,
  490. 'like_count': int,
  491. 'comment_count': int,
  492. 'repost_count': int,
  493. },
  494. },
  495. {
  496. # AAC HQ format available (account with active subscription needed)
  497. 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1',
  498. 'only_matching': True,
  499. },
  500. {
  501. # Go+ (account with active subscription needed)
  502. 'url': 'https://soundcloud.com/taylorswiftofficial/look-what-you-made-me-do',
  503. 'only_matching': True,
  504. },
  505. ]
  506. def _real_extract(self, url):
  507. mobj = self._match_valid_url(url)
  508. track_id = mobj.group('track_id')
  509. query = {}
  510. if track_id:
  511. info_json_url = self._API_V2_BASE + 'tracks/' + track_id
  512. full_title = track_id
  513. token = mobj.group('secret_token')
  514. if token:
  515. query['secret_token'] = token
  516. else:
  517. full_title = resolve_title = '%s/%s' % mobj.group('uploader', 'title')
  518. token = mobj.group('token')
  519. if token:
  520. resolve_title += '/%s' % token
  521. info_json_url = self._resolv_url(self._BASE_URL + resolve_title)
  522. info = self._download_json(
  523. info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS)
  524. return self._extract_info_dict(info, full_title, token)
  525. class SoundcloudPlaylistBaseIE(SoundcloudBaseIE):
  526. def _extract_set(self, playlist, token=None):
  527. playlist_id = compat_str(playlist['id'])
  528. tracks = playlist.get('tracks') or []
  529. if not all([t.get('permalink_url') for t in tracks]) and token:
  530. tracks = self._download_json(
  531. self._API_V2_BASE + 'tracks', playlist_id,
  532. 'Downloading tracks', query={
  533. 'ids': ','.join([compat_str(t['id']) for t in tracks]),
  534. 'playlistId': playlist_id,
  535. 'playlistSecretToken': token,
  536. }, headers=self._HEADERS)
  537. entries = []
  538. for track in tracks:
  539. track_id = str_or_none(track.get('id'))
  540. url = track.get('permalink_url')
  541. if not url:
  542. if not track_id:
  543. continue
  544. url = self._API_V2_BASE + 'tracks/' + track_id
  545. if token:
  546. url += '?secret_token=' + token
  547. entries.append(self.url_result(
  548. url, SoundcloudIE.ie_key(), track_id))
  549. return self.playlist_result(
  550. entries, playlist_id,
  551. playlist.get('title'),
  552. playlist.get('description'))
  553. class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
  554. _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[:\w\d-]+)(?:/(?P<token>[^?/]+))?'
  555. IE_NAME = 'soundcloud:set'
  556. _TESTS = [{
  557. 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep',
  558. 'info_dict': {
  559. 'id': '2284613',
  560. 'title': 'The Royal Concept EP',
  561. 'description': 'md5:71d07087c7a449e8941a70a29e34671e',
  562. },
  563. 'playlist_mincount': 5,
  564. }, {
  565. 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token',
  566. 'only_matching': True,
  567. }, {
  568. 'url': 'https://soundcloud.com/discover/sets/weekly::flacmatic',
  569. 'only_matching': True,
  570. }, {
  571. 'url': 'https://soundcloud.com/discover/sets/charts-top:all-music:de',
  572. 'only_matching': True,
  573. }, {
  574. 'url': 'https://soundcloud.com/discover/sets/charts-top:hiphoprap:kr',
  575. 'only_matching': True,
  576. }]
  577. def _real_extract(self, url):
  578. mobj = self._match_valid_url(url)
  579. full_title = '%s/sets/%s' % mobj.group('uploader', 'slug_title')
  580. token = mobj.group('token')
  581. if token:
  582. full_title += '/' + token
  583. info = self._download_json(self._resolv_url(
  584. self._BASE_URL + full_title), full_title, headers=self._HEADERS)
  585. if 'errors' in info:
  586. msgs = (compat_str(err['error_message']) for err in info['errors'])
  587. raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs))
  588. return self._extract_set(info, token)
  589. class SoundcloudPagedPlaylistBaseIE(SoundcloudBaseIE):
  590. def _extract_playlist(self, base_url, playlist_id, playlist_title):
  591. return {
  592. '_type': 'playlist',
  593. 'id': playlist_id,
  594. 'title': playlist_title,
  595. 'entries': self._entries(base_url, playlist_id),
  596. }
  597. def _entries(self, url, playlist_id):
  598. # Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200.
  599. # https://developers.soundcloud.com/blog/offset-pagination-deprecated
  600. query = {
  601. 'limit': 200,
  602. 'linked_partitioning': '1',
  603. 'offset': 0,
  604. }
  605. for i in itertools.count():
  606. for retry in self.RetryManager():
  607. try:
  608. response = self._download_json(
  609. url, playlist_id, query=query, headers=self._HEADERS,
  610. note=f'Downloading track page {i + 1}')
  611. break
  612. except ExtractorError as e:
  613. # Downloading page may result in intermittent 502 HTTP error
  614. # See https://github.com/hypervideo/hypervideo/issues/872
  615. if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 502:
  616. raise
  617. retry.error = e
  618. continue
  619. def resolve_entry(*candidates):
  620. for cand in candidates:
  621. if not isinstance(cand, dict):
  622. continue
  623. permalink_url = url_or_none(cand.get('permalink_url'))
  624. if permalink_url:
  625. return self.url_result(
  626. permalink_url,
  627. SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None,
  628. str_or_none(cand.get('id')), cand.get('title'))
  629. for e in response['collection'] or []:
  630. yield resolve_entry(e, e.get('track'), e.get('playlist'))
  631. url = response.get('next_href')
  632. if not url:
  633. break
  634. query.pop('offset', None)
  635. class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
  636. _VALID_URL = r'''(?x)
  637. https?://
  638. (?:(?:www|m)\.)?soundcloud\.com/
  639. (?P<user>[^/]+)
  640. (?:/
  641. (?P<rsrc>tracks|albums|sets|reposts|likes|spotlight)
  642. )?
  643. /?(?:[?#].*)?$
  644. '''
  645. IE_NAME = 'soundcloud:user'
  646. _TESTS = [{
  647. 'url': 'https://soundcloud.com/soft-cell-official',
  648. 'info_dict': {
  649. 'id': '207965082',
  650. 'title': 'Soft Cell (All)',
  651. },
  652. 'playlist_mincount': 28,
  653. }, {
  654. 'url': 'https://soundcloud.com/soft-cell-official/tracks',
  655. 'info_dict': {
  656. 'id': '207965082',
  657. 'title': 'Soft Cell (Tracks)',
  658. },
  659. 'playlist_mincount': 27,
  660. }, {
  661. 'url': 'https://soundcloud.com/soft-cell-official/albums',
  662. 'info_dict': {
  663. 'id': '207965082',
  664. 'title': 'Soft Cell (Albums)',
  665. },
  666. 'playlist_mincount': 1,
  667. }, {
  668. 'url': 'https://soundcloud.com/jcv246/sets',
  669. 'info_dict': {
  670. 'id': '12982173',
  671. 'title': 'Jordi / cv (Sets)',
  672. },
  673. 'playlist_mincount': 2,
  674. }, {
  675. 'url': 'https://soundcloud.com/jcv246/reposts',
  676. 'info_dict': {
  677. 'id': '12982173',
  678. 'title': 'Jordi / cv (Reposts)',
  679. },
  680. 'playlist_mincount': 6,
  681. }, {
  682. 'url': 'https://soundcloud.com/clalberg/likes',
  683. 'info_dict': {
  684. 'id': '11817582',
  685. 'title': 'clalberg (Likes)',
  686. },
  687. 'playlist_mincount': 5,
  688. }, {
  689. 'url': 'https://soundcloud.com/grynpyret/spotlight',
  690. 'info_dict': {
  691. 'id': '7098329',
  692. 'title': 'Grynpyret (Spotlight)',
  693. },
  694. 'playlist_mincount': 1,
  695. }]
  696. _BASE_URL_MAP = {
  697. 'all': 'stream/users/%s',
  698. 'tracks': 'users/%s/tracks',
  699. 'albums': 'users/%s/albums',
  700. 'sets': 'users/%s/playlists',
  701. 'reposts': 'stream/users/%s/reposts',
  702. 'likes': 'users/%s/likes',
  703. 'spotlight': 'users/%s/spotlight',
  704. }
  705. def _real_extract(self, url):
  706. mobj = self._match_valid_url(url)
  707. uploader = mobj.group('user')
  708. user = self._download_json(
  709. self._resolv_url(self._BASE_URL + uploader),
  710. uploader, 'Downloading user info', headers=self._HEADERS)
  711. resource = mobj.group('rsrc') or 'all'
  712. return self._extract_playlist(
  713. self._API_V2_BASE + self._BASE_URL_MAP[resource] % user['id'],
  714. str_or_none(user.get('id')),
  715. '%s (%s)' % (user['username'], resource.capitalize()))
  716. class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
  717. _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)'
  718. IE_NAME = 'soundcloud:trackstation'
  719. _TESTS = [{
  720. 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text',
  721. 'info_dict': {
  722. 'id': '286017854',
  723. 'title': 'Track station: your text',
  724. },
  725. 'playlist_mincount': 47,
  726. }]
  727. def _real_extract(self, url):
  728. track_name = self._match_id(url)
  729. track = self._download_json(self._resolv_url(url), track_name, headers=self._HEADERS)
  730. track_id = self._search_regex(
  731. r'soundcloud:track-stations:(\d+)', track['id'], 'track id')
  732. return self._extract_playlist(
  733. self._API_V2_BASE + 'stations/%s/tracks' % track['id'],
  734. track_id, 'Track station: %s' % track['title'])
  735. class SoundcloudRelatedIE(SoundcloudPagedPlaylistBaseIE):
  736. _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<slug>[\w\d-]+/[\w\d-]+)/(?P<relation>albums|sets|recommended)'
  737. IE_NAME = 'soundcloud:related'
  738. _TESTS = [{
  739. 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/recommended',
  740. 'info_dict': {
  741. 'id': '1084577272',
  742. 'title': 'Sexapil - Pingers 5 (Recommended)',
  743. },
  744. 'playlist_mincount': 50,
  745. }, {
  746. 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/albums',
  747. 'info_dict': {
  748. 'id': '1084577272',
  749. 'title': 'Sexapil - Pingers 5 (Albums)',
  750. },
  751. 'playlist_mincount': 1,
  752. }, {
  753. 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/sets',
  754. 'info_dict': {
  755. 'id': '1084577272',
  756. 'title': 'Sexapil - Pingers 5 (Sets)',
  757. },
  758. 'playlist_mincount': 4,
  759. }]
  760. _BASE_URL_MAP = {
  761. 'albums': 'tracks/%s/albums',
  762. 'sets': 'tracks/%s/playlists_without_albums',
  763. 'recommended': 'tracks/%s/related',
  764. }
  765. def _real_extract(self, url):
  766. slug, relation = self._match_valid_url(url).group('slug', 'relation')
  767. track = self._download_json(
  768. self._resolv_url(self._BASE_URL + slug),
  769. slug, 'Downloading track info', headers=self._HEADERS)
  770. if track.get('errors'):
  771. raise ExtractorError(f'{self.IE_NAME} said: %s' % ','.join(
  772. str(err['error_message']) for err in track['errors']), expected=True)
  773. return self._extract_playlist(
  774. self._API_V2_BASE + self._BASE_URL_MAP[relation] % track['id'], str(track['id']),
  775. '%s (%s)' % (track.get('title') or slug, relation.capitalize()))
  776. class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
  777. _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$'
  778. IE_NAME = 'soundcloud:playlist'
  779. _TESTS = [{
  780. 'url': 'https://api.soundcloud.com/playlists/4110309',
  781. 'info_dict': {
  782. 'id': '4110309',
  783. 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]',
  784. 'description': 're:.*?TILT Brass - Bowery Poetry Club',
  785. },
  786. 'playlist_count': 6,
  787. }]
  788. def _real_extract(self, url):
  789. mobj = self._match_valid_url(url)
  790. playlist_id = mobj.group('id')
  791. query = {}
  792. token = mobj.group('token')
  793. if token:
  794. query['secret_token'] = token
  795. data = self._download_json(
  796. self._API_V2_BASE + 'playlists/' + playlist_id,
  797. playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS)
  798. return self._extract_set(data, token)
  799. class SoundcloudSearchIE(SoundcloudBaseIE, SearchInfoExtractor):
  800. IE_NAME = 'soundcloud:search'
  801. IE_DESC = 'Soundcloud search'
  802. _SEARCH_KEY = 'scsearch'
  803. _TESTS = [{
  804. 'url': 'scsearch15:post-avant jazzcore',
  805. 'info_dict': {
  806. 'id': 'post-avant jazzcore',
  807. 'title': 'post-avant jazzcore',
  808. },
  809. 'playlist_count': 15,
  810. }]
  811. _MAX_RESULTS_PER_PAGE = 200
  812. _DEFAULT_RESULTS_PER_PAGE = 50
  813. def _get_collection(self, endpoint, collection_id, **query):
  814. limit = min(
  815. query.get('limit', self._DEFAULT_RESULTS_PER_PAGE),
  816. self._MAX_RESULTS_PER_PAGE)
  817. query.update({
  818. 'limit': limit,
  819. 'linked_partitioning': 1,
  820. 'offset': 0,
  821. })
  822. next_url = update_url_query(self._API_V2_BASE + endpoint, query)
  823. for i in itertools.count(1):
  824. response = self._download_json(
  825. next_url, collection_id, f'Downloading page {i}',
  826. 'Unable to download API page', headers=self._HEADERS)
  827. for item in response.get('collection') or []:
  828. if item:
  829. yield self.url_result(
  830. item['uri'], SoundcloudIE.ie_key(), **self._extract_info_dict(item, extract_flat=True))
  831. next_url = response.get('next_href')
  832. if not next_url:
  833. break
  834. def _get_n_results(self, query, n):
  835. return self.playlist_result(itertools.islice(
  836. self._get_collection('search/tracks', query, limit=n, q=query),
  837. 0, None if n == float('inf') else n), query, query)