cjsw.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. from .common import InfoExtractor
  2. from ..utils import (
  3. determine_ext,
  4. unescapeHTML,
  5. )
  6. class CJSWIE(InfoExtractor):
  7. _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P<program>[^/]+)/episode/(?P<id>\d+)'
  8. _TESTS = [{
  9. 'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620',
  10. 'md5': 'cee14d40f1e9433632c56e3d14977120',
  11. 'info_dict': {
  12. 'id': '91d9f016-a2e7-46c5-8dcb-7cbcd7437c41',
  13. 'ext': 'mp3',
  14. 'title': 'Freshly Squeezed – Episode June 20, 2017',
  15. 'description': 'md5:c967d63366c3898a80d0c7b0ff337202',
  16. 'series': 'Freshly Squeezed',
  17. 'episode_id': '20170620',
  18. },
  19. }, {
  20. # no description
  21. 'url': 'http://cjsw.com/program/road-pops/episode/20170707/',
  22. 'only_matching': True,
  23. }]
  24. def _real_extract(self, url):
  25. mobj = self._match_valid_url(url)
  26. program, episode_id = mobj.group('program', 'id')
  27. audio_id = '%s/%s' % (program, episode_id)
  28. webpage = self._download_webpage(url, episode_id)
  29. title = unescapeHTML(self._search_regex(
  30. (r'<h1[^>]+class=["\']episode-header__title["\'][^>]*>(?P<title>[^<]+)',
  31. r'data-audio-title=(["\'])(?P<title>(?:(?!\1).)+)\1'),
  32. webpage, 'title', group='title'))
  33. audio_url = self._search_regex(
  34. r'<button[^>]+data-audio-src=(["\'])(?P<url>(?:(?!\1).)+)\1',
  35. webpage, 'audio url', group='url')
  36. audio_id = self._search_regex(
  37. r'/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.mp3',
  38. audio_url, 'audio id', default=audio_id)
  39. formats = [{
  40. 'url': audio_url,
  41. 'ext': determine_ext(audio_url, 'mp3'),
  42. 'vcodec': 'none',
  43. }]
  44. description = self._html_search_regex(
  45. r'<p>(?P<description>.+?)</p>', webpage, 'description',
  46. default=None)
  47. series = self._search_regex(
  48. r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage,
  49. 'series', default=program, group='name')
  50. return {
  51. 'id': audio_id,
  52. 'title': title,
  53. 'description': description,
  54. 'formats': formats,
  55. 'series': series,
  56. 'episode_id': episode_id,
  57. }