callin.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. from .common import InfoExtractor
  2. from ..utils import (
  3. traverse_obj,
  4. float_or_none,
  5. int_or_none
  6. )
  7. class CallinIE(InfoExtractor):
  8. _VALID_URL = r'https?://(?:www\.)?callin\.com/(episode)/(?P<id>[-a-zA-Z]+)'
  9. _TESTS = [{
  10. 'url': 'https://www.callin.com/episode/the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc',
  11. 'info_dict': {
  12. 'id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd',
  13. 'title': 'The Title IX Regime and the Long March Through and Beyond the Institutions',
  14. 'ext': 'ts',
  15. 'display_id': 'the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc',
  16. 'thumbnail': 're:https://.+\\.png',
  17. 'description': 'First episode',
  18. 'uploader': 'Wesley Yang',
  19. 'timestamp': 1639404128.65,
  20. 'upload_date': '20211213',
  21. 'uploader_id': 'wesyang',
  22. 'uploader_url': 'http://wesleyyang.substack.com',
  23. 'channel': 'Conversations in Year Zero',
  24. 'channel_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553',
  25. 'channel_url': 'https://callin.com/show/conversations-in-year-zero-oJNllRFSfx',
  26. 'duration': 9951.936,
  27. 'view_count': int,
  28. 'categories': ['News & Politics', 'History', 'Technology'],
  29. 'cast': ['Wesley Yang', 'KC Johnson', 'Gabi Abramovich'],
  30. 'series': 'Conversations in Year Zero',
  31. 'series_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553',
  32. 'episode': 'The Title IX Regime and the Long March Through and Beyond the Institutions',
  33. 'episode_number': 1,
  34. 'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd'
  35. }
  36. }]
  37. def try_get_user_name(self, d):
  38. names = [d.get(n) for n in ('first', 'last')]
  39. if None in names:
  40. return next((n for n in names if n), default=None)
  41. return ' '.join(names)
  42. def _real_extract(self, url):
  43. display_id = self._match_id(url)
  44. webpage = self._download_webpage(url, display_id)
  45. next_data = self._search_nextjs_data(webpage, display_id)
  46. episode = next_data['props']['pageProps']['episode']
  47. id = episode['id']
  48. title = episode.get('title') or self._generic_title('', webpage)
  49. url = episode['m3u8']
  50. formats = self._extract_m3u8_formats(url, display_id, ext='ts')
  51. show = traverse_obj(episode, ('show', 'title'))
  52. show_id = traverse_obj(episode, ('show', 'id'))
  53. show_json = None
  54. app_slug = (self._html_search_regex(
  55. '<script\\s+src=["\']/_next/static/([-_a-zA-Z0-9]+)/_',
  56. webpage, 'app slug', fatal=False) or next_data.get('buildId'))
  57. show_slug = traverse_obj(episode, ('show', 'linkObj', 'resourceUrl'))
  58. if app_slug and show_slug and '/' in show_slug:
  59. show_slug = show_slug.rsplit('/', 1)[1]
  60. show_json_url = f'https://www.callin.com/_next/data/{app_slug}/show/{show_slug}.json'
  61. show_json = self._download_json(show_json_url, display_id, fatal=False)
  62. host = (traverse_obj(show_json, ('pageProps', 'show', 'hosts', 0))
  63. or traverse_obj(episode, ('speakers', 0)))
  64. host_nick = traverse_obj(host, ('linkObj', 'resourceUrl'))
  65. host_nick = host_nick.rsplit('/', 1)[1] if (host_nick and '/' in host_nick) else None
  66. cast = list(filter(None, [
  67. self.try_get_user_name(u) for u in
  68. traverse_obj(episode, (('speakers', 'callerTags'), ...)) or []
  69. ]))
  70. episode_list = traverse_obj(show_json, ('pageProps', 'show', 'episodes')) or []
  71. episode_number = next(
  72. (len(episode_list) - i for (i, e) in enumerate(episode_list) if e.get('id') == id),
  73. None)
  74. return {
  75. 'id': id,
  76. 'display_id': display_id,
  77. 'title': title,
  78. 'formats': formats,
  79. 'thumbnail': traverse_obj(episode, ('show', 'photo')),
  80. 'description': episode.get('description'),
  81. 'uploader': self.try_get_user_name(host) if host else None,
  82. 'timestamp': episode.get('publishedAt'),
  83. 'uploader_id': host_nick,
  84. 'uploader_url': traverse_obj(show_json, ('pageProps', 'show', 'url')),
  85. 'channel': show,
  86. 'channel_id': show_id,
  87. 'channel_url': traverse_obj(episode, ('show', 'linkObj', 'resourceUrl')),
  88. 'duration': float_or_none(episode.get('runtime')),
  89. 'view_count': int_or_none(episode.get('plays')),
  90. 'categories': traverse_obj(episode, ('show', 'categorizations', ..., 'name')),
  91. 'cast': cast if cast else None,
  92. 'series': show,
  93. 'series_id': show_id,
  94. 'episode': title,
  95. 'episode_number': episode_number,
  96. 'episode_id': id
  97. }