podchaser.py 3.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. import functools
  2. import json
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. OnDemandPagedList,
  6. float_or_none,
  7. str_or_none,
  8. str_to_int,
  9. traverse_obj,
  10. unified_timestamp,
  11. )
  12. class PodchaserIE(InfoExtractor):
  13. _VALID_URL = r'https?://(?:www\.)?podchaser\.com/podcasts/[\w-]+-(?P<podcast_id>\d+)(?:/episodes/[\w-]+-(?P<id>\d+))?'
  14. _PAGE_SIZE = 100
  15. _TESTS = [{
  16. 'url': 'https://www.podchaser.com/podcasts/cum-town-36924/episodes/ep-285-freeze-me-off-104365585',
  17. 'info_dict': {
  18. 'id': '104365585',
  19. 'title': 'Ep. 285 – freeze me off',
  20. 'description': 'cam ahn',
  21. 'thumbnail': r're:^https?://.*\.jpg$',
  22. 'ext': 'mp3',
  23. 'categories': ['Comedy'],
  24. 'tags': ['comedy', 'dark humor'],
  25. 'series': 'Cum Town',
  26. 'duration': 3708,
  27. 'timestamp': 1636531259,
  28. 'upload_date': '20211110',
  29. 'rating': 4.0
  30. }
  31. }, {
  32. 'url': 'https://www.podchaser.com/podcasts/the-bone-zone-28853',
  33. 'info_dict': {
  34. 'id': '28853',
  35. 'title': 'The Bone Zone',
  36. 'description': 'Podcast by The Bone Zone',
  37. },
  38. 'playlist_count': 275
  39. }, {
  40. 'url': 'https://www.podchaser.com/podcasts/sean-carrolls-mindscape-scienc-699349/episodes',
  41. 'info_dict': {
  42. 'id': '699349',
  43. 'title': 'Sean Carroll\'s Mindscape: Science, Society, Philosophy, Culture, Arts, and Ideas',
  44. 'description': 'md5:2cbd8f4749891a84dc8235342e0b5ff1'
  45. },
  46. 'playlist_mincount': 225
  47. }]
  48. @staticmethod
  49. def _parse_episode(episode, podcast):
  50. return {
  51. 'id': str(episode.get('id')),
  52. 'title': episode.get('title'),
  53. 'description': episode.get('description'),
  54. 'url': episode.get('audio_url'),
  55. 'thumbnail': episode.get('image_url'),
  56. 'duration': str_to_int(episode.get('length')),
  57. 'timestamp': unified_timestamp(episode.get('air_date')),
  58. 'rating': float_or_none(episode.get('rating')),
  59. 'categories': list(set(traverse_obj(podcast, (('summary', None), 'categories', ..., 'text')))),
  60. 'tags': traverse_obj(podcast, ('tags', ..., 'text')),
  61. 'series': podcast.get('title'),
  62. }
  63. def _call_api(self, path, *args, **kwargs):
  64. return self._download_json(f'https://api.podchaser.com/{path}', *args, **kwargs)
  65. def _fetch_page(self, podcast_id, podcast, page):
  66. json_response = self._call_api(
  67. 'list/episode', podcast_id,
  68. headers={'Content-Type': 'application/json;charset=utf-8'},
  69. data=json.dumps({
  70. 'start': page * self._PAGE_SIZE,
  71. 'count': self._PAGE_SIZE,
  72. 'sort_order': 'SORT_ORDER_RECENT',
  73. 'filters': {
  74. 'podcast_id': podcast_id
  75. },
  76. 'options': {}
  77. }).encode())
  78. for episode in json_response['entities']:
  79. yield self._parse_episode(episode, podcast)
  80. def _real_extract(self, url):
  81. podcast_id, episode_id = self._match_valid_url(url).group('podcast_id', 'id')
  82. podcast = self._call_api(f'podcasts/{podcast_id}', episode_id or podcast_id)
  83. if not episode_id:
  84. return self.playlist_result(
  85. OnDemandPagedList(functools.partial(self._fetch_page, podcast_id, podcast), self._PAGE_SIZE),
  86. str_or_none(podcast.get('id')), podcast.get('title'), podcast.get('description'))
  87. episode = self._call_api(f'episodes/{episode_id}', episode_id)
  88. return self._parse_episode(episode, podcast)