bbc.py 70 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644
  1. import functools
  2. import itertools
  3. import json
  4. import re
  5. import urllib.error
  6. import xml.etree.ElementTree
  7. from .common import InfoExtractor
  8. from ..compat import compat_HTTPError, compat_str, compat_urlparse
  9. from ..utils import (
  10. ExtractorError,
  11. OnDemandPagedList,
  12. clean_html,
  13. dict_get,
  14. float_or_none,
  15. get_element_by_class,
  16. int_or_none,
  17. js_to_json,
  18. parse_duration,
  19. parse_iso8601,
  20. parse_qs,
  21. strip_or_none,
  22. try_get,
  23. unescapeHTML,
  24. unified_timestamp,
  25. url_or_none,
  26. urlencode_postdata,
  27. urljoin,
  28. )
  29. class BBCCoUkIE(InfoExtractor):
  30. IE_NAME = 'bbc.co.uk'
  31. IE_DESC = 'BBC iPlayer'
  32. _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
  33. _VALID_URL = r'''(?x)
  34. https?://
  35. (?:www\.)?bbc\.co\.uk/
  36. (?:
  37. programmes/(?!articles/)|
  38. iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
  39. music/(?:clips|audiovideo/popular)[/#]|
  40. radio/player/|
  41. sounds/play/|
  42. events/[^/]+/play/[^/]+/
  43. )
  44. (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
  45. ''' % _ID_REGEX
  46. _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
  47. _LOGIN_URL = 'https://account.bbc.com/signin'
  48. _NETRC_MACHINE = 'bbc'
  49. _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
  50. _MEDIA_SETS = [
  51. # Provides HQ HLS streams with even better quality that pc mediaset but fails
  52. # with geolocation in some cases when it's even not geo restricted at all (e.g.
  53. # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
  54. 'iptv-all',
  55. 'pc',
  56. ]
  57. _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
  58. _TESTS = [
  59. {
  60. 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
  61. 'info_dict': {
  62. 'id': 'b039d07m',
  63. 'ext': 'flv',
  64. 'title': 'Kaleidoscope, Leonard Cohen',
  65. 'description': 'The Canadian poet and songwriter reflects on his musical career.',
  66. },
  67. 'params': {
  68. # rtmp download
  69. 'skip_download': True,
  70. }
  71. },
  72. {
  73. 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
  74. 'info_dict': {
  75. 'id': 'b00yng1d',
  76. 'ext': 'flv',
  77. 'title': 'The Man in Black: Series 3: The Printed Name',
  78. 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
  79. 'duration': 1800,
  80. },
  81. 'params': {
  82. # rtmp download
  83. 'skip_download': True,
  84. },
  85. 'skip': 'Episode is no longer available on BBC iPlayer Radio',
  86. },
  87. {
  88. 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
  89. 'info_dict': {
  90. 'id': 'b00yng1d',
  91. 'ext': 'flv',
  92. 'title': 'The Voice UK: Series 3: Blind Auditions 5',
  93. 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
  94. 'duration': 5100,
  95. },
  96. 'params': {
  97. # rtmp download
  98. 'skip_download': True,
  99. },
  100. 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
  101. },
  102. {
  103. 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
  104. 'info_dict': {
  105. 'id': 'b03k3pb7',
  106. 'ext': 'flv',
  107. 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
  108. 'description': '2. Invasion',
  109. 'duration': 3600,
  110. },
  111. 'params': {
  112. # rtmp download
  113. 'skip_download': True,
  114. },
  115. 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
  116. }, {
  117. 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
  118. 'info_dict': {
  119. 'id': 'b04v209v',
  120. 'ext': 'flv',
  121. 'title': 'Pete Tong, The Essential New Tune Special',
  122. 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
  123. 'duration': 10800,
  124. },
  125. 'params': {
  126. # rtmp download
  127. 'skip_download': True,
  128. },
  129. 'skip': 'Episode is no longer available on BBC iPlayer Radio',
  130. }, {
  131. 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
  132. 'note': 'Audio',
  133. 'info_dict': {
  134. 'id': 'p022h44j',
  135. 'ext': 'flv',
  136. 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
  137. 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
  138. 'duration': 227,
  139. },
  140. 'params': {
  141. # rtmp download
  142. 'skip_download': True,
  143. }
  144. }, {
  145. 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
  146. 'note': 'Video',
  147. 'info_dict': {
  148. 'id': 'p025c103',
  149. 'ext': 'flv',
  150. 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
  151. 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
  152. 'duration': 226,
  153. },
  154. 'params': {
  155. # rtmp download
  156. 'skip_download': True,
  157. }
  158. }, {
  159. 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
  160. 'info_dict': {
  161. 'id': 'p02n76xf',
  162. 'ext': 'flv',
  163. 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
  164. 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
  165. 'duration': 3540,
  166. },
  167. 'params': {
  168. # rtmp download
  169. 'skip_download': True,
  170. },
  171. 'skip': 'geolocation',
  172. }, {
  173. 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
  174. 'info_dict': {
  175. 'id': 'b05zmgw1',
  176. 'ext': 'flv',
  177. 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
  178. 'title': 'Royal Academy Summer Exhibition',
  179. 'duration': 3540,
  180. },
  181. 'params': {
  182. # rtmp download
  183. 'skip_download': True,
  184. },
  185. 'skip': 'geolocation',
  186. }, {
  187. # iptv-all mediaset fails with geolocation however there is no geo restriction
  188. # for this programme at all
  189. 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
  190. 'info_dict': {
  191. 'id': 'b06rkms3',
  192. 'ext': 'flv',
  193. 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
  194. 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
  195. },
  196. 'params': {
  197. # rtmp download
  198. 'skip_download': True,
  199. },
  200. 'skip': 'Now it\'s really geo-restricted',
  201. }, {
  202. # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
  203. 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
  204. 'info_dict': {
  205. 'id': 'p028bfkj',
  206. 'ext': 'flv',
  207. 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
  208. 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
  209. },
  210. 'params': {
  211. # rtmp download
  212. 'skip_download': True,
  213. },
  214. }, {
  215. 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
  216. 'note': 'Audio',
  217. 'info_dict': {
  218. 'id': 'm0007jz9',
  219. 'ext': 'mp4',
  220. 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
  221. 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
  222. 'duration': 9840,
  223. },
  224. 'params': {
  225. # rtmp download
  226. 'skip_download': True,
  227. }
  228. }, {
  229. 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
  230. 'only_matching': True,
  231. }, {
  232. 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
  233. 'only_matching': True,
  234. }, {
  235. 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
  236. 'only_matching': True,
  237. }, {
  238. 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
  239. 'only_matching': True,
  240. }, {
  241. 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
  242. 'only_matching': True,
  243. }, {
  244. 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
  245. 'only_matching': True,
  246. }, {
  247. 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
  248. 'only_matching': True,
  249. }, {
  250. 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
  251. 'only_matching': True,
  252. }]
  253. def _perform_login(self, username, password):
  254. login_page = self._download_webpage(
  255. self._LOGIN_URL, None, 'Downloading signin page')
  256. login_form = self._hidden_inputs(login_page)
  257. login_form.update({
  258. 'username': username,
  259. 'password': password,
  260. })
  261. post_url = urljoin(self._LOGIN_URL, self._search_regex(
  262. r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
  263. 'post url', default=self._LOGIN_URL, group='url'))
  264. response, urlh = self._download_webpage_handle(
  265. post_url, None, 'Logging in', data=urlencode_postdata(login_form),
  266. headers={'Referer': self._LOGIN_URL})
  267. if self._LOGIN_URL in urlh.geturl():
  268. error = clean_html(get_element_by_class('form-message', response))
  269. if error:
  270. raise ExtractorError(
  271. 'Unable to login: %s' % error, expected=True)
  272. raise ExtractorError('Unable to log in')
  273. class MediaSelectionError(Exception):
  274. def __init__(self, id):
  275. self.id = id
  276. def _extract_asx_playlist(self, connection, programme_id):
  277. asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
  278. return [ref.get('href') for ref in asx.findall('./Entry/ref')]
  279. def _extract_items(self, playlist):
  280. return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
  281. def _extract_medias(self, media_selection):
  282. error = media_selection.get('result')
  283. if error:
  284. raise BBCCoUkIE.MediaSelectionError(error)
  285. return media_selection.get('media') or []
  286. def _extract_connections(self, media):
  287. return media.get('connection') or []
  288. def _get_subtitles(self, media, programme_id):
  289. subtitles = {}
  290. for connection in self._extract_connections(media):
  291. cc_url = url_or_none(connection.get('href'))
  292. if not cc_url:
  293. continue
  294. captions = self._download_xml(
  295. cc_url, programme_id, 'Downloading captions', fatal=False)
  296. if not isinstance(captions, xml.etree.ElementTree.Element):
  297. continue
  298. subtitles['en'] = [
  299. {
  300. 'url': connection.get('href'),
  301. 'ext': 'ttml',
  302. },
  303. ]
  304. break
  305. return subtitles
  306. def _raise_extractor_error(self, media_selection_error):
  307. raise ExtractorError(
  308. '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
  309. expected=True)
  310. def _download_media_selector(self, programme_id):
  311. last_exception = None
  312. for media_set in self._MEDIA_SETS:
  313. try:
  314. return self._download_media_selector_url(
  315. self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
  316. except BBCCoUkIE.MediaSelectionError as e:
  317. if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
  318. last_exception = e
  319. continue
  320. self._raise_extractor_error(e)
  321. self._raise_extractor_error(last_exception)
  322. def _download_media_selector_url(self, url, programme_id=None):
  323. media_selection = self._download_json(
  324. url, programme_id, 'Downloading media selection JSON',
  325. expected_status=(403, 404))
  326. return self._process_media_selector(media_selection, programme_id)
  327. def _process_media_selector(self, media_selection, programme_id):
  328. formats = []
  329. subtitles = None
  330. urls = []
  331. for media in self._extract_medias(media_selection):
  332. kind = media.get('kind')
  333. if kind in ('video', 'audio'):
  334. bitrate = int_or_none(media.get('bitrate'))
  335. encoding = media.get('encoding')
  336. width = int_or_none(media.get('width'))
  337. height = int_or_none(media.get('height'))
  338. file_size = int_or_none(media.get('media_file_size'))
  339. for connection in self._extract_connections(media):
  340. href = connection.get('href')
  341. if href in urls:
  342. continue
  343. if href:
  344. urls.append(href)
  345. conn_kind = connection.get('kind')
  346. protocol = connection.get('protocol')
  347. supplier = connection.get('supplier')
  348. transfer_format = connection.get('transferFormat')
  349. format_id = supplier or conn_kind or protocol
  350. # ASX playlist
  351. if supplier == 'asx':
  352. for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
  353. formats.append({
  354. 'url': ref,
  355. 'format_id': 'ref%s_%s' % (i, format_id),
  356. })
  357. elif transfer_format == 'dash':
  358. formats.extend(self._extract_mpd_formats(
  359. href, programme_id, mpd_id=format_id, fatal=False))
  360. elif transfer_format == 'hls':
  361. # TODO: let expected_status be passed into _extract_xxx_formats() instead
  362. try:
  363. fmts = self._extract_m3u8_formats(
  364. href, programme_id, ext='mp4', entry_protocol='m3u8_native',
  365. m3u8_id=format_id, fatal=False)
  366. except ExtractorError as e:
  367. if not (isinstance(e.exc_info[1], urllib.error.HTTPError)
  368. and e.exc_info[1].code in (403, 404)):
  369. raise
  370. fmts = []
  371. formats.extend(fmts)
  372. elif transfer_format == 'hds':
  373. formats.extend(self._extract_f4m_formats(
  374. href, programme_id, f4m_id=format_id, fatal=False))
  375. else:
  376. if not supplier and bitrate:
  377. format_id += '-%d' % bitrate
  378. fmt = {
  379. 'format_id': format_id,
  380. 'filesize': file_size,
  381. }
  382. if kind == 'video':
  383. fmt.update({
  384. 'width': width,
  385. 'height': height,
  386. 'tbr': bitrate,
  387. 'vcodec': encoding,
  388. })
  389. else:
  390. fmt.update({
  391. 'abr': bitrate,
  392. 'acodec': encoding,
  393. 'vcodec': 'none',
  394. })
  395. if protocol in ('http', 'https'):
  396. # Direct link
  397. fmt.update({
  398. 'url': href,
  399. })
  400. elif protocol == 'rtmp':
  401. application = connection.get('application', 'ondemand')
  402. auth_string = connection.get('authString')
  403. identifier = connection.get('identifier')
  404. server = connection.get('server')
  405. fmt.update({
  406. 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
  407. 'play_path': identifier,
  408. 'app': '%s?%s' % (application, auth_string),
  409. 'page_url': 'http://www.bbc.co.uk',
  410. 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
  411. 'rtmp_live': False,
  412. 'ext': 'flv',
  413. })
  414. else:
  415. continue
  416. formats.append(fmt)
  417. elif kind == 'captions':
  418. subtitles = self.extract_subtitles(media, programme_id)
  419. return formats, subtitles
  420. def _download_playlist(self, playlist_id):
  421. try:
  422. playlist = self._download_json(
  423. 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
  424. playlist_id, 'Downloading playlist JSON')
  425. formats = []
  426. subtitles = {}
  427. for version in playlist.get('allAvailableVersions', []):
  428. smp_config = version['smpConfig']
  429. title = smp_config['title']
  430. description = smp_config['summary']
  431. for item in smp_config['items']:
  432. kind = item['kind']
  433. if kind not in ('programme', 'radioProgramme'):
  434. continue
  435. programme_id = item.get('vpid')
  436. duration = int_or_none(item.get('duration'))
  437. version_formats, version_subtitles = self._download_media_selector(programme_id)
  438. types = version['types']
  439. for f in version_formats:
  440. f['format_note'] = ', '.join(types)
  441. if any('AudioDescribed' in x for x in types):
  442. f['language_preference'] = -10
  443. formats += version_formats
  444. for tag, subformats in (version_subtitles or {}).items():
  445. subtitles.setdefault(tag, []).extend(subformats)
  446. return programme_id, title, description, duration, formats, subtitles
  447. except ExtractorError as ee:
  448. if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
  449. raise
  450. # fallback to legacy playlist
  451. return self._process_legacy_playlist(playlist_id)
  452. def _process_legacy_playlist_url(self, url, display_id):
  453. playlist = self._download_legacy_playlist_url(url, display_id)
  454. return self._extract_from_legacy_playlist(playlist, display_id)
  455. def _process_legacy_playlist(self, playlist_id):
  456. return self._process_legacy_playlist_url(
  457. 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
  458. def _download_legacy_playlist_url(self, url, playlist_id=None):
  459. return self._download_xml(
  460. url, playlist_id, 'Downloading legacy playlist XML')
  461. def _extract_from_legacy_playlist(self, playlist, playlist_id):
  462. no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
  463. if no_items is not None:
  464. reason = no_items.get('reason')
  465. if reason == 'preAvailability':
  466. msg = 'Episode %s is not yet available' % playlist_id
  467. elif reason == 'postAvailability':
  468. msg = 'Episode %s is no longer available' % playlist_id
  469. elif reason == 'noMedia':
  470. msg = 'Episode %s is not currently available' % playlist_id
  471. else:
  472. msg = 'Episode %s is not available: %s' % (playlist_id, reason)
  473. raise ExtractorError(msg, expected=True)
  474. for item in self._extract_items(playlist):
  475. kind = item.get('kind')
  476. if kind not in ('programme', 'radioProgramme'):
  477. continue
  478. title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
  479. description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
  480. description = description_el.text if description_el is not None else None
  481. def get_programme_id(item):
  482. def get_from_attributes(item):
  483. for p in ('identifier', 'group'):
  484. value = item.get(p)
  485. if value and re.match(r'^[pb][\da-z]{7}$', value):
  486. return value
  487. get_from_attributes(item)
  488. mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
  489. if mediator is not None:
  490. return get_from_attributes(mediator)
  491. programme_id = get_programme_id(item)
  492. duration = int_or_none(item.get('duration'))
  493. if programme_id:
  494. formats, subtitles = self._download_media_selector(programme_id)
  495. else:
  496. formats, subtitles = self._process_media_selector(item, playlist_id)
  497. programme_id = playlist_id
  498. return programme_id, title, description, duration, formats, subtitles
  499. def _real_extract(self, url):
  500. group_id = self._match_id(url)
  501. webpage = self._download_webpage(url, group_id, 'Downloading video page')
  502. error = self._search_regex(
  503. r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
  504. webpage, 'error', default=None)
  505. if error:
  506. raise ExtractorError(error, expected=True)
  507. programme_id = None
  508. duration = None
  509. tviplayer = self._search_regex(
  510. r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
  511. webpage, 'player', default=None)
  512. if tviplayer:
  513. player = self._parse_json(tviplayer, group_id).get('player', {})
  514. duration = int_or_none(player.get('duration'))
  515. programme_id = player.get('vpid')
  516. if not programme_id:
  517. programme_id = self._search_regex(
  518. r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
  519. if programme_id:
  520. formats, subtitles = self._download_media_selector(programme_id)
  521. title = self._og_search_title(webpage, default=None) or self._html_search_regex(
  522. (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
  523. r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
  524. description = self._search_regex(
  525. (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
  526. r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
  527. webpage, 'description', default=None)
  528. if not description:
  529. description = self._html_search_meta('description', webpage)
  530. else:
  531. programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
  532. return {
  533. 'id': programme_id,
  534. 'title': title,
  535. 'description': description,
  536. 'thumbnail': self._og_search_thumbnail(webpage, default=None),
  537. 'duration': duration,
  538. 'formats': formats,
  539. 'subtitles': subtitles,
  540. }
  541. class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
  542. IE_NAME = 'bbc'
  543. IE_DESC = 'BBC'
  544. _VALID_URL = r'''(?x)
  545. https?://(?:www\.)?(?:
  546. bbc\.(?:com|co\.uk)|
  547. bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
  548. bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
  549. )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
  550. _MEDIA_SETS = [
  551. 'pc',
  552. 'mobile-tablet-main',
  553. ]
  554. _TESTS = [{
  555. # article with multiple videos embedded with data-playable containing vpids
  556. 'url': 'http://www.bbc.com/news/world-europe-32668511',
  557. 'info_dict': {
  558. 'id': 'world-europe-32668511',
  559. 'title': 'Russia stages massive WW2 parade',
  560. 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
  561. },
  562. 'playlist_count': 2,
  563. }, {
  564. # article with multiple videos embedded with data-playable (more videos)
  565. 'url': 'http://www.bbc.com/news/business-28299555',
  566. 'info_dict': {
  567. 'id': 'business-28299555',
  568. 'title': 'Farnborough Airshow: Video highlights',
  569. 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
  570. },
  571. 'playlist_count': 9,
  572. 'skip': 'Save time',
  573. }, {
  574. # article with multiple videos embedded with `new SMP()`
  575. # broken
  576. 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
  577. 'info_dict': {
  578. 'id': '3662a707-0af9-3149-963f-47bea720b460',
  579. 'title': 'BUGGER',
  580. },
  581. 'playlist_count': 18,
  582. }, {
  583. # single video embedded with data-playable containing vpid
  584. 'url': 'http://www.bbc.com/news/world-europe-32041533',
  585. 'info_dict': {
  586. 'id': 'p02mprgb',
  587. 'ext': 'mp4',
  588. 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
  589. 'description': 'md5:2868290467291b37feda7863f7a83f54',
  590. 'duration': 47,
  591. 'timestamp': 1427219242,
  592. 'upload_date': '20150324',
  593. },
  594. 'params': {
  595. # rtmp download
  596. 'skip_download': True,
  597. }
  598. }, {
  599. # article with single video embedded with data-playable containing XML playlist
  600. # with direct video links as progressiveDownloadUrl (for now these are extracted)
  601. # and playlist with f4m and m3u8 as streamingUrl
  602. 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
  603. 'info_dict': {
  604. 'id': '150615_telabyad_kentin_cogu',
  605. 'ext': 'mp4',
  606. 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
  607. 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
  608. 'timestamp': 1434397334,
  609. 'upload_date': '20150615',
  610. },
  611. 'params': {
  612. 'skip_download': True,
  613. }
  614. }, {
  615. # single video embedded with data-playable containing XML playlists (regional section)
  616. 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
  617. 'info_dict': {
  618. 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
  619. 'ext': 'mp4',
  620. 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
  621. 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
  622. 'timestamp': 1434713142,
  623. 'upload_date': '20150619',
  624. },
  625. 'params': {
  626. 'skip_download': True,
  627. }
  628. }, {
  629. # single video from video playlist embedded with vxp-playlist-data JSON
  630. 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
  631. 'info_dict': {
  632. 'id': 'p02w6qjc',
  633. 'ext': 'mp4',
  634. 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
  635. 'duration': 56,
  636. 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
  637. },
  638. 'params': {
  639. 'skip_download': True,
  640. }
  641. }, {
  642. # single video story with digitalData
  643. 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
  644. 'info_dict': {
  645. 'id': 'p02q6gc4',
  646. 'ext': 'flv',
  647. 'title': 'Sri Lanka’s spicy secret',
  648. 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
  649. 'timestamp': 1437674293,
  650. 'upload_date': '20150723',
  651. },
  652. 'params': {
  653. # rtmp download
  654. 'skip_download': True,
  655. }
  656. }, {
  657. # single video story without digitalData
  658. 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
  659. 'info_dict': {
  660. 'id': 'p018zqqg',
  661. 'ext': 'mp4',
  662. 'title': 'Hyundai Santa Fe Sport: Rock star',
  663. 'description': 'md5:b042a26142c4154a6e472933cf20793d',
  664. 'timestamp': 1415867444,
  665. 'upload_date': '20141113',
  666. },
  667. 'params': {
  668. # rtmp download
  669. 'skip_download': True,
  670. }
  671. }, {
  672. # single video embedded with Morph
  673. 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
  674. 'info_dict': {
  675. 'id': 'p041vhd0',
  676. 'ext': 'mp4',
  677. 'title': "Nigeria v Japan - Men's First Round",
  678. 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
  679. 'duration': 7980,
  680. 'uploader': 'BBC Sport',
  681. 'uploader_id': 'bbc_sport',
  682. },
  683. 'params': {
  684. # m3u8 download
  685. 'skip_download': True,
  686. },
  687. 'skip': 'Georestricted to UK',
  688. }, {
  689. # single video with playlist.sxml URL in playlist param
  690. 'url': 'http://www.bbc.com/sport/0/football/33653409',
  691. 'info_dict': {
  692. 'id': 'p02xycnp',
  693. 'ext': 'mp4',
  694. 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
  695. 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
  696. 'duration': 140,
  697. },
  698. 'params': {
  699. # rtmp download
  700. 'skip_download': True,
  701. }
  702. }, {
  703. # article with multiple videos embedded with playlist.sxml in playlist param
  704. 'url': 'http://www.bbc.com/sport/0/football/34475836',
  705. 'info_dict': {
  706. 'id': '34475836',
  707. 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
  708. 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
  709. },
  710. 'playlist_count': 3,
  711. }, {
  712. # school report article with single video
  713. 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
  714. 'info_dict': {
  715. 'id': '35744779',
  716. 'title': 'School which breaks down barriers in Jerusalem',
  717. },
  718. 'playlist_count': 1,
  719. }, {
  720. # single video with playlist URL from weather section
  721. 'url': 'http://www.bbc.com/weather/features/33601775',
  722. 'only_matching': True,
  723. }, {
  724. # custom redirection to www.bbc.com
  725. # also, video with window.__INITIAL_DATA__
  726. 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
  727. 'info_dict': {
  728. 'id': 'p02xzws1',
  729. 'ext': 'mp4',
  730. 'title': "Pluto may have 'nitrogen glaciers'",
  731. 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
  732. 'thumbnail': r're:https?://.+/.+\.jpg',
  733. 'timestamp': 1437785037,
  734. 'upload_date': '20150725',
  735. },
  736. }, {
  737. # video with window.__INITIAL_DATA__ and value as JSON string
  738. 'url': 'https://www.bbc.com/news/av/world-europe-59468682',
  739. 'info_dict': {
  740. 'id': 'p0b71qth',
  741. 'ext': 'mp4',
  742. 'title': 'Why France is making this woman a national hero',
  743. 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
  744. 'thumbnail': r're:https?://.+/.+\.jpg',
  745. 'timestamp': 1638230731,
  746. 'upload_date': '20211130',
  747. },
  748. }, {
  749. # single video article embedded with data-media-vpid
  750. 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
  751. 'only_matching': True,
  752. }, {
  753. # bbcthreeConfig
  754. 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
  755. 'info_dict': {
  756. 'id': 'p06556y7',
  757. 'ext': 'mp4',
  758. 'title': 'Things Not To Say to people that live on council estates',
  759. 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
  760. 'duration': 360,
  761. 'thumbnail': r're:https?://.+/.+\.jpg',
  762. },
  763. }, {
  764. # window.__PRELOADED_STATE__
  765. 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
  766. 'info_dict': {
  767. 'id': 'b0b9z4vz',
  768. 'ext': 'mp4',
  769. 'title': 'Prom 6: An American in Paris and Turangalila',
  770. 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
  771. 'uploader': 'Radio 3',
  772. 'uploader_id': 'bbc_radio_three',
  773. },
  774. }, {
  775. 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
  776. 'info_dict': {
  777. 'id': 'p06w9tws',
  778. 'ext': 'mp4',
  779. 'title': 'md5:2fabf12a726603193a2879a055f72514',
  780. 'description': 'Learn English words and phrases from this story',
  781. },
  782. 'add_ie': [BBCCoUkIE.ie_key()],
  783. }, {
  784. # BBC Reel
  785. 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
  786. 'info_dict': {
  787. 'id': 'p07c6sb9',
  788. 'ext': 'mp4',
  789. 'title': 'How positive thinking is harming your happiness',
  790. 'alt_title': 'The downsides of positive thinking',
  791. 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
  792. 'duration': 235,
  793. 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
  794. 'upload_date': '20190604',
  795. 'categories': ['Psychology'],
  796. },
  797. }, { # onion routes
  798. 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
  799. 'only_matching': True,
  800. }, {
  801. 'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
  802. 'only_matching': True,
  803. }]
  804. @classmethod
  805. def suitable(cls, url):
  806. EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
  807. return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
  808. else super(BBCIE, cls).suitable(url))
  809. def _extract_from_media_meta(self, media_meta, video_id):
  810. # Direct links to media in media metadata (e.g.
  811. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
  812. # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
  813. source_files = media_meta.get('sourceFiles')
  814. if source_files:
  815. return [{
  816. 'url': f['url'],
  817. 'format_id': format_id,
  818. 'ext': f.get('encoding'),
  819. 'tbr': float_or_none(f.get('bitrate'), 1000),
  820. 'filesize': int_or_none(f.get('filesize')),
  821. } for format_id, f in source_files.items() if f.get('url')], []
  822. programme_id = media_meta.get('externalId')
  823. if programme_id:
  824. return self._download_media_selector(programme_id)
  825. # Process playlist.sxml as legacy playlist
  826. href = media_meta.get('href')
  827. if href:
  828. playlist = self._download_legacy_playlist_url(href)
  829. _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
  830. return formats, subtitles
  831. return [], []
  832. def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
  833. programme_id, title, description, duration, formats, subtitles = \
  834. self._process_legacy_playlist_url(url, playlist_id)
  835. return {
  836. 'id': programme_id,
  837. 'title': title,
  838. 'description': description,
  839. 'duration': duration,
  840. 'timestamp': timestamp,
  841. 'formats': formats,
  842. 'subtitles': subtitles,
  843. }
  844. def _real_extract(self, url):
  845. playlist_id = self._match_id(url)
  846. webpage = self._download_webpage(url, playlist_id)
  847. json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
  848. timestamp = json_ld_info.get('timestamp')
  849. playlist_title = json_ld_info.get('title') or re.sub(
  850. r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
  851. playlist_description = json_ld_info.get(
  852. 'description') or self._og_search_description(webpage, default=None)
  853. if not timestamp:
  854. timestamp = parse_iso8601(self._search_regex(
  855. [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
  856. r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
  857. r'"datePublished":\s*"([^"]+)'],
  858. webpage, 'date', default=None))
  859. entries = []
  860. # article with multiple videos embedded with playlist.sxml (e.g.
  861. # http://www.bbc.com/sport/0/football/34475836)
  862. playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
  863. playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
  864. if playlists:
  865. entries = [
  866. self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
  867. for playlist_url in playlists]
  868. # news article with multiple videos embedded with data-playable
  869. data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
  870. if data_playables:
  871. for _, data_playable_json in data_playables:
  872. data_playable = self._parse_json(
  873. unescapeHTML(data_playable_json), playlist_id, fatal=False)
  874. if not data_playable:
  875. continue
  876. settings = data_playable.get('settings', {})
  877. if settings:
  878. # data-playable with video vpid in settings.playlistObject.items (e.g.
  879. # http://www.bbc.com/news/world-us-canada-34473351)
  880. playlist_object = settings.get('playlistObject', {})
  881. if playlist_object:
  882. items = playlist_object.get('items')
  883. if items and isinstance(items, list):
  884. title = playlist_object['title']
  885. description = playlist_object.get('summary')
  886. duration = int_or_none(items[0].get('duration'))
  887. programme_id = items[0].get('vpid')
  888. formats, subtitles = self._download_media_selector(programme_id)
  889. entries.append({
  890. 'id': programme_id,
  891. 'title': title,
  892. 'description': description,
  893. 'timestamp': timestamp,
  894. 'duration': duration,
  895. 'formats': formats,
  896. 'subtitles': subtitles,
  897. })
  898. else:
  899. # data-playable without vpid but with a playlist.sxml URLs
  900. # in otherSettings.playlist (e.g.
  901. # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
  902. playlist = data_playable.get('otherSettings', {}).get('playlist', {})
  903. if playlist:
  904. entry = None
  905. for key in ('streaming', 'progressiveDownload'):
  906. playlist_url = playlist.get('%sUrl' % key)
  907. if not playlist_url:
  908. continue
  909. try:
  910. info = self._extract_from_playlist_sxml(
  911. playlist_url, playlist_id, timestamp)
  912. if not entry:
  913. entry = info
  914. else:
  915. entry['title'] = info['title']
  916. entry['formats'].extend(info['formats'])
  917. except ExtractorError as e:
  918. # Some playlist URL may fail with 500, at the same time
  919. # the other one may work fine (e.g.
  920. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
  921. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
  922. continue
  923. raise
  924. if entry:
  925. entries.append(entry)
  926. if entries:
  927. return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
  928. # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
  929. group_id = self._search_regex(
  930. r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
  931. webpage, 'group id', default=None)
  932. if group_id:
  933. return self.url_result(
  934. 'https://www.bbc.co.uk/programmes/%s' % group_id,
  935. ie=BBCCoUkIE.ie_key())
  936. # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
  937. programme_id = self._search_regex(
  938. [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
  939. r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
  940. r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
  941. webpage, 'vpid', default=None)
  942. if programme_id:
  943. formats, subtitles = self._download_media_selector(programme_id)
  944. # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
  945. digital_data = self._parse_json(
  946. self._search_regex(
  947. r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
  948. programme_id, fatal=False)
  949. page_info = digital_data.get('page', {}).get('pageInfo', {})
  950. title = page_info.get('pageName') or self._og_search_title(webpage)
  951. description = page_info.get('description') or self._og_search_description(webpage)
  952. timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
  953. return {
  954. 'id': programme_id,
  955. 'title': title,
  956. 'description': description,
  957. 'timestamp': timestamp,
  958. 'formats': formats,
  959. 'subtitles': subtitles,
  960. }
  961. # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
  962. initial_data = self._parse_json(self._html_search_regex(
  963. r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
  964. webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
  965. if initial_data:
  966. init_data = try_get(
  967. initial_data, lambda x: x['initData']['items'][0], dict) or {}
  968. smp_data = init_data.get('smpData') or {}
  969. clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
  970. version_id = clip_data.get('versionID')
  971. if version_id:
  972. title = smp_data['title']
  973. formats, subtitles = self._download_media_selector(version_id)
  974. image_url = smp_data.get('holdingImageURL')
  975. display_date = init_data.get('displayDate')
  976. topic_title = init_data.get('topicTitle')
  977. return {
  978. 'id': version_id,
  979. 'title': title,
  980. 'formats': formats,
  981. 'alt_title': init_data.get('shortTitle'),
  982. 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
  983. 'description': smp_data.get('summary') or init_data.get('shortSummary'),
  984. 'upload_date': display_date.replace('-', '') if display_date else None,
  985. 'subtitles': subtitles,
  986. 'duration': int_or_none(clip_data.get('duration')),
  987. 'categories': [topic_title] if topic_title else None,
  988. }
  989. # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
  990. # There are several setPayload calls may be present but the video
  991. # seems to be always related to the first one
  992. morph_payload = self._parse_json(
  993. self._search_regex(
  994. r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
  995. webpage, 'morph payload', default='{}'),
  996. playlist_id, fatal=False)
  997. if morph_payload:
  998. components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
  999. for component in components:
  1000. if not isinstance(component, dict):
  1001. continue
  1002. lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
  1003. if not lead_media:
  1004. continue
  1005. identifiers = lead_media.get('identifiers')
  1006. if not identifiers or not isinstance(identifiers, dict):
  1007. continue
  1008. programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
  1009. if not programme_id:
  1010. continue
  1011. title = lead_media.get('title') or self._og_search_title(webpage)
  1012. formats, subtitles = self._download_media_selector(programme_id)
  1013. description = lead_media.get('summary')
  1014. uploader = lead_media.get('masterBrand')
  1015. uploader_id = lead_media.get('mid')
  1016. duration = None
  1017. duration_d = lead_media.get('duration')
  1018. if isinstance(duration_d, dict):
  1019. duration = parse_duration(dict_get(
  1020. duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
  1021. return {
  1022. 'id': programme_id,
  1023. 'title': title,
  1024. 'description': description,
  1025. 'duration': duration,
  1026. 'uploader': uploader,
  1027. 'uploader_id': uploader_id,
  1028. 'formats': formats,
  1029. 'subtitles': subtitles,
  1030. }
  1031. preload_state = self._parse_json(self._search_regex(
  1032. r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
  1033. 'preload state', default='{}'), playlist_id, fatal=False)
  1034. if preload_state:
  1035. current_programme = preload_state.get('programmes', {}).get('current') or {}
  1036. programme_id = current_programme.get('id')
  1037. if current_programme and programme_id and current_programme.get('type') == 'playable_item':
  1038. title = current_programme.get('titles', {}).get('tertiary') or playlist_title
  1039. formats, subtitles = self._download_media_selector(programme_id)
  1040. synopses = current_programme.get('synopses') or {}
  1041. network = current_programme.get('network') or {}
  1042. duration = int_or_none(
  1043. current_programme.get('duration', {}).get('value'))
  1044. thumbnail = None
  1045. image_url = current_programme.get('image_url')
  1046. if image_url:
  1047. thumbnail = image_url.replace('{recipe}', 'raw')
  1048. return {
  1049. 'id': programme_id,
  1050. 'title': title,
  1051. 'description': dict_get(synopses, ('long', 'medium', 'short')),
  1052. 'thumbnail': thumbnail,
  1053. 'duration': duration,
  1054. 'uploader': network.get('short_title'),
  1055. 'uploader_id': network.get('id'),
  1056. 'formats': formats,
  1057. 'subtitles': subtitles,
  1058. }
  1059. bbc3_config = self._parse_json(
  1060. self._search_regex(
  1061. r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
  1062. 'bbcthree config', default='{}'),
  1063. playlist_id, transform_source=js_to_json, fatal=False) or {}
  1064. payload = bbc3_config.get('payload') or {}
  1065. if payload:
  1066. clip = payload.get('currentClip') or {}
  1067. clip_vpid = clip.get('vpid')
  1068. clip_title = clip.get('title')
  1069. if clip_vpid and clip_title:
  1070. formats, subtitles = self._download_media_selector(clip_vpid)
  1071. return {
  1072. 'id': clip_vpid,
  1073. 'title': clip_title,
  1074. 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
  1075. 'description': clip.get('description'),
  1076. 'duration': parse_duration(clip.get('duration')),
  1077. 'formats': formats,
  1078. 'subtitles': subtitles,
  1079. }
  1080. bbc3_playlist = try_get(
  1081. payload, lambda x: x['content']['bbcMedia']['playlist'],
  1082. dict)
  1083. if bbc3_playlist:
  1084. playlist_title = bbc3_playlist.get('title') or playlist_title
  1085. thumbnail = bbc3_playlist.get('holdingImageURL')
  1086. entries = []
  1087. for bbc3_item in bbc3_playlist['items']:
  1088. programme_id = bbc3_item.get('versionID')
  1089. if not programme_id:
  1090. continue
  1091. formats, subtitles = self._download_media_selector(programme_id)
  1092. entries.append({
  1093. 'id': programme_id,
  1094. 'title': playlist_title,
  1095. 'thumbnail': thumbnail,
  1096. 'timestamp': timestamp,
  1097. 'formats': formats,
  1098. 'subtitles': subtitles,
  1099. })
  1100. return self.playlist_result(
  1101. entries, playlist_id, playlist_title, playlist_description)
  1102. initial_data = self._search_regex(
  1103. r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
  1104. 'quoted preload state', default=None)
  1105. if initial_data is None:
  1106. initial_data = self._search_regex(
  1107. r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
  1108. 'preload state', default={})
  1109. else:
  1110. initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
  1111. initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
  1112. if initial_data:
  1113. def parse_media(media):
  1114. if not media:
  1115. return
  1116. for item in (try_get(media, lambda x: x['media']['items'], list) or []):
  1117. item_id = item.get('id')
  1118. item_title = item.get('title')
  1119. if not (item_id and item_title):
  1120. continue
  1121. formats, subtitles = self._download_media_selector(item_id)
  1122. item_desc = None
  1123. blocks = try_get(media, lambda x: x['summary']['blocks'], list)
  1124. if blocks:
  1125. summary = []
  1126. for block in blocks:
  1127. text = try_get(block, lambda x: x['model']['text'], compat_str)
  1128. if text:
  1129. summary.append(text)
  1130. if summary:
  1131. item_desc = '\n\n'.join(summary)
  1132. item_time = None
  1133. for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
  1134. if try_get(meta, lambda x: x['label']) == 'Published':
  1135. item_time = unified_timestamp(meta.get('timestamp'))
  1136. break
  1137. entries.append({
  1138. 'id': item_id,
  1139. 'title': item_title,
  1140. 'thumbnail': item.get('holdingImageUrl'),
  1141. 'formats': formats,
  1142. 'subtitles': subtitles,
  1143. 'timestamp': item_time,
  1144. 'description': strip_or_none(item_desc),
  1145. })
  1146. for resp in (initial_data.get('data') or {}).values():
  1147. name = resp.get('name')
  1148. if name == 'media-experience':
  1149. parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
  1150. elif name == 'article':
  1151. for block in (try_get(resp,
  1152. (lambda x: x['data']['blocks'],
  1153. lambda x: x['data']['content']['model']['blocks'],),
  1154. list) or []):
  1155. if block.get('type') not in ['media', 'video']:
  1156. continue
  1157. parse_media(block.get('model'))
  1158. return self.playlist_result(
  1159. entries, playlist_id, playlist_title, playlist_description)
  1160. def extract_all(pattern):
  1161. return list(filter(None, map(
  1162. lambda s: self._parse_json(s, playlist_id, fatal=False),
  1163. re.findall(pattern, webpage))))
  1164. # Multiple video article (e.g.
  1165. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
  1166. EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
  1167. entries = []
  1168. for match in extract_all(r'new\s+SMP\(({.+?})\)'):
  1169. embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
  1170. if embed_url and re.match(EMBED_URL, embed_url):
  1171. entries.append(embed_url)
  1172. entries.extend(re.findall(
  1173. r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
  1174. if entries:
  1175. return self.playlist_result(
  1176. [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
  1177. playlist_id, playlist_title, playlist_description)
  1178. # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
  1179. medias = extract_all(r"data-media-meta='({[^']+})'")
  1180. if not medias:
  1181. # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
  1182. media_asset = self._search_regex(
  1183. r'mediaAssetPage\.init\(\s*({.+?}), "/',
  1184. webpage, 'media asset', default=None)
  1185. if media_asset:
  1186. media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
  1187. medias = []
  1188. for video in media_asset_page.get('videos', {}).values():
  1189. medias.extend(video.values())
  1190. if not medias:
  1191. # Multiple video playlist with single `now playing` entry (e.g.
  1192. # http://www.bbc.com/news/video_and_audio/must_see/33767813)
  1193. vxp_playlist = self._parse_json(
  1194. self._search_regex(
  1195. r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
  1196. webpage, 'playlist data'),
  1197. playlist_id)
  1198. playlist_medias = []
  1199. for item in vxp_playlist:
  1200. media = item.get('media')
  1201. if not media:
  1202. continue
  1203. playlist_medias.append(media)
  1204. # Download single video if found media with asset id matching the video id from URL
  1205. if item.get('advert', {}).get('assetId') == playlist_id:
  1206. medias = [media]
  1207. break
  1208. # Fallback to the whole playlist
  1209. if not medias:
  1210. medias = playlist_medias
  1211. entries = []
  1212. for num, media_meta in enumerate(medias, start=1):
  1213. formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
  1214. if not formats and not self.get_param('ignore_no_formats'):
  1215. continue
  1216. video_id = media_meta.get('externalId')
  1217. if not video_id:
  1218. video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
  1219. title = media_meta.get('caption')
  1220. if not title:
  1221. title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
  1222. duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
  1223. images = []
  1224. for image in media_meta.get('images', {}).values():
  1225. images.extend(image.values())
  1226. if 'image' in media_meta:
  1227. images.append(media_meta['image'])
  1228. thumbnails = [{
  1229. 'url': image.get('href'),
  1230. 'width': int_or_none(image.get('width')),
  1231. 'height': int_or_none(image.get('height')),
  1232. } for image in images]
  1233. entries.append({
  1234. 'id': video_id,
  1235. 'title': title,
  1236. 'thumbnails': thumbnails,
  1237. 'duration': duration,
  1238. 'timestamp': timestamp,
  1239. 'formats': formats,
  1240. 'subtitles': subtitles,
  1241. })
  1242. return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
  1243. class BBCCoUkArticleIE(InfoExtractor):
  1244. _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
  1245. IE_NAME = 'bbc.co.uk:article'
  1246. IE_DESC = 'BBC articles'
  1247. _TEST = {
  1248. 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
  1249. 'info_dict': {
  1250. 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
  1251. 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
  1252. 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
  1253. },
  1254. 'playlist_count': 4,
  1255. 'add_ie': ['BBCCoUk'],
  1256. }
  1257. def _real_extract(self, url):
  1258. playlist_id = self._match_id(url)
  1259. webpage = self._download_webpage(url, playlist_id)
  1260. title = self._og_search_title(webpage)
  1261. description = self._og_search_description(webpage).strip()
  1262. entries = [self.url_result(programme_url) for programme_url in re.findall(
  1263. r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
  1264. return self.playlist_result(entries, playlist_id, title, description)
  1265. class BBCCoUkPlaylistBaseIE(InfoExtractor):
  1266. def _entries(self, webpage, url, playlist_id):
  1267. single_page = 'page' in compat_urlparse.parse_qs(
  1268. compat_urlparse.urlparse(url).query)
  1269. for page_num in itertools.count(2):
  1270. for video_id in re.findall(
  1271. self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
  1272. yield self.url_result(
  1273. self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
  1274. if single_page:
  1275. return
  1276. next_page = self._search_regex(
  1277. r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
  1278. webpage, 'next page url', default=None, group='url')
  1279. if not next_page:
  1280. break
  1281. webpage = self._download_webpage(
  1282. compat_urlparse.urljoin(url, next_page), playlist_id,
  1283. 'Downloading page %d' % page_num, page_num)
  1284. def _real_extract(self, url):
  1285. playlist_id = self._match_id(url)
  1286. webpage = self._download_webpage(url, playlist_id)
  1287. title, description = self._extract_title_and_description(webpage)
  1288. return self.playlist_result(
  1289. self._entries(webpage, url, playlist_id),
  1290. playlist_id, title, description)
  1291. class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
  1292. _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
  1293. @staticmethod
  1294. def _get_default(episode, key, default_key='default'):
  1295. return try_get(episode, lambda x: x[key][default_key])
  1296. def _get_description(self, data):
  1297. synopsis = data.get(self._DESCRIPTION_KEY) or {}
  1298. return dict_get(synopsis, ('large', 'medium', 'small'))
  1299. def _fetch_page(self, programme_id, per_page, series_id, page):
  1300. elements = self._get_elements(self._call_api(
  1301. programme_id, per_page, page + 1, series_id))
  1302. for element in elements:
  1303. episode = self._get_episode(element)
  1304. episode_id = episode.get('id')
  1305. if not episode_id:
  1306. continue
  1307. thumbnail = None
  1308. image = self._get_episode_image(episode)
  1309. if image:
  1310. thumbnail = image.replace('{recipe}', 'raw')
  1311. category = self._get_default(episode, 'labels', 'category')
  1312. yield {
  1313. '_type': 'url',
  1314. 'id': episode_id,
  1315. 'title': self._get_episode_field(episode, 'subtitle'),
  1316. 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
  1317. 'thumbnail': thumbnail,
  1318. 'description': self._get_description(episode),
  1319. 'categories': [category] if category else None,
  1320. 'series': self._get_episode_field(episode, 'title'),
  1321. 'ie_key': BBCCoUkIE.ie_key(),
  1322. }
  1323. def _real_extract(self, url):
  1324. pid = self._match_id(url)
  1325. qs = parse_qs(url)
  1326. series_id = qs.get('seriesId', [None])[0]
  1327. page = qs.get('page', [None])[0]
  1328. per_page = 36 if page else self._PAGE_SIZE
  1329. fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
  1330. entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
  1331. playlist_data = self._get_playlist_data(self._call_api(pid, 1))
  1332. return self.playlist_result(
  1333. entries, pid, self._get_playlist_title(playlist_data),
  1334. self._get_description(playlist_data))
  1335. class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
  1336. IE_NAME = 'bbc.co.uk:iplayer:episodes'
  1337. _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
  1338. _TESTS = [{
  1339. 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
  1340. 'info_dict': {
  1341. 'id': 'b05rcz9v',
  1342. 'title': 'The Disappearance',
  1343. 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
  1344. },
  1345. 'playlist_mincount': 8,
  1346. }, {
  1347. # all seasons
  1348. 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
  1349. 'info_dict': {
  1350. 'id': 'b094m5t9',
  1351. 'title': 'Doctor Foster',
  1352. 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
  1353. },
  1354. 'playlist_mincount': 10,
  1355. }, {
  1356. # explicit season
  1357. 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
  1358. 'info_dict': {
  1359. 'id': 'b094m5t9',
  1360. 'title': 'Doctor Foster',
  1361. 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
  1362. },
  1363. 'playlist_mincount': 5,
  1364. }, {
  1365. # all pages
  1366. 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
  1367. 'info_dict': {
  1368. 'id': 'm0004c4v',
  1369. 'title': 'Beechgrove',
  1370. 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
  1371. },
  1372. 'playlist_mincount': 37,
  1373. }, {
  1374. # explicit page
  1375. 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
  1376. 'info_dict': {
  1377. 'id': 'm0004c4v',
  1378. 'title': 'Beechgrove',
  1379. 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
  1380. },
  1381. 'playlist_mincount': 1,
  1382. }]
  1383. _PAGE_SIZE = 100
  1384. _DESCRIPTION_KEY = 'synopsis'
  1385. def _get_episode_image(self, episode):
  1386. return self._get_default(episode, 'image')
  1387. def _get_episode_field(self, episode, field):
  1388. return self._get_default(episode, field)
  1389. @staticmethod
  1390. def _get_elements(data):
  1391. return data['entities']['results']
  1392. @staticmethod
  1393. def _get_episode(element):
  1394. return element.get('episode') or {}
  1395. def _call_api(self, pid, per_page, page=1, series_id=None):
  1396. variables = {
  1397. 'id': pid,
  1398. 'page': page,
  1399. 'perPage': per_page,
  1400. }
  1401. if series_id:
  1402. variables['sliceId'] = series_id
  1403. return self._download_json(
  1404. 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
  1405. 'Content-Type': 'application/json'
  1406. }, data=json.dumps({
  1407. 'id': '5692d93d5aac8d796a0305e895e61551',
  1408. 'variables': variables,
  1409. }).encode('utf-8'))['data']['programme']
  1410. @staticmethod
  1411. def _get_playlist_data(data):
  1412. return data
  1413. def _get_playlist_title(self, data):
  1414. return self._get_default(data, 'title')
  1415. class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
  1416. IE_NAME = 'bbc.co.uk:iplayer:group'
  1417. _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
  1418. _TESTS = [{
  1419. # Available for over a year unlike 30 days for most other programmes
  1420. 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
  1421. 'info_dict': {
  1422. 'id': 'p02tcc32',
  1423. 'title': 'Bohemian Icons',
  1424. 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
  1425. },
  1426. 'playlist_mincount': 10,
  1427. }, {
  1428. # all pages
  1429. 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
  1430. 'info_dict': {
  1431. 'id': 'p081d7j7',
  1432. 'title': 'Music in Scotland',
  1433. 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
  1434. },
  1435. 'playlist_mincount': 47,
  1436. }, {
  1437. # explicit page
  1438. 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
  1439. 'info_dict': {
  1440. 'id': 'p081d7j7',
  1441. 'title': 'Music in Scotland',
  1442. 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
  1443. },
  1444. 'playlist_mincount': 11,
  1445. }]
  1446. _PAGE_SIZE = 200
  1447. _DESCRIPTION_KEY = 'synopses'
  1448. def _get_episode_image(self, episode):
  1449. return self._get_default(episode, 'images', 'standard')
  1450. def _get_episode_field(self, episode, field):
  1451. return episode.get(field)
  1452. @staticmethod
  1453. def _get_elements(data):
  1454. return data['elements']
  1455. @staticmethod
  1456. def _get_episode(element):
  1457. return element
  1458. def _call_api(self, pid, per_page, page=1, series_id=None):
  1459. return self._download_json(
  1460. 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
  1461. pid, query={
  1462. 'page': page,
  1463. 'per_page': per_page,
  1464. })['group_episodes']
  1465. @staticmethod
  1466. def _get_playlist_data(data):
  1467. return data['group']
  1468. def _get_playlist_title(self, data):
  1469. return data.get('title')
  1470. class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
  1471. IE_NAME = 'bbc.co.uk:playlist'
  1472. _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
  1473. _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
  1474. _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
  1475. _TESTS = [{
  1476. 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
  1477. 'info_dict': {
  1478. 'id': 'b05rcz9v',
  1479. 'title': 'The Disappearance - Clips - BBC Four',
  1480. 'description': 'French thriller serial about a missing teenager.',
  1481. },
  1482. 'playlist_mincount': 7,
  1483. }, {
  1484. # multipage playlist, explicit page
  1485. 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
  1486. 'info_dict': {
  1487. 'id': 'b00mfl7n',
  1488. 'title': 'Frozen Planet - Clips - BBC One',
  1489. 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
  1490. },
  1491. 'playlist_mincount': 24,
  1492. }, {
  1493. # multipage playlist, all pages
  1494. 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
  1495. 'info_dict': {
  1496. 'id': 'b00mfl7n',
  1497. 'title': 'Frozen Planet - Clips - BBC One',
  1498. 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
  1499. },
  1500. 'playlist_mincount': 142,
  1501. }, {
  1502. 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
  1503. 'only_matching': True,
  1504. }, {
  1505. 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
  1506. 'only_matching': True,
  1507. }, {
  1508. 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
  1509. 'only_matching': True,
  1510. }]
  1511. def _extract_title_and_description(self, webpage):
  1512. title = self._og_search_title(webpage, fatal=False)
  1513. description = self._og_search_description(webpage)
  1514. return title, description