123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242 |
- import re
- from .common import InfoExtractor
- from ..compat import compat_urlparse
- from ..utils import (
- clean_html,
- extract_attributes,
- ExtractorError,
- get_elements_by_class,
- int_or_none,
- js_to_json,
- smuggle_url,
- unescapeHTML,
- )
- def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
- """Return the content of the tag with the specified attribute in the passed HTML document"""
- if tag is None:
- tag = '[a-zA-Z0-9:._-]+'
- if attribute is None:
- attribute = ''
- else:
- attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
- if value is None:
- value = ''
- else:
- value = re.escape(value) if escape_value else value
- value = '=[\'"]?(?P<value>%s)[\'"]?' % value
- retlist = []
- for m in re.finditer(r'''(?xs)
- <(?P<tag>%s)
- (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
- %s%s
- (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
- \s*>
- (?P<content>.*?)
- </\1>
- ''' % (tag, attribute, value), html):
- retlist.append(m)
- return retlist
- def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
- retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
- return retval[0] if retval else None
- class DubokuIE(InfoExtractor):
- IE_NAME = 'duboku'
- IE_DESC = 'www.duboku.io'
- _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
- _TESTS = [{
- 'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
- 'info_dict': {
- 'id': '1575-1-1',
- 'ext': 'mp4',
- 'series': '白色月光',
- 'title': 'contains:白色月光',
- 'season_number': 1,
- 'episode_number': 1,
- 'season': 'Season 1',
- 'episode_id': '1',
- 'season_id': '1',
- 'episode': 'Episode 1',
- },
- 'params': {
- 'skip_download': 'm3u8 download',
- },
- }, {
- 'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
- 'info_dict': {
- 'id': '1588-1-1',
- 'ext': 'mp4',
- 'series': '亲爱的自己',
- 'title': 'contains:第1集',
- 'season_number': 1,
- 'episode_number': 1,
- 'episode': 'Episode 1',
- 'season': 'Season 1',
- 'episode_id': '1',
- 'season_id': '1',
- },
- 'params': {
- 'skip_download': 'm3u8 download',
- },
- }]
- _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
- def _real_extract(self, url):
- video_id = self._match_id(url)
- temp = video_id.split('-')
- series_id = temp[0]
- season_id = temp[1]
- episode_id = temp[2]
- webpage_url = 'https://w.duboku.io/vodplay/%s.html' % video_id
- webpage_html = self._download_webpage(webpage_url, video_id)
- # extract video url
- player_data = self._search_regex(
- self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
- player_data = self._parse_json(player_data, video_id, js_to_json)
- # extract title
- temp = get_elements_by_class('title', webpage_html)
- series_title = None
- title = None
- for html in temp:
- mobj = re.search(r'<a\s+.*>(.*)</a>', html)
- if mobj:
- href = extract_attributes(mobj.group(0)).get('href')
- if href:
- mobj1 = re.search(r'/(\d+)\.html', href)
- if mobj1 and mobj1.group(1) == series_id:
- series_title = clean_html(mobj.group(0))
- series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
- title = clean_html(html)
- title = re.sub(r'[\s\r\n\t]+', ' ', title)
- break
- data_url = player_data.get('url')
- if not data_url:
- raise ExtractorError('Cannot find url in player_data')
- data_from = player_data.get('from')
- # if it is an embedded iframe, maybe it's an external source
- headers = {'Referer': webpage_url}
- if data_from == 'iframe':
- # use _type url_transparent to retain the meaningful details
- # of the video.
- return {
- '_type': 'url_transparent',
- 'url': smuggle_url(data_url, {'http_headers': headers}),
- 'id': video_id,
- 'title': title,
- 'series': series_title,
- 'season_number': int_or_none(season_id),
- 'season_id': season_id,
- 'episode_number': int_or_none(episode_id),
- 'episode_id': episode_id,
- }
- formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers)
- return {
- 'id': video_id,
- 'title': title,
- 'series': series_title,
- 'season_number': int_or_none(season_id),
- 'season_id': season_id,
- 'episode_number': int_or_none(episode_id),
- 'episode_id': episode_id,
- 'formats': formats,
- 'http_headers': headers
- }
- class DubokuPlaylistIE(InfoExtractor):
- IE_NAME = 'duboku:list'
- IE_DESC = 'www.duboku.io entire series'
- _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
- _TESTS = [{
- 'url': 'https://w.duboku.io/voddetail/1575.html',
- 'info_dict': {
- 'id': 'startswith:1575',
- 'title': '白色月光',
- },
- 'playlist_count': 12,
- }, {
- 'url': 'https://w.duboku.io/voddetail/1554.html',
- 'info_dict': {
- 'id': 'startswith:1554',
- 'title': '以家人之名',
- },
- 'playlist_mincount': 30,
- }]
- def _real_extract(self, url):
- mobj = self._match_valid_url(url)
- if mobj is None:
- raise ExtractorError('Invalid URL: %s' % url)
- series_id = mobj.group('id')
- fragment = compat_urlparse.urlparse(url).fragment
- webpage_url = 'https://w.duboku.io/voddetail/%s.html' % series_id
- webpage_html = self._download_webpage(webpage_url, series_id)
- # extract title
- title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
- title = unescapeHTML(title.group('content')) if title else None
- if not title:
- title = self._html_search_meta('keywords', webpage_html)
- if not title:
- title = _get_element_by_tag_and_attrib(webpage_html, 'title')
- title = unescapeHTML(title.group('content')) if title else None
- # extract playlists
- playlists = {}
- for div in _get_elements_by_tag_and_attrib(
- webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
- playlist_id = div.group('value')
- playlist = []
- for a in _get_elements_by_tag_and_attrib(
- div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
- playlist.append({
- 'href': unescapeHTML(a.group('value')),
- 'title': unescapeHTML(a.group('content'))
- })
- playlists[playlist_id] = playlist
- # select the specified playlist if url fragment exists
- playlist = None
- playlist_id = None
- if fragment:
- playlist = playlists.get(fragment)
- playlist_id = fragment
- else:
- first = next(iter(playlists.items()), None)
- if first:
- (playlist_id, playlist) = first
- if not playlist:
- raise ExtractorError(
- 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')
- # return url results
- return self.playlist_result([
- self.url_result(
- compat_urlparse.urljoin('https://w.duboku.io', x['href']),
- ie=DubokuIE.ie_key(), video_title=x.get('title'))
- for x in playlist], series_id + '#' + playlist_id, title)
|