pornhub.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817
  1. import functools
  2. import itertools
  3. import math
  4. import operator
  5. import re
  6. import urllib.request
  7. from .common import InfoExtractor
  8. from .openload import PhantomJSwrapper
  9. from ..compat import compat_HTTPError, compat_str
  10. from ..utils import (
  11. NO_DEFAULT,
  12. ExtractorError,
  13. clean_html,
  14. determine_ext,
  15. format_field,
  16. int_or_none,
  17. merge_dicts,
  18. orderedSet,
  19. remove_quotes,
  20. remove_start,
  21. str_to_int,
  22. update_url_query,
  23. url_or_none,
  24. urlencode_postdata,
  25. )
  26. class PornHubBaseIE(InfoExtractor):
  27. _NETRC_MACHINE = 'pornhub'
  28. _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd\.onion)'
  29. def _download_webpage_handle(self, *args, **kwargs):
  30. def dl(*args, **kwargs):
  31. return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
  32. ret = dl(*args, **kwargs)
  33. if not ret:
  34. return ret
  35. webpage, urlh = ret
  36. if any(re.search(p, webpage) for p in (
  37. r'<body\b[^>]+\bonload=["\']go\(\)',
  38. r'document\.cookie\s*=\s*["\']RNKEY=',
  39. r'document\.location\.reload\(true\)')):
  40. url_or_request = args[0]
  41. url = (url_or_request.get_full_url()
  42. if isinstance(url_or_request, urllib.request.Request)
  43. else url_or_request)
  44. phantom = PhantomJSwrapper(self, required_version='2.0')
  45. phantom.get(url, html=webpage)
  46. webpage, urlh = dl(*args, **kwargs)
  47. return webpage, urlh
  48. def _real_initialize(self):
  49. self._logged_in = False
  50. def _login(self, host):
  51. if self._logged_in:
  52. return
  53. site = host.split('.')[0]
  54. # Both sites pornhub and pornhubpremium have separate accounts
  55. # so there should be an option to provide credentials for both.
  56. # At the same time some videos are available under the same video id
  57. # on both sites so that we have to identify them as the same video.
  58. # For that purpose we have to keep both in the same extractor
  59. # but under different netrc machines.
  60. username, password = self._get_login_info(netrc_machine=site)
  61. if username is None:
  62. return
  63. login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '')
  64. login_page = self._download_webpage(
  65. login_url, None, 'Downloading %s login page' % site)
  66. def is_logged(webpage):
  67. return any(re.search(p, webpage) for p in (
  68. r'class=["\']signOut',
  69. r'>Sign\s+[Oo]ut\s*<'))
  70. if is_logged(login_page):
  71. self._logged_in = True
  72. return
  73. login_form = self._hidden_inputs(login_page)
  74. login_form.update({
  75. 'username': username,
  76. 'password': password,
  77. })
  78. response = self._download_json(
  79. 'https://www.%s/front/authenticate' % host, None,
  80. 'Logging in to %s' % site,
  81. data=urlencode_postdata(login_form),
  82. headers={
  83. 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
  84. 'Referer': login_url,
  85. 'X-Requested-With': 'XMLHttpRequest',
  86. })
  87. if response.get('success') == '1':
  88. self._logged_in = True
  89. return
  90. message = response.get('message')
  91. if message is not None:
  92. raise ExtractorError(
  93. 'Unable to login: %s' % message, expected=True)
  94. raise ExtractorError('Unable to log in')
  95. class PornHubIE(PornHubBaseIE):
  96. IE_DESC = 'PornHub and Thumbzilla'
  97. _VALID_URL = r'''(?x)
  98. https?://
  99. (?:
  100. (?:[^/]+\.)?
  101. %s
  102. /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
  103. (?:www\.)?thumbzilla\.com/video/
  104. )
  105. (?P<id>[\da-z]+)
  106. ''' % PornHubBaseIE._PORNHUB_HOST_RE
  107. _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)']
  108. _TESTS = [{
  109. 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
  110. 'md5': 'a6391306d050e4547f62b3f485dd9ba9',
  111. 'info_dict': {
  112. 'id': '648719015',
  113. 'ext': 'mp4',
  114. 'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
  115. 'uploader': 'Babes',
  116. 'upload_date': '20130628',
  117. 'timestamp': 1372447216,
  118. 'duration': 361,
  119. 'view_count': int,
  120. 'like_count': int,
  121. 'dislike_count': int,
  122. 'comment_count': int,
  123. 'age_limit': 18,
  124. 'tags': list,
  125. 'categories': list,
  126. 'cast': list,
  127. },
  128. }, {
  129. # non-ASCII title
  130. 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
  131. 'info_dict': {
  132. 'id': '1331683002',
  133. 'ext': 'mp4',
  134. 'title': '重庆婷婷女王足交',
  135. 'upload_date': '20150213',
  136. 'timestamp': 1423804862,
  137. 'duration': 1753,
  138. 'view_count': int,
  139. 'like_count': int,
  140. 'dislike_count': int,
  141. 'comment_count': int,
  142. 'age_limit': 18,
  143. 'tags': list,
  144. 'categories': list,
  145. },
  146. 'params': {
  147. 'skip_download': True,
  148. },
  149. 'skip': 'Video has been flagged for verification in accordance with our trust and safety policy',
  150. }, {
  151. # subtitles
  152. 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
  153. 'info_dict': {
  154. 'id': 'ph5af5fef7c2aa7',
  155. 'ext': 'mp4',
  156. 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
  157. 'uploader': 'BFFs',
  158. 'duration': 622,
  159. 'view_count': int,
  160. 'like_count': int,
  161. 'dislike_count': int,
  162. 'comment_count': int,
  163. 'age_limit': 18,
  164. 'tags': list,
  165. 'categories': list,
  166. 'subtitles': {
  167. 'en': [{
  168. "ext": 'srt'
  169. }]
  170. },
  171. },
  172. 'params': {
  173. 'skip_download': True,
  174. },
  175. 'skip': 'This video has been disabled',
  176. }, {
  177. 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph601dc30bae19a',
  178. 'info_dict': {
  179. 'id': 'ph601dc30bae19a',
  180. 'uploader': 'Projekt Melody',
  181. 'uploader_id': 'projekt-melody',
  182. 'upload_date': '20210205',
  183. 'title': '"Welcome to My Pussy Mansion" - CB Stream (02/03/21)',
  184. 'thumbnail': r're:https?://.+',
  185. },
  186. }, {
  187. 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
  188. 'only_matching': True,
  189. }, {
  190. # removed at the request of cam4.com
  191. 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
  192. 'only_matching': True,
  193. }, {
  194. # removed at the request of the copyright owner
  195. 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
  196. 'only_matching': True,
  197. }, {
  198. # removed by uploader
  199. 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
  200. 'only_matching': True,
  201. }, {
  202. # private video
  203. 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
  204. 'only_matching': True,
  205. }, {
  206. 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
  207. 'only_matching': True,
  208. }, {
  209. 'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
  210. 'only_matching': True,
  211. }, {
  212. 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933',
  213. 'only_matching': True,
  214. }, {
  215. 'url': 'https://www.pornhub.org/view_video.php?viewkey=203640933',
  216. 'only_matching': True,
  217. }, {
  218. 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82',
  219. 'only_matching': True,
  220. }, {
  221. # Some videos are available with the same id on both premium
  222. # and non-premium sites (e.g. this and the following test)
  223. 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3',
  224. 'only_matching': True,
  225. }, {
  226. 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3',
  227. 'only_matching': True,
  228. }, {
  229. # geo restricted
  230. 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156',
  231. 'only_matching': True,
  232. }, {
  233. 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/view_video.php?viewkey=ph5a9813bfa7156',
  234. 'only_matching': True,
  235. }]
  236. def _extract_count(self, pattern, webpage, name):
  237. return str_to_int(self._search_regex(pattern, webpage, '%s count' % name, default=None))
  238. def _real_extract(self, url):
  239. mobj = self._match_valid_url(url)
  240. host = mobj.group('host') or 'pornhub.com'
  241. video_id = mobj.group('id')
  242. self._login(host)
  243. self._set_cookie(host, 'age_verified', '1')
  244. def dl_webpage(platform):
  245. self._set_cookie(host, 'platform', platform)
  246. return self._download_webpage(
  247. 'https://www.%s/view_video.php?viewkey=%s' % (host, video_id),
  248. video_id, 'Downloading %s webpage' % platform)
  249. webpage = dl_webpage('pc')
  250. error_msg = self._html_search_regex(
  251. (r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
  252. r'(?s)<section[^>]+class=["\']noVideo["\'][^>]*>(?P<error>.+?)</section>'),
  253. webpage, 'error message', default=None, group='error')
  254. if error_msg:
  255. error_msg = re.sub(r'\s+', ' ', error_msg)
  256. raise ExtractorError(
  257. 'PornHub said: %s' % error_msg,
  258. expected=True, video_id=video_id)
  259. if any(re.search(p, webpage) for p in (
  260. r'class=["\']geoBlocked["\']',
  261. r'>\s*This content is unavailable in your country')):
  262. self.raise_geo_restricted()
  263. # video_title from flashvars contains whitespace instead of non-ASCII (see
  264. # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
  265. # on that anymore.
  266. title = self._html_search_meta(
  267. 'twitter:title', webpage, default=None) or self._html_search_regex(
  268. (r'(?s)<h1[^>]+class=["\']title["\'][^>]*>(?P<title>.+?)</h1>',
  269. r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1',
  270. r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'),
  271. webpage, 'title', group='title')
  272. video_urls = []
  273. video_urls_set = set()
  274. subtitles = {}
  275. flashvars = self._parse_json(
  276. self._search_regex(
  277. r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
  278. video_id)
  279. if flashvars:
  280. subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
  281. if subtitle_url:
  282. subtitles.setdefault('en', []).append({
  283. 'url': subtitle_url,
  284. 'ext': 'srt',
  285. })
  286. thumbnail = flashvars.get('image_url')
  287. duration = int_or_none(flashvars.get('video_duration'))
  288. media_definitions = flashvars.get('mediaDefinitions')
  289. if isinstance(media_definitions, list):
  290. for definition in media_definitions:
  291. if not isinstance(definition, dict):
  292. continue
  293. video_url = definition.get('videoUrl')
  294. if not video_url or not isinstance(video_url, compat_str):
  295. continue
  296. if video_url in video_urls_set:
  297. continue
  298. video_urls_set.add(video_url)
  299. video_urls.append(
  300. (video_url, int_or_none(definition.get('quality'))))
  301. else:
  302. thumbnail, duration = [None] * 2
  303. def extract_js_vars(webpage, pattern, default=NO_DEFAULT):
  304. assignments = self._search_regex(
  305. pattern, webpage, 'encoded url', default=default)
  306. if not assignments:
  307. return {}
  308. assignments = assignments.split(';')
  309. js_vars = {}
  310. def parse_js_value(inp):
  311. inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
  312. if '+' in inp:
  313. inps = inp.split('+')
  314. return functools.reduce(
  315. operator.concat, map(parse_js_value, inps))
  316. inp = inp.strip()
  317. if inp in js_vars:
  318. return js_vars[inp]
  319. return remove_quotes(inp)
  320. for assn in assignments:
  321. assn = assn.strip()
  322. if not assn:
  323. continue
  324. assn = re.sub(r'var\s+', '', assn)
  325. vname, value = assn.split('=', 1)
  326. js_vars[vname] = parse_js_value(value)
  327. return js_vars
  328. def add_video_url(video_url):
  329. v_url = url_or_none(video_url)
  330. if not v_url:
  331. return
  332. if v_url in video_urls_set:
  333. return
  334. video_urls.append((v_url, None))
  335. video_urls_set.add(v_url)
  336. def parse_quality_items(quality_items):
  337. q_items = self._parse_json(quality_items, video_id, fatal=False)
  338. if not isinstance(q_items, list):
  339. return
  340. for item in q_items:
  341. if isinstance(item, dict):
  342. add_video_url(item.get('url'))
  343. if not video_urls:
  344. FORMAT_PREFIXES = ('media', 'quality', 'qualityItems')
  345. js_vars = extract_js_vars(
  346. webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES),
  347. default=None)
  348. if js_vars:
  349. for key, format_url in js_vars.items():
  350. if key.startswith(FORMAT_PREFIXES[-1]):
  351. parse_quality_items(format_url)
  352. elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]):
  353. add_video_url(format_url)
  354. if not video_urls and re.search(
  355. r'<[^>]+\bid=["\']lockedPlayer', webpage):
  356. raise ExtractorError(
  357. 'Video %s is locked' % video_id, expected=True)
  358. if not video_urls:
  359. js_vars = extract_js_vars(
  360. dl_webpage('tv'), r'(var.+?mediastring.+?)</script>')
  361. add_video_url(js_vars['mediastring'])
  362. for mobj in re.finditer(
  363. r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
  364. webpage):
  365. video_url = mobj.group('url')
  366. if video_url not in video_urls_set:
  367. video_urls.append((video_url, None))
  368. video_urls_set.add(video_url)
  369. upload_date = None
  370. formats = []
  371. def add_format(format_url, height=None):
  372. ext = determine_ext(format_url)
  373. if ext == 'mpd':
  374. formats.extend(self._extract_mpd_formats(
  375. format_url, video_id, mpd_id='dash', fatal=False))
  376. return
  377. if ext == 'm3u8':
  378. formats.extend(self._extract_m3u8_formats(
  379. format_url, video_id, 'mp4', entry_protocol='m3u8_native',
  380. m3u8_id='hls', fatal=False))
  381. return
  382. if not height:
  383. height = int_or_none(self._search_regex(
  384. r'(?P<height>\d+)[pP]?_\d+[kK]', format_url, 'height',
  385. default=None))
  386. formats.append({
  387. 'url': format_url,
  388. 'format_id': format_field(height, None, '%dp'),
  389. 'height': height,
  390. })
  391. for video_url, height in video_urls:
  392. if not upload_date:
  393. upload_date = self._search_regex(
  394. r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
  395. if upload_date:
  396. upload_date = upload_date.replace('/', '')
  397. if '/video/get_media' in video_url:
  398. medias = self._download_json(video_url, video_id, fatal=False)
  399. if isinstance(medias, list):
  400. for media in medias:
  401. if not isinstance(media, dict):
  402. continue
  403. video_url = url_or_none(media.get('videoUrl'))
  404. if not video_url:
  405. continue
  406. height = int_or_none(media.get('quality'))
  407. add_format(video_url, height)
  408. continue
  409. add_format(video_url)
  410. model_profile = self._search_json(
  411. r'var\s+MODEL_PROFILE\s*=', webpage, 'model profile', video_id, fatal=False)
  412. video_uploader = self._html_search_regex(
  413. r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
  414. webpage, 'uploader', default=None) or model_profile.get('username')
  415. def extract_vote_count(kind, name):
  416. return self._extract_count(
  417. (r'<span[^>]+\bclass="votes%s"[^>]*>([\d,\.]+)</span>' % kind,
  418. r'<span[^>]+\bclass=["\']votes%s["\'][^>]*\bdata-rating=["\'](\d+)' % kind),
  419. webpage, name)
  420. view_count = self._extract_count(
  421. r'<span class="count">([\d,\.]+)</span> [Vv]iews', webpage, 'view')
  422. like_count = extract_vote_count('Up', 'like')
  423. dislike_count = extract_vote_count('Down', 'dislike')
  424. comment_count = self._extract_count(
  425. r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
  426. def extract_list(meta_key):
  427. div = self._search_regex(
  428. r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>'
  429. % meta_key, webpage, meta_key, default=None)
  430. if div:
  431. return [clean_html(x).strip() for x in re.findall(r'(?s)<a[^>]+\bhref=[^>]+>.+?</a>', div)]
  432. info = self._search_json_ld(webpage, video_id, default={})
  433. # description provided in JSON-LD is irrelevant
  434. info['description'] = None
  435. return merge_dicts({
  436. 'id': video_id,
  437. 'uploader': video_uploader,
  438. 'uploader_id': remove_start(model_profile.get('modelProfileLink'), '/model/'),
  439. 'upload_date': upload_date,
  440. 'title': title,
  441. 'thumbnail': thumbnail,
  442. 'duration': duration,
  443. 'view_count': view_count,
  444. 'like_count': like_count,
  445. 'dislike_count': dislike_count,
  446. 'comment_count': comment_count,
  447. 'formats': formats,
  448. 'age_limit': 18,
  449. 'tags': extract_list('tags'),
  450. 'categories': extract_list('categories'),
  451. 'cast': extract_list('pornstars'),
  452. 'subtitles': subtitles,
  453. }, info)
  454. class PornHubPlaylistBaseIE(PornHubBaseIE):
  455. def _extract_page(self, url):
  456. return int_or_none(self._search_regex(
  457. r'\bpage=(\d+)', url, 'page', default=None))
  458. def _extract_entries(self, webpage, host):
  459. # Only process container div with main playlist content skipping
  460. # drop-down menu that uses similar pattern for videos (see
  461. # https://github.com/ytdl-org/youtube-dl/issues/11594).
  462. container = self._search_regex(
  463. r'(?s)(<div[^>]+class=["\']container.+)', webpage,
  464. 'container', default=webpage)
  465. return [
  466. self.url_result(
  467. 'http://www.%s/%s' % (host, video_url),
  468. PornHubIE.ie_key(), video_title=title)
  469. for video_url, title in orderedSet(re.findall(
  470. r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
  471. container))
  472. ]
  473. class PornHubUserIE(PornHubPlaylistBaseIE):
  474. _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' % PornHubBaseIE._PORNHUB_HOST_RE
  475. _TESTS = [{
  476. 'url': 'https://www.pornhub.com/model/zoe_ph',
  477. 'playlist_mincount': 118,
  478. }, {
  479. 'url': 'https://www.pornhub.com/pornstar/liz-vicious',
  480. 'info_dict': {
  481. 'id': 'liz-vicious',
  482. },
  483. 'playlist_mincount': 118,
  484. }, {
  485. 'url': 'https://www.pornhub.com/users/russianveet69',
  486. 'only_matching': True,
  487. }, {
  488. 'url': 'https://www.pornhub.com/channels/povd',
  489. 'only_matching': True,
  490. }, {
  491. 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1',
  492. 'only_matching': True,
  493. }, {
  494. # Unavailable via /videos page, but available with direct pagination
  495. # on pornstar page (see [1]), requires premium
  496. # 1. https://github.com/ytdl-org/youtube-dl/issues/27853
  497. 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west',
  498. 'only_matching': True,
  499. }, {
  500. # Same as before, multi page
  501. 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau',
  502. 'only_matching': True,
  503. }, {
  504. 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph',
  505. 'only_matching': True,
  506. }]
  507. def _real_extract(self, url):
  508. mobj = self._match_valid_url(url)
  509. user_id = mobj.group('id')
  510. videos_url = '%s/videos' % mobj.group('url')
  511. page = self._extract_page(url)
  512. if page:
  513. videos_url = update_url_query(videos_url, {'page': page})
  514. return self.url_result(
  515. videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id)
  516. class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
  517. @staticmethod
  518. def _has_more(webpage):
  519. return re.search(
  520. r'''(?x)
  521. <li[^>]+\bclass=["\']page_next|
  522. <link[^>]+\brel=["\']next|
  523. <button[^>]+\bid=["\']moreDataBtn
  524. ''', webpage) is not None
  525. def _entries(self, url, host, item_id):
  526. page = self._extract_page(url)
  527. VIDEOS = '/videos'
  528. def download_page(base_url, num, fallback=False):
  529. note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '')
  530. return self._download_webpage(
  531. base_url, item_id, note, query={'page': num})
  532. def is_404(e):
  533. return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404
  534. base_url = url
  535. has_page = page is not None
  536. first_page = page if has_page else 1
  537. for page_num in (first_page, ) if has_page else itertools.count(first_page):
  538. try:
  539. try:
  540. webpage = download_page(base_url, page_num)
  541. except ExtractorError as e:
  542. # Some sources may not be available via /videos page,
  543. # trying to fallback to main page pagination (see [1])
  544. # 1. https://github.com/ytdl-org/youtube-dl/issues/27853
  545. if is_404(e) and page_num == first_page and VIDEOS in base_url:
  546. base_url = base_url.replace(VIDEOS, '')
  547. webpage = download_page(base_url, page_num, fallback=True)
  548. else:
  549. raise
  550. except ExtractorError as e:
  551. if is_404(e) and page_num != first_page:
  552. break
  553. raise
  554. page_entries = self._extract_entries(webpage, host)
  555. if not page_entries:
  556. break
  557. for e in page_entries:
  558. yield e
  559. if not self._has_more(webpage):
  560. break
  561. def _real_extract(self, url):
  562. mobj = self._match_valid_url(url)
  563. host = mobj.group('host')
  564. item_id = mobj.group('id')
  565. self._login(host)
  566. return self.playlist_result(self._entries(url, host, item_id), item_id)
  567. class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
  568. _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?!playlist/)(?P<id>(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE
  569. _TESTS = [{
  570. 'url': 'https://www.pornhub.com/model/zoe_ph/videos',
  571. 'only_matching': True,
  572. }, {
  573. 'url': 'http://www.pornhub.com/users/rushandlia/videos',
  574. 'only_matching': True,
  575. }, {
  576. 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos',
  577. 'info_dict': {
  578. 'id': 'pornstar/jenny-blighe/videos',
  579. },
  580. 'playlist_mincount': 149,
  581. }, {
  582. 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos?page=3',
  583. 'info_dict': {
  584. 'id': 'pornstar/jenny-blighe/videos',
  585. },
  586. 'playlist_mincount': 40,
  587. }, {
  588. # default sorting as Top Rated Videos
  589. 'url': 'https://www.pornhub.com/channels/povd/videos',
  590. 'info_dict': {
  591. 'id': 'channels/povd/videos',
  592. },
  593. 'playlist_mincount': 293,
  594. }, {
  595. # Top Rated Videos
  596. 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra',
  597. 'only_matching': True,
  598. }, {
  599. # Most Recent Videos
  600. 'url': 'https://www.pornhub.com/channels/povd/videos?o=da',
  601. 'only_matching': True,
  602. }, {
  603. # Most Viewed Videos
  604. 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi',
  605. 'only_matching': True,
  606. }, {
  607. 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
  608. 'only_matching': True,
  609. }, {
  610. # Most Viewed Videos
  611. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=mv',
  612. 'only_matching': True,
  613. }, {
  614. # Top Rated Videos
  615. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=tr',
  616. 'only_matching': True,
  617. }, {
  618. # Longest Videos
  619. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=lg',
  620. 'only_matching': True,
  621. }, {
  622. # Newest Videos
  623. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=cm',
  624. 'only_matching': True,
  625. }, {
  626. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/paid',
  627. 'only_matching': True,
  628. }, {
  629. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/fanonly',
  630. 'only_matching': True,
  631. }, {
  632. 'url': 'https://www.pornhub.com/video',
  633. 'only_matching': True,
  634. }, {
  635. 'url': 'https://www.pornhub.com/video?page=3',
  636. 'only_matching': True,
  637. }, {
  638. 'url': 'https://www.pornhub.com/video/search?search=123',
  639. 'only_matching': True,
  640. }, {
  641. 'url': 'https://www.pornhub.com/categories/teen',
  642. 'only_matching': True,
  643. }, {
  644. 'url': 'https://www.pornhub.com/categories/teen?page=3',
  645. 'only_matching': True,
  646. }, {
  647. 'url': 'https://www.pornhub.com/hd',
  648. 'only_matching': True,
  649. }, {
  650. 'url': 'https://www.pornhub.com/hd?page=3',
  651. 'only_matching': True,
  652. }, {
  653. 'url': 'https://www.pornhub.com/described-video',
  654. 'only_matching': True,
  655. }, {
  656. 'url': 'https://www.pornhub.com/described-video?page=2',
  657. 'only_matching': True,
  658. }, {
  659. 'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn',
  660. 'only_matching': True,
  661. }, {
  662. 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph/videos',
  663. 'only_matching': True,
  664. }]
  665. @classmethod
  666. def suitable(cls, url):
  667. return (False
  668. if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url)
  669. else super(PornHubPagedVideoListIE, cls).suitable(url))
  670. class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
  671. _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' % PornHubBaseIE._PORNHUB_HOST_RE
  672. _TESTS = [{
  673. 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
  674. 'info_dict': {
  675. 'id': 'jenny-blighe',
  676. },
  677. 'playlist_mincount': 129,
  678. }, {
  679. 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
  680. 'only_matching': True,
  681. }, {
  682. 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/pornstar/jenny-blighe/videos/upload',
  683. 'only_matching': True,
  684. }]
  685. class PornHubPlaylistIE(PornHubPlaylistBaseIE):
  686. _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/playlist/(?P<id>[^/?#&]+))' % PornHubBaseIE._PORNHUB_HOST_RE
  687. _TESTS = [{
  688. 'url': 'https://www.pornhub.com/playlist/44121572',
  689. 'info_dict': {
  690. 'id': '44121572',
  691. },
  692. 'playlist_count': 77,
  693. }, {
  694. 'url': 'https://www.pornhub.com/playlist/4667351',
  695. 'only_matching': True,
  696. }, {
  697. 'url': 'https://de.pornhub.com/playlist/4667351',
  698. 'only_matching': True,
  699. }, {
  700. 'url': 'https://de.pornhub.com/playlist/4667351?page=2',
  701. 'only_matching': True,
  702. }]
  703. def _entries(self, url, host, item_id):
  704. webpage = self._download_webpage(url, item_id, 'Downloading page 1')
  705. playlist_id = self._search_regex(r'var\s+playlistId\s*=\s*"([^"]+)"', webpage, 'playlist_id')
  706. video_count = int_or_none(
  707. self._search_regex(r'var\s+itemsCount\s*=\s*([0-9]+)\s*\|\|', webpage, 'video_count'))
  708. token = self._search_regex(r'var\s+token\s*=\s*"([^"]+)"', webpage, 'token')
  709. page_count = math.ceil((video_count - 36) / 40.) + 1
  710. page_entries = self._extract_entries(webpage, host)
  711. def download_page(page_num):
  712. note = 'Downloading page {}'.format(page_num)
  713. page_url = 'https://www.{}/playlist/viewChunked'.format(host)
  714. return self._download_webpage(page_url, item_id, note, query={
  715. 'id': playlist_id,
  716. 'page': page_num,
  717. 'token': token,
  718. })
  719. for page_num in range(1, page_count + 1):
  720. if page_num > 1:
  721. webpage = download_page(page_num)
  722. page_entries = self._extract_entries(webpage, host)
  723. if not page_entries:
  724. break
  725. for e in page_entries:
  726. yield e
  727. def _real_extract(self, url):
  728. mobj = self._match_valid_url(url)
  729. host = mobj.group('host')
  730. item_id = mobj.group('id')
  731. self._login(host)
  732. return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id)