cracked.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. import re
  2. from .common import InfoExtractor
  3. from .youtube import YoutubeIE
  4. from ..utils import (
  5. parse_iso8601,
  6. str_to_int,
  7. )
  8. class CrackedIE(InfoExtractor):
  9. _VALID_URL = r'https?://(?:www\.)?cracked\.com/video_(?P<id>\d+)_[\da-z-]+\.html'
  10. _TESTS = [{
  11. 'url': 'http://www.cracked.com/video_19070_if-animal-actors-got-e21-true-hollywood-stories.html',
  12. 'md5': '89b90b9824e3806ca95072c4d78f13f7',
  13. 'info_dict': {
  14. 'id': '19070',
  15. 'ext': 'mp4',
  16. 'title': 'If Animal Actors Got E! True Hollywood Stories',
  17. 'timestamp': 1404954000,
  18. 'upload_date': '20140710',
  19. }
  20. }, {
  21. # youtube embed
  22. 'url': 'http://www.cracked.com/video_19006_4-plot-holes-you-didnt-notice-in-your-favorite-movies.html',
  23. 'md5': 'ccd52866b50bde63a6ef3b35016ba8c7',
  24. 'info_dict': {
  25. 'id': 'EjI00A3rZD0',
  26. 'ext': 'mp4',
  27. 'title': "4 Plot Holes You Didn't Notice in Your Favorite Movies - The Spit Take",
  28. 'description': 'md5:c603708c718b796fe6079e2b3351ffc7',
  29. 'upload_date': '20140725',
  30. 'uploader_id': 'Cracked',
  31. 'uploader': 'Cracked',
  32. }
  33. }]
  34. def _real_extract(self, url):
  35. video_id = self._match_id(url)
  36. webpage = self._download_webpage(url, video_id)
  37. youtube_url = YoutubeIE._extract_url(webpage)
  38. if youtube_url:
  39. return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
  40. video_url = self._html_search_regex(
  41. [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'],
  42. webpage, 'video URL')
  43. title = self._search_regex(
  44. [r'property="?og:title"?\s+content="([^"]+)"', r'class="?title"?>([^<]+)'],
  45. webpage, 'title')
  46. description = self._search_regex(
  47. r'name="?(?:og:)?description"?\s+content="([^"]+)"',
  48. webpage, 'description', default=None)
  49. timestamp = self._html_search_regex(
  50. r'"date"\s*:\s*"([^"]+)"', webpage, 'upload date', fatal=False)
  51. if timestamp:
  52. timestamp = parse_iso8601(timestamp[:-6])
  53. view_count = str_to_int(self._html_search_regex(
  54. r'<span\s+class="?views"? id="?viewCounts"?>([\d,\.]+) Views</span>',
  55. webpage, 'view count', fatal=False))
  56. comment_count = str_to_int(self._html_search_regex(
  57. r'<span\s+id="?commentCounts"?>([\d,\.]+)</span>',
  58. webpage, 'comment count', fatal=False))
  59. m = re.search(r'_(?P<width>\d+)X(?P<height>\d+)\.mp4$', video_url)
  60. if m:
  61. width = int(m.group('width'))
  62. height = int(m.group('height'))
  63. else:
  64. width = height = None
  65. return {
  66. 'id': video_id,
  67. 'url': video_url,
  68. 'title': title,
  69. 'description': description,
  70. 'timestamp': timestamp,
  71. 'view_count': view_count,
  72. 'comment_count': comment_count,
  73. 'height': height,
  74. 'width': width,
  75. }