camdemy.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. import re
  2. from .common import InfoExtractor
  3. from ..compat import (
  4. compat_urllib_parse_urlencode,
  5. compat_urlparse,
  6. )
  7. from ..utils import (
  8. clean_html,
  9. parse_duration,
  10. str_to_int,
  11. unified_strdate,
  12. )
  13. class CamdemyIE(InfoExtractor):
  14. _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)'
  15. _TESTS = [{
  16. # single file
  17. 'url': 'http://www.camdemy.com/media/5181/',
  18. 'md5': '5a5562b6a98b37873119102e052e311b',
  19. 'info_dict': {
  20. 'id': '5181',
  21. 'ext': 'mp4',
  22. 'title': 'Ch1-1 Introduction, Signals (02-23-2012)',
  23. 'thumbnail': r're:^https?://.*\.jpg$',
  24. 'creator': 'ss11spring',
  25. 'duration': 1591,
  26. 'upload_date': '20130114',
  27. 'view_count': int,
  28. }
  29. }, {
  30. # With non-empty description
  31. # webpage returns "No permission or not login"
  32. 'url': 'http://www.camdemy.com/media/13885',
  33. 'md5': '4576a3bb2581f86c61044822adbd1249',
  34. 'info_dict': {
  35. 'id': '13885',
  36. 'ext': 'mp4',
  37. 'title': 'EverCam + Camdemy QuickStart',
  38. 'thumbnail': r're:^https?://.*\.jpg$',
  39. 'description': 'md5:2a9f989c2b153a2342acee579c6e7db6',
  40. 'creator': 'evercam',
  41. 'duration': 318,
  42. }
  43. }, {
  44. # External source (YouTube)
  45. 'url': 'http://www.camdemy.com/media/14842',
  46. 'info_dict': {
  47. 'id': '2vsYQzNIsJo',
  48. 'ext': 'mp4',
  49. 'title': 'Excel 2013 Tutorial - How to add Password Protection',
  50. 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection',
  51. 'upload_date': '20130211',
  52. 'uploader': 'Hun Kim',
  53. 'uploader_id': 'hunkimtutorials',
  54. },
  55. 'params': {
  56. 'skip_download': True,
  57. },
  58. }]
  59. def _real_extract(self, url):
  60. video_id = self._match_id(url)
  61. webpage = self._download_webpage(url, video_id)
  62. src_from = self._html_search_regex(
  63. r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1",
  64. webpage, 'external source', default=None, group='url')
  65. if src_from:
  66. return self.url_result(src_from)
  67. oembed_obj = self._download_json(
  68. 'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)
  69. title = oembed_obj['title']
  70. thumb_url = oembed_obj['thumbnail_url']
  71. video_folder = compat_urlparse.urljoin(thumb_url, 'video/')
  72. file_list_doc = self._download_xml(
  73. compat_urlparse.urljoin(video_folder, 'fileList.xml'),
  74. video_id, 'Downloading filelist XML')
  75. file_name = file_list_doc.find('./video/item/fileName').text
  76. video_url = compat_urlparse.urljoin(video_folder, file_name)
  77. # Some URLs return "No permission or not login" in a webpage despite being
  78. # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885)
  79. upload_date = unified_strdate(self._search_regex(
  80. r'>published on ([^<]+)<', webpage,
  81. 'upload date', default=None))
  82. view_count = str_to_int(self._search_regex(
  83. r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views',
  84. webpage, 'view count', default=None))
  85. description = self._html_search_meta(
  86. 'description', webpage, default=None) or clean_html(
  87. oembed_obj.get('description'))
  88. return {
  89. 'id': video_id,
  90. 'url': video_url,
  91. 'title': title,
  92. 'thumbnail': thumb_url,
  93. 'description': description,
  94. 'creator': oembed_obj.get('author_name'),
  95. 'duration': parse_duration(oembed_obj.get('duration')),
  96. 'upload_date': upload_date,
  97. 'view_count': view_count,
  98. }
  99. class CamdemyFolderIE(InfoExtractor):
  100. _VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)'
  101. _TESTS = [{
  102. # links with trailing slash
  103. 'url': 'http://www.camdemy.com/folder/450',
  104. 'info_dict': {
  105. 'id': '450',
  106. 'title': '信號與系統 2012 & 2011 (Signals and Systems)',
  107. },
  108. 'playlist_mincount': 145
  109. }, {
  110. # links without trailing slash
  111. # and multi-page
  112. 'url': 'http://www.camdemy.com/folder/853',
  113. 'info_dict': {
  114. 'id': '853',
  115. 'title': '科學計算 - 使用 Matlab'
  116. },
  117. 'playlist_mincount': 20
  118. }, {
  119. # with displayMode parameter. For testing the codes to add parameters
  120. 'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg',
  121. 'info_dict': {
  122. 'id': '853',
  123. 'title': '科學計算 - 使用 Matlab'
  124. },
  125. 'playlist_mincount': 20
  126. }]
  127. def _real_extract(self, url):
  128. folder_id = self._match_id(url)
  129. # Add displayMode=list so that all links are displayed in a single page
  130. parsed_url = list(compat_urlparse.urlparse(url))
  131. query = dict(compat_urlparse.parse_qsl(parsed_url[4]))
  132. query.update({'displayMode': 'list'})
  133. parsed_url[4] = compat_urllib_parse_urlencode(query)
  134. final_url = compat_urlparse.urlunparse(parsed_url)
  135. page = self._download_webpage(final_url, folder_id)
  136. matches = re.findall(r"href='(/media/\d+/?)'", page)
  137. entries = [self.url_result('http://www.camdemy.com' + media_path)
  138. for media_path in matches]
  139. folder_title = self._html_search_meta('keywords', page)
  140. return self.playlist_result(entries, folder_id, folder_title)