linuxacademy.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. import json
  2. import random
  3. from .common import InfoExtractor
  4. from ..compat import (
  5. compat_b64decode,
  6. compat_HTTPError,
  7. compat_str,
  8. )
  9. from ..utils import (
  10. clean_html,
  11. ExtractorError,
  12. js_to_json,
  13. parse_duration,
  14. try_get,
  15. unified_timestamp,
  16. urlencode_postdata,
  17. urljoin,
  18. )
  19. class LinuxAcademyIE(InfoExtractor):
  20. _VALID_URL = r'''(?x)
  21. https?://
  22. (?:www\.)?linuxacademy\.com/cp/
  23. (?:
  24. courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)|
  25. modules/view/id/(?P<course_id>\d+)
  26. )
  27. '''
  28. _TESTS = [{
  29. 'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675',
  30. 'info_dict': {
  31. 'id': '7971-2',
  32. 'ext': 'mp4',
  33. 'title': 'What Is Data Science',
  34. 'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
  35. 'timestamp': int, # The timestamp and upload date changes
  36. 'upload_date': r're:\d+',
  37. 'duration': 304,
  38. },
  39. 'params': {
  40. 'skip_download': True,
  41. },
  42. 'skip': 'Requires Linux Academy account credentials',
  43. }, {
  44. 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
  45. 'only_matching': True,
  46. }, {
  47. 'url': 'https://linuxacademy.com/cp/modules/view/id/154',
  48. 'info_dict': {
  49. 'id': '154',
  50. 'title': 'AWS Certified Cloud Practitioner',
  51. 'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c',
  52. 'duration': 28835,
  53. },
  54. 'playlist_count': 41,
  55. 'skip': 'Requires Linux Academy account credentials',
  56. }, {
  57. 'url': 'https://linuxacademy.com/cp/modules/view/id/39',
  58. 'info_dict': {
  59. 'id': '39',
  60. 'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep (legacy)',
  61. 'description': 'md5:0f1d3369e90c3fb14a79813b863c902f',
  62. 'duration': 89280,
  63. },
  64. 'playlist_count': 73,
  65. 'skip': 'Requires Linux Academy account credentials',
  66. }]
  67. _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
  68. _ORIGIN_URL = 'https://linuxacademy.com'
  69. _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
  70. _NETRC_MACHINE = 'linuxacademy'
  71. def _perform_login(self, username, password):
  72. def random_string():
  73. return ''.join([
  74. random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
  75. for _ in range(32)])
  76. webpage, urlh = self._download_webpage_handle(
  77. self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
  78. 'client_id': self._CLIENT_ID,
  79. 'response_type': 'token id_token',
  80. 'response_mode': 'web_message',
  81. 'redirect_uri': self._ORIGIN_URL,
  82. 'scope': 'openid email user_impersonation profile',
  83. 'audience': self._ORIGIN_URL,
  84. 'state': random_string(),
  85. 'nonce': random_string(),
  86. })
  87. login_data = self._parse_json(
  88. self._search_regex(
  89. r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
  90. 'login info', group='value'), None,
  91. transform_source=lambda x: compat_b64decode(x).decode('utf-8')
  92. )['extraParams']
  93. login_data.update({
  94. 'client_id': self._CLIENT_ID,
  95. 'redirect_uri': self._ORIGIN_URL,
  96. 'tenant': 'lacausers',
  97. 'connection': 'Username-Password-ACG-Proxy',
  98. 'username': username,
  99. 'password': password,
  100. 'sso': 'true',
  101. })
  102. login_state_url = urlh.geturl()
  103. try:
  104. login_page = self._download_webpage(
  105. 'https://login.linuxacademy.com/usernamepassword/login', None,
  106. 'Downloading login page', data=json.dumps(login_data).encode(),
  107. headers={
  108. 'Content-Type': 'application/json',
  109. 'Origin': 'https://login.linuxacademy.com',
  110. 'Referer': login_state_url,
  111. })
  112. except ExtractorError as e:
  113. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
  114. error = self._parse_json(e.cause.read(), None)
  115. message = error.get('description') or error['code']
  116. raise ExtractorError(
  117. '%s said: %s' % (self.IE_NAME, message), expected=True)
  118. raise
  119. callback_page, urlh = self._download_webpage_handle(
  120. 'https://login.linuxacademy.com/login/callback', None,
  121. 'Downloading callback page',
  122. data=urlencode_postdata(self._hidden_inputs(login_page)),
  123. headers={
  124. 'Content-Type': 'application/x-www-form-urlencoded',
  125. 'Origin': 'https://login.linuxacademy.com',
  126. 'Referer': login_state_url,
  127. })
  128. access_token = self._search_regex(
  129. r'access_token=([^=&]+)', urlh.geturl(),
  130. 'access token', default=None)
  131. if not access_token:
  132. access_token = self._parse_json(
  133. self._search_regex(
  134. r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page,
  135. 'authorization response'), None,
  136. transform_source=js_to_json)['response']['access_token']
  137. self._download_webpage(
  138. 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
  139. % access_token, None, 'Downloading token validation page')
  140. def _real_extract(self, url):
  141. mobj = self._match_valid_url(url)
  142. chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
  143. item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)
  144. webpage = self._download_webpage(url, item_id)
  145. # course path
  146. if course_id:
  147. module = self._parse_json(
  148. self._search_regex(
  149. r'window\.module\s*=\s*({(?:(?!};)[^"]|"([^"]|\\")*")+})\s*;', webpage, 'module'),
  150. item_id)
  151. entries = []
  152. chapter_number = None
  153. chapter = None
  154. chapter_id = None
  155. for item in module['items']:
  156. if not isinstance(item, dict):
  157. continue
  158. def type_field(key):
  159. return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower()
  160. type_fields = (type_field('name'), type_field('slug'))
  161. # Move to next module section
  162. if 'section' in type_fields:
  163. chapter = item.get('course_name')
  164. chapter_id = item.get('course_module')
  165. chapter_number = 1 if not chapter_number else chapter_number + 1
  166. continue
  167. # Skip non-lessons
  168. if 'lesson' not in type_fields:
  169. continue
  170. lesson_url = urljoin(url, item.get('url'))
  171. if not lesson_url:
  172. continue
  173. title = item.get('title') or item.get('lesson_name')
  174. description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text'))
  175. entries.append({
  176. '_type': 'url_transparent',
  177. 'url': lesson_url,
  178. 'ie_key': LinuxAcademyIE.ie_key(),
  179. 'title': title,
  180. 'description': description,
  181. 'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')),
  182. 'duration': parse_duration(item.get('duration')),
  183. 'chapter': chapter,
  184. 'chapter_id': chapter_id,
  185. 'chapter_number': chapter_number,
  186. })
  187. return {
  188. '_type': 'playlist',
  189. 'entries': entries,
  190. 'id': course_id,
  191. 'title': module.get('title'),
  192. 'description': module.get('md_desc') or clean_html(module.get('desc')),
  193. 'duration': parse_duration(module.get('duration')),
  194. }
  195. # single video path
  196. m3u8_url = self._parse_json(
  197. self._search_regex(
  198. r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'),
  199. item_id)[0]['file']
  200. formats = self._extract_m3u8_formats(
  201. m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native',
  202. m3u8_id='hls')
  203. info = {
  204. 'id': item_id,
  205. 'formats': formats,
  206. }
  207. lesson = self._parse_json(
  208. self._search_regex(
  209. (r'window\.lesson\s*=\s*({.+?})\s*;',
  210. r'player\.lesson\s*=\s*({.+?})\s*;'),
  211. webpage, 'lesson', default='{}'), item_id, fatal=False)
  212. if lesson:
  213. info.update({
  214. 'title': lesson.get('lesson_name'),
  215. 'description': lesson.get('md_desc') or clean_html(lesson.get('desc')),
  216. 'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')),
  217. 'duration': parse_duration(lesson.get('duration')),
  218. })
  219. if not info.get('title'):
  220. info['title'] = self._search_regex(
  221. (r'>Lecture\s*:\s*(?P<value>[^<]+)',
  222. r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
  223. 'title', group='value')
  224. return info