weibo.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. from .common import InfoExtractor
  2. import json
  3. import random
  4. import re
  5. from ..compat import (
  6. compat_parse_qs,
  7. compat_str,
  8. )
  9. from ..utils import (
  10. js_to_json,
  11. strip_jsonp,
  12. urlencode_postdata,
  13. )
  14. class WeiboIE(InfoExtractor):
  15. _VALID_URL = r'https?://(?:www\.)?weibo\.com/[0-9]+/(?P<id>[a-zA-Z0-9]+)'
  16. _TEST = {
  17. 'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment',
  18. 'info_dict': {
  19. 'id': 'Fp6RGfbff',
  20. 'ext': 'mp4',
  21. 'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博',
  22. }
  23. }
  24. def _real_extract(self, url):
  25. video_id = self._match_id(url)
  26. # to get Referer url for genvisitor
  27. webpage, urlh = self._download_webpage_handle(url, video_id)
  28. visitor_url = urlh.geturl()
  29. if 'passport.weibo.com' in visitor_url:
  30. # first visit
  31. visitor_data = self._download_json(
  32. 'https://passport.weibo.com/visitor/genvisitor', video_id,
  33. note='Generating first-visit data',
  34. transform_source=strip_jsonp,
  35. headers={'Referer': visitor_url},
  36. data=urlencode_postdata({
  37. 'cb': 'gen_callback',
  38. 'fp': json.dumps({
  39. 'os': '2',
  40. 'browser': 'Gecko57,0,0,0',
  41. 'fonts': 'undefined',
  42. 'screenInfo': '1440*900*24',
  43. 'plugins': '',
  44. }),
  45. }))
  46. tid = visitor_data['data']['tid']
  47. cnfd = '%03d' % visitor_data['data']['confidence']
  48. self._download_webpage(
  49. 'https://passport.weibo.com/visitor/visitor', video_id,
  50. note='Running first-visit callback',
  51. query={
  52. 'a': 'incarnate',
  53. 't': tid,
  54. 'w': 2,
  55. 'c': cnfd,
  56. 'cb': 'cross_domain',
  57. 'from': 'weibo',
  58. '_rand': random.random(),
  59. })
  60. webpage = self._download_webpage(
  61. url, video_id, note='Revisiting webpage')
  62. title = self._html_extract_title(webpage)
  63. video_formats = compat_parse_qs(self._search_regex(
  64. r'video-sources=\\\"(.+?)\"', webpage, 'video_sources'))
  65. formats = []
  66. supported_resolutions = (480, 720)
  67. for res in supported_resolutions:
  68. vid_urls = video_formats.get(compat_str(res))
  69. if not vid_urls or not isinstance(vid_urls, list):
  70. continue
  71. vid_url = vid_urls[0]
  72. formats.append({
  73. 'url': vid_url,
  74. 'height': res,
  75. })
  76. uploader = self._og_search_property(
  77. 'nick-name', webpage, 'uploader', default=None)
  78. return {
  79. 'id': video_id,
  80. 'title': title,
  81. 'uploader': uploader,
  82. 'formats': formats
  83. }
  84. class WeiboMobileIE(InfoExtractor):
  85. _VALID_URL = r'https?://m\.weibo\.cn/status/(?P<id>[0-9]+)(\?.+)?'
  86. _TEST = {
  87. 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0',
  88. 'info_dict': {
  89. 'id': '4189191225395228',
  90. 'ext': 'mp4',
  91. 'title': '午睡当然是要甜甜蜜蜜的啦',
  92. 'uploader': '柴犬柴犬'
  93. }
  94. }
  95. def _real_extract(self, url):
  96. video_id = self._match_id(url)
  97. # to get Referer url for genvisitor
  98. webpage = self._download_webpage(url, video_id, note='visit the page')
  99. weibo_info = self._parse_json(self._search_regex(
  100. r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};',
  101. webpage, 'js_code', flags=re.DOTALL),
  102. video_id, transform_source=js_to_json)
  103. status_data = weibo_info.get('status', {})
  104. page_info = status_data.get('page_info')
  105. title = status_data['status_title']
  106. uploader = status_data.get('user', {}).get('screen_name')
  107. return {
  108. 'id': video_id,
  109. 'title': title,
  110. 'uploader': uploader,
  111. 'url': page_info['media_info']['stream_url']
  112. }