优酷筛选.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. # File : 优酷筛选.py
  4. # Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
  5. # Date : 2022/9/23
  6. import json
  7. import re
  8. import requests
  9. from pprint import pprint
  10. # cates = 'teleplay&film&cartoon&tvshow&documentary'.split('&')
  11. headers1 = {
  12. 'user-agent': 'Mozilla/5.0 (Linux; Android 11; M2007J3SC Build/RKQ1.200826.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/77.0.3865.120 MQQBrowser/6.2 TBS/045714 Mobile Safari/537.36'
  13. # ,'x-requested-with':'XMLHttpRequest'
  14. # ,'sec-fetch-site':'same-origin'
  15. # ,'sec-fetch-mode':'cors'
  16. # ,'referer':'https://www.youku.com/category/show/type_%E7%94%B5%E8%A7%86%E5%89%A7_mainArea_%E4%B8%AD%E5%9B%BD%E5%86%85%E5%9C%B0_tags_%E9%9D%92%E6%98%A5.html?spm=a2ha1.14919748_WEBTV_JINGXUAN.drawer3.27'
  17. ,'referer':'https://www.youku.com'
  18. }
  19. r = requests.get('https://www.youku.com/category/data?params=%7B%22type%22%3A%22%E7%94%B5%E5%BD%B1%22%7D&optionRefresh=1&pageNo=1',headers=headers1)
  20. html = r.json()
  21. cates_data = html['data']['filterData']['filter']['filterData'][0]['subFilter']
  22. cates_data = list(map(lambda x:x['title'],cates_data))
  23. print(cates_data)
  24. exit()
  25. # cates = cates_data[:1]
  26. cates = cates_data
  27. urls = ['https://www.youku.com/category/data?params='+'{"type":"'+cate+'"}&optionRefresh=1&pageNo=1' for cate in cates]
  28. print(urls)
  29. headers = {'user-agent':'Mozilla/5.0 (Linux; Android 11; M2007J3SC Build/RKQ1.200826.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/77.0.3865.120 MQQBrowser/6.2 TBS/045714 Mobile Safari/537.36'}
  30. ft_dict = {}
  31. def getHtml(url):
  32. r = requests.get(url, headers=headers)
  33. html = r.text
  34. html = '{' + re.search('window.__INITIAL_DATA__.*?{(.*?);', html, re.S | re.M).groups()[0]
  35. undefined = null = None
  36. false = False
  37. true = True
  38. html = eval(html)
  39. print(type(html), html)
  40. url1 = 'https://www.youku.com/category/data?params=%7B%22type%22%3A%22%E7%94%B5%E8%A7%86%E5%89%A7%22%2C%22tags%22%3A%22%E9%9D%92%E6%98%A5%22%7D&optionRefresh=1&pageNo=1'
  41. def getOne(url):
  42. r = requests.get(url,headers=headers1)
  43. print(r.text)
  44. html = r.json()
  45. filters = html['data']['filterData']['filter']['filterData'][1:]
  46. cate_id = html['data']['filterData']['cateKey']
  47. ft_dict[cate_id] = []
  48. for i in range(len(filters)):
  49. ft = filters[i]
  50. # value = [{"n":"全部","v":""}]
  51. value = []
  52. vl = [{"n":i['title'],"v":i.get('value','')} for i in ft['subFilter']]
  53. value.extend(vl)
  54. ft_dict[cate_id].append({
  55. 'key':ft['filterType'],
  56. 'name':ft['subFilter'][0]['title'],
  57. 'value':value
  58. })
  59. return ft_dict
  60. # print(ft_dict)
  61. for url in urls:
  62. # print(getOne(urls[0]))
  63. # print(getOne(url))
  64. getOne(url)
  65. print(ft_dict)
  66. print(json.dumps(ft_dict,ensure_ascii=False))