cctv_spider.py 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880
  1. # coding=utf-8
  2. # !/usr/bin/python
  3. import os.path
  4. import sys
  5. sys.path.append('..')
  6. try:
  7. from base.spider import Spider as BaseSpider
  8. except ImportError:
  9. from t4.base.spider import BaseSpider
  10. import json
  11. import time
  12. import base64
  13. import re
  14. from urllib import request, parse
  15. from pathlib import Path
  16. import urllib
  17. import urllib.request
  18. import time
  19. """
  20. 配置示例:
  21. t4的配置里ext节点会自动变成api对应query参数extend,但t4的ext字符串不支持路径格式,比如./开头或者.json结尾
  22. api里会自动含有ext参数是base64编码后的选中的筛选条件
  23. 错误示例,ext含有json:
  24. {
  25. "key":"hipy_cctv",
  26. "name":"hipy_cctv",
  27. "type":4,
  28. "api":"http://192.168.31.49:5707/api/v1/vod/cctv_spider?api_ext={{host}}/txt/hipy/cctv_spider.json",
  29. "searchable":1,
  30. "quickSearch":1,
  31. "filterable":1,
  32. "ext":"cctv_spider.json"
  33. }
  34. 正确示例。同时存在ext和api_ext会优先取ext作为extend加载init
  35. {
  36. "key":"hipy_cctv",
  37. "name":"hipy_cctv",
  38. "type":4,
  39. "api":"http://192.168.31.49:5707/api/v1/vod/cctv_spider?api_ext={{host}}/txt/hipy/cctv_spider.json",
  40. "searchable":1,
  41. "quickSearch":1,
  42. "filterable":1,
  43. "ext":"cctv_spider"
  44. }
  45. {
  46. "key": "t3_hipy_cctv",
  47. "name": "t3_hipy_cctv",
  48. "type": 3,
  49. "api": "{{host}}/txt/hipy/cctv_spider.py",
  50. "searchable": 1,
  51. "quickSearch": 1,
  52. "filterable": 1,
  53. "ext": "{{host}}/txt/hipy/cctv_spider.json"
  54. }
  55. """
  56. class Spider(BaseSpider): # 元类 默认的元类 type
  57. def getName(self):
  58. return "中央电视台" # 可搜索
  59. def init_api_ext_file(self):
  60. ext_file = __file__.replace('.py', '.json')
  61. print(f'ext_file:{ext_file}')
  62. # 特别节目网页: https://tv.cctv.com/yxg/index.shtml?spm=C28340.PlFTqGe6Zk8M.E2PQtIunpEaz.65
  63. # 特别节目分类筛选获取页面: https://tv.cctv.com/yxg/tbjm/index.shtml
  64. # 纪录片网页: https://tv.cctv.com/yxg/index.shtml?spm=C28340.PlFTqGe6Zk8M.E2PQtIunpEaz.65
  65. # 纪录片分类筛选获取页面:https://tv.cctv.com/yxg/jlp/index.shtml
  66. # ==================== 获取特别节目的筛选条件 ======================
  67. r = self.fetch('https://tv.cctv.com/yxg/tbjm/index.shtml')
  68. html = r.text
  69. # html = self.webReadFile(urlStr='https://tv.cctv.com/yxg/tbjm/index.shtml', header=self.header)
  70. # print(html)
  71. html = self.html(html)
  72. filter_tbjm = []
  73. lis = html.xpath('//*[@id="pindao"]/li')
  74. li_value = []
  75. for li in lis:
  76. li_value.append({
  77. 'n': ''.join(li.xpath('./span//text()')),
  78. 'v': ''.join(li.xpath('@datacd')),
  79. })
  80. # print(li_value)
  81. filter_tbjm.append({
  82. "key": "datapd-channel",
  83. "name": "频道",
  84. "value": li_value
  85. })
  86. lis = html.xpath('//*[@id="fenlei"]/li')
  87. li_value = []
  88. for li in lis:
  89. li_value.append({
  90. 'n': ''.join(li.xpath('./span//text()')),
  91. 'v': ''.join(li.xpath('@datalx')),
  92. })
  93. # print(li_value)
  94. filter_tbjm.append({
  95. "key": "datafl-sc",
  96. "name": "类型",
  97. "value": li_value
  98. })
  99. lis = html.xpath('//*[@id="zimu"]/li')
  100. li_value = []
  101. for li in lis:
  102. li_value.append({
  103. 'n': ''.join(li.xpath('./span//text()')),
  104. 'v': ''.join(li.xpath('@datazm')),
  105. })
  106. # print(li_value)
  107. filter_tbjm.append({
  108. "key": "dataszm-letter",
  109. "name": "首字母",
  110. "value": li_value
  111. })
  112. print(filter_tbjm)
  113. # ==================== 纪录片筛选获取 ======================
  114. r = self.fetch('https://tv.cctv.com/yxg/jlp/index.shtml')
  115. html = r.text
  116. html = self.html(html)
  117. filter_jlp = []
  118. lis = html.xpath('//*[@id="pindao"]/li')
  119. li_value = []
  120. for li in lis:
  121. li_value.append({
  122. 'n': ''.join(li.xpath('./span//text()')),
  123. 'v': ''.join(li.xpath('@datacd')),
  124. })
  125. # print(li_value)
  126. filter_jlp.append({
  127. "key": "datapd-channel",
  128. "name": "频道",
  129. "value": li_value
  130. })
  131. lis = html.xpath('//*[@id="fenlei"]/li')
  132. li_value = []
  133. for li in lis:
  134. li_value.append({
  135. 'n': ''.join(li.xpath('./span//text()')),
  136. 'v': ''.join(li.xpath('@datalx')),
  137. })
  138. # print(li_value)
  139. filter_jlp.append({
  140. "key": "datafl-sc",
  141. "name": "类型",
  142. "value": li_value
  143. })
  144. lis = html.xpath('//*[@id="nianfen"]/li')
  145. li_value = []
  146. for li in lis:
  147. li_value.append({
  148. 'n': ''.join(li.xpath('./span//text()')),
  149. 'v': ''.join(li.xpath('@datanf')),
  150. })
  151. # print(li_value)
  152. filter_jlp.append({
  153. "key": "datanf-year",
  154. "name": "年份",
  155. "value": li_value
  156. })
  157. lis = html.xpath('//*[@id="zimu"]/li')
  158. li_value = []
  159. for li in lis:
  160. li_value.append({
  161. 'n': ''.join(li.xpath('./span//text()')),
  162. 'v': ''.join(li.xpath('@datazm')),
  163. })
  164. # print(li_value)
  165. filter_jlp.append({
  166. "key": "dataszm-letter",
  167. "name": "首字母",
  168. "value": li_value
  169. })
  170. print(filter_jlp)
  171. ext_file_dict = {
  172. "特别节目": filter_tbjm,
  173. "纪录片": filter_jlp,
  174. }
  175. # print(json.dumps(ext_file_dict,ensure_ascii=False,indent=4))
  176. with open(ext_file, mode='w+', encoding='utf-8') as f:
  177. # f.write(json.dumps(ext_file_dict,ensure_ascii=False,indent=4))
  178. f.write(json.dumps(ext_file_dict, ensure_ascii=False))
  179. def init(self, extend=""):
  180. def init_file(ext_file):
  181. ext_file = Path(ext_file).as_posix()
  182. # print(f'ext_file:{ext_file}')
  183. if os.path.exists(ext_file):
  184. # print('存在扩展文件')
  185. with open(ext_file, mode='r', encoding='utf-8') as f:
  186. try:
  187. ext_dict = json.loads(f.read())
  188. # print(ext_dict)
  189. self.config['filter'].update(ext_dict)
  190. except Exception as e:
  191. print(f'更新扩展筛选条件发生错误:{e}')
  192. print("============{0}============".format(extend))
  193. if extend.startswith('./'):
  194. ext_file = os.path.join(os.path.dirname(__file__), extend)
  195. init_file(ext_file)
  196. elif extend.startswith('http'):
  197. try:
  198. r = self.fetch(extend)
  199. self.config['filter'].update(r.json())
  200. except Exception as e:
  201. print(f'更新扩展筛选条件发生错误:{e}')
  202. elif extend and not extend.startswith('./') and not extend.startswith('http'):
  203. ext_file = os.path.join(os.path.dirname(__file__), './' + extend + '.json')
  204. init_file(ext_file)
  205. def isVideoFormat(self, url):
  206. pass
  207. def manualVideoCheck(self):
  208. pass
  209. def homeContent(self, filter):
  210. result = {}
  211. cateManual = {
  212. "栏目大全": "栏目大全",
  213. "特别节目": "特别节目",
  214. "纪录片": "纪录片",
  215. "电视剧": "电视剧",
  216. "动画片": "动画片"
  217. }
  218. classes = []
  219. for k in cateManual:
  220. classes.append({
  221. 'type_name': k,
  222. 'type_id': cateManual[k]
  223. })
  224. result['class'] = classes
  225. if (filter):
  226. result['filters'] = self.config['filter']
  227. return result
  228. def homeVideoContent(self):
  229. result = {
  230. 'list': []
  231. }
  232. return result
  233. def categoryContent(self, tid, pg, filter, extend):
  234. result = {}
  235. month = "" # 月
  236. year = "" # 年
  237. area = '' # 地区
  238. channel = '' # 频道
  239. datafl = '' # 类型
  240. letter = '' # 字母
  241. pagecount = 24
  242. if tid == '动画片':
  243. id = urllib.parse.quote(tid)
  244. if 'datadq-area' in extend.keys():
  245. area = urllib.parse.quote(extend['datadq-area'])
  246. if 'dataszm-letter' in extend.keys():
  247. letter = extend['dataszm-letter']
  248. if 'datafl-sc' in extend.keys():
  249. datafl = urllib.parse.quote(extend['datafl-sc'])
  250. url = 'https://api.cntv.cn/list/getVideoAlbumList?channelid=CHAL1460955899450127&area={0}&sc={4}&fc={1}&letter={2}&p={3}&n=24&serviceId=tvcctv&topv=1&t=json'.format(
  251. area, id, letter, pg, datafl)
  252. elif tid == '纪录片':
  253. id = urllib.parse.quote(tid)
  254. if 'datapd-channel' in extend.keys():
  255. channel = urllib.parse.quote(extend['datapd-channel'])
  256. if 'datafl-sc' in extend.keys():
  257. datafl = urllib.parse.quote(extend['datafl-sc'])
  258. if 'datanf-year' in extend.keys():
  259. year = extend['datanf-year']
  260. if 'dataszm-letter' in extend.keys():
  261. letter = extend['dataszm-letter']
  262. url = 'https://api.cntv.cn/list/getVideoAlbumList?channelid=CHAL1460955924871139&fc={0}&channel={1}&sc={2}&year={3}&letter={4}&p={5}&n=24&serviceId=tvcctv&topv=1&t=json'.format(
  263. id, channel, datafl, year, letter, pg)
  264. elif tid == '电视剧':
  265. id = urllib.parse.quote(tid)
  266. if 'datafl-sc' in extend.keys():
  267. datafl = urllib.parse.quote(extend['datafl-sc'])
  268. if 'datanf-year' in extend.keys():
  269. year = extend['datanf-year']
  270. if 'dataszm-letter' in extend.keys():
  271. letter = extend['dataszm-letter']
  272. url = 'https://api.cntv.cn/list/getVideoAlbumList?channelid=CHAL1460955853485115&area={0}&sc={1}&fc={2}&year={3}&letter={4}&p={5}&n=24&serviceId=tvcctv&topv=1&t=json'.format(
  273. area, datafl, id, year, letter, pg)
  274. elif tid == '特别节目':
  275. id = urllib.parse.quote(tid)
  276. if 'datapd-channel' in extend.keys():
  277. channel = urllib.parse.quote(extend['datapd-channel'])
  278. if 'datafl-sc' in extend.keys():
  279. datafl = urllib.parse.quote(extend['datafl-sc'])
  280. if 'dataszm-letter' in extend.keys():
  281. letter = extend['dataszm-letter']
  282. url = 'https://api.cntv.cn/list/getVideoAlbumList?channelid=CHAL1460955953877151&channel={0}&sc={1}&fc={2}&bigday=&letter={3}&p={4}&n=24&serviceId=tvcctv&topv=1&t=json'.format(
  283. channel, datafl, id, letter, pg)
  284. elif tid == '栏目大全':
  285. cid = '' # 频道
  286. if 'cid' in extend.keys():
  287. cid = extend['cid']
  288. fc = '' # 分类
  289. if 'fc' in extend.keys():
  290. fc = extend['fc']
  291. fl = '' # 字母
  292. if 'fl' in extend.keys():
  293. fl = extend['fl']
  294. url = 'https://api.cntv.cn/lanmu/columnSearch?&fl={0}&fc={1}&cid={2}&p={3}&n=20&serviceId=tvcctv&t=json&cb=ko'.format(
  295. fl, fc, cid, pg)
  296. pagecount = 20
  297. else:
  298. url = 'https://tv.cctv.com/epg/index.shtml'
  299. videos = []
  300. htmlText = self.webReadFile(urlStr=url, header=self.header)
  301. if tid == '栏目大全':
  302. index = htmlText.rfind(');')
  303. if index > -1:
  304. htmlText = htmlText[3:index]
  305. videos = self.get_list1(html=htmlText, tid=tid)
  306. else:
  307. videos = self.get_list(html=htmlText, tid=tid)
  308. # print(videos)
  309. result['list'] = videos
  310. result['page'] = pg
  311. result['pagecount'] = 9999 if len(videos) >= pagecount else pg
  312. result['limit'] = 90
  313. result['total'] = 999999
  314. return result
  315. def detailContent(self, array):
  316. result = {}
  317. aid = array[0].split('||')
  318. tid = aid[0]
  319. logo = aid[3]
  320. lastVideo = aid[2]
  321. title = aid[1]
  322. id = aid[4]
  323. vod_year = aid[5]
  324. actors = aid[6] if len(aid) > 6 else ''
  325. brief = aid[7] if len(aid) > 7 else '' # get请求最长255,这个描述会有可能直接被干没了。
  326. fromId = 'CCTV'
  327. if tid == "栏目大全":
  328. lastUrl = 'https://api.cntv.cn/video/videoinfoByGuid?guid={0}&serviceId=tvcctv'.format(id)
  329. htmlTxt = self.webReadFile(urlStr=lastUrl, header=self.header)
  330. topicId = json.loads(htmlTxt)['ctid']
  331. Url = "https://api.cntv.cn/NewVideo/getVideoListByColumn?id={0}&d=&p=1&n=100&sort=desc&mode=0&serviceId=tvcctv&t=json".format(
  332. topicId)
  333. htmlTxt = self.webReadFile(urlStr=Url, header=self.header)
  334. else:
  335. Url = 'https://api.cntv.cn/NewVideo/getVideoListByAlbumIdNew?id={0}&serviceId=tvcctv&p=1&n=100&mode=0&pub=1'.format(
  336. id)
  337. jRoot = ''
  338. videoList = []
  339. try:
  340. if tid == "搜索":
  341. fromId = '中央台'
  342. videoList = [title + "$" + lastVideo]
  343. else:
  344. htmlTxt = self.webReadFile(urlStr=Url, header=self.header)
  345. jRoot = json.loads(htmlTxt)
  346. data = jRoot['data']
  347. jsonList = data['list']
  348. videoList = self.get_EpisodesList(jsonList=jsonList)
  349. if len(videoList) < 1:
  350. htmlTxt = self.webReadFile(urlStr=lastVideo, header=self.header)
  351. if tid == "电视剧" or tid == "纪录片":
  352. patternTxt = r"'title':\s*'(?P<title>.+?)',\n{0,1}\s*'brief':\s*'(.+?)',\n{0,1}\s*'img':\s*'(.+?)',\n{0,1}\s*'url':\s*'(?P<url>.+?)'"
  353. elif tid == "特别节目":
  354. patternTxt = r'class="tp1"><a\s*href="(?P<url>https://.+?)"\s*target="_blank"\s*title="(?P<title>.+?)"></a></div>'
  355. elif tid == "动画片":
  356. patternTxt = r"'title':\s*'(?P<title>.+?)',\n{0,1}\s*'img':\s*'(.+?)',\n{0,1}\s*'brief':\s*'(.+?)',\n{0,1}\s*'url':\s*'(?P<url>.+?)'"
  357. elif tid == "栏目大全":
  358. patternTxt = r'href="(?P<url>.+?)" target="_blank" alt="(?P<title>.+?)" title=".+?">'
  359. videoList = self.get_EpisodesList_re(htmlTxt=htmlTxt, patternTxt=patternTxt)
  360. fromId = '央视'
  361. except:
  362. pass
  363. if len(videoList) == 0:
  364. return {}
  365. vod = {
  366. "vod_id": array[0],
  367. "vod_name": title,
  368. "vod_pic": logo,
  369. "type_name": tid,
  370. "vod_year": vod_year,
  371. "vod_area": "",
  372. "vod_remarks": '',
  373. "vod_actor": actors,
  374. "vod_director": '',
  375. "vod_content": brief
  376. }
  377. vod['vod_play_from'] = fromId
  378. vod['vod_play_url'] = "#".join(videoList)
  379. result = {
  380. 'list': [
  381. vod
  382. ]
  383. }
  384. return result
  385. def get_lineList(self, Txt, mark, after):
  386. circuit = []
  387. origin = Txt.find(mark)
  388. while origin > 8:
  389. end = Txt.find(after, origin)
  390. circuit.append(Txt[origin:end])
  391. origin = Txt.find(mark, end)
  392. return circuit
  393. def get_RegexGetTextLine(self, Text, RegexText, Index):
  394. returnTxt = []
  395. pattern = re.compile(RegexText, re.M | re.S)
  396. ListRe = pattern.findall(Text)
  397. if len(ListRe) < 1:
  398. return returnTxt
  399. for value in ListRe:
  400. returnTxt.append(value)
  401. return returnTxt
  402. def searchContent(self, key, quick):
  403. key = urllib.parse.quote(key)
  404. Url = 'https://search.cctv.com/ifsearch.php?page=1&qtext={0}&sort=relevance&pageSize=20&type=video&vtime=-1&datepid=1&channel=&pageflag=0&qtext_str={0}'.format(
  405. key)
  406. htmlTxt = self.webReadFile(urlStr=Url, header=self.header)
  407. videos = self.get_list_search(html=htmlTxt, tid='搜索')
  408. result = {
  409. 'list': videos
  410. }
  411. return result
  412. def playerContent(self, flag, id, vipFlags):
  413. result = {}
  414. url = ''
  415. parse = 0
  416. headers = {
  417. 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
  418. }
  419. if flag == 'CCTV':
  420. url = self.get_m3u8(urlTxt=id)
  421. else:
  422. try:
  423. html = self.webReadFile(urlStr=id, header=self.header)
  424. guid = self.get_RegexGetText(Text=html, RegexText=r'var\sguid\s*=\s*"(.+?)";', Index=1)
  425. url = self.get_m3u8(urlTxt=guid)
  426. except:
  427. url = id
  428. parse = 1
  429. if url.find('https:') < 0:
  430. url = id
  431. parse = 1
  432. result["parse"] = parse # 1=嗅探,0=播放
  433. result["playUrl"] = ''
  434. result["url"] = url
  435. result["header"] = headers
  436. return result
  437. # 分类抓取地址:
  438. # 栏目大全:https://tv.cctv.com/lm/index.shtml?spm=C28340.Pu9TN9YUsfNZ.E2PQtIunpEaz.24
  439. # 电视剧:https://tv.cctv.com/yxg/index.shtml?spm=C28340.PlFTqGe6Zk8M.E2PQtIunpEaz.65#datacid=dsj&datafl=&datadq=&fc=%E7%94%B5%E8%A7%86%E5%89%A7&datanf=&dataszm=
  440. # 动画片:https://tv.cctv.com/yxg/index.shtml?spm=C28340.PlFTqGe6Zk8M.E2PQtIunpEaz.65#datacid=dhp&datafl=&datadq=&fc=%E5%8A%A8%E7%94%BB%E7%89%87&dataszm=
  441. # 记录片:https://tv.cctv.com/yxg/index.shtml?spm=C28340.PlFTqGe6Zk8M.E2PQtIunpEaz.65#datacid=jlp&datapd=&datafl=&fc=%E7%BA%AA%E5%BD%95%E7%89%87&datanf=&dataszm=
  442. # 特别节目:https://tv.cctv.com/yxg/index.shtml?spm=C28340.PlFTqGe6Zk8M.E2PQtIunpEaz.65#datacid=tbjm&datapd=&datafl=&fc=%E7%89%B9%E5%88%AB%E8%8A%82%E7%9B%AE&datajr=&dataszm=
  443. config = {
  444. "player": {},
  445. "filter": {
  446. "电视剧": [
  447. {"key": "datafl-sc", "name": "类型",
  448. "value": [{"n": "全部", "v": ""}, {"n": "谍战", "v": "谍战"}, {"n": "悬疑", "v": "悬疑"},
  449. {"n": "刑侦", "v": "刑侦"}, {"n": "历史", "v": "历史"}, {"n": "古装", "v": "古装"},
  450. {"n": "武侠", "v": "武侠"}, {"n": "军旅", "v": "军旅"}, {"n": "战争", "v": "战争"},
  451. {"n": "喜剧", "v": "喜剧"}, {"n": "青春", "v": "青春"}, {"n": "言情", "v": "言情"},
  452. {"n": "偶像", "v": "偶像"}, {"n": "家庭", "v": "家庭"}, {"n": "年代", "v": "年代"},
  453. {"n": "革命", "v": "革命"}, {"n": "农村", "v": "农村"}, {"n": "都市", "v": "都市"},
  454. {"n": "其他", "v": "其他"}]},
  455. {"key": "datadq-area", "name": "地区",
  456. "value": [{"n": "全部", "v": ""}, {"n": "中国大陆", "v": "中国大陆"}, {"n": "中国香港", "v": "香港"},
  457. {"n": "美国", "v": "美国"}, {"n": "欧洲", "v": "欧洲"}, {"n": "泰国", "v": "泰国"}]},
  458. {"key": "datanf-year", "name": "年份",
  459. "value": [{"n": "全部", "v": ""}, {"n": "2023", "v": "2023"}, {"n": "2022", "v": "2022"},
  460. {"n": "2021", "v": "2021"}, {"n": "2020", "v": "2020"}, {"n": "2019", "v": "2019"},
  461. {"n": "2018", "v": "2018"}, {"n": "2017", "v": "2017"}, {"n": "2016", "v": "2016"},
  462. {"n": "2015", "v": "2015"}, {"n": "2014", "v": "2014"}, {"n": "2013", "v": "2013"},
  463. {"n": "2012", "v": "2012"}, {"n": "2011", "v": "2011"}, {"n": "2010", "v": "2010"},
  464. {"n": "2009", "v": "2009"}, {"n": "2008", "v": "2008"}, {"n": "2007", "v": "2007"},
  465. {"n": "2006", "v": "2006"}, {"n": "2005", "v": "2005"}, {"n": "2004", "v": "2004"},
  466. {"n": "2003", "v": "2003"}, {"n": "2002", "v": "2002"}, {"n": "2001", "v": "2001"},
  467. {"n": "2000", "v": "2000"}, {"n": "1999", "v": "1999"}, {"n": "1998", "v": "1998"},
  468. {"n": "1997", "v": "1997"}]},
  469. {"key": "dataszm-letter", "name": "字母",
  470. "value": [{"n": "全部", "v": ""}, {"n": "A", "v": "A"}, {"n": "C", "v": "C"}, {"n": "E", "v": "E"},
  471. {"n": "F", "v": "F"}, {"n": "G", "v": "G"}, {"n": "H", "v": "H"}, {"n": "I", "v": "I"},
  472. {"n": "J", "v": "J"}, {"n": "K", "v": "K"}, {"n": "L", "v": "L"}, {"n": "M", "v": "M"},
  473. {"n": "N", "v": "N"}, {"n": "O", "v": "O"}, {"n": "P", "v": "P"}, {"n": "Q", "v": "Q"},
  474. {"n": "R", "v": "R"}, {"n": "S", "v": "S"}, {"n": "T", "v": "T"}, {"n": "U", "v": "U"},
  475. {"n": "V", "v": "V"}, {"n": "W", "v": "W"}, {"n": "X", "v": "X"}, {"n": "Y", "v": "Y"},
  476. {"n": "Z", "v": "Z"}, {"n": "0-9", "v": "0-9"}]}
  477. ],
  478. "动画片": [
  479. {"key": "datafl-sc", "name": "类型",
  480. "value": [{"n": "全部", "v": ""}, {"n": "亲子", "v": "亲子"}, {"n": "搞笑", "v": "搞笑"},
  481. {"n": "冒险", "v": "冒险"}, {"n": "动作", "v": "动作"}, {"n": "宠物", "v": "宠物"},
  482. {"n": "体育", "v": "体育"}, {"n": "益智", "v": "益智"}, {"n": "历史", "v": "历史"},
  483. {"n": "教育", "v": "教育"}, {"n": "校园", "v": "校园"}, {"n": "言情", "v": "言情"},
  484. {"n": "武侠", "v": "武侠"}, {"n": "经典", "v": "经典"}, {"n": "未来", "v": "未来"},
  485. {"n": "古代", "v": "古代"}, {"n": "神话", "v": "神话"}, {"n": "真人", "v": "真人"},
  486. {"n": "励志", "v": "励志"}, {"n": "热血", "v": "热血"}, {"n": "奇幻", "v": "奇幻"},
  487. {"n": "童话", "v": "童话"}, {"n": "剧情", "v": "剧情"}, {"n": "夺宝", "v": "夺宝"},
  488. {"n": "其他", "v": "其他"}]},
  489. {"key": "datadq-area", "name": "地区",
  490. "value": [{"n": "全部", "v": ""}, {"n": "中国大陆", "v": "中国大陆"}, {"n": "美国", "v": "美国"},
  491. {"n": "欧洲", "v": "欧洲"}]},
  492. {"key": "dataszm-letter", "name": "字母",
  493. "value": [{"n": "全部", "v": ""}, {"n": "A", "v": "A"}, {"n": "C", "v": "C"}, {"n": "E", "v": "E"},
  494. {"n": "F", "v": "F"}, {"n": "G", "v": "G"}, {"n": "H", "v": "H"}, {"n": "I", "v": "I"},
  495. {"n": "J", "v": "J"}, {"n": "K", "v": "K"}, {"n": "L", "v": "L"}, {"n": "M", "v": "M"},
  496. {"n": "N", "v": "N"}, {"n": "O", "v": "O"}, {"n": "P", "v": "P"}, {"n": "Q", "v": "Q"},
  497. {"n": "R", "v": "R"}, {"n": "S", "v": "S"}, {"n": "T", "v": "T"}, {"n": "U", "v": "U"},
  498. {"n": "V", "v": "V"}, {"n": "W", "v": "W"}, {"n": "X", "v": "X"}, {"n": "Y", "v": "Y"},
  499. {"n": "Z", "v": "Z"}, {"n": "0-9", "v": "0-9"}]}
  500. ],
  501. "纪录片": [
  502. {"key": "datapd-channel", "name": "频道",
  503. "value": [{"n": "全部", "v": ""}, {"n": "CCTV{1 综合", "v": "CCTV{1 综合"},
  504. {"n": "CCTV{2 财经", "v": "CCTV{2 财经"}, {"n": "CCTV{3 综艺", "v": "CCTV{3 综艺"},
  505. {"n": "CCTV{4 中文国际", "v": "CCTV{4 中文国际"}, {"n": "CCTV{5 体育", "v": "CCTV{5 体育"},
  506. {"n": "CCTV{6 电影", "v": "CCTV{6 电影"}, {"n": "CCTV{7 国防军事", "v": "CCTV{7 国防军事"},
  507. {"n": "CCTV{8 电视剧", "v": "CCTV{8 电视剧"}, {"n": "CCTV{9 纪录", "v": "CCTV{9 纪录"},
  508. {"n": "CCTV{10 科教", "v": "CCTV{10 科教"}, {"n": "CCTV{11 戏曲", "v": "CCTV{11 戏曲"},
  509. {"n": "CCTV{12 社会与法", "v": "CCTV{12 社会与法"},
  510. {"n": "CCTV{13 新闻", "v": "CCTV{13 新闻"}, {"n": "CCTV{14 少儿", "v": "CCTV{14 少儿"},
  511. {"n": "CCTV{15 音乐", "v": "CCTV{15 音乐"},
  512. {"n": "CCTV{17 农业农村", "v": "CCTV{17 农业农村"}]},
  513. {"key": "datafl-sc", "name": "类型",
  514. "value": [{"n": "全部", "v": ""}, {"n": "人文历史", "v": "人文历史"}, {"n": "人物", "v": "人物"},
  515. {"n": "军事", "v": "军事"}, {"n": "探索", "v": "探索"}, {"n": "社会", "v": "社会"},
  516. {"n": "时政", "v": "时政"}, {"n": "经济", "v": "经济"}, {"n": "科技", "v": "科技"}]},
  517. {"key": "datanf-year", "name": "年份",
  518. "value": [{"n": "全部", "v": ""}, {"n": "2023", "v": "2023"}, {"n": "2022", "v": "2022"},
  519. {"n": "2021", "v": "2021"}, {"n": "2020", "v": "2020"}, {"n": "2019", "v": "2019"},
  520. {"n": "2018", "v": "2018"}, {"n": "2017", "v": "2017"}, {"n": "2016", "v": "2016"},
  521. {"n": "2015", "v": "2015"}, {"n": "2014", "v": "2014"}, {"n": "2013", "v": "2013"},
  522. {"n": "2012", "v": "2012"}, {"n": "2011", "v": "2011"}, {"n": "2010", "v": "2010"},
  523. {"n": "2009", "v": "2009"}, {"n": "2008", "v": "2008"}]},
  524. {"key": "dataszm-letter", "name": "字母",
  525. "value": [{"n": "全部", "v": ""}, {"n": "A", "v": "A"}, {"n": "C", "v": "C"}, {"n": "E", "v": "E"},
  526. {"n": "F", "v": "F"}, {"n": "G", "v": "G"}, {"n": "H", "v": "H"}, {"n": "I", "v": "I"},
  527. {"n": "J", "v": "J"}, {"n": "K", "v": "K"}, {"n": "L", "v": "L"}, {"n": "M", "v": "M"},
  528. {"n": "N", "v": "N"}, {"n": "O", "v": "O"}, {"n": "P", "v": "P"}, {"n": "Q", "v": "Q"},
  529. {"n": "R", "v": "R"}, {"n": "S", "v": "S"}, {"n": "T", "v": "T"}, {"n": "U", "v": "U"},
  530. {"n": "V", "v": "V"}, {"n": "W", "v": "W"}, {"n": "X", "v": "X"}, {"n": "Y", "v": "Y"},
  531. {"n": "Z", "v": "Z"}, {"n": "0-9", "v": "0-9"}]}
  532. ],
  533. "特别节目": [
  534. {"key": "datapd-channel", "name": "频道",
  535. "value": [{"n": "全部", "v": ""}, {"n": "CCTV{1 综合", "v": "CCTV{1 综合"},
  536. {"n": "CCTV{2 财经", "v": "CCTV{2 财经"}, {"n": "CCTV{3 综艺", "v": "CCTV{3 综艺"},
  537. {"n": "CCTV{4 中文国际", "v": "CCTV{4 中文国际"}, {"n": "CCTV{5 体育", "v": "CCTV{5 体育"},
  538. {"n": "CCTV{6 电影", "v": "CCTV{6 电影"}, {"n": "CCTV{7 国防军事", "v": "CCTV{7 国防军事"},
  539. {"n": "CCTV{8 电视剧", "v": "CCTV{8 电视剧"}, {"n": "CCTV{9 纪录", "v": "CCTV{9 纪录"},
  540. {"n": "CCTV{10 科教", "v": "CCTV{10 科教"}, {"n": "CCTV{11 戏曲", "v": "CCTV{11 戏曲"},
  541. {"n": "CCTV{12 社会与法", "v": "CCTV{12 社会与法"},
  542. {"n": "CCTV{13 新闻", "v": "CCTV{13 新闻"}, {"n": "CCTV{14 少儿", "v": "CCTV{14 少儿"},
  543. {"n": "CCTV{15 音乐", "v": "CCTV{15 音乐"},
  544. {"n": "CCTV{17 农业农村", "v": "CCTV{17 农业农村"}]},
  545. {"key": "datafl-sc", "name": "类型",
  546. "value": [{"n": "全部", "v": ""}, {"n": "全部", "v": "全部"}, {"n": "新闻", "v": "新闻"},
  547. {"n": "经济", "v": "经济"}, {"n": "综艺", "v": "综艺"}, {"n": "体育", "v": "体育"},
  548. {"n": "军事", "v": "军事"}, {"n": "影视", "v": "影视"}, {"n": "科教", "v": "科教"},
  549. {"n": "戏曲", "v": "戏曲"}, {"n": "青少", "v": "青少"}, {"n": "音乐", "v": "音乐"},
  550. {"n": "社会", "v": "社会"}, {"n": "公益", "v": "公益"}, {"n": "其他", "v": "其他"}]},
  551. {"key": "dataszm-letter", "name": "字母",
  552. "value": [{"n": "全部", "v": ""}, {"n": "A", "v": "A"}, {"n": "C", "v": "C"}, {"n": "E", "v": "E"},
  553. {"n": "F", "v": "F"}, {"n": "G", "v": "G"}, {"n": "H", "v": "H"}, {"n": "I", "v": "I"},
  554. {"n": "J", "v": "J"}, {"n": "K", "v": "K"}, {"n": "L", "v": "L"}, {"n": "M", "v": "M"},
  555. {"n": "N", "v": "N"}, {"n": "O", "v": "O"}, {"n": "P", "v": "P"}, {"n": "Q", "v": "Q"},
  556. {"n": "R", "v": "R"}, {"n": "S", "v": "S"}, {"n": "T", "v": "T"}, {"n": "U", "v": "U"},
  557. {"n": "V", "v": "V"}, {"n": "W", "v": "W"}, {"n": "X", "v": "X"}, {"n": "Y", "v": "Y"},
  558. {"n": "Z", "v": "Z"}, {"n": "0-9", "v": "0-9"}]}
  559. ],
  560. "栏目大全": [{"key": "cid", "name": "频道",
  561. "value": [{"n": "全部", "v": ""}, {"n": "CCTV-1综合", "v": "EPGC1386744804340101"},
  562. {"n": "CCTV-2财经", "v": "EPGC1386744804340102"},
  563. {"n": "CCTV-3综艺", "v": "EPGC1386744804340103"},
  564. {"n": "CCTV-4中文国际", "v": "EPGC1386744804340104"},
  565. {"n": "CCTV-5体育", "v": "EPGC1386744804340107"},
  566. {"n": "CCTV-6电影", "v": "EPGC1386744804340108"},
  567. {"n": "CCTV-7国防军事", "v": "EPGC1386744804340109"},
  568. {"n": "CCTV-8电视剧", "v": "EPGC1386744804340110"},
  569. {"n": "CCTV-9纪录", "v": "EPGC1386744804340112"},
  570. {"n": "CCTV-10科教", "v": "EPGC1386744804340113"},
  571. {"n": "CCTV-11戏曲", "v": "EPGC1386744804340114"},
  572. {"n": "CCTV-12社会与法", "v": "EPGC1386744804340115"},
  573. {"n": "CCTV-13新闻", "v": "EPGC1386744804340116"},
  574. {"n": "CCTV-14少儿", "v": "EPGC1386744804340117"},
  575. {"n": "CCTV-15音乐", "v": "EPGC1386744804340118"},
  576. {"n": "CCTV-16奥林匹克", "v": "EPGC1634630207058998"},
  577. {"n": "CCTV-17农业农村", "v": "EPGC1563932742616872"},
  578. {"n": "CCTV-5+体育赛事", "v": "EPGC1468294755566101"}]},
  579. {"key": "fc", "name": "分类",
  580. "value": [{"n": "全部", "v": ""}, {"n": "新闻", "v": "新闻"}, {"n": "体育", "v": "体育"},
  581. {"n": "综艺", "v": "综艺"}, {"n": "健康", "v": "健康"}, {"n": "生活", "v": "生活"},
  582. {"n": "科教", "v": "科教"}, {"n": "经济", "v": "经济"}, {"n": "农业", "v": "农业"},
  583. {"n": "法治", "v": "法治"}, {"n": "军事", "v": "军事"}, {"n": "少儿", "v": "少儿"},
  584. {"n": "动画", "v": "动画"}, {"n": "纪实", "v": "纪实"}, {"n": "戏曲", "v": "戏曲"},
  585. {"n": "音乐", "v": "音乐"}, {"n": "影视", "v": "影视"}]},
  586. {"key": "fl", "name": "字母",
  587. "value": [{"n": "全部", "v": ""}, {"n": "A", "v": "A"}, {"n": "B", "v": "B"},
  588. {"n": "C", "v": "C"}, {"n": "D", "v": "D"}, {"n": "E", "v": "E"},
  589. {"n": "F", "v": "F"}, {"n": "G", "v": "G"}, {"n": "H", "v": "H"},
  590. {"n": "I", "v": "I"}, {"n": "J", "v": "J"}, {"n": "K", "v": "K"},
  591. {"n": "L", "v": "L"}, {"n": "M", "v": "M"}, {"n": "N", "v": "N"},
  592. {"n": "O", "v": "O"}, {"n": "P", "v": "P"}, {"n": "Q", "v": "Q"},
  593. {"n": "R", "v": "R"}, {"n": "S", "v": "S"}, {"n": "T", "v": "T"},
  594. {"n": "U", "v": "U"}, {"n": "V", "v": "V"}, {"n": "W", "v": "W"},
  595. {"n": "X", "v": "X"}, {"n": "Y", "v": "Y"}, {"n": "Z", "v": "Z"}]},
  596. {"key": "year", "name": "年份",
  597. "value": [{"n": "全部", "v": ""}, {"n": "2022", "v": "2022"}, {"n": "2021", "v": "2021"},
  598. {"n": "2020", "v": "2020"}, {"n": "2019", "v": "2019"}, {"n": "2018", "v": "2018"},
  599. {"n": "2017", "v": "2017"}, {"n": "2016", "v": "2016"}, {"n": "2015", "v": "2015"},
  600. {"n": "2014", "v": "2014"}, {"n": "2013", "v": "2013"}, {"n": "2012", "v": "2012"},
  601. {"n": "2011", "v": "2011"}, {"n": "2010", "v": "2010"}, {"n": "2009", "v": "2009"},
  602. {"n": "2008", "v": "2008"}, {"n": "2007", "v": "2007"}, {"n": "2006", "v": "2006"},
  603. {"n": "2005", "v": "2005"}, {"n": "2004", "v": "2004"}, {"n": "2003", "v": "2003"},
  604. {"n": "2002", "v": "2002"}, {"n": "2001", "v": "2001"},
  605. {"n": "2000", "v": "2000"}]}, {"key": "month", "name": "月份",
  606. "value": [{"n": "全部", "v": ""},
  607. {"n": "12", "v": "12"},
  608. {"n": "11", "v": "11"},
  609. {"n": "10", "v": "10"},
  610. {"n": "09", "v": "09"},
  611. {"n": "08", "v": "08"},
  612. {"n": "07", "v": "07"},
  613. {"n": "06", "v": "06"},
  614. {"n": "05", "v": "05"},
  615. {"n": "04", "v": "04"},
  616. {"n": "03", "v": "03"},
  617. {"n": "02", "v": "02"},
  618. {"n": "01", "v": "01"}]}]
  619. }
  620. }
  621. header = {
  622. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36",
  623. "Host": "tv.cctv.com",
  624. "Referer": "https://tv.cctv.com/"
  625. }
  626. def localProxy(self, param):
  627. return [200, "video/MP2T", action, ""]
  628. # -----------------------------------------------自定义函数-----------------------------------------------
  629. # 访问网页
  630. def webReadFile(self, urlStr, header):
  631. html = ''
  632. req = urllib.request.Request(url=urlStr) # ,headers=header
  633. with urllib.request.urlopen(req) as response:
  634. html = response.read().decode('utf-8')
  635. return html
  636. # 判断网络地址是否存在
  637. def TestWebPage(self, urlStr, header):
  638. html = ''
  639. req = urllib.request.Request(url=urlStr, method='HEAD') # ,headers=header
  640. with urllib.request.urlopen(req) as response:
  641. html = response.getcode()
  642. return html
  643. # 正则取文本
  644. def get_RegexGetText(self, Text, RegexText, Index):
  645. returnTxt = ""
  646. Regex = re.search(RegexText, Text, re.M | re.S)
  647. if Regex is None:
  648. returnTxt = ""
  649. else:
  650. returnTxt = Regex.group(Index)
  651. return returnTxt
  652. # 取集数
  653. def get_EpisodesList(self, jsonList):
  654. videos = []
  655. for vod in jsonList:
  656. url = vod['guid']
  657. title = vod['title']
  658. if len(url) == 0:
  659. continue
  660. videos.append(title + "$" + url)
  661. return videos
  662. # 取集数
  663. def get_EpisodesList_re(self, htmlTxt, patternTxt):
  664. ListRe = re.finditer(patternTxt, htmlTxt, re.M | re.S)
  665. videos = []
  666. for vod in ListRe:
  667. url = vod.group('url')
  668. title = vod.group('title')
  669. if len(url) == 0:
  670. continue
  671. videos.append(title + "$" + url)
  672. return videos
  673. # 取剧集区
  674. def get_lineList(self, Txt, mark, after):
  675. circuit = []
  676. origin = Txt.find(mark)
  677. while origin > 8:
  678. end = Txt.find(after, origin)
  679. circuit.append(Txt[origin:end])
  680. origin = Txt.find(mark, end)
  681. return circuit
  682. # 正则取文本,返回数组
  683. def get_RegexGetTextLine(self, Text, RegexText, Index):
  684. returnTxt = []
  685. pattern = re.compile(RegexText, re.M | re.S)
  686. ListRe = pattern.findall(Text)
  687. if len(ListRe) < 1:
  688. return returnTxt
  689. for value in ListRe:
  690. returnTxt.append(value)
  691. return returnTxt
  692. # 删除html标签
  693. def removeHtml(self, txt):
  694. soup = re.compile(r'<[^>]+>', re.S)
  695. txt = soup.sub('', txt)
  696. return txt.replace("&nbsp;", " ")
  697. # 取m3u8
  698. def get_m3u8(self, urlTxt):
  699. url = "https://vdn.apps.cntv.cn/api/getHttpVideoInfo.do?pid={0}".format(urlTxt)
  700. html = self.webReadFile(urlStr=url, header=self.header)
  701. jo = json.loads(html)
  702. link = jo['hls_url'].strip()
  703. html = self.webReadFile(urlStr=link, header=self.header)
  704. content = html.strip()
  705. arr = content.split('\n')
  706. urlPrefix = self.get_RegexGetText(Text=link, RegexText='(http[s]?://[a-zA-z0-9.]+)/', Index=1)
  707. subUrl = arr[-1].split('/')
  708. subUrl[3] = '1200'
  709. subUrl[-1] = '1200.m3u8'
  710. hdUrl = urlPrefix + '/'.join(subUrl)
  711. url = urlPrefix + arr[-1]
  712. hdRsp = self.TestWebPage(urlStr=hdUrl, header=self.header)
  713. if hdRsp == 200:
  714. url = hdUrl
  715. else:
  716. url = ''
  717. return url
  718. # 搜索
  719. def get_list_search(self, html, tid):
  720. jRoot = json.loads(html)
  721. jsonList = jRoot['list']
  722. videos = []
  723. for vod in jsonList:
  724. url = vod['urllink']
  725. title = self.removeHtml(txt=vod['title'])
  726. img = vod['imglink']
  727. id = vod['id']
  728. brief = vod['channel']
  729. year = vod['uploadtime']
  730. if len(url) == 0:
  731. continue
  732. guids = [tid, title, url, img, id, year, '', brief]
  733. guid = "||".join(guids)
  734. videos.append({
  735. "vod_id": guid,
  736. "vod_name": title,
  737. "vod_pic": img,
  738. "vod_remarks": year
  739. })
  740. return videos
  741. def get_list1(self, html, tid):
  742. jRoot = json.loads(html)
  743. videos = []
  744. data = jRoot['response']
  745. if data is None:
  746. return []
  747. jsonList = data['docs']
  748. for vod in jsonList:
  749. id = vod['lastVIDE']['videoSharedCode']
  750. desc = vod['lastVIDE']['videoTitle']
  751. title = vod['column_name']
  752. url = vod['column_website']
  753. img = vod['column_logo']
  754. year = vod['column_playdate']
  755. brief = vod['column_brief']
  756. actors = ''
  757. if len(url) == 0:
  758. continue
  759. guids = [tid, title, url, img, id, year, actors, brief]
  760. guid = "||".join(guids)
  761. # print(vod_id)
  762. videos.append({
  763. "vod_id": guid,
  764. "vod_name": title,
  765. "vod_pic": img,
  766. "vod_remarks": desc.split('》')[1].strip() if '》' in desc else desc.strip()
  767. })
  768. # print(videos)
  769. return videos
  770. # 分类取结果
  771. def get_list(self, html, tid):
  772. jRoot = json.loads(html)
  773. videos = []
  774. data = jRoot['data']
  775. if data is None:
  776. return []
  777. jsonList = data['list']
  778. for vod in jsonList:
  779. url = vod['url']
  780. title = vod['title']
  781. img = vod['image']
  782. id = vod['id']
  783. try:
  784. brief = vod['brief']
  785. except:
  786. brief = ''
  787. try:
  788. year = vod['year']
  789. except:
  790. year = ''
  791. try:
  792. actors = vod['actors']
  793. except:
  794. actors = ''
  795. if len(url) == 0:
  796. continue
  797. guids = [tid, title, url, img, id, year, actors, brief]
  798. guid = "||".join(guids)
  799. # print(vod_id)
  800. videos.append({
  801. "vod_id": guid,
  802. "vod_name": title,
  803. "vod_pic": img,
  804. "vod_remarks": ''
  805. })
  806. return videos
  807. if __name__ == '__main__':
  808. spider = Spider()
  809. spider.init()
  810. spider.init_api_ext_file()
  811. # home_content = spider.homeContent(None)
  812. # print(home_content)
  813. # cate_content = spider.categoryContent('栏目大全', 1, {'cid': 'n'}, {})
  814. # print(cate_content)
  815. # vid = cate_content['list'][0]['vod_id']
  816. # print(vid)
  817. # detail_content = spider.detailContent([vid])
  818. # print(detail_content)
  819. #
  820. # vod_play_from = detail_content['list'][0]['vod_play_from']
  821. # vod_play_url = detail_content['list'][0]['vod_play_url']
  822. # print(vod_play_from, vod_play_url)
  823. # _url = vod_play_url.split('#')[0].split('$')[1]
  824. # print(_url)
  825. # play = spider.playerContent(vod_play_from, _url, None)
  826. # print(play)