channel.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. from utils.config import get_config, resource_path
  2. from utils.tools import check_url_by_patterns, get_total_urls_from_info_list
  3. from utils.speed import sort_urls_by_speed_and_resolution
  4. import os
  5. from collections import defaultdict
  6. import re
  7. from bs4 import NavigableString
  8. import logging
  9. from logging.handlers import RotatingFileHandler
  10. from opencc import OpenCC
  11. config = get_config()
  12. handler = RotatingFileHandler("result_new.log", encoding="utf-8")
  13. logging.basicConfig(
  14. handlers=[handler],
  15. format="%(message)s",
  16. level=logging.INFO,
  17. )
  18. def get_channel_data_from_file(channels, file):
  19. """
  20. Get the channel data from the file
  21. """
  22. current_category = ""
  23. pattern = r"^(.*?),(?!#genre#)(.*?)$"
  24. for line in file:
  25. line = line.strip()
  26. if "#genre#" in line:
  27. # This is a new channel, create a new key in the dictionary.
  28. current_category = line.split(",")[0]
  29. else:
  30. # This is a url, add it to the list of urls for the current channel.
  31. match = re.search(pattern, line)
  32. if match is not None:
  33. name = match.group(1).strip()
  34. url = match.group(2).strip()
  35. if url and url not in channels[current_category][name]:
  36. channels[current_category][name].append(url)
  37. return channels
  38. def get_channel_items():
  39. """
  40. Get the channel items from the source file
  41. """
  42. # Open the source file and read all lines.
  43. user_source_file = (
  44. "user_" + config.source_file
  45. if os.path.exists("user_" + config.source_file)
  46. else getattr(config, "source_file", "demo.txt")
  47. )
  48. # Open the old final file and read all lines.
  49. user_final_file = (
  50. "user_" + config.final_file
  51. if os.path.exists("user_" + config.final_file)
  52. else getattr(config, "final_file", "result.txt")
  53. )
  54. # Create a dictionary to store the channels.
  55. channels = defaultdict(lambda: defaultdict(list))
  56. if os.path.exists(resource_path(user_source_file)):
  57. with open(resource_path(user_source_file), "r", encoding="utf-8") as file:
  58. channels = get_channel_data_from_file(channels, file)
  59. if config.open_use_old_result and os.path.exists(resource_path(user_final_file)):
  60. with open(resource_path(user_final_file), "r", encoding="utf-8") as file:
  61. channels = get_channel_data_from_file(channels, file)
  62. return channels
  63. def format_channel_name(name):
  64. """
  65. Format the channel name with sub and replace and lower
  66. """
  67. if config.open_keep_all:
  68. return name
  69. sub_pattern = (
  70. r"-|_|\((.*?)\)|\[(.*?)\]| |频道|标清|高清|HD|hd|超清|超高|超高清|中央|央视|台"
  71. )
  72. name = re.sub(sub_pattern, "", name)
  73. name = name.replace("plus", "+")
  74. name = name.replace("PLUS", "+")
  75. name = name.replace("+", "+")
  76. name = name.replace("CCTV1综合", "CCTV1")
  77. name = name.replace("CCTV2财经", "CCTV2")
  78. name = name.replace("CCTV3综艺", "CCTV3")
  79. name = name.replace("CCTV4国际", "CCTV4")
  80. name = name.replace("CCTV4中文国际", "CCTV4")
  81. name = name.replace("CCTV4欧洲", "CCTV4")
  82. name = name.replace("CCTV5体育", "CCTV5")
  83. name = name.replace("CCTV5+体育赛视", "CCTV5+")
  84. name = name.replace("CCTV5+体育赛事", "CCTV5+")
  85. name = name.replace("CCTV5+体育", "CCTV5+")
  86. name = name.replace("CCTV6电影", "CCTV6")
  87. name = name.replace("CCTV7军事", "CCTV7")
  88. name = name.replace("CCTV7军农", "CCTV7")
  89. name = name.replace("CCTV7农业", "CCTV7")
  90. name = name.replace("CCTV7国防军事", "CCTV7")
  91. name = name.replace("CCTV8电视剧", "CCTV8")
  92. name = name.replace("CCTV9记录", "CCTV9")
  93. name = name.replace("CCTV9纪录", "CCTV9")
  94. name = name.replace("CCTV10科教", "CCTV10")
  95. name = name.replace("CCTV11戏曲", "CCTV11")
  96. name = name.replace("CCTV12社会与法", "CCTV12")
  97. name = name.replace("CCTV13新闻", "CCTV13")
  98. name = name.replace("CCTV新闻", "CCTV13")
  99. name = name.replace("CCTV14少儿", "CCTV14")
  100. name = name.replace("CCTV15音乐", "CCTV15")
  101. name = name.replace("CCTV16奥林匹克", "CCTV16")
  102. name = name.replace("CCTV17农业农村", "CCTV17")
  103. name = name.replace("CCTV17农业", "CCTV17")
  104. return name.lower()
  105. def channel_name_is_equal(name1, name2):
  106. """
  107. Check if the channel name is equal
  108. """
  109. if config.open_keep_all:
  110. return True
  111. cc = OpenCC("t2s")
  112. name1_converted = cc.convert(format_channel_name(name1))
  113. name2_converted = cc.convert(format_channel_name(name2))
  114. return name1_converted == name2_converted
  115. def get_channel_results_by_name(name, data):
  116. """
  117. Get channel results from data by name
  118. """
  119. format_name = format_channel_name(name)
  120. cc1 = OpenCC("s2t")
  121. converted1 = cc1.convert(format_name)
  122. cc2 = OpenCC("t2s")
  123. converted2 = cc2.convert(format_name)
  124. result1 = data.get(converted1, [])
  125. result2 = data.get(converted2, [])
  126. results = list(dict.fromkeys(result1 + result2))
  127. return results
  128. def get_element_child_text_list(element, child_name):
  129. """
  130. Get the child text of the element
  131. """
  132. text_list = []
  133. children = element.find_all(child_name)
  134. if children:
  135. for child in children:
  136. text = child.get_text(strip=True)
  137. if text:
  138. text_list.append(text)
  139. return text_list
  140. def get_results_from_soup(soup, name):
  141. """
  142. Get the results from the soup
  143. """
  144. results = []
  145. for element in soup.descendants:
  146. if isinstance(element, NavigableString):
  147. text = element.get_text(strip=True)
  148. url = get_channel_url(text)
  149. if url and not any(item[0] == url for item in results):
  150. url_element = soup.find(lambda tag: tag.get_text(strip=True) == url)
  151. if url_element:
  152. name_element = url_element.find_previous_sibling()
  153. if name_element:
  154. channel_name = name_element.get_text(strip=True)
  155. if channel_name_is_equal(name, channel_name):
  156. info_element = url_element.find_next_sibling()
  157. date, resolution = get_channel_info(
  158. info_element.get_text(strip=True)
  159. )
  160. results.append((url, date, resolution))
  161. return results
  162. def get_results_from_soup_requests(soup, name):
  163. """
  164. Get the results from the soup by requests
  165. """
  166. results = []
  167. elements = soup.find_all("div", class_="resultplus") if soup else []
  168. for element in elements:
  169. name_element = element.find("div", class_="channel")
  170. if name_element:
  171. channel_name = name_element.get_text(strip=True)
  172. if channel_name_is_equal(name, channel_name):
  173. text_list = get_element_child_text_list(element, "div")
  174. url = date = resolution = None
  175. for text in text_list:
  176. text_url = get_channel_url(text)
  177. if text_url:
  178. url = text_url
  179. if " " in text:
  180. text_info = get_channel_info(text)
  181. date, resolution = text_info
  182. if url:
  183. results.append((url, date, resolution))
  184. return results
  185. def update_channel_urls_txt(cate, name, urls):
  186. """
  187. Update the category and channel urls to the final file
  188. """
  189. genre_line = cate + ",#genre#\n"
  190. filename = "result_new.txt"
  191. if not os.path.exists(filename):
  192. open(filename, "w").close()
  193. with open(filename, "r", encoding="utf-8") as f:
  194. content = f.read()
  195. with open(filename, "a", encoding="utf-8") as f:
  196. if genre_line not in content:
  197. f.write(genre_line)
  198. for url in urls:
  199. if url is not None:
  200. f.write(name + "," + url + "\n")
  201. def get_channel_url(text):
  202. """
  203. Get the url from text
  204. """
  205. url = None
  206. urlRegex = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
  207. url_search = re.search(
  208. urlRegex,
  209. text,
  210. )
  211. if url_search:
  212. url = url_search.group()
  213. return url
  214. def get_channel_info(text):
  215. """
  216. Get the channel info from text
  217. """
  218. date, resolution = None, None
  219. if text:
  220. date, resolution = (
  221. (text.partition(" ")[0] if text.partition(" ")[0] else None),
  222. (
  223. text.partition(" ")[2].partition("•")[2]
  224. if text.partition(" ")[2].partition("•")[2]
  225. else None
  226. ),
  227. )
  228. return date, resolution
  229. def init_info_data(data, cate, name):
  230. """
  231. Init channel info data
  232. """
  233. if data.get(cate) is None:
  234. data[cate] = {}
  235. if data[cate].get(name) is None:
  236. data[cate][name] = []
  237. return data
  238. def append_data_to_info_data(info_data, cate, name, data, check=True):
  239. """
  240. Append channel data to total info data
  241. """
  242. info_data = init_info_data(info_data, cate, name)
  243. for url, date, resolution in data:
  244. if (url and not check) or (url and check and check_url_by_patterns(url)):
  245. info_data[cate][name].append((url, date, resolution))
  246. return info_data
  247. def append_total_data(*args, **kwargs):
  248. """
  249. Append total channel data
  250. """
  251. if config.open_keep_all:
  252. return append_all_method_data_keep_all(*args, **kwargs)
  253. else:
  254. return append_all_method_data(*args, **kwargs)
  255. def append_all_method_data(
  256. items, data, subscribe_result=None, multicast_result=None, online_search_result=None
  257. ):
  258. """
  259. Append all method data to total info data
  260. """
  261. for cate, channel_obj in items:
  262. for name, old_urls in channel_obj.items():
  263. for method, result in [
  264. ("subscribe", subscribe_result),
  265. ("multicast", multicast_result),
  266. ("online_search", online_search_result),
  267. ]:
  268. if getattr(config, f"open_{method}"):
  269. data = append_data_to_info_data(
  270. data,
  271. cate,
  272. name,
  273. get_channel_results_by_name(name, result),
  274. )
  275. print(
  276. name,
  277. f"{method.capitalize()} num:",
  278. len(get_channel_results_by_name(name, result)),
  279. )
  280. total_channel_data_len = len(data.get(cate, {}).get(name, []))
  281. if total_channel_data_len == 0 or config.open_use_old_result:
  282. data = append_data_to_info_data(
  283. data,
  284. cate,
  285. name,
  286. [(url, None, None) for url in old_urls],
  287. )
  288. print(
  289. name,
  290. "total num:",
  291. len(data.get(cate, {}).get(name, [])),
  292. )
  293. return data
  294. def append_all_method_data_keep_all(
  295. items, data, subscribe_result=None, multicast_result=None, online_search_result=None
  296. ):
  297. """
  298. Append all method data to total info data, keep all channel name and urls
  299. """
  300. for cate, channel_obj in items:
  301. for result_name, result in [
  302. ("subscribe", subscribe_result),
  303. ("multicast", multicast_result),
  304. ("online_search", online_search_result),
  305. ]:
  306. if result and getattr(config, f"open_{result_name}"):
  307. for name, urls in result.items():
  308. data = append_data_to_info_data(data, cate, name, urls)
  309. print(name, f"{result_name.capitalize()} num:", len(urls))
  310. if config.open_use_old_result:
  311. old_urls = channel_obj.get(name, [])
  312. data = append_data_to_info_data(
  313. data,
  314. cate,
  315. name,
  316. [(url, None, None) for url in old_urls],
  317. )
  318. return data
  319. async def sort_channel_list(semaphore, cate, name, info_list, callback):
  320. """
  321. Sort the channel list
  322. """
  323. async with semaphore:
  324. data = []
  325. try:
  326. if info_list:
  327. sorted_data = await sort_urls_by_speed_and_resolution(info_list)
  328. if sorted_data:
  329. for (
  330. url,
  331. date,
  332. resolution,
  333. ), response_time in sorted_data:
  334. logging.info(
  335. f"Name: {name}, URL: {url}, Date: {date}, Resolution: {resolution}, Response Time: {response_time} ms"
  336. )
  337. data = [
  338. (url, date, resolution)
  339. for (url, date, resolution), _ in sorted_data
  340. ]
  341. except Exception as e:
  342. logging.error(f"Error: {e}")
  343. finally:
  344. callback()
  345. return {"cate": cate, "name": name, "data": data}
  346. def write_channel_to_file(items, data, callback):
  347. """
  348. Write channel to file
  349. """
  350. for cate, channel_obj in items:
  351. for name in channel_obj.keys():
  352. info_list = data.get(cate, {}).get(name, [])
  353. try:
  354. channel_urls = get_total_urls_from_info_list(info_list)
  355. print("write:", cate, name, "num:", len(channel_urls))
  356. update_channel_urls_txt(cate, name, channel_urls)
  357. finally:
  358. callback()
  359. for handler in logging.root.handlers[:]:
  360. handler.close()
  361. logging.root.removeHandler(handler)