|
@@ -0,0 +1,130 @@
|
|
|
+import time
|
|
|
+
|
|
|
+from helpers.logger import Logger
|
|
|
+import hashlib
|
|
|
+import requests
|
|
|
+from bs4 import BeautifulSoup as bs
|
|
|
+from multiprocessing import Process
|
|
|
+from math import ceil
|
|
|
+import datetime
|
|
|
+import os
|
|
|
+
|
|
|
+
|
|
|
+class NotYoutube:
|
|
|
+
|
|
|
+ def __init__(self, cache_dir: str):
|
|
|
+ self.cache_file = cache_dir + "not_youtube.cache"
|
|
|
+ self.cache_images = cache_dir + "video_previews/"
|
|
|
+ self.source_cache = {}
|
|
|
+ self.list = self._restore_from_cache()
|
|
|
+
|
|
|
+ def get_channel_name(self, channel_url: str) -> str:
|
|
|
+ if NotYoutube._is_not_youtube_url(channel_url) is not True:
|
|
|
+ Logger.log_info("link is not for NotYoutube", channel_url)
|
|
|
+ return ""
|
|
|
+
|
|
|
+ Logger.log_info("try get info about channel", channel_url)
|
|
|
+ cache_name = self._check_in_cache(channel_url)
|
|
|
+ if cache_name != "":
|
|
|
+ return cache_name
|
|
|
+
|
|
|
+ page_source = requests.get(channel_url)
|
|
|
+ if page_source.status_code != 200:
|
|
|
+ Logger.log_error("can't load info about channel", channel_url)
|
|
|
+ return ""
|
|
|
+
|
|
|
+ # put request in cache, will reuse later on channels load
|
|
|
+ self.source_cache[channel_url] = {"src": page_source, "tms": datetime.datetime.now().timestamp()}
|
|
|
+ page_source.encoding = 'utf-8'
|
|
|
+ pre_soup = bs(page_source.text, 'html.parser')
|
|
|
+ pre_name = pre_soup.select("span")
|
|
|
+ if len(pre_name) == 0:
|
|
|
+ Logger.log_error("can't get channel name", channel_url)
|
|
|
+ return ""
|
|
|
+ channel_name = pre_name[1].text
|
|
|
+ self._put_in_cache(channel_url, channel_name)
|
|
|
+ return channel_name
|
|
|
+
|
|
|
+ def get_channel_video(self, channel_url: str) -> dict:
|
|
|
+ source = self._get_page_source(channel_url)
|
|
|
+ source.encoding = 'utf-8'
|
|
|
+ soup = bs(source.text, 'html.parser')
|
|
|
+ video_list = {}
|
|
|
+ prev_download = []
|
|
|
+ for video_box in soup.select("div.h-box"):
|
|
|
+ link = video_box.select("p a")
|
|
|
+ if len(link) == 0:
|
|
|
+ continue
|
|
|
+ link = link[0]
|
|
|
+ if link['href'].startswith('/watch?') is not True:
|
|
|
+ continue
|
|
|
+ if link.text.strip() == "":
|
|
|
+ continue
|
|
|
+ video_prev = video_box.select('img.thumbnail')
|
|
|
+ if len(video_prev) > 0:
|
|
|
+ prev_download.append({"url": "https://notyoutube.org" + video_prev[0]['src'], "txt": link.text})
|
|
|
+ video_list[link.text] = link['href']
|
|
|
+ self._bulk_save_images(prev_download)
|
|
|
+ return video_list
|
|
|
+
|
|
|
+ def _get_page_source(self, url: str):
|
|
|
+ current_tms = datetime.datetime.now().timestamp()
|
|
|
+ if self.source_cache.get(url) is not None:
|
|
|
+ # found page in cache, check how old is it
|
|
|
+ if current_tms - self.source_cache.get(url)['tms'] < 300: # if cache is not older than 5 min - return
|
|
|
+ return self.source_cache.get(url)['src']
|
|
|
+ page_source = requests.get(url)
|
|
|
+ if page_source.status_code == 200:
|
|
|
+ self.source_cache[url] = {"src": page_source, "tms": current_tms}
|
|
|
+ return page_source
|
|
|
+
|
|
|
+ def _restore_from_cache(self) -> dict:
|
|
|
+ result = {}
|
|
|
+ try:
|
|
|
+ channels_list_file = open(self.cache_file, 'r+')
|
|
|
+ for i in channels_list_file.readlines():
|
|
|
+ ch_data = i.split(";")
|
|
|
+ if len(ch_data) != 2:
|
|
|
+ Logger.log_error("invalid cache data, skip it", i)
|
|
|
+ continue
|
|
|
+ result[ch_data[0]] = ch_data[1]
|
|
|
+ channels_list_file.close()
|
|
|
+ return result
|
|
|
+ except Exception as e:
|
|
|
+ Logger.log_error("error read from cache", str(e))
|
|
|
+ return result
|
|
|
+
|
|
|
+ def _check_in_cache(self, channel_url: str) -> str:
|
|
|
+ if self.list.get(channel_url) is not None:
|
|
|
+ return self.list.get(channel_url)
|
|
|
+ return ""
|
|
|
+
|
|
|
+ def _put_in_cache(self, channel_url: str, channel_name: str):
|
|
|
+ self.list[channel_url] = channel_name
|
|
|
+ with open(self.cache_file, "a") as cache_file:
|
|
|
+ cache_file.write("{0};{1}".format(channel_url, channel_name))
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def _is_not_youtube_url(channel_url: str) -> bool:
|
|
|
+ return "notyoutube.org" in channel_url
|
|
|
+
|
|
|
+ def _bulk_save_images(self, images_to_download: list):
|
|
|
+ num_processes = 6
|
|
|
+ part_len = ceil(len(images_to_download) / num_processes)
|
|
|
+ chunks = [images_to_download[part_len * k:part_len * (k + 1)] for k in range(4)]
|
|
|
+ i = 0
|
|
|
+ for ch in chunks:
|
|
|
+ i += 1
|
|
|
+ Process(target=self._save_images, args=(ch, i)).start()
|
|
|
+
|
|
|
+ def _save_images(self, chunk: list, j: int):
|
|
|
+ for i in chunk:
|
|
|
+ file = self.cache_images + hashlib.md5(i['txt'].encode("utf-8")).hexdigest()
|
|
|
+ if os.path.exists(file) and os.path.isfile(file):
|
|
|
+ continue
|
|
|
+ # print("download {0} {1}".format(j, i['url']))
|
|
|
+ response = requests.get(i['url'])
|
|
|
+ if response.status_code == 200:
|
|
|
+ file = open(file, "wb")
|
|
|
+ file.write(response.content)
|
|
|
+ file.close()
|