files.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. from __future__ import annotations
  2. import os
  3. import json
  4. from pathlib import Path
  5. from typing import Iterator, Optional, AsyncIterator
  6. from aiohttp import ClientSession, ClientError, ClientResponse, ClientTimeout
  7. import urllib.parse
  8. import time
  9. import zipfile
  10. import asyncio
  11. import hashlib
  12. import base64
  13. try:
  14. from werkzeug.utils import secure_filename
  15. except ImportError:
  16. secure_filename = os.path.basename
  17. try:
  18. import PyPDF2
  19. from PyPDF2.errors import PdfReadError
  20. has_pypdf2 = True
  21. except ImportError:
  22. has_pypdf2 = False
  23. try:
  24. import pdfplumber
  25. has_pdfplumber = True
  26. except ImportError:
  27. has_pdfplumber = False
  28. try:
  29. from pdfminer.high_level import extract_text
  30. has_pdfminer = True
  31. except ImportError:
  32. has_pdfminer = False
  33. try:
  34. from docx import Document
  35. has_docx = True
  36. except ImportError:
  37. has_docx = False
  38. try:
  39. import docx2txt
  40. has_docx2txt = True
  41. except ImportError:
  42. has_docx2txt = False
  43. try:
  44. from odf.opendocument import load
  45. from odf.text import P
  46. has_odfpy = True
  47. except ImportError:
  48. has_odfpy = False
  49. try:
  50. import ebooklib
  51. from ebooklib import epub
  52. has_ebooklib = True
  53. except ImportError:
  54. has_ebooklib = False
  55. try:
  56. import pandas as pd
  57. has_openpyxl = True
  58. except ImportError:
  59. has_openpyxl = False
  60. try:
  61. import spacy
  62. has_spacy = True
  63. except:
  64. has_spacy = False
  65. try:
  66. from bs4 import BeautifulSoup
  67. has_beautifulsoup4 = True
  68. except ImportError:
  69. has_beautifulsoup4 = False
  70. from .web_search import scrape_text
  71. from ..cookies import get_cookies_dir
  72. from ..requests.aiohttp import get_connector
  73. from ..providers.asyncio import to_sync_generator
  74. from ..errors import MissingRequirementsError
  75. from .. import debug
  76. PLAIN_FILE_EXTENSIONS = ["txt", "xml", "json", "js", "har", "sh", "py", "php", "css", "yaml", "sql", "log", "csv", "twig", "md"]
  77. PLAIN_CACHE = "plain.cache"
  78. DOWNLOADS_FILE = "downloads.json"
  79. FILE_LIST = "files.txt"
  80. def supports_filename(filename: str):
  81. if filename.endswith(".pdf"):
  82. if has_pypdf2:
  83. return True
  84. elif has_pdfplumber:
  85. return True
  86. elif has_pdfminer:
  87. return True
  88. raise MissingRequirementsError(f'Install "pypdf2" requirements | pip install -U g4f[files]')
  89. elif filename.endswith(".docx"):
  90. if has_docx:
  91. return True
  92. elif has_docx2txt:
  93. return True
  94. raise MissingRequirementsError(f'Install "docx" requirements | pip install -U g4f[files]')
  95. elif has_odfpy and filename.endswith(".odt"):
  96. return True
  97. elif has_ebooklib and filename.endswith(".epub"):
  98. return True
  99. elif has_openpyxl and filename.endswith(".xlsx"):
  100. return True
  101. elif filename.endswith(".html"):
  102. if not has_beautifulsoup4:
  103. raise MissingRequirementsError(f'Install "beautifulsoup4" requirements | pip install -U g4f[files]')
  104. return True
  105. elif filename.endswith(".zip"):
  106. return True
  107. elif filename.endswith("package-lock.json") and filename != FILE_LIST:
  108. return False
  109. else:
  110. extension = os.path.splitext(filename)[1][1:]
  111. if extension in PLAIN_FILE_EXTENSIONS:
  112. return True
  113. return False
  114. def get_bucket_dir(bucket_id: str):
  115. bucket_dir = os.path.join(get_cookies_dir(), "buckets", bucket_id)
  116. return bucket_dir
  117. def get_buckets():
  118. buckets_dir = os.path.join(get_cookies_dir(), "buckets")
  119. try:
  120. return [d for d in os.listdir(buckets_dir) if os.path.isdir(os.path.join(buckets_dir, d))]
  121. except OSError as e:
  122. return None
  123. def spacy_refine_chunks(source_iterator):
  124. if not has_spacy:
  125. raise MissingRequirementsError(f'Install "spacy" requirements | pip install -U g4f[files]')
  126. nlp = spacy.load("en_core_web_sm")
  127. for page in source_iterator:
  128. doc = nlp(page)
  129. #for chunk in doc.noun_chunks:
  130. # yield " ".join([token.lemma_ for token in chunk if not token.is_stop])
  131. # for token in doc:
  132. # if not token.is_space:
  133. # yield token.lemma_.lower()
  134. # yield " "
  135. sentences = list(doc.sents)
  136. summary = sorted(sentences, key=lambda x: len(x.text), reverse=True)[:2]
  137. for sent in summary:
  138. yield sent.text
  139. def get_filenames(bucket_dir: Path):
  140. files = bucket_dir / FILE_LIST
  141. if files.exists():
  142. with files.open('r') as f:
  143. return [filename.strip() for filename in f.readlines()]
  144. return []
  145. def stream_read_files(bucket_dir: Path, filenames: list, delete_files: bool = False) -> Iterator[str]:
  146. for filename in filenames:
  147. file_path: Path = bucket_dir / filename
  148. if not file_path.exists() and 0 > file_path.lstat().st_size:
  149. continue
  150. extension = os.path.splitext(filename)[1][1:]
  151. if filename.endswith(".zip"):
  152. with zipfile.ZipFile(file_path, 'r') as zip_ref:
  153. zip_ref.extractall(bucket_dir)
  154. try:
  155. yield from stream_read_files(bucket_dir, [f for f in zip_ref.namelist() if supports_filename(f)], delete_files)
  156. except zipfile.BadZipFile:
  157. pass
  158. finally:
  159. if delete_files:
  160. for unlink in zip_ref.namelist()[::-1]:
  161. filepath = os.path.join(bucket_dir, unlink)
  162. if os.path.exists(filepath):
  163. if os.path.isdir(filepath):
  164. os.rmdir(filepath)
  165. else:
  166. os.unlink(filepath)
  167. continue
  168. yield f"```{filename}\n"
  169. if has_pypdf2 and filename.endswith(".pdf"):
  170. try:
  171. reader = PyPDF2.PdfReader(file_path)
  172. for page_num in range(len(reader.pages)):
  173. page = reader.pages[page_num]
  174. yield page.extract_text()
  175. except PdfReadError:
  176. continue
  177. if has_pdfplumber and filename.endswith(".pdf"):
  178. with pdfplumber.open(file_path) as pdf:
  179. for page in pdf.pages:
  180. yield page.extract_text()
  181. if has_pdfminer and filename.endswith(".pdf"):
  182. yield extract_text(file_path)
  183. elif has_docx and filename.endswith(".docx"):
  184. doc = Document(file_path)
  185. for para in doc.paragraphs:
  186. yield para.text
  187. elif has_docx2txt and filename.endswith(".docx"):
  188. yield docx2txt.process(file_path)
  189. elif has_odfpy and filename.endswith(".odt"):
  190. textdoc = load(file_path)
  191. allparas = textdoc.getElementsByType(P)
  192. for p in allparas:
  193. yield p.firstChild.data if p.firstChild else ""
  194. elif has_ebooklib and filename.endswith(".epub"):
  195. book = epub.read_epub(file_path)
  196. for doc_item in book.get_items():
  197. if doc_item.get_type() == ebooklib.ITEM_DOCUMENT:
  198. yield doc_item.get_content().decode(errors='ignore')
  199. elif has_openpyxl and filename.endswith(".xlsx"):
  200. df = pd.read_excel(file_path)
  201. for row in df.itertuples(index=False):
  202. yield " ".join(str(cell) for cell in row)
  203. elif has_beautifulsoup4 and filename.endswith(".html"):
  204. yield from scrape_text(file_path.read_text(errors="ignore"))
  205. elif extension in PLAIN_FILE_EXTENSIONS:
  206. yield file_path.read_text(errors="ignore")
  207. yield f"\n```\n\n"
  208. def cache_stream(stream: Iterator[str], bucket_dir: Path) -> Iterator[str]:
  209. cache_file = bucket_dir / PLAIN_CACHE
  210. tmp_file = bucket_dir / f"{PLAIN_CACHE}.{time.time()}.tmp"
  211. if cache_file.exists():
  212. for chunk in read_path_chunked(cache_file):
  213. yield chunk
  214. return
  215. with open(tmp_file, "wb") as f:
  216. for chunk in stream:
  217. f.write(chunk.encode(errors="replace"))
  218. yield chunk
  219. tmp_file.rename(cache_file)
  220. def is_complete(data: str):
  221. return data.endswith("\n```\n\n") and data.count("```") % 2 == 0
  222. def read_path_chunked(path: Path):
  223. with path.open("r", encoding='utf-8') as f:
  224. current_chunk_size = 0
  225. buffer = ""
  226. for line in f:
  227. current_chunk_size += len(line.encode('utf-8'))
  228. buffer += line
  229. if current_chunk_size >= 4096:
  230. if is_complete(buffer) or current_chunk_size >= 8192:
  231. yield buffer
  232. buffer = ""
  233. current_chunk_size = 0
  234. if current_chunk_size > 0:
  235. yield buffer
  236. def read_bucket(bucket_dir: Path):
  237. bucket_dir = Path(bucket_dir)
  238. cache_file = bucket_dir / PLAIN_CACHE
  239. spacy_file = bucket_dir / f"spacy_0001.cache"
  240. if not spacy_file.exists():
  241. yield cache_file.read_text()
  242. for idx in range(1, 1000):
  243. spacy_file = bucket_dir / f"spacy_{idx:04d}.cache"
  244. plain_file = bucket_dir / f"plain_{idx:04d}.cache"
  245. if spacy_file.exists():
  246. yield spacy_file.read_text()
  247. elif plain_file.exists():
  248. yield plain_file.read_text()
  249. else:
  250. break
  251. def stream_read_parts_and_refine(bucket_dir: Path, delete_files: bool = False) -> Iterator[str]:
  252. cache_file = bucket_dir / PLAIN_CACHE
  253. space_file = Path(bucket_dir) / f"spacy_0001.cache"
  254. part_one = bucket_dir / f"plain_0001.cache"
  255. if not space_file.exists() and not part_one.exists() and cache_file.exists():
  256. split_file_by_size_and_newline(cache_file, bucket_dir)
  257. for idx in range(1, 1000):
  258. part = bucket_dir / f"plain_{idx:04d}.cache"
  259. tmp_file = Path(bucket_dir) / f"spacy_{idx:04d}.{time.time()}.tmp"
  260. cache_file = Path(bucket_dir) / f"spacy_{idx:04d}.cache"
  261. if cache_file.exists():
  262. with open(cache_file, "r") as f:
  263. yield f.read()
  264. continue
  265. if not part.exists():
  266. break
  267. with tmp_file.open("w") as f:
  268. for chunk in spacy_refine_chunks(read_path_chunked(part)):
  269. f.write(chunk)
  270. yield chunk
  271. tmp_file.rename(cache_file)
  272. if delete_files:
  273. part.unlink()
  274. def split_file_by_size_and_newline(input_filename, output_dir, chunk_size_bytes=1024*1024): # 1MB
  275. """Splits a file into chunks of approximately chunk_size_bytes, splitting only at newline characters.
  276. Args:
  277. input_filename: Path to the input file.
  278. output_prefix: Prefix for the output files (e.g., 'output_part_').
  279. chunk_size_bytes: Desired size of each chunk in bytes.
  280. """
  281. split_filename = os.path.splitext(os.path.basename(input_filename))
  282. output_prefix = os.path.join(output_dir, split_filename[0] + "_")
  283. with open(input_filename, 'r', encoding='utf-8') as infile:
  284. chunk_num = 1
  285. current_chunk = ""
  286. current_chunk_size = 0
  287. for line in infile:
  288. current_chunk += line
  289. current_chunk_size += len(line.encode('utf-8'))
  290. if current_chunk_size >= chunk_size_bytes:
  291. if is_complete(current_chunk) or current_chunk_size >= chunk_size_bytes * 2:
  292. output_filename = f"{output_prefix}{chunk_num:04d}{split_filename[1]}"
  293. with open(output_filename, 'w', encoding='utf-8') as outfile:
  294. outfile.write(current_chunk)
  295. current_chunk = ""
  296. current_chunk_size = 0
  297. chunk_num += 1
  298. # Write the last chunk
  299. if current_chunk:
  300. output_filename = f"{output_prefix}{chunk_num:04d}{split_filename[1]}"
  301. with open(output_filename, 'w', encoding='utf-8') as outfile:
  302. outfile.write(current_chunk)
  303. async def get_filename(response: ClientResponse) -> str:
  304. """
  305. Attempts to extract a filename from an aiohttp response. Prioritizes Content-Disposition, then URL.
  306. Args:
  307. response: The aiohttp ClientResponse object.
  308. Returns:
  309. The filename as a string, or None if it cannot be determined.
  310. """
  311. content_disposition = response.headers.get('Content-Disposition')
  312. if content_disposition:
  313. try:
  314. filename = content_disposition.split('filename=')[1].strip('"')
  315. if filename:
  316. return secure_filename(filename)
  317. except IndexError:
  318. pass
  319. content_type = response.headers.get('Content-Type')
  320. url = str(response.url)
  321. if content_type and url:
  322. extension = await get_file_extension(response)
  323. if extension:
  324. parsed_url = urllib.parse.urlparse(url)
  325. sha256_hash = hashlib.sha256(url.encode()).digest()
  326. base32_encoded = base64.b32encode(sha256_hash).decode()
  327. url_hash = base32_encoded[:24].lower()
  328. return f"{parsed_url.netloc}+{parsed_url.path[1:].replace('/', '_')}+{url_hash}{extension}"
  329. return None
  330. async def get_file_extension(response: ClientResponse):
  331. """
  332. Attempts to determine the file extension from an aiohttp response. Improved to handle more types.
  333. Args:
  334. response: The aiohttp ClientResponse object.
  335. Returns:
  336. The file extension (e.g., ".html", ".json", ".pdf", ".zip", ".md", ".txt") as a string,
  337. or None if it cannot be determined.
  338. """
  339. content_type = response.headers.get('Content-Type')
  340. if content_type:
  341. if "html" in content_type.lower():
  342. return ".html"
  343. elif "json" in content_type.lower():
  344. return ".json"
  345. elif "pdf" in content_type.lower():
  346. return ".pdf"
  347. elif "zip" in content_type.lower():
  348. return ".zip"
  349. elif "text/plain" in content_type.lower():
  350. return ".txt"
  351. elif "markdown" in content_type.lower():
  352. return ".md"
  353. url = str(response.url)
  354. if url:
  355. return Path(url).suffix.lower()
  356. return None
  357. def read_links(html: str, base: str) -> set[str]:
  358. soup = BeautifulSoup(html, "html.parser")
  359. for selector in [
  360. "main",
  361. ".main-content-wrapper",
  362. ".main-content",
  363. ".emt-container-inner",
  364. ".content-wrapper",
  365. "#content",
  366. "#mainContent",
  367. ]:
  368. select = soup.select_one(selector)
  369. if select:
  370. soup = select
  371. break
  372. urls = []
  373. for link in soup.select("a"):
  374. if "rel" not in link.attrs or "nofollow" not in link.attrs["rel"]:
  375. url = link.attrs.get("href")
  376. if url and url.startswith("https://") or url.startswith("/"):
  377. urls.append(url.split("#")[0])
  378. return set([urllib.parse.urljoin(base, link) for link in urls])
  379. async def download_urls(
  380. bucket_dir: Path,
  381. urls: list[str],
  382. max_depth: int = 1,
  383. loading_urls: set[str] = set(),
  384. lock: asyncio.Lock = None,
  385. delay: int = 3,
  386. new_urls: list[str] = list(),
  387. group_size: int = 5,
  388. timeout: int = 10,
  389. proxy: Optional[str] = None
  390. ) -> AsyncIterator[str]:
  391. if lock is None:
  392. lock = asyncio.Lock()
  393. async with ClientSession(
  394. connector=get_connector(proxy=proxy),
  395. timeout=ClientTimeout(timeout)
  396. ) as session:
  397. async def download_url(url: str) -> str:
  398. try:
  399. async with session.get(url) as response:
  400. response.raise_for_status()
  401. filename = await get_filename(response)
  402. if not filename:
  403. print(f"Failed to get filename for {url}")
  404. return None
  405. if not supports_filename(filename) or filename == DOWNLOADS_FILE:
  406. return None
  407. if filename.endswith(".html") and max_depth > 0:
  408. add_urls = read_links(await response.text(), str(response.url))
  409. if add_urls:
  410. async with lock:
  411. add_urls = [add_url for add_url in add_urls if add_url not in loading_urls]
  412. [loading_urls.add(add_url) for add_url in add_urls]
  413. [new_urls.append(add_url) for add_url in add_urls if add_url not in new_urls]
  414. target = bucket_dir / filename
  415. with target.open("wb") as f:
  416. async for chunk in response.content.iter_chunked(4096):
  417. if b'<link rel="canonical"' not in chunk:
  418. f.write(chunk.replace(b'</head>', f'<link rel="canonical" href="{response.url}">\n</head>'.encode()))
  419. return filename
  420. except (ClientError, asyncio.TimeoutError) as e:
  421. debug.log(f"Download failed: {e.__class__.__name__}: {e}")
  422. return None
  423. for filename in await asyncio.gather(*[download_url(url) for url in urls]):
  424. if filename:
  425. yield filename
  426. else:
  427. await asyncio.sleep(delay)
  428. while new_urls:
  429. next_urls = list()
  430. for i in range(0, len(new_urls), group_size):
  431. chunked_urls = new_urls[i:i + group_size]
  432. async for filename in download_urls(bucket_dir, chunked_urls, max_depth - 1, loading_urls, lock, delay + 1, next_urls):
  433. yield filename
  434. await asyncio.sleep(delay)
  435. new_urls = next_urls
  436. def get_downloads_urls(bucket_dir: Path, delete_files: bool = False) -> Iterator[str]:
  437. download_file = bucket_dir / DOWNLOADS_FILE
  438. if download_file.exists():
  439. with download_file.open('r') as f:
  440. data = json.load(f)
  441. if delete_files:
  442. download_file.unlink()
  443. if isinstance(data, list):
  444. for item in data:
  445. if "url" in item:
  446. yield {"urls": [item.pop("url")], **item}
  447. elif "urls" in item:
  448. yield item
  449. def read_and_download_urls(bucket_dir: Path, event_stream: bool = False) -> Iterator[str]:
  450. urls = get_downloads_urls(bucket_dir)
  451. if urls:
  452. count = 0
  453. with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f:
  454. for url in urls:
  455. for filename in to_sync_generator(download_urls(bucket_dir, **url)):
  456. f.write(f"{filename}\n")
  457. if event_stream:
  458. count += 1
  459. yield f'data: {json.dumps({"action": "download", "count": count})}\n\n'
  460. async def async_read_and_download_urls(bucket_dir: Path, event_stream: bool = False) -> AsyncIterator[str]:
  461. urls = get_downloads_urls(bucket_dir)
  462. if urls:
  463. count = 0
  464. with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f:
  465. async for filename in download_urls(bucket_dir, urls):
  466. f.write(f"{filename}\n")
  467. if event_stream:
  468. count += 1
  469. yield f'data: {json.dumps({"action": "download", "count": count})}\n\n'
  470. def stream_chunks(bucket_dir: Path, delete_files: bool = False, refine_chunks_with_spacy: bool = False, event_stream: bool = False) -> Iterator[str]:
  471. size = 0
  472. if refine_chunks_with_spacy:
  473. for chunk in stream_read_parts_and_refine(bucket_dir, delete_files):
  474. if event_stream:
  475. size += len(chunk)
  476. yield f'data: {json.dumps({"action": "refine", "size": size})}\n\n'
  477. else:
  478. yield chunk
  479. else:
  480. streaming = stream_read_files(bucket_dir, get_filenames(bucket_dir), delete_files)
  481. streaming = cache_stream(streaming, bucket_dir)
  482. for chunk in streaming:
  483. if event_stream:
  484. size += len(chunk)
  485. yield f'data: {json.dumps({"action": "load", "size": size})}\n\n'
  486. else:
  487. yield chunk
  488. files_txt = os.path.join(bucket_dir, FILE_LIST)
  489. if delete_files and os.path.exists(files_txt):
  490. for filename in get_filenames(bucket_dir):
  491. if os.path.exists(os.path.join(bucket_dir, filename)):
  492. os.remove(os.path.join(bucket_dir, filename))
  493. os.remove(files_txt)
  494. if event_stream:
  495. yield f'data: {json.dumps({"action": "delete_files"})}\n\n'
  496. if event_stream:
  497. yield f'data: {json.dumps({"action": "done", "size": size})}\n\n'
  498. def get_streaming(bucket_dir: str, delete_files = False, refine_chunks_with_spacy = False, event_stream: bool = False) -> Iterator[str]:
  499. bucket_dir = Path(bucket_dir)
  500. bucket_dir.mkdir(parents=True, exist_ok=True)
  501. try:
  502. yield from read_and_download_urls(bucket_dir, event_stream)
  503. yield from stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream)
  504. except Exception as e:
  505. if event_stream:
  506. yield f'data: {json.dumps({"error": {"message": str(e)}})}\n\n'
  507. raise e
  508. async def get_async_streaming(bucket_dir: str, delete_files = False, refine_chunks_with_spacy = False, event_stream: bool = False) -> Iterator[str]:
  509. bucket_dir = Path(bucket_dir)
  510. bucket_dir.mkdir(parents=True, exist_ok=True)
  511. try:
  512. async for chunk in async_read_and_download_urls(bucket_dir, event_stream):
  513. yield chunk
  514. for chunk in stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream):
  515. yield chunk
  516. except Exception as e:
  517. if event_stream:
  518. yield f'data: {json.dumps({"error": {"message": str(e)}})}\n\n'
  519. raise e