web_search.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. from __future__ import annotations
  2. from aiohttp import ClientSession, ClientTimeout, ClientError
  3. import json
  4. import hashlib
  5. from pathlib import Path
  6. from urllib.parse import urlparse, quote_plus
  7. from datetime import datetime
  8. import datetime
  9. import asyncio
  10. try:
  11. from duckduckgo_search import DDGS
  12. from duckduckgo_search.exceptions import DuckDuckGoSearchException
  13. from bs4 import BeautifulSoup
  14. has_requirements = True
  15. except ImportError:
  16. has_requirements = False
  17. try:
  18. import spacy
  19. has_spacy = True
  20. except:
  21. has_spacy = False
  22. from typing import Iterator
  23. from ..cookies import get_cookies_dir
  24. from ..errors import MissingRequirementsError
  25. from .. import debug
  26. DEFAULT_INSTRUCTIONS = """
  27. Using the provided web search results, to write a comprehensive reply to the user request.
  28. Make sure to add the sources of cites using [[Number]](Url) notation after the reference. Example: [[0]](http://google.com)
  29. """
  30. class SearchResults():
  31. def __init__(self, results: list, used_words: int):
  32. self.results = results
  33. self.used_words = used_words
  34. def __iter__(self):
  35. yield from self.results
  36. def __str__(self):
  37. search = ""
  38. for idx, result in enumerate(self.results):
  39. if search:
  40. search += "\n\n\n"
  41. search += f"Title: {result.title}\n\n"
  42. if result.text:
  43. search += result.text
  44. else:
  45. search += result.snippet
  46. search += f"\n\nSource: [[{idx}]]({result.url})"
  47. return search
  48. def __len__(self) -> int:
  49. return len(self.results)
  50. class SearchResultEntry():
  51. def __init__(self, title: str, url: str, snippet: str, text: str = None):
  52. self.title = title
  53. self.url = url
  54. self.snippet = snippet
  55. self.text = text
  56. def set_text(self, text: str):
  57. self.text = text
  58. def scrape_text(html: str, max_words: int = None, add_source=True) -> Iterator[str]:
  59. source = BeautifulSoup(html, "html.parser")
  60. soup = source
  61. for selector in [
  62. "main",
  63. ".main-content-wrapper",
  64. ".main-content",
  65. ".emt-container-inner",
  66. ".content-wrapper",
  67. "#content",
  68. "#mainContent",
  69. ]:
  70. select = soup.select_one(selector)
  71. if select:
  72. soup = select
  73. break
  74. # Zdnet
  75. for remove in [".c-globalDisclosure"]:
  76. select = soup.select_one(remove)
  77. if select:
  78. select.extract()
  79. for paragraph in soup.select("p, table:not(:has(p)), ul:not(:has(p)), h1, h2, h3, h4, h5, h6"):
  80. for line in paragraph.text.splitlines():
  81. words = [word for word in line.replace("\t", " ").split(" ") if word]
  82. count = len(words)
  83. if not count:
  84. continue
  85. if max_words:
  86. max_words -= count
  87. if max_words <= 0:
  88. break
  89. yield " ".join(words) + "\n"
  90. if add_source:
  91. canonical_link = source.find("link", rel="canonical")
  92. if canonical_link and "href" in canonical_link.attrs:
  93. link = canonical_link["href"]
  94. domain = urlparse(link).netloc
  95. yield f"\nSource: [{domain}]({link})"
  96. async def fetch_and_scrape(session: ClientSession, url: str, max_words: int = None, add_source: bool = False) -> str:
  97. try:
  98. bucket_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape"
  99. bucket_dir.mkdir(parents=True, exist_ok=True)
  100. md5_hash = hashlib.md5(url.encode()).hexdigest()
  101. cache_file = bucket_dir / f"{url.split('?')[0].split('//')[1].replace('/', '+')[:16]}.{datetime.date.today()}.{md5_hash}.txt"
  102. if cache_file.exists():
  103. return cache_file.read_text()
  104. async with session.get(url) as response:
  105. if response.status == 200:
  106. html = await response.text()
  107. text = "".join(scrape_text(html, max_words, add_source))
  108. with open(cache_file, "w") as f:
  109. f.write(text)
  110. return text
  111. except (ClientError, asyncio.TimeoutError):
  112. return
  113. async def search(query: str, max_results: int = 5, max_words: int = 2500, backend: str = "auto", add_text: bool = True, timeout: int = 5, region: str = "wt-wt") -> SearchResults:
  114. if not has_requirements:
  115. raise MissingRequirementsError('Install "duckduckgo-search" and "beautifulsoup4" package | pip install -U g4f[search]')
  116. with DDGS() as ddgs:
  117. results = []
  118. for result in ddgs.text(
  119. query,
  120. region=region,
  121. safesearch="moderate",
  122. timelimit="y",
  123. max_results=max_results,
  124. backend=backend,
  125. ):
  126. if ".google." in result["href"]:
  127. continue
  128. results.append(SearchResultEntry(
  129. result["title"],
  130. result["href"],
  131. result["body"]
  132. ))
  133. if add_text:
  134. requests = []
  135. async with ClientSession(timeout=ClientTimeout(timeout)) as session:
  136. for entry in results:
  137. requests.append(fetch_and_scrape(session, entry.url, int(max_words / (max_results - 1)), False))
  138. texts = await asyncio.gather(*requests)
  139. formatted_results = []
  140. used_words = 0
  141. left_words = max_words
  142. for i, entry in enumerate(results):
  143. if add_text:
  144. entry.text = texts[i]
  145. if left_words:
  146. left_words -= entry.title.count(" ") + 5
  147. if entry.text:
  148. left_words -= entry.text.count(" ")
  149. else:
  150. left_words -= entry.snippet.count(" ")
  151. if 0 > left_words:
  152. break
  153. used_words = max_words - left_words
  154. formatted_results.append(entry)
  155. return SearchResults(formatted_results, used_words)
  156. async def do_search(prompt: str, query: str = None, instructions: str = DEFAULT_INSTRUCTIONS, **kwargs) -> str:
  157. if query is None:
  158. query = spacy_get_keywords(prompt)
  159. json_bytes = json.dumps({"query": query, **kwargs}, sort_keys=True).encode()
  160. md5_hash = hashlib.md5(json_bytes).hexdigest()
  161. bucket_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / f"web_search" / f"{datetime.date.today()}"
  162. bucket_dir.mkdir(parents=True, exist_ok=True)
  163. cache_file = bucket_dir / f"{quote_plus(query[:20])}.{md5_hash}.txt"
  164. if cache_file.exists():
  165. with cache_file.open("r") as f:
  166. search_results = f.read()
  167. else:
  168. search_results = await search(query, **kwargs)
  169. with cache_file.open("w") as f:
  170. f.write(str(search_results))
  171. new_prompt = f"""
  172. {search_results}
  173. Instruction: {instructions}
  174. User request:
  175. {prompt}
  176. """
  177. debug.log(f"Web search: '{query.strip()[:50]}...'")
  178. if isinstance(search_results, SearchResults):
  179. debug.log(f"with {len(search_results.results)} Results {search_results.used_words} Words")
  180. return new_prompt
  181. def get_search_message(prompt: str, raise_search_exceptions=False, **kwargs) -> str:
  182. try:
  183. return asyncio.run(do_search(prompt, **kwargs))
  184. except (DuckDuckGoSearchException, MissingRequirementsError) as e:
  185. if raise_search_exceptions:
  186. raise e
  187. debug.log(f"Couldn't do web search: {e.__class__.__name__}: {e}")
  188. return prompt
  189. def spacy_get_keywords(text: str):
  190. if not has_spacy:
  191. return text
  192. # Load the spaCy language model
  193. nlp = spacy.load("en_core_web_sm")
  194. # Process the query
  195. doc = nlp(text)
  196. # Extract keywords based on POS and named entities
  197. keywords = []
  198. for token in doc:
  199. # Filter for nouns, proper nouns, and adjectives
  200. if token.pos_ in {"NOUN", "PROPN", "ADJ"} and not token.is_stop:
  201. keywords.append(token.lemma_)
  202. # Add named entities as keywords
  203. for ent in doc.ents:
  204. keywords.append(ent.text)
  205. # Remove duplicates and print keywords
  206. keywords = list(set(keywords))
  207. #print("Keyword:", keywords)
  208. #keyword_freq = Counter(keywords)
  209. #keywords = keyword_freq.most_common()
  210. #print("Keyword Frequencies:", keywords)
  211. keywords = [chunk.text for chunk in doc.noun_chunks if not chunk.root.is_stop]
  212. #print("Phrases:", keywords)
  213. return keywords