web_search.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. from __future__ import annotations
  2. from aiohttp import ClientSession, ClientTimeout, ClientError
  3. import json
  4. import hashlib
  5. from pathlib import Path
  6. from urllib.parse import urlparse, quote_plus
  7. from datetime import datetime
  8. import datetime
  9. import asyncio
  10. try:
  11. from duckduckgo_search import DDGS
  12. from duckduckgo_search.exceptions import DuckDuckGoSearchException
  13. from bs4 import BeautifulSoup
  14. has_requirements = True
  15. except ImportError:
  16. has_requirements = False
  17. try:
  18. import spacy
  19. has_spacy = True
  20. except:
  21. has_spacy = False
  22. from typing import Iterator
  23. from ..cookies import get_cookies_dir
  24. from ..providers.response import format_link, JsonMixin, Sources
  25. from ..errors import MissingRequirementsError
  26. from .. import debug
  27. DEFAULT_INSTRUCTIONS = """
  28. Using the provided web search results, to write a comprehensive reply to the user request.
  29. Make sure to add the sources of cites using [[Number]](Url) notation after the reference. Example: [[0]](http://google.com)
  30. """
  31. class SearchResults(JsonMixin):
  32. def __init__(self, results: list, used_words: int):
  33. self.results = results
  34. self.used_words = used_words
  35. @classmethod
  36. def from_dict(cls, data: dict):
  37. return cls(
  38. [SearchResultEntry(**item) for item in data["results"]],
  39. data["used_words"]
  40. )
  41. def __iter__(self):
  42. yield from self.results
  43. def __str__(self):
  44. search = ""
  45. for idx, result in enumerate(self.results):
  46. if search:
  47. search += "\n\n\n"
  48. search += f"Title: {result.title}\n\n"
  49. if result.text:
  50. search += result.text
  51. else:
  52. search += result.snippet
  53. search += f"\n\nSource: [[{idx}]]({result.url})"
  54. return search
  55. def __len__(self) -> int:
  56. return len(self.results)
  57. def get_sources(self) -> Sources:
  58. return Sources([{"url": result.url, "title": result.title} for result in self.results])
  59. def get_dict(self):
  60. return {
  61. "results": [result.get_dict() for result in self.results],
  62. "used_words": self.used_words
  63. }
  64. class SearchResultEntry(JsonMixin):
  65. def __init__(self, title: str, url: str, snippet: str, text: str = None):
  66. self.title = title
  67. self.url = url
  68. self.snippet = snippet
  69. self.text = text
  70. def set_text(self, text: str):
  71. self.text = text
  72. def scrape_text(html: str, max_words: int = None, add_source=True, count_images: int = 2) -> Iterator[str]:
  73. source = BeautifulSoup(html, "html.parser")
  74. soup = source
  75. for selector in [
  76. "main",
  77. ".main-content-wrapper",
  78. ".main-content",
  79. ".emt-container-inner",
  80. ".content-wrapper",
  81. "#content",
  82. "#mainContent",
  83. ]:
  84. select = soup.select_one(selector)
  85. if select:
  86. soup = select
  87. break
  88. # Zdnet
  89. for remove in [".c-globalDisclosure"]:
  90. select = soup.select_one(remove)
  91. if select:
  92. select.extract()
  93. image_select = "img[alt][src^=http]:not([alt='']):not(.avatar):not([width])"
  94. image_link_select = f"a:has({image_select})"
  95. yield_words = []
  96. for paragraph in soup.select(f"h1, h2, h3, h4, h5, h6, p, pre, table:not(:has(p)), ul:not(:has(p)), {image_link_select}"):
  97. if count_images > 0:
  98. image = paragraph.select_one(image_select)
  99. if image:
  100. title = str(paragraph.get("title", paragraph.text))
  101. if title:
  102. yield f"!{format_link(image['src'], title)}\n"
  103. if max_words is not None:
  104. max_words -= 10
  105. count_images -= 1
  106. continue
  107. for line in paragraph.get_text(" ").splitlines():
  108. words = [word for word in line.split() if word]
  109. count = len(words)
  110. if not count:
  111. continue
  112. words = " ".join(words)
  113. if words in yield_words:
  114. continue
  115. if max_words:
  116. max_words -= count
  117. if max_words <= 0:
  118. break
  119. yield words + "\n"
  120. yield_words.append(words)
  121. if add_source:
  122. canonical_link = source.find("link", rel="canonical")
  123. if canonical_link and "href" in canonical_link.attrs:
  124. link = canonical_link["href"]
  125. domain = urlparse(link).netloc
  126. yield f"\nSource: [{domain}]({link})"
  127. async def fetch_and_scrape(session: ClientSession, url: str, max_words: int = None, add_source: bool = False) -> str:
  128. try:
  129. bucket_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape"
  130. bucket_dir.mkdir(parents=True, exist_ok=True)
  131. md5_hash = hashlib.md5(url.encode(errors="ignore")).hexdigest()
  132. cache_file = bucket_dir / f"{quote_plus(url.split('?')[0].split('//')[1].replace('/', ' ')[:48])}.{datetime.date.today()}.{md5_hash[:16]}.cache"
  133. if cache_file.exists():
  134. return cache_file.read_text()
  135. async with session.get(url) as response:
  136. if response.status == 200:
  137. html = await response.text(errors="replace")
  138. text = "".join(scrape_text(html, max_words, add_source))
  139. with open(cache_file, "wb") as f:
  140. f.write(text.encode(errors="replace"))
  141. return text
  142. except (ClientError, asyncio.TimeoutError):
  143. return
  144. async def search(query: str, max_results: int = 5, max_words: int = 2500, backend: str = "auto", add_text: bool = True, timeout: int = 5, region: str = "wt-wt") -> SearchResults:
  145. if not has_requirements:
  146. raise MissingRequirementsError('Install "duckduckgo-search" and "beautifulsoup4" package | pip install -U g4f[search]')
  147. with DDGS() as ddgs:
  148. results = []
  149. for result in ddgs.text(
  150. query,
  151. region=region,
  152. safesearch="moderate",
  153. timelimit="y",
  154. max_results=max_results,
  155. backend=backend,
  156. ):
  157. if ".google." in result["href"]:
  158. continue
  159. results.append(SearchResultEntry(
  160. result["title"],
  161. result["href"],
  162. result["body"]
  163. ))
  164. if add_text:
  165. requests = []
  166. async with ClientSession(timeout=ClientTimeout(timeout)) as session:
  167. for entry in results:
  168. requests.append(fetch_and_scrape(session, entry.url, int(max_words / (max_results - 1)), False))
  169. texts = await asyncio.gather(*requests)
  170. formatted_results = []
  171. used_words = 0
  172. left_words = max_words
  173. for i, entry in enumerate(results):
  174. if add_text:
  175. entry.text = texts[i]
  176. if max_words:
  177. left_words -= entry.title.count(" ") + 5
  178. if entry.text:
  179. left_words -= entry.text.count(" ")
  180. else:
  181. left_words -= entry.snippet.count(" ")
  182. if 0 > left_words:
  183. break
  184. used_words = max_words - left_words
  185. formatted_results.append(entry)
  186. return SearchResults(formatted_results, used_words)
  187. async def do_search(prompt: str, query: str = None, instructions: str = DEFAULT_INSTRUCTIONS, **kwargs) -> tuple[str, Sources]:
  188. if instructions and instructions in prompt:
  189. return prompt, None # We have already added search results
  190. if prompt.startswith("##") and query is None:
  191. return prompt, None # We have no search query
  192. if query is None:
  193. query = prompt.strip().splitlines()[0] # Use the first line as the search query
  194. json_bytes = json.dumps({"query": query, **kwargs}, sort_keys=True).encode(errors="ignore")
  195. md5_hash = hashlib.md5(json_bytes).hexdigest()
  196. bucket_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / f"web_search" / f"{datetime.date.today()}"
  197. bucket_dir.mkdir(parents=True, exist_ok=True)
  198. cache_file = bucket_dir / f"{quote_plus(query[:20])}.{md5_hash}.cache"
  199. search_results = None
  200. if cache_file.exists():
  201. with cache_file.open("r") as f:
  202. search_results = f.read()
  203. try:
  204. search_results = SearchResults.from_dict(json.loads(search_results))
  205. except json.JSONDecodeError:
  206. search_results = None
  207. if search_results is None:
  208. search_results = await search(query, **kwargs)
  209. if search_results.results:
  210. with cache_file.open("w") as f:
  211. f.write(json.dumps(search_results.get_dict()))
  212. if instructions:
  213. new_prompt = f"""
  214. {search_results}
  215. Instruction: {instructions}
  216. User request:
  217. {prompt}
  218. """
  219. else:
  220. new_prompt = f"""
  221. {search_results}
  222. {prompt}
  223. """
  224. debug.log(f"Web search: '{query.strip()[:50]}...'")
  225. debug.log(f"with {len(search_results.results)} Results {search_results.used_words} Words")
  226. return new_prompt, search_results.get_sources()
  227. def get_search_message(prompt: str, raise_search_exceptions=False, **kwargs) -> str:
  228. try:
  229. return asyncio.run(do_search(prompt, **kwargs))[0]
  230. except (DuckDuckGoSearchException, MissingRequirementsError) as e:
  231. if raise_search_exceptions:
  232. raise e
  233. debug.error(f"Couldn't do web search: {e.__class__.__name__}: {e}")
  234. return prompt
  235. def spacy_get_keywords(text: str):
  236. if not has_spacy:
  237. return text
  238. # Load the spaCy language model
  239. nlp = spacy.load("en_core_web_sm")
  240. # Process the query
  241. doc = nlp(text)
  242. # Extract keywords based on POS and named entities
  243. keywords = []
  244. for token in doc:
  245. # Filter for nouns, proper nouns, and adjectives
  246. if token.pos_ in {"NOUN", "PROPN", "ADJ"} and not token.is_stop:
  247. keywords.append(token.lemma_)
  248. # Add named entities as keywords
  249. for ent in doc.ents:
  250. keywords.append(ent.text)
  251. # Remove duplicates and print keywords
  252. keywords = list(set(keywords))
  253. #print("Keyword:", keywords)
  254. #keyword_freq = Counter(keywords)
  255. #keywords = keyword_freq.most_common()
  256. #print("Keyword Frequencies:", keywords)
  257. keywords = [chunk.text for chunk in doc.noun_chunks if not chunk.root.is_stop]
  258. #print("Phrases:", keywords)
  259. return keywords