web_search.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. from __future__ import annotations
  2. from aiohttp import ClientSession, ClientTimeout, ClientError
  3. import json
  4. import hashlib
  5. from pathlib import Path
  6. from urllib.parse import urlparse, quote_plus
  7. from datetime import datetime
  8. import datetime
  9. import asyncio
  10. try:
  11. from duckduckgo_search import DDGS
  12. from duckduckgo_search.exceptions import DuckDuckGoSearchException
  13. from bs4 import BeautifulSoup
  14. has_requirements = True
  15. except ImportError:
  16. has_requirements = False
  17. try:
  18. import spacy
  19. has_spacy = True
  20. except:
  21. has_spacy = False
  22. from typing import Iterator
  23. from ..cookies import get_cookies_dir
  24. from ..providers.response import format_link
  25. from ..errors import MissingRequirementsError
  26. from .. import debug
  27. DEFAULT_INSTRUCTIONS = """
  28. Using the provided web search results, to write a comprehensive reply to the user request.
  29. Make sure to add the sources of cites using [[Number]](Url) notation after the reference. Example: [[0]](http://google.com)
  30. """
  31. class SearchResults():
  32. def __init__(self, results: list, used_words: int):
  33. self.results = results
  34. self.used_words = used_words
  35. def __iter__(self):
  36. yield from self.results
  37. def __str__(self):
  38. search = ""
  39. for idx, result in enumerate(self.results):
  40. if search:
  41. search += "\n\n\n"
  42. search += f"Title: {result.title}\n\n"
  43. if result.text:
  44. search += result.text
  45. else:
  46. search += result.snippet
  47. search += f"\n\nSource: [[{idx}]]({result.url})"
  48. return search
  49. def __len__(self) -> int:
  50. return len(self.results)
  51. class SearchResultEntry():
  52. def __init__(self, title: str, url: str, snippet: str, text: str = None):
  53. self.title = title
  54. self.url = url
  55. self.snippet = snippet
  56. self.text = text
  57. def set_text(self, text: str):
  58. self.text = text
  59. def scrape_text(html: str, max_words: int = None, add_source=True, count_images: int = 2) -> Iterator[str]:
  60. source = BeautifulSoup(html, "html.parser")
  61. soup = source
  62. for selector in [
  63. "main",
  64. ".main-content-wrapper",
  65. ".main-content",
  66. ".emt-container-inner",
  67. ".content-wrapper",
  68. "#content",
  69. "#mainContent",
  70. ]:
  71. select = soup.select_one(selector)
  72. if select:
  73. soup = select
  74. break
  75. # Zdnet
  76. for remove in [".c-globalDisclosure"]:
  77. select = soup.select_one(remove)
  78. if select:
  79. select.extract()
  80. image_select = "img[alt][src^=http]:not([alt=''])"
  81. image_link_select = f"a:has({image_select})"
  82. yield_words = []
  83. for paragraph in soup.select(f"h1, h2, h3, h4, h5, h6, p, table:not(:has(p)), ul:not(:has(p)), {image_link_select}"):
  84. if count_images > 0:
  85. image = paragraph.select_one(image_select)
  86. if image:
  87. title = paragraph.get("title") or paragraph.text
  88. if title:
  89. yield f"!{format_link(image['src'], title)}\n"
  90. if max_words is not None:
  91. max_words -= 10
  92. count_images -= 1
  93. continue
  94. for line in paragraph.get_text(" ").splitlines():
  95. words = [word for word in line.split() if word]
  96. count = len(words)
  97. if not count:
  98. continue
  99. words = " ".join(words)
  100. if words in yield_words:
  101. continue
  102. if max_words:
  103. max_words -= count
  104. if max_words <= 0:
  105. break
  106. yield words + "\n"
  107. yield_words.append(words)
  108. if add_source:
  109. canonical_link = source.find("link", rel="canonical")
  110. if canonical_link and "href" in canonical_link.attrs:
  111. link = canonical_link["href"]
  112. domain = urlparse(link).netloc
  113. yield f"\nSource: [{domain}]({link})"
  114. async def fetch_and_scrape(session: ClientSession, url: str, max_words: int = None, add_source: bool = False) -> str:
  115. try:
  116. bucket_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape"
  117. bucket_dir.mkdir(parents=True, exist_ok=True)
  118. md5_hash = hashlib.md5(url.encode()).hexdigest()
  119. cache_file = bucket_dir / f"{quote_plus(url.split('?')[0].split('//')[1].replace('/', ' ')[:48])}.{datetime.date.today()}.{md5_hash[:16]}.cache"
  120. if cache_file.exists():
  121. return cache_file.read_text()
  122. async with session.get(url) as response:
  123. if response.status == 200:
  124. html = await response.text(errors="replace")
  125. text = "".join(scrape_text(html, max_words, add_source))
  126. with open(cache_file, "w") as f:
  127. f.write(text)
  128. return text
  129. except (ClientError, asyncio.TimeoutError):
  130. return
  131. async def search(query: str, max_results: int = 5, max_words: int = 2500, backend: str = "auto", add_text: bool = True, timeout: int = 5, region: str = "wt-wt") -> SearchResults:
  132. if not has_requirements:
  133. raise MissingRequirementsError('Install "duckduckgo-search" and "beautifulsoup4" package | pip install -U g4f[search]')
  134. with DDGS() as ddgs:
  135. results = []
  136. for result in ddgs.text(
  137. query,
  138. region=region,
  139. safesearch="moderate",
  140. timelimit="y",
  141. max_results=max_results,
  142. backend=backend,
  143. ):
  144. if ".google." in result["href"]:
  145. continue
  146. results.append(SearchResultEntry(
  147. result["title"],
  148. result["href"],
  149. result["body"]
  150. ))
  151. if add_text:
  152. requests = []
  153. async with ClientSession(timeout=ClientTimeout(timeout)) as session:
  154. for entry in results:
  155. requests.append(fetch_and_scrape(session, entry.url, int(max_words / (max_results - 1)), False))
  156. texts = await asyncio.gather(*requests)
  157. formatted_results = []
  158. used_words = 0
  159. left_words = max_words
  160. for i, entry in enumerate(results):
  161. if add_text:
  162. entry.text = texts[i]
  163. if max_words:
  164. left_words -= entry.title.count(" ") + 5
  165. if entry.text:
  166. left_words -= entry.text.count(" ")
  167. else:
  168. left_words -= entry.snippet.count(" ")
  169. if 0 > left_words:
  170. break
  171. used_words = max_words - left_words
  172. formatted_results.append(entry)
  173. return SearchResults(formatted_results, used_words)
  174. async def do_search(prompt: str, query: str = None, instructions: str = DEFAULT_INSTRUCTIONS, **kwargs) -> str:
  175. if query is None:
  176. query = spacy_get_keywords(prompt)
  177. json_bytes = json.dumps({"query": query, **kwargs}, sort_keys=True).encode()
  178. md5_hash = hashlib.md5(json_bytes).hexdigest()
  179. bucket_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / f"web_search" / f"{datetime.date.today()}"
  180. bucket_dir.mkdir(parents=True, exist_ok=True)
  181. cache_file = bucket_dir / f"{quote_plus(query[:20])}.{md5_hash}.cache"
  182. if cache_file.exists():
  183. with cache_file.open("r") as f:
  184. search_results = f.read()
  185. else:
  186. search_results = await search(query, **kwargs)
  187. if search_results.results:
  188. with cache_file.open("w") as f:
  189. f.write(str(search_results))
  190. if instructions:
  191. new_prompt = f"""
  192. {search_results}
  193. Instruction: {instructions}
  194. User request:
  195. {prompt}
  196. """
  197. else:
  198. new_prompt = f"""
  199. {search_results}
  200. {prompt}
  201. """
  202. debug.log(f"Web search: '{query.strip()[:50]}...'")
  203. if isinstance(search_results, SearchResults):
  204. debug.log(f"with {len(search_results.results)} Results {search_results.used_words} Words")
  205. return new_prompt
  206. def get_search_message(prompt: str, raise_search_exceptions=False, **kwargs) -> str:
  207. try:
  208. return asyncio.run(do_search(prompt, **kwargs))
  209. except (DuckDuckGoSearchException, MissingRequirementsError) as e:
  210. if raise_search_exceptions:
  211. raise e
  212. debug.log(f"Couldn't do web search: {e.__class__.__name__}: {e}")
  213. return prompt
  214. def spacy_get_keywords(text: str):
  215. if not has_spacy:
  216. return text
  217. # Load the spaCy language model
  218. nlp = spacy.load("en_core_web_sm")
  219. # Process the query
  220. doc = nlp(text)
  221. # Extract keywords based on POS and named entities
  222. keywords = []
  223. for token in doc:
  224. # Filter for nouns, proper nouns, and adjectives
  225. if token.pos_ in {"NOUN", "PROPN", "ADJ"} and not token.is_stop:
  226. keywords.append(token.lemma_)
  227. # Add named entities as keywords
  228. for ent in doc.ents:
  229. keywords.append(ent.text)
  230. # Remove duplicates and print keywords
  231. keywords = list(set(keywords))
  232. #print("Keyword:", keywords)
  233. #keyword_freq = Counter(keywords)
  234. #keywords = keyword_freq.most_common()
  235. #print("Keyword Frequencies:", keywords)
  236. keywords = [chunk.text for chunk in doc.noun_chunks if not chunk.root.is_stop]
  237. #print("Phrases:", keywords)
  238. return keywords