DDG.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547
  1. from __future__ import annotations
  2. import time
  3. from aiohttp import ClientSession, ClientTimeout
  4. import json
  5. import asyncio
  6. import random
  7. import base64
  8. import hashlib
  9. from yarl import URL
  10. from ..typing import AsyncResult, Messages, Cookies
  11. from ..requests.raise_for_status import raise_for_status
  12. from .base_provider import AsyncGeneratorProvider, ProviderModelMixin
  13. from .helper import format_prompt, get_last_user_message
  14. from ..providers.response import FinishReason, JsonConversation
  15. from ..errors import ModelNotSupportedError, ResponseStatusError, RateLimitError, TimeoutError, ConversationLimitError
  16. try:
  17. from bs4 import BeautifulSoup
  18. has_bs4 = True
  19. except ImportError:
  20. has_bs4 = False
  21. class DuckDuckGoSearchException(Exception):
  22. """Base exception class for duckduckgo_search."""
  23. class DuckDuckGoChallengeError(ResponseStatusError):
  24. """Raised when DuckDuckGo presents a challenge that needs to be solved."""
  25. class Conversation(JsonConversation):
  26. vqd: str = None
  27. vqd_hash_1: str = None
  28. message_history: Messages = []
  29. cookies: dict = {}
  30. fe_version: str = None
  31. def __init__(self, model: str):
  32. self.model = model
  33. class DDG(AsyncGeneratorProvider, ProviderModelMixin):
  34. label = "DuckDuckGo AI Chat"
  35. url = "https://duckduckgo.com/aichat"
  36. api_endpoint = "https://duckduckgo.com/duckchat/v1/chat"
  37. status_url = "https://duckduckgo.com/duckchat/v1/status"
  38. working = True
  39. supports_stream = True
  40. supports_system_message = True
  41. supports_message_history = True
  42. default_model = "gpt-4o-mini"
  43. # Model mapping from user-friendly names to API model names
  44. _chat_models = {
  45. "gpt-4": default_model,
  46. "gpt-4o-mini": default_model,
  47. "llama-3.3-70b": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
  48. "claude-3-haiku": "claude-3-haiku-20240307",
  49. "o3-mini": "o3-mini",
  50. "mixtral-small-24b": "mistralai/Mistral-Small-24B-Instruct-2501",
  51. }
  52. # Available models (user-friendly names)
  53. models = list(_chat_models.keys())
  54. last_request_time = 0
  55. max_retries = 3
  56. base_delay = 2
  57. # Class variable to store the x-fe-version across instances
  58. _chat_xfe = ""
  59. @staticmethod
  60. def sha256_base64(text: str) -> str:
  61. """Return the base64 encoding of the SHA256 digest of the text."""
  62. sha256_hash = hashlib.sha256(text.encode("utf-8")).digest()
  63. return base64.b64encode(sha256_hash).decode()
  64. @staticmethod
  65. def parse_dom_fingerprint(js_text: str) -> str:
  66. if not has_bs4:
  67. # Fallback if BeautifulSoup is not available
  68. return "1000"
  69. try:
  70. html_snippet = js_text.split("e.innerHTML = '")[1].split("';")[0]
  71. offset_value = js_text.split("return String(")[1].split(" ")[0]
  72. soup = BeautifulSoup(html_snippet, "html.parser")
  73. corrected_inner_html = soup.body.decode_contents()
  74. inner_html_length = len(corrected_inner_html)
  75. fingerprint = int(offset_value) + inner_html_length
  76. return str(fingerprint)
  77. except Exception:
  78. # Return a fallback value if parsing fails
  79. return "1000"
  80. @staticmethod
  81. def parse_server_hashes(js_text: str) -> list:
  82. try:
  83. return js_text.split('server_hashes: ["', maxsplit=1)[1].split('"]', maxsplit=1)[0].split('","')
  84. except Exception:
  85. # Return a fallback value if parsing fails
  86. return ["1", "2"]
  87. @classmethod
  88. def build_x_vqd_hash_1(cls, vqd_hash_1: str, headers: dict) -> str:
  89. """Build the x-vqd-hash-1 header value."""
  90. try:
  91. # If we received a valid base64 string, try to decode it
  92. if vqd_hash_1 and len(vqd_hash_1) > 20:
  93. try:
  94. # Try to decode and parse as JSON first
  95. decoded_json = json.loads(base64.b64decode(vqd_hash_1).decode())
  96. # If it's already a complete structure with meta, return it as is
  97. if isinstance(decoded_json, dict) and "meta" in decoded_json:
  98. return vqd_hash_1
  99. # Otherwise, extract what we can from it
  100. if isinstance(decoded_json, dict) and "server_hashes" in decoded_json:
  101. server_hashes = decoded_json.get("server_hashes", ["1", "2"])
  102. else:
  103. # Fall back to parsing from string
  104. decoded = base64.b64decode(vqd_hash_1).decode()
  105. server_hashes = cls.parse_server_hashes(decoded)
  106. except (json.JSONDecodeError, UnicodeDecodeError):
  107. # If it's not valid JSON, try to parse it as a string
  108. decoded = base64.b64decode(vqd_hash_1).decode()
  109. server_hashes = cls.parse_server_hashes(decoded)
  110. else:
  111. # Default server hashes if we can't extract them
  112. server_hashes = ["1", "2"]
  113. # Generate fingerprints
  114. dom_fingerprint = "1000" # Default value
  115. ua_fingerprint = headers.get("User-Agent", "") + headers.get("sec-ch-ua", "")
  116. ua_hash = cls.sha256_base64(ua_fingerprint)
  117. dom_hash = cls.sha256_base64(dom_fingerprint)
  118. # Create a challenge ID (random hex string)
  119. challenge_id = ''.join(random.choice('0123456789abcdef') for _ in range(40)) + 'h8jbt'
  120. # Build the complete structure including meta
  121. final_result = {
  122. "server_hashes": server_hashes,
  123. "client_hashes": [ua_hash, dom_hash],
  124. "signals": {},
  125. "meta": {
  126. "v": "1",
  127. "challenge_id": challenge_id,
  128. "origin": "https://duckduckgo.com",
  129. "stack": "Error\nat ke (https://duckduckgo.com/dist/wpm.chat.js:1:29526)\nat async dispatchServiceInitialVQD (https://duckduckgo.com/dist/wpm.chat.js:1:45076)"
  130. }
  131. }
  132. base64_final_result = base64.b64encode(json.dumps(final_result).encode()).decode()
  133. return base64_final_result
  134. except Exception as e:
  135. # If anything fails, return an empty string
  136. return ""
  137. @classmethod
  138. def validate_model(cls, model: str) -> str:
  139. """Validates and returns the correct model name for the API"""
  140. if not model:
  141. return cls.default_model
  142. # Check aliases first
  143. if model in cls.model_aliases:
  144. model = cls.model_aliases[model]
  145. # Check if it's a valid model name
  146. if model not in cls.models:
  147. raise ModelNotSupportedError(f"Model {model} not supported. Available models: {cls.models}")
  148. return model
  149. @classmethod
  150. async def sleep(cls, multiplier=1.0):
  151. """Implements rate limiting between requests"""
  152. now = time.time()
  153. if cls.last_request_time > 0:
  154. delay = max(0.0, 1.5 - (now - cls.last_request_time)) * multiplier
  155. if delay > 0:
  156. await asyncio.sleep(delay)
  157. cls.last_request_time = time.time()
  158. @classmethod
  159. async def get_default_cookies(cls, session: ClientSession) -> dict:
  160. """Obtains default cookies needed for API requests"""
  161. try:
  162. await cls.sleep()
  163. # Make initial request to get cookies
  164. async with session.get(cls.url) as response:
  165. # Set the required cookies
  166. cookies = {}
  167. cookies_dict = {'dcs': '1', 'dcm': '3'}
  168. # Add any cookies from the response
  169. for cookie in response.cookies.values():
  170. cookies[cookie.key] = cookie.value
  171. # Ensure our required cookies are set
  172. for name, value in cookies_dict.items():
  173. cookies[name] = value
  174. url_obj = URL(cls.url)
  175. session.cookie_jar.update_cookies({name: value}, url_obj)
  176. # Make a second request to the status endpoint to get any additional cookies
  177. headers = {
  178. "accept": "text/event-stream",
  179. "accept-language": "en",
  180. "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
  181. "origin": "https://duckduckgo.com",
  182. "referer": "https://duckduckgo.com/",
  183. }
  184. await cls.sleep()
  185. async with session.get(cls.status_url, headers=headers) as status_response:
  186. # Add any cookies from the status response
  187. for cookie in status_response.cookies.values():
  188. cookies[cookie.key] = cookie.value
  189. url_obj = URL(cls.url)
  190. session.cookie_jar.update_cookies({cookie.key: cookie.value}, url_obj)
  191. return cookies
  192. except Exception as e:
  193. # Return at least the required cookies on error
  194. cookies = {'dcs': '1', 'dcm': '3'}
  195. url_obj = URL(cls.url)
  196. for name, value in cookies.items():
  197. session.cookie_jar.update_cookies({name: value}, url_obj)
  198. return cookies
  199. @classmethod
  200. async def fetch_fe_version(cls, session: ClientSession) -> str:
  201. """Fetches the fe-version from the initial page load."""
  202. if cls._chat_xfe:
  203. return cls._chat_xfe
  204. try:
  205. url = "https://duckduckgo.com/?q=DuckDuckGo+AI+Chat&ia=chat&duckai=1"
  206. await cls.sleep()
  207. async with session.get(url) as response:
  208. await raise_for_status(response)
  209. content = await response.text()
  210. # Extract x-fe-version components
  211. try:
  212. # Try to extract the version components
  213. xfe1 = content.split('__DDG_BE_VERSION__="', 1)[1].split('"', 1)[0]
  214. xfe2 = content.split('__DDG_FE_CHAT_HASH__="', 1)[1].split('"', 1)[0]
  215. # Format it like "serp_YYYYMMDD_HHMMSS_ET-hash"
  216. from datetime import datetime
  217. current_date = datetime.now().strftime("%Y%m%d_%H%M%S")
  218. cls._chat_xfe = f"serp_{current_date}_ET-{xfe2}"
  219. return cls._chat_xfe
  220. except Exception:
  221. # Fallback to a default format if extraction fails
  222. from datetime import datetime
  223. current_date = datetime.now().strftime("%Y%m%d_%H%M%S")
  224. cls._chat_xfe = f"serp_{current_date}_ET-78c2e87e3d286691cc21"
  225. return cls._chat_xfe
  226. except Exception:
  227. # Fallback to a default format if request fails
  228. from datetime import datetime
  229. current_date = datetime.now().strftime("%Y%m%d_%H%M%S")
  230. cls._chat_xfe = f"serp_{current_date}_ET-78c2e87e3d286691cc21"
  231. return cls._chat_xfe
  232. @classmethod
  233. async def fetch_vqd_and_hash(cls, session: ClientSession, retry_count: int = 0) -> tuple[str, str]:
  234. """Fetches the required VQD token and hash for the chat session with retries."""
  235. headers = {
  236. "accept": "text/event-stream",
  237. "accept-language": "en",
  238. "cache-control": "no-cache",
  239. "pragma": "no-cache",
  240. "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
  241. "origin": "https://duckduckgo.com",
  242. "referer": "https://duckduckgo.com/",
  243. "x-vqd-accept": "1",
  244. }
  245. # Make sure we have cookies first
  246. if len(session.cookie_jar) == 0:
  247. await cls.get_default_cookies(session)
  248. try:
  249. await cls.sleep(multiplier=1.0 + retry_count * 0.5)
  250. async with session.get(cls.status_url, headers=headers) as response:
  251. await raise_for_status(response)
  252. vqd = response.headers.get("x-vqd-4", "")
  253. vqd_hash_1 = response.headers.get("x-vqd-hash-1", "")
  254. if vqd:
  255. # Return the fetched vqd and vqd_hash_1
  256. return vqd, vqd_hash_1
  257. response_text = await response.text()
  258. raise RuntimeError(f"Failed to fetch VQD token and hash: {response.status} {response_text}")
  259. except Exception as e:
  260. if retry_count < cls.max_retries:
  261. wait_time = cls.base_delay * (2 ** retry_count) * (1 + random.random())
  262. await asyncio.sleep(wait_time)
  263. return await cls.fetch_vqd_and_hash(session, retry_count + 1)
  264. else:
  265. raise RuntimeError(f"Failed to fetch VQD token and hash after {cls.max_retries} attempts: {str(e)}")
  266. @classmethod
  267. async def create_async_generator(
  268. cls,
  269. model: str,
  270. messages: Messages,
  271. proxy: str = None,
  272. timeout: int = 60,
  273. cookies: Cookies = None,
  274. conversation: Conversation = None,
  275. return_conversation: bool = False,
  276. **kwargs
  277. ) -> AsyncResult:
  278. model = cls.validate_model(model)
  279. retry_count = 0
  280. while retry_count <= cls.max_retries:
  281. try:
  282. session_timeout = ClientTimeout(total=timeout)
  283. async with ClientSession(timeout=session_timeout, cookies=cookies) as session:
  284. # Step 1: Ensure we have the fe_version
  285. if not cls._chat_xfe:
  286. cls._chat_xfe = await cls.fetch_fe_version(session)
  287. # Step 2: Initialize or update conversation
  288. if conversation is None:
  289. # Get initial cookies if not provided
  290. if not cookies:
  291. await cls.get_default_cookies(session)
  292. # Create a new conversation
  293. conversation = Conversation(model)
  294. conversation.fe_version = cls._chat_xfe
  295. # Step 3: Get VQD tokens
  296. vqd, vqd_hash_1 = await cls.fetch_vqd_and_hash(session)
  297. conversation.vqd = vqd
  298. conversation.vqd_hash_1 = vqd_hash_1
  299. conversation.message_history = [{"role": "user", "content": format_prompt(messages)}]
  300. else:
  301. # Update existing conversation with new message
  302. last_message = get_last_user_message(messages.copy())
  303. conversation.message_history.append({"role": "user", "content": last_message})
  304. # Step 4: Prepare headers with proper x-vqd-hash-1
  305. headers = {
  306. "accept": "text/event-stream",
  307. "accept-language": "en",
  308. "cache-control": "no-cache",
  309. "content-type": "application/json",
  310. "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
  311. "origin": "https://duckduckgo.com",
  312. "referer": "https://duckduckgo.com/",
  313. "pragma": "no-cache",
  314. "priority": "u=1, i",
  315. "sec-ch-ua": '"Not:A-Brand";v="24", "Chromium";v="134"',
  316. "sec-ch-ua-mobile": "?0",
  317. "sec-ch-ua-platform": '"Linux"',
  318. "sec-fetch-dest": "empty",
  319. "sec-fetch-mode": "cors",
  320. "sec-fetch-site": "same-origin",
  321. "x-fe-version": conversation.fe_version or cls._chat_xfe,
  322. "x-vqd-4": conversation.vqd,
  323. }
  324. # For the first request, send an empty x-vqd-hash-1 header
  325. # This matches the behavior in the duckduckgo_search module
  326. headers["x-vqd-hash-1"] = ""
  327. # Step 5: Prepare the request data
  328. # Convert the user-friendly model name to the API model name
  329. api_model = cls._chat_models.get(model, model)
  330. data = {
  331. "model": api_model,
  332. "messages": conversation.message_history,
  333. }
  334. # Step 6: Send the request
  335. await cls.sleep(multiplier=1.0 + retry_count * 0.5)
  336. async with session.post(cls.api_endpoint, json=data, headers=headers, proxy=proxy) as response:
  337. # Handle 429 and 418 errors specifically
  338. if response.status == 429:
  339. response_text = await response.text()
  340. if retry_count < cls.max_retries:
  341. retry_count += 1
  342. wait_time = cls.base_delay * (2 ** retry_count) * (1 + random.random())
  343. await asyncio.sleep(wait_time)
  344. # Get fresh tokens and cookies
  345. cookies = await cls.get_default_cookies(session)
  346. continue
  347. else:
  348. raise RateLimitError(f"Rate limited after {cls.max_retries} retries")
  349. elif response.status == 418:
  350. # Check if it's a challenge error
  351. try:
  352. response_text = await response.text()
  353. try:
  354. response_json = json.loads(response_text)
  355. # Extract challenge data if available
  356. challenge_data = None
  357. if response_json.get("type") == "ERR_CHALLENGE" and "cd" in response_json:
  358. challenge_data = response_json["cd"]
  359. if retry_count < cls.max_retries:
  360. retry_count += 1
  361. wait_time = cls.base_delay * (2 ** retry_count) * (1 + random.random())
  362. await asyncio.sleep(wait_time)
  363. # Reset tokens and try again with fresh session
  364. conversation = None
  365. cls._chat_xfe = ""
  366. # Get fresh cookies
  367. cookies = await cls.get_default_cookies(session)
  368. # If we have challenge data, try to use it
  369. if challenge_data and isinstance(challenge_data, dict):
  370. # Extract any useful information from challenge data
  371. # This could be used to build a better response in the future
  372. pass
  373. continue
  374. else:
  375. raise DuckDuckGoChallengeError(f"Challenge error after {cls.max_retries} retries")
  376. except json.JSONDecodeError:
  377. # If we can't parse the JSON, assume it's a challenge error anyway
  378. if retry_count < cls.max_retries:
  379. retry_count += 1
  380. wait_time = cls.base_delay * (2 ** retry_count) * (1 + random.random())
  381. await asyncio.sleep(wait_time)
  382. # Reset tokens and try again with fresh session
  383. conversation = None
  384. cls._chat_xfe = ""
  385. cookies = await cls.get_default_cookies(session)
  386. continue
  387. else:
  388. raise DuckDuckGoChallengeError(f"Challenge error after {cls.max_retries} retries")
  389. except Exception as e:
  390. # If any other error occurs during handling, still try to recover
  391. if retry_count < cls.max_retries:
  392. retry_count += 1
  393. wait_time = cls.base_delay * (2 ** retry_count) * (1 + random.random())
  394. await asyncio.sleep(wait_time)
  395. # Reset tokens and try again with fresh session
  396. conversation = None
  397. cls._chat_xfe = ""
  398. cookies = await cls.get_default_cookies(session)
  399. continue
  400. else:
  401. raise DuckDuckGoChallengeError(f"Challenge error after {cls.max_retries} retries: {str(e)}")
  402. # For other status codes, use the standard error handler
  403. await raise_for_status(response)
  404. reason = None
  405. full_message = ""
  406. # Step 7: Process the streaming response
  407. async for line in response.content:
  408. line = line.decode("utf-8").strip()
  409. if line.startswith("data:"):
  410. try:
  411. message = json.loads(line[5:].strip())
  412. except json.JSONDecodeError:
  413. continue
  414. if "action" in message and message["action"] == "error":
  415. error_type = message.get("type", "")
  416. if message.get("status") == 429:
  417. if error_type == "ERR_CONVERSATION_LIMIT":
  418. raise ConversationLimitError(error_type)
  419. raise RateLimitError(error_type)
  420. elif message.get("status") == 418 and error_type == "ERR_CHALLENGE":
  421. # Handle challenge error by refreshing tokens and retrying
  422. if retry_count < cls.max_retries:
  423. # Don't raise here, let the outer exception handler retry
  424. raise DuckDuckGoChallengeError(f"Challenge detected: {error_type}")
  425. raise DuckDuckGoSearchException(error_type)
  426. if "message" in message:
  427. if message["message"]:
  428. yield message["message"]
  429. full_message += message["message"]
  430. reason = "length"
  431. else:
  432. reason = "stop"
  433. # Step 8: Update conversation with response information
  434. # Always update the VQD tokens from the response headers
  435. conversation.vqd = response.headers.get("x-vqd-4", conversation.vqd)
  436. conversation.vqd_hash_1 = response.headers.get("x-vqd-hash-1", conversation.vqd_hash_1)
  437. # Update cookies
  438. conversation.cookies = {
  439. n: c.value
  440. for n, c in session.cookie_jar.filter_cookies(URL(cls.url)).items()
  441. }
  442. # If requested, return the updated conversation
  443. if return_conversation:
  444. conversation.message_history.append({"role": "assistant", "content": full_message})
  445. yield conversation
  446. if reason is not None:
  447. yield FinishReason(reason)
  448. # If we got here, the request was successful
  449. break
  450. except (RateLimitError, ResponseStatusError, DuckDuckGoChallengeError) as e:
  451. if ("429" in str(e) or isinstance(e, DuckDuckGoChallengeError)) and retry_count < cls.max_retries:
  452. retry_count += 1
  453. wait_time = cls.base_delay * (2 ** retry_count) * (1 + random.random())
  454. await asyncio.sleep(wait_time)
  455. # For challenge errors, refresh tokens and cookies
  456. if isinstance(e, DuckDuckGoChallengeError):
  457. # Reset conversation to force new token acquisition
  458. conversation = None
  459. # Clear class cache to force refresh
  460. cls._chat_xfe = ""
  461. else:
  462. raise
  463. except asyncio.TimeoutError as e:
  464. raise TimeoutError(f"Request timed out: {str(e)}")
  465. except Exception as e:
  466. raise