google.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """This is the implementation of the Google WEB engine. Some of this
  3. implementations (manly the :py:obj:`get_google_info`) are shared by other
  4. engines:
  5. - :ref:`google images engine`
  6. - :ref:`google news engine`
  7. - :ref:`google videos engine`
  8. - :ref:`google scholar engine`
  9. - :ref:`google autocomplete`
  10. """
  11. from __future__ import annotations
  12. from typing import TYPE_CHECKING
  13. import re
  14. import random
  15. import string
  16. import time
  17. from urllib.parse import urlencode
  18. from lxml import html
  19. import babel
  20. import babel.core
  21. import babel.languages
  22. from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
  23. from searx.locales import language_tag, region_tag, get_official_locales
  24. from searx.network import get # see https://github.com/searxng/searxng/issues/762
  25. from searx.exceptions import SearxEngineCaptchaException
  26. from searx.enginelib.traits import EngineTraits
  27. from searx.result_types import EngineResults
  28. if TYPE_CHECKING:
  29. import logging
  30. logger: logging.Logger
  31. traits: EngineTraits
  32. # about
  33. about = {
  34. "website": 'https://www.google.com',
  35. "wikidata_id": 'Q9366',
  36. "official_api_documentation": 'https://developers.google.com/custom-search/',
  37. "use_official_api": False,
  38. "require_api_key": False,
  39. "results": 'HTML',
  40. }
  41. # engine dependent config
  42. categories = ['general', 'web']
  43. paging = True
  44. max_page = 50
  45. """`Google: max 50 pages`_
  46. .. _Google: max 50 pages: https://github.com/searxng/searxng/issues/2982
  47. """
  48. time_range_support = True
  49. safesearch = True
  50. time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
  51. # Filter results. 0: None, 1: Moderate, 2: Strict
  52. filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
  53. # specific xpath variables
  54. # ------------------------
  55. # Suggestions are links placed in a *card-section*, we extract only the text
  56. # from the links not the links itself.
  57. suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
  58. _arcid_range = string.ascii_letters + string.digits + "_-"
  59. _arcid_random: tuple[str, int] | None = None
  60. def ui_async(start: int) -> str:
  61. """Format of the response from UI's async request.
  62. - ``arc_id:<...>,use_ac:true,_fmt:prog``
  63. The arc_id is random generated every hour.
  64. """
  65. global _arcid_random # pylint: disable=global-statement
  66. use_ac = "use_ac:true"
  67. # _fmt:html returns a HTTP 500 when user search for celebrities like
  68. # '!google natasha allegri' or '!google chris evans'
  69. _fmt = "_fmt:prog"
  70. # create a new random arc_id every hour
  71. if not _arcid_random or (int(time.time()) - _arcid_random[1]) > 3600:
  72. _arcid_random = (''.join(random.choices(_arcid_range, k=23)), int(time.time()))
  73. arc_id = f"arc_id:srp_{_arcid_random[0]}_1{start:02}"
  74. return ",".join([arc_id, use_ac, _fmt])
  75. def get_google_info(params, eng_traits):
  76. """Composing various (language) properties for the google engines (:ref:`google
  77. API`).
  78. This function is called by the various google engines (:ref:`google web
  79. engine`, :ref:`google images engine`, :ref:`google news engine` and
  80. :ref:`google videos engine`).
  81. :param dict param: Request parameters of the engine. At least
  82. a ``searxng_locale`` key should be in the dictionary.
  83. :param eng_traits: Engine's traits fetched from google preferences
  84. (:py:obj:`searx.enginelib.traits.EngineTraits`)
  85. :rtype: dict
  86. :returns:
  87. Py-Dictionary with the key/value pairs:
  88. language:
  89. The language code that is used by google (e.g. ``lang_en`` or
  90. ``lang_zh-TW``)
  91. country:
  92. The country code that is used by google (e.g. ``US`` or ``TW``)
  93. locale:
  94. A instance of :py:obj:`babel.core.Locale` build from the
  95. ``searxng_locale`` value.
  96. subdomain:
  97. Google subdomain :py:obj:`google_domains` that fits to the country
  98. code.
  99. params:
  100. Py-Dictionary with additional request arguments (can be passed to
  101. :py:func:`urllib.parse.urlencode`).
  102. - ``hl`` parameter: specifies the interface language of user interface.
  103. - ``lr`` parameter: restricts search results to documents written in
  104. a particular language.
  105. - ``cr`` parameter: restricts search results to documents
  106. originating in a particular country.
  107. - ``ie`` parameter: sets the character encoding scheme that should
  108. be used to interpret the query string ('utf8').
  109. - ``oe`` parameter: sets the character encoding scheme that should
  110. be used to decode the XML result ('utf8').
  111. headers:
  112. Py-Dictionary with additional HTTP headers (can be passed to
  113. request's headers)
  114. - ``Accept: '*/*``
  115. """
  116. ret_val = {
  117. 'language': None,
  118. 'country': None,
  119. 'subdomain': None,
  120. 'params': {},
  121. 'headers': {},
  122. 'cookies': {},
  123. 'locale': None,
  124. }
  125. sxng_locale = params.get('searxng_locale', 'all')
  126. try:
  127. locale = babel.Locale.parse(sxng_locale, sep='-')
  128. except babel.core.UnknownLocaleError:
  129. locale = None
  130. eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
  131. lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en
  132. country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)
  133. # Test zh_hans & zh_hant --> in the topmost links in the result list of list
  134. # TW and HK you should a find wiktionary.org zh_hant link. In the result
  135. # list of zh-CN should not be no hant link instead you should find
  136. # zh.m.wikipedia.org/zh somewhere in the top.
  137. # '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
  138. # '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5
  139. ret_val['language'] = eng_lang
  140. ret_val['country'] = country
  141. ret_val['locale'] = locale
  142. ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
  143. # hl parameter:
  144. # The hl parameter specifies the interface language (host language) of
  145. # your user interface. To improve the performance and the quality of your
  146. # search results, you are strongly encouraged to set this parameter
  147. # explicitly.
  148. # https://developers.google.com/custom-search/docs/xml_results#hlsp
  149. # The Interface Language:
  150. # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
  151. # https://github.com/searxng/searxng/issues/2515#issuecomment-1607150817
  152. ret_val['params']['hl'] = f'{lang_code}-{country}'
  153. # lr parameter:
  154. # The lr (language restrict) parameter restricts search results to
  155. # documents written in a particular language.
  156. # https://developers.google.com/custom-search/docs/xml_results#lrsp
  157. # Language Collection Values:
  158. # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
  159. #
  160. # To select 'all' languages an empty 'lr' value is used.
  161. #
  162. # Different to other google services, Google Scholar supports to select more
  163. # than one language. The languages are separated by a pipe '|' (logical OR).
  164. # By example: &lr=lang_zh-TW%7Clang_de selects articles written in
  165. # traditional chinese OR german language.
  166. ret_val['params']['lr'] = eng_lang
  167. if sxng_locale == 'all':
  168. ret_val['params']['lr'] = ''
  169. # cr parameter:
  170. # The cr parameter restricts search results to documents originating in a
  171. # particular country.
  172. # https://developers.google.com/custom-search/docs/xml_results#crsp
  173. # specify a region (country) only if a region is given in the selected
  174. # locale --> https://github.com/searxng/searxng/issues/2672
  175. ret_val['params']['cr'] = ''
  176. if len(sxng_locale.split('-')) > 1:
  177. ret_val['params']['cr'] = 'country' + country
  178. # gl parameter: (mandatory by Google News)
  179. # The gl parameter value is a two-letter country code. For WebSearch
  180. # results, the gl parameter boosts search results whose country of origin
  181. # matches the parameter value. See the Country Codes section for a list of
  182. # valid values.
  183. # Specifying a gl parameter value in WebSearch requests should improve the
  184. # relevance of results. This is particularly true for international
  185. # customers and, even more specifically, for customers in English-speaking
  186. # countries other than the United States.
  187. # https://developers.google.com/custom-search/docs/xml_results#glsp
  188. # https://github.com/searxng/searxng/issues/2515#issuecomment-1606294635
  189. # ret_val['params']['gl'] = country
  190. # ie parameter:
  191. # The ie parameter sets the character encoding scheme that should be used
  192. # to interpret the query string. The default ie value is latin1.
  193. # https://developers.google.com/custom-search/docs/xml_results#iesp
  194. ret_val['params']['ie'] = 'utf8'
  195. # oe parameter:
  196. # The oe parameter sets the character encoding scheme that should be used
  197. # to decode the XML result. The default oe value is latin1.
  198. # https://developers.google.com/custom-search/docs/xml_results#oesp
  199. ret_val['params']['oe'] = 'utf8'
  200. # num parameter:
  201. # The num parameter identifies the number of search results to return.
  202. # The default num value is 10, and the maximum value is 20. If you request
  203. # more than 20 results, only 20 results will be returned.
  204. # https://developers.google.com/custom-search/docs/xml_results#numsp
  205. # HINT: seems to have no effect (tested in google WEB & Images)
  206. # ret_val['params']['num'] = 20
  207. # HTTP headers
  208. ret_val['headers']['Accept'] = '*/*'
  209. # Cookies
  210. # - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
  211. # - https://github.com/searxng/searxng/issues/1555
  212. ret_val['cookies']['CONSENT'] = "YES+"
  213. return ret_val
  214. def detect_google_sorry(resp):
  215. if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
  216. raise SearxEngineCaptchaException()
  217. def request(query, params):
  218. """Google search request"""
  219. # pylint: disable=line-too-long
  220. start = (params['pageno'] - 1) * 10
  221. str_async = ui_async(start)
  222. google_info = get_google_info(params, traits)
  223. logger.debug("ARC_ID: %s", str_async)
  224. # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
  225. query_url = (
  226. 'https://'
  227. + google_info['subdomain']
  228. + '/search'
  229. + "?"
  230. + urlencode(
  231. {
  232. 'q': query,
  233. **google_info['params'],
  234. 'filter': '0',
  235. 'start': start,
  236. # 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
  237. # 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
  238. # 'cs' : 1,
  239. # 'sa': 'N',
  240. # 'yv': 3,
  241. # 'prmd': 'vin',
  242. # 'ei': 'GASaY6TxOcy_xc8PtYeY6AE',
  243. # 'sa': 'N',
  244. # 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
  245. # formally known as use_mobile_ui
  246. 'asearch': 'arc',
  247. 'async': str_async,
  248. }
  249. )
  250. )
  251. if params['time_range'] in time_range_dict:
  252. query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
  253. if params['safesearch']:
  254. query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
  255. params['url'] = query_url
  256. params['cookies'] = google_info['cookies']
  257. params['headers'].update(google_info['headers'])
  258. return params
  259. # =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA
  260. # ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
  261. RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
  262. RE_DATA_IMAGE_end = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*)$')
  263. def parse_data_images(text: str):
  264. data_image_map = {}
  265. for img_id, data_image in RE_DATA_IMAGE.findall(text):
  266. end_pos = data_image.rfind('=')
  267. if end_pos > 0:
  268. data_image = data_image[: end_pos + 1]
  269. data_image_map[img_id] = data_image
  270. last = RE_DATA_IMAGE_end.search(text)
  271. if last:
  272. data_image_map[last.group(1)] = last.group(2)
  273. logger.debug('data:image objects --> %s', list(data_image_map.keys()))
  274. return data_image_map
  275. def response(resp) -> EngineResults:
  276. """Get response from google's search request"""
  277. # pylint: disable=too-many-branches, too-many-statements
  278. detect_google_sorry(resp)
  279. data_image_map = parse_data_images(resp.text)
  280. results = EngineResults()
  281. # convert the text to dom
  282. dom = html.fromstring(resp.text)
  283. # results --> answer
  284. answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
  285. for item in answer_list:
  286. for bubble in eval_xpath(item, './/div[@class="nnFGuf"]'):
  287. bubble.drop_tree()
  288. results.add(
  289. results.types.Answer(
  290. answer=extract_text(item),
  291. url=(eval_xpath(item, '../..//a/@href') + [None])[0],
  292. )
  293. )
  294. # parse results
  295. for result in eval_xpath_list(dom, './/div[contains(@jscontroller, "SC7lYd")]'):
  296. # pylint: disable=too-many-nested-blocks
  297. try:
  298. title_tag = eval_xpath_getindex(result, './/a/h3[1]', 0, default=None)
  299. if title_tag is None:
  300. # this not one of the common google results *section*
  301. logger.debug('ignoring item from the result_xpath list: missing title')
  302. continue
  303. title = extract_text(title_tag)
  304. url = eval_xpath_getindex(result, './/a[h3]/@href', 0, None)
  305. if url is None:
  306. logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
  307. continue
  308. content_nodes = eval_xpath(result, './/div[contains(@data-sncf, "1")]')
  309. for item in content_nodes:
  310. for script in item.xpath(".//script"):
  311. script.getparent().remove(script)
  312. content = extract_text(content_nodes)
  313. if not content:
  314. logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
  315. continue
  316. thumbnail = content_nodes[0].xpath('.//img/@src')
  317. if thumbnail:
  318. thumbnail = thumbnail[0]
  319. if thumbnail.startswith('data:image'):
  320. img_id = content_nodes[0].xpath('.//img/@id')
  321. if img_id:
  322. thumbnail = data_image_map.get(img_id[0])
  323. else:
  324. thumbnail = None
  325. results.append({'url': url, 'title': title, 'content': content, 'thumbnail': thumbnail})
  326. except Exception as e: # pylint: disable=broad-except
  327. logger.error(e, exc_info=True)
  328. continue
  329. # parse suggestion
  330. for suggestion in eval_xpath_list(dom, suggestion_xpath):
  331. # append suggestion
  332. results.append({'suggestion': extract_text(suggestion)})
  333. # return results
  334. return results
  335. # get supported languages from their site
  336. skip_countries = [
  337. # official language of google-country not in google-languages
  338. 'AL', # Albanien (sq)
  339. 'AZ', # Aserbaidschan (az)
  340. 'BD', # Bangladesch (bn)
  341. 'BN', # Brunei Darussalam (ms)
  342. 'BT', # Bhutan (dz)
  343. 'ET', # Äthiopien (am)
  344. 'GE', # Georgien (ka, os)
  345. 'GL', # Grönland (kl)
  346. 'KH', # Kambodscha (km)
  347. 'LA', # Laos (lo)
  348. 'LK', # Sri Lanka (si, ta)
  349. 'ME', # Montenegro (sr)
  350. 'MK', # Nordmazedonien (mk, sq)
  351. 'MM', # Myanmar (my)
  352. 'MN', # Mongolei (mn)
  353. 'MV', # Malediven (dv) // dv_MV is unknown by babel
  354. 'MY', # Malaysia (ms)
  355. 'NP', # Nepal (ne)
  356. 'TJ', # Tadschikistan (tg)
  357. 'TM', # Turkmenistan (tk)
  358. 'UZ', # Usbekistan (uz)
  359. ]
  360. def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
  361. """Fetch languages from Google."""
  362. # pylint: disable=import-outside-toplevel, too-many-branches
  363. engine_traits.custom['supported_domains'] = {}
  364. resp = get('https://www.google.com/preferences')
  365. if not resp.ok: # type: ignore
  366. raise RuntimeError("Response from Google's preferences is not OK.")
  367. dom = html.fromstring(resp.text.replace('<?xml version="1.0" encoding="UTF-8"?>', ''))
  368. # supported language codes
  369. lang_map = {'no': 'nb'}
  370. for x in eval_xpath_list(dom, "//select[@name='hl']/option"):
  371. eng_lang = x.get("value")
  372. try:
  373. locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
  374. except babel.UnknownLocaleError:
  375. print("INFO: google UI language %s (%s) is unknown by babel" % (eng_lang, x.text.split("(")[0].strip()))
  376. continue
  377. sxng_lang = language_tag(locale)
  378. conflict = engine_traits.languages.get(sxng_lang)
  379. if conflict:
  380. if conflict != eng_lang:
  381. print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
  382. continue
  383. engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
  384. # alias languages
  385. engine_traits.languages['zh'] = 'lang_zh-CN'
  386. # supported region codes
  387. for x in eval_xpath_list(dom, "//select[@name='gl']/option"):
  388. eng_country = x.get("value")
  389. if eng_country in skip_countries:
  390. continue
  391. if eng_country == 'ZZ':
  392. engine_traits.all_locale = 'ZZ'
  393. continue
  394. sxng_locales = get_official_locales(eng_country, engine_traits.languages.keys(), regional=True)
  395. if not sxng_locales:
  396. print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
  397. continue
  398. for sxng_locale in sxng_locales:
  399. engine_traits.regions[region_tag(sxng_locale)] = eng_country
  400. # alias regions
  401. engine_traits.regions['zh-CN'] = 'HK'
  402. # supported domains
  403. if add_domains:
  404. resp = get('https://www.google.com/supported_domains')
  405. if not resp.ok: # type: ignore
  406. raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
  407. for domain in resp.text.split(): # type: ignore
  408. domain = domain.strip()
  409. if not domain or domain in [
  410. '.google.com',
  411. ]:
  412. continue
  413. region = domain.split('.')[-1].upper()
  414. engine_traits.custom['supported_domains'][region] = 'www' + domain # type: ignore
  415. if region == 'HK':
  416. # There is no google.cn, we use .com.hk for zh-CN
  417. engine_traits.custom['supported_domains']['CN'] = 'www' + domain # type: ignore