google.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """This is the implementation of the Google WEB engine. Some of this
  3. implementations (manly the :py:obj:`get_google_info`) are shared by other
  4. engines:
  5. - :ref:`google images engine`
  6. - :ref:`google news engine`
  7. - :ref:`google videos engine`
  8. - :ref:`google scholar engine`
  9. - :ref:`google autocomplete`
  10. """
  11. from typing import TYPE_CHECKING
  12. import re
  13. from urllib.parse import urlencode
  14. from lxml import html
  15. import babel
  16. import babel.core
  17. import babel.languages
  18. from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
  19. from searx.locales import language_tag, region_tag, get_official_locales
  20. from searx.network import get # see https://github.com/searxng/searxng/issues/762
  21. from searx.exceptions import SearxEngineCaptchaException
  22. from searx.enginelib.traits import EngineTraits
  23. from searx.result_types import EngineResults
  24. if TYPE_CHECKING:
  25. import logging
  26. logger: logging.Logger
  27. traits: EngineTraits
  28. # about
  29. about = {
  30. "website": 'https://www.google.com',
  31. "wikidata_id": 'Q9366',
  32. "official_api_documentation": 'https://developers.google.com/custom-search/',
  33. "use_official_api": False,
  34. "require_api_key": False,
  35. "results": 'HTML',
  36. }
  37. # engine dependent config
  38. categories = ['general', 'web']
  39. paging = True
  40. max_page = 50
  41. time_range_support = True
  42. safesearch = True
  43. time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
  44. # Filter results. 0: None, 1: Moderate, 2: Strict
  45. filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
  46. # specific xpath variables
  47. # ------------------------
  48. # Suggestions are links placed in a *card-section*, we extract only the text
  49. # from the links not the links itself.
  50. suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
  51. # UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for
  52. # # celebrities like '!google natasha allegri'
  53. # # or '!google chris evans'
  54. UI_ASYNC = 'use_ac:true,_fmt:prog'
  55. """Format of the response from UI's async request."""
  56. def get_google_info(params, eng_traits):
  57. """Composing various (language) properties for the google engines (:ref:`google
  58. API`).
  59. This function is called by the various google engines (:ref:`google web
  60. engine`, :ref:`google images engine`, :ref:`google news engine` and
  61. :ref:`google videos engine`).
  62. :param dict param: Request parameters of the engine. At least
  63. a ``searxng_locale`` key should be in the dictionary.
  64. :param eng_traits: Engine's traits fetched from google preferences
  65. (:py:obj:`searx.enginelib.traits.EngineTraits`)
  66. :rtype: dict
  67. :returns:
  68. Py-Dictionary with the key/value pairs:
  69. language:
  70. The language code that is used by google (e.g. ``lang_en`` or
  71. ``lang_zh-TW``)
  72. country:
  73. The country code that is used by google (e.g. ``US`` or ``TW``)
  74. locale:
  75. A instance of :py:obj:`babel.core.Locale` build from the
  76. ``searxng_locale`` value.
  77. subdomain:
  78. Google subdomain :py:obj:`google_domains` that fits to the country
  79. code.
  80. params:
  81. Py-Dictionary with additional request arguments (can be passed to
  82. :py:func:`urllib.parse.urlencode`).
  83. - ``hl`` parameter: specifies the interface language of user interface.
  84. - ``lr`` parameter: restricts search results to documents written in
  85. a particular language.
  86. - ``cr`` parameter: restricts search results to documents
  87. originating in a particular country.
  88. - ``ie`` parameter: sets the character encoding scheme that should
  89. be used to interpret the query string ('utf8').
  90. - ``oe`` parameter: sets the character encoding scheme that should
  91. be used to decode the XML result ('utf8').
  92. headers:
  93. Py-Dictionary with additional HTTP headers (can be passed to
  94. request's headers)
  95. - ``Accept: '*/*``
  96. """
  97. ret_val = {
  98. 'language': None,
  99. 'country': None,
  100. 'subdomain': None,
  101. 'params': {},
  102. 'headers': {},
  103. 'cookies': {},
  104. 'locale': None,
  105. }
  106. sxng_locale = params.get('searxng_locale', 'all')
  107. try:
  108. locale = babel.Locale.parse(sxng_locale, sep='-')
  109. except babel.core.UnknownLocaleError:
  110. locale = None
  111. eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
  112. lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en
  113. country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)
  114. # Test zh_hans & zh_hant --> in the topmost links in the result list of list
  115. # TW and HK you should a find wiktionary.org zh_hant link. In the result
  116. # list of zh-CN should not be no hant link instead you should find
  117. # zh.m.wikipedia.org/zh somewhere in the top.
  118. # '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
  119. # '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5
  120. ret_val['language'] = eng_lang
  121. ret_val['country'] = country
  122. ret_val['locale'] = locale
  123. ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
  124. # hl parameter:
  125. # The hl parameter specifies the interface language (host language) of
  126. # your user interface. To improve the performance and the quality of your
  127. # search results, you are strongly encouraged to set this parameter
  128. # explicitly.
  129. # https://developers.google.com/custom-search/docs/xml_results#hlsp
  130. # The Interface Language:
  131. # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
  132. # https://github.com/searxng/searxng/issues/2515#issuecomment-1607150817
  133. ret_val['params']['hl'] = f'{lang_code}-{country}'
  134. # lr parameter:
  135. # The lr (language restrict) parameter restricts search results to
  136. # documents written in a particular language.
  137. # https://developers.google.com/custom-search/docs/xml_results#lrsp
  138. # Language Collection Values:
  139. # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
  140. #
  141. # To select 'all' languages an empty 'lr' value is used.
  142. #
  143. # Different to other google services, Google Scholar supports to select more
  144. # than one language. The languages are separated by a pipe '|' (logical OR).
  145. # By example: &lr=lang_zh-TW%7Clang_de selects articles written in
  146. # traditional chinese OR german language.
  147. ret_val['params']['lr'] = eng_lang
  148. if sxng_locale == 'all':
  149. ret_val['params']['lr'] = ''
  150. # cr parameter:
  151. # The cr parameter restricts search results to documents originating in a
  152. # particular country.
  153. # https://developers.google.com/custom-search/docs/xml_results#crsp
  154. # specify a region (country) only if a region is given in the selected
  155. # locale --> https://github.com/searxng/searxng/issues/2672
  156. ret_val['params']['cr'] = ''
  157. if len(sxng_locale.split('-')) > 1:
  158. ret_val['params']['cr'] = 'country' + country
  159. # gl parameter: (mandatory by Google News)
  160. # The gl parameter value is a two-letter country code. For WebSearch
  161. # results, the gl parameter boosts search results whose country of origin
  162. # matches the parameter value. See the Country Codes section for a list of
  163. # valid values.
  164. # Specifying a gl parameter value in WebSearch requests should improve the
  165. # relevance of results. This is particularly true for international
  166. # customers and, even more specifically, for customers in English-speaking
  167. # countries other than the United States.
  168. # https://developers.google.com/custom-search/docs/xml_results#glsp
  169. # https://github.com/searxng/searxng/issues/2515#issuecomment-1606294635
  170. # ret_val['params']['gl'] = country
  171. # ie parameter:
  172. # The ie parameter sets the character encoding scheme that should be used
  173. # to interpret the query string. The default ie value is latin1.
  174. # https://developers.google.com/custom-search/docs/xml_results#iesp
  175. ret_val['params']['ie'] = 'utf8'
  176. # oe parameter:
  177. # The oe parameter sets the character encoding scheme that should be used
  178. # to decode the XML result. The default oe value is latin1.
  179. # https://developers.google.com/custom-search/docs/xml_results#oesp
  180. ret_val['params']['oe'] = 'utf8'
  181. # num parameter:
  182. # The num parameter identifies the number of search results to return.
  183. # The default num value is 10, and the maximum value is 20. If you request
  184. # more than 20 results, only 20 results will be returned.
  185. # https://developers.google.com/custom-search/docs/xml_results#numsp
  186. # HINT: seems to have no effect (tested in google WEB & Images)
  187. # ret_val['params']['num'] = 20
  188. # HTTP headers
  189. ret_val['headers']['Accept'] = '*/*'
  190. # Cookies
  191. # - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
  192. # - https://github.com/searxng/searxng/issues/1555
  193. ret_val['cookies']['CONSENT'] = "YES+"
  194. return ret_val
  195. def detect_google_sorry(resp):
  196. if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
  197. raise SearxEngineCaptchaException()
  198. def request(query, params):
  199. """Google search request"""
  200. # pylint: disable=line-too-long
  201. offset = (params['pageno'] - 1) * 10
  202. google_info = get_google_info(params, traits)
  203. # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
  204. query_url = (
  205. 'https://'
  206. + google_info['subdomain']
  207. + '/search'
  208. + "?"
  209. + urlencode(
  210. {
  211. 'q': query,
  212. **google_info['params'],
  213. 'filter': '0',
  214. 'start': offset,
  215. # 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
  216. # 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
  217. # 'cs' : 1,
  218. # 'sa': 'N',
  219. # 'yv': 3,
  220. # 'prmd': 'vin',
  221. # 'ei': 'GASaY6TxOcy_xc8PtYeY6AE',
  222. # 'sa': 'N',
  223. # 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
  224. # formally known as use_mobile_ui
  225. 'asearch': 'arc',
  226. 'async': UI_ASYNC,
  227. }
  228. )
  229. )
  230. if params['time_range'] in time_range_dict:
  231. query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
  232. if params['safesearch']:
  233. query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
  234. params['url'] = query_url
  235. params['cookies'] = google_info['cookies']
  236. params['headers'].update(google_info['headers'])
  237. return params
  238. # =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;
  239. # ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
  240. RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
  241. def _parse_data_images(dom):
  242. data_image_map = {}
  243. for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()):
  244. end_pos = data_image.rfind('=')
  245. if end_pos > 0:
  246. data_image = data_image[: end_pos + 1]
  247. data_image_map[img_id] = data_image
  248. logger.debug('data:image objects --> %s', list(data_image_map.keys()))
  249. return data_image_map
  250. def response(resp) -> EngineResults:
  251. """Get response from google's search request"""
  252. # pylint: disable=too-many-branches, too-many-statements
  253. detect_google_sorry(resp)
  254. results = EngineResults()
  255. # convert the text to dom
  256. dom = html.fromstring(resp.text)
  257. data_image_map = _parse_data_images(dom)
  258. # results --> answer
  259. answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
  260. for item in answer_list:
  261. for bubble in eval_xpath(item, './/div[@class="nnFGuf"]'):
  262. bubble.drop_tree()
  263. results.add(
  264. results.types.Answer(
  265. answer=extract_text(item),
  266. url=(eval_xpath(item, '../..//a/@href') + [None])[0],
  267. )
  268. )
  269. # parse results
  270. for result in eval_xpath_list(dom, './/div[contains(@jscontroller, "SC7lYd")]'):
  271. # pylint: disable=too-many-nested-blocks
  272. try:
  273. title_tag = eval_xpath_getindex(result, './/a/h3[1]', 0, default=None)
  274. if title_tag is None:
  275. # this not one of the common google results *section*
  276. logger.debug('ignoring item from the result_xpath list: missing title')
  277. continue
  278. title = extract_text(title_tag)
  279. url = eval_xpath_getindex(result, './/a[h3]/@href', 0, None)
  280. if url is None:
  281. logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
  282. continue
  283. content_nodes = eval_xpath(result, './/div[contains(@data-sncf, "1")]')
  284. for item in content_nodes:
  285. for script in item.xpath(".//script"):
  286. script.getparent().remove(script)
  287. content = extract_text(content_nodes)
  288. if not content:
  289. logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
  290. continue
  291. thumbnail = content_nodes[0].xpath('.//img/@src')
  292. if thumbnail:
  293. thumbnail = thumbnail[0]
  294. if thumbnail.startswith('data:image'):
  295. img_id = content_nodes[0].xpath('.//img/@id')
  296. if img_id:
  297. thumbnail = data_image_map.get(img_id[0])
  298. else:
  299. thumbnail = None
  300. results.append({'url': url, 'title': title, 'content': content, 'thumbnail': thumbnail})
  301. except Exception as e: # pylint: disable=broad-except
  302. logger.error(e, exc_info=True)
  303. continue
  304. # parse suggestion
  305. for suggestion in eval_xpath_list(dom, suggestion_xpath):
  306. # append suggestion
  307. results.append({'suggestion': extract_text(suggestion)})
  308. # return results
  309. return results
  310. # get supported languages from their site
  311. skip_countries = [
  312. # official language of google-country not in google-languages
  313. 'AL', # Albanien (sq)
  314. 'AZ', # Aserbaidschan (az)
  315. 'BD', # Bangladesch (bn)
  316. 'BN', # Brunei Darussalam (ms)
  317. 'BT', # Bhutan (dz)
  318. 'ET', # Äthiopien (am)
  319. 'GE', # Georgien (ka, os)
  320. 'GL', # Grönland (kl)
  321. 'KH', # Kambodscha (km)
  322. 'LA', # Laos (lo)
  323. 'LK', # Sri Lanka (si, ta)
  324. 'ME', # Montenegro (sr)
  325. 'MK', # Nordmazedonien (mk, sq)
  326. 'MM', # Myanmar (my)
  327. 'MN', # Mongolei (mn)
  328. 'MV', # Malediven (dv) // dv_MV is unknown by babel
  329. 'MY', # Malaysia (ms)
  330. 'NP', # Nepal (ne)
  331. 'TJ', # Tadschikistan (tg)
  332. 'TM', # Turkmenistan (tk)
  333. 'UZ', # Usbekistan (uz)
  334. ]
  335. def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
  336. """Fetch languages from Google."""
  337. # pylint: disable=import-outside-toplevel, too-many-branches
  338. engine_traits.custom['supported_domains'] = {}
  339. resp = get('https://www.google.com/preferences')
  340. if not resp.ok: # type: ignore
  341. raise RuntimeError("Response from Google's preferences is not OK.")
  342. dom = html.fromstring(resp.text.replace('<?xml version="1.0" encoding="UTF-8"?>', ''))
  343. # supported language codes
  344. lang_map = {'no': 'nb'}
  345. for x in eval_xpath_list(dom, "//select[@name='hl']/option"):
  346. eng_lang = x.get("value")
  347. try:
  348. locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
  349. except babel.UnknownLocaleError:
  350. print("INFO: google UI language %s (%s) is unknown by babel" % (eng_lang, x.text.split("(")[0].strip()))
  351. continue
  352. sxng_lang = language_tag(locale)
  353. conflict = engine_traits.languages.get(sxng_lang)
  354. if conflict:
  355. if conflict != eng_lang:
  356. print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
  357. continue
  358. engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
  359. # alias languages
  360. engine_traits.languages['zh'] = 'lang_zh-CN'
  361. # supported region codes
  362. for x in eval_xpath_list(dom, "//select[@name='gl']/option"):
  363. eng_country = x.get("value")
  364. if eng_country in skip_countries:
  365. continue
  366. if eng_country == 'ZZ':
  367. engine_traits.all_locale = 'ZZ'
  368. continue
  369. sxng_locales = get_official_locales(eng_country, engine_traits.languages.keys(), regional=True)
  370. if not sxng_locales:
  371. print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
  372. continue
  373. for sxng_locale in sxng_locales:
  374. engine_traits.regions[region_tag(sxng_locale)] = eng_country
  375. # alias regions
  376. engine_traits.regions['zh-CN'] = 'HK'
  377. # supported domains
  378. if add_domains:
  379. resp = get('https://www.google.com/supported_domains')
  380. if not resp.ok: # type: ignore
  381. raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
  382. for domain in resp.text.split(): # type: ignore
  383. domain = domain.strip()
  384. if not domain or domain in [
  385. '.google.com',
  386. ]:
  387. continue
  388. region = domain.split('.')[-1].upper()
  389. engine_traits.custom['supported_domains'][region] = 'www' + domain # type: ignore
  390. if region == 'HK':
  391. # There is no google.cn, we use .com.hk for zh-CN
  392. engine_traits.custom['supported_domains']['CN'] = 'www' + domain # type: ignore