google.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """This is the implementation of the Google WEB engine. Some of this
  3. implementations (manly the :py:obj:`get_google_info`) are shared by other
  4. engines:
  5. - :ref:`google images engine`
  6. - :ref:`google news engine`
  7. - :ref:`google videos engine`
  8. - :ref:`google scholar engine`
  9. - :ref:`google autocomplete`
  10. """
  11. from typing import TYPE_CHECKING
  12. import re
  13. from urllib.parse import urlencode
  14. from lxml import html
  15. import babel
  16. import babel.core
  17. import babel.languages
  18. from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
  19. from searx.locales import language_tag, region_tag, get_official_locales
  20. from searx.network import get # see https://github.com/searxng/searxng/issues/762
  21. from searx.exceptions import SearxEngineCaptchaException
  22. from searx.enginelib.traits import EngineTraits
  23. if TYPE_CHECKING:
  24. import logging
  25. logger: logging.Logger
  26. traits: EngineTraits
  27. # about
  28. about = {
  29. "website": 'https://www.google.com',
  30. "wikidata_id": 'Q9366',
  31. "official_api_documentation": 'https://developers.google.com/custom-search/',
  32. "use_official_api": False,
  33. "require_api_key": False,
  34. "results": 'HTML',
  35. }
  36. # engine dependent config
  37. categories = ['general', 'web']
  38. paging = True
  39. max_page = 50
  40. time_range_support = True
  41. safesearch = True
  42. time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
  43. # Filter results. 0: None, 1: Moderate, 2: Strict
  44. filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
  45. # specific xpath variables
  46. # ------------------------
  47. # Suggestions are links placed in a *card-section*, we extract only the text
  48. # from the links not the links itself.
  49. suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
  50. # UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for
  51. # # celebrities like '!google natasha allegri'
  52. # # or '!google chris evans'
  53. UI_ASYNC = 'use_ac:true,_fmt:prog'
  54. """Format of the response from UI's async request."""
  55. def get_google_info(params, eng_traits):
  56. """Composing various (language) properties for the google engines (:ref:`google
  57. API`).
  58. This function is called by the various google engines (:ref:`google web
  59. engine`, :ref:`google images engine`, :ref:`google news engine` and
  60. :ref:`google videos engine`).
  61. :param dict param: Request parameters of the engine. At least
  62. a ``searxng_locale`` key should be in the dictionary.
  63. :param eng_traits: Engine's traits fetched from google preferences
  64. (:py:obj:`searx.enginelib.traits.EngineTraits`)
  65. :rtype: dict
  66. :returns:
  67. Py-Dictionary with the key/value pairs:
  68. language:
  69. The language code that is used by google (e.g. ``lang_en`` or
  70. ``lang_zh-TW``)
  71. country:
  72. The country code that is used by google (e.g. ``US`` or ``TW``)
  73. locale:
  74. A instance of :py:obj:`babel.core.Locale` build from the
  75. ``searxng_locale`` value.
  76. subdomain:
  77. Google subdomain :py:obj:`google_domains` that fits to the country
  78. code.
  79. params:
  80. Py-Dictionary with additional request arguments (can be passed to
  81. :py:func:`urllib.parse.urlencode`).
  82. - ``hl`` parameter: specifies the interface language of user interface.
  83. - ``lr`` parameter: restricts search results to documents written in
  84. a particular language.
  85. - ``cr`` parameter: restricts search results to documents
  86. originating in a particular country.
  87. - ``ie`` parameter: sets the character encoding scheme that should
  88. be used to interpret the query string ('utf8').
  89. - ``oe`` parameter: sets the character encoding scheme that should
  90. be used to decode the XML result ('utf8').
  91. headers:
  92. Py-Dictionary with additional HTTP headers (can be passed to
  93. request's headers)
  94. - ``Accept: '*/*``
  95. """
  96. ret_val = {
  97. 'language': None,
  98. 'country': None,
  99. 'subdomain': None,
  100. 'params': {},
  101. 'headers': {},
  102. 'cookies': {},
  103. 'locale': None,
  104. }
  105. sxng_locale = params.get('searxng_locale', 'all')
  106. try:
  107. locale = babel.Locale.parse(sxng_locale, sep='-')
  108. except babel.core.UnknownLocaleError:
  109. locale = None
  110. eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
  111. lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en
  112. country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)
  113. # Test zh_hans & zh_hant --> in the topmost links in the result list of list
  114. # TW and HK you should a find wiktionary.org zh_hant link. In the result
  115. # list of zh-CN should not be no hant link instead you should find
  116. # zh.m.wikipedia.org/zh somewhere in the top.
  117. # '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
  118. # '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5
  119. ret_val['language'] = eng_lang
  120. ret_val['country'] = country
  121. ret_val['locale'] = locale
  122. ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
  123. # hl parameter:
  124. # The hl parameter specifies the interface language (host language) of
  125. # your user interface. To improve the performance and the quality of your
  126. # search results, you are strongly encouraged to set this parameter
  127. # explicitly.
  128. # https://developers.google.com/custom-search/docs/xml_results#hlsp
  129. # The Interface Language:
  130. # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
  131. # https://github.com/searxng/searxng/issues/2515#issuecomment-1607150817
  132. ret_val['params']['hl'] = f'{lang_code}-{country}'
  133. # lr parameter:
  134. # The lr (language restrict) parameter restricts search results to
  135. # documents written in a particular language.
  136. # https://developers.google.com/custom-search/docs/xml_results#lrsp
  137. # Language Collection Values:
  138. # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
  139. #
  140. # To select 'all' languages an empty 'lr' value is used.
  141. #
  142. # Different to other google services, Google Scholar supports to select more
  143. # than one language. The languages are separated by a pipe '|' (logical OR).
  144. # By example: &lr=lang_zh-TW%7Clang_de selects articles written in
  145. # traditional chinese OR german language.
  146. ret_val['params']['lr'] = eng_lang
  147. if sxng_locale == 'all':
  148. ret_val['params']['lr'] = ''
  149. # cr parameter:
  150. # The cr parameter restricts search results to documents originating in a
  151. # particular country.
  152. # https://developers.google.com/custom-search/docs/xml_results#crsp
  153. # specify a region (country) only if a region is given in the selected
  154. # locale --> https://github.com/searxng/searxng/issues/2672
  155. ret_val['params']['cr'] = ''
  156. if len(sxng_locale.split('-')) > 1:
  157. ret_val['params']['cr'] = 'country' + country
  158. # gl parameter: (mandatory by Google News)
  159. # The gl parameter value is a two-letter country code. For WebSearch
  160. # results, the gl parameter boosts search results whose country of origin
  161. # matches the parameter value. See the Country Codes section for a list of
  162. # valid values.
  163. # Specifying a gl parameter value in WebSearch requests should improve the
  164. # relevance of results. This is particularly true for international
  165. # customers and, even more specifically, for customers in English-speaking
  166. # countries other than the United States.
  167. # https://developers.google.com/custom-search/docs/xml_results#glsp
  168. # https://github.com/searxng/searxng/issues/2515#issuecomment-1606294635
  169. # ret_val['params']['gl'] = country
  170. # ie parameter:
  171. # The ie parameter sets the character encoding scheme that should be used
  172. # to interpret the query string. The default ie value is latin1.
  173. # https://developers.google.com/custom-search/docs/xml_results#iesp
  174. ret_val['params']['ie'] = 'utf8'
  175. # oe parameter:
  176. # The oe parameter sets the character encoding scheme that should be used
  177. # to decode the XML result. The default oe value is latin1.
  178. # https://developers.google.com/custom-search/docs/xml_results#oesp
  179. ret_val['params']['oe'] = 'utf8'
  180. # num parameter:
  181. # The num parameter identifies the number of search results to return.
  182. # The default num value is 10, and the maximum value is 20. If you request
  183. # more than 20 results, only 20 results will be returned.
  184. # https://developers.google.com/custom-search/docs/xml_results#numsp
  185. # HINT: seems to have no effect (tested in google WEB & Images)
  186. # ret_val['params']['num'] = 20
  187. # HTTP headers
  188. ret_val['headers']['Accept'] = '*/*'
  189. # Cookies
  190. # - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
  191. # - https://github.com/searxng/searxng/issues/1555
  192. ret_val['cookies']['CONSENT'] = "YES+"
  193. return ret_val
  194. def detect_google_sorry(resp):
  195. if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
  196. raise SearxEngineCaptchaException()
  197. def request(query, params):
  198. """Google search request"""
  199. # pylint: disable=line-too-long
  200. offset = (params['pageno'] - 1) * 10
  201. google_info = get_google_info(params, traits)
  202. # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
  203. query_url = (
  204. 'https://'
  205. + google_info['subdomain']
  206. + '/search'
  207. + "?"
  208. + urlencode(
  209. {
  210. 'q': query,
  211. **google_info['params'],
  212. 'filter': '0',
  213. 'start': offset,
  214. # 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
  215. # 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
  216. # 'cs' : 1,
  217. # 'sa': 'N',
  218. # 'yv': 3,
  219. # 'prmd': 'vin',
  220. # 'ei': 'GASaY6TxOcy_xc8PtYeY6AE',
  221. # 'sa': 'N',
  222. # 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
  223. # formally known as use_mobile_ui
  224. 'asearch': 'arc',
  225. 'async': UI_ASYNC,
  226. }
  227. )
  228. )
  229. if params['time_range'] in time_range_dict:
  230. query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
  231. if params['safesearch']:
  232. query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
  233. params['url'] = query_url
  234. params['cookies'] = google_info['cookies']
  235. params['headers'].update(google_info['headers'])
  236. return params
  237. # =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA
  238. # ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
  239. RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
  240. def _parse_data_images(dom):
  241. data_image_map = {}
  242. for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()):
  243. end_pos = data_image.rfind('=')
  244. if end_pos > 0:
  245. data_image = data_image[: end_pos + 1]
  246. data_image_map[img_id] = data_image
  247. logger.debug('data:image objects --> %s', list(data_image_map.keys()))
  248. return data_image_map
  249. def response(resp):
  250. """Get response from google's search request"""
  251. # pylint: disable=too-many-branches, too-many-statements
  252. detect_google_sorry(resp)
  253. results = []
  254. # convert the text to dom
  255. dom = html.fromstring(resp.text)
  256. data_image_map = _parse_data_images(dom)
  257. # results --> answer
  258. answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
  259. for item in answer_list:
  260. for bubble in eval_xpath(item, './/div[@class="nnFGuf"]'):
  261. bubble.drop_tree()
  262. results.append(
  263. {
  264. 'answer': extract_text(item),
  265. 'url': (eval_xpath(item, '../..//a/@href') + [None])[0],
  266. }
  267. )
  268. # parse results
  269. for result in eval_xpath_list(dom, './/div[contains(@jscontroller, "SC7lYd")]'):
  270. # pylint: disable=too-many-nested-blocks
  271. try:
  272. title_tag = eval_xpath_getindex(result, './/a/h3[1]', 0, default=None)
  273. if title_tag is None:
  274. # this not one of the common google results *section*
  275. logger.debug('ignoring item from the result_xpath list: missing title')
  276. continue
  277. title = extract_text(title_tag)
  278. url = eval_xpath_getindex(result, './/a[h3]/@href', 0, None)
  279. if url is None:
  280. logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
  281. continue
  282. content_nodes = eval_xpath(result, './/div[contains(@data-sncf, "1")]')
  283. for item in content_nodes:
  284. for script in item.xpath(".//script"):
  285. script.getparent().remove(script)
  286. content = extract_text(content_nodes)
  287. if not content:
  288. logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
  289. continue
  290. thumbnail = content_nodes[0].xpath('.//img/@src')
  291. if thumbnail:
  292. thumbnail = thumbnail[0]
  293. if thumbnail.startswith('data:image'):
  294. img_id = content_nodes[0].xpath('.//img/@id')
  295. if img_id:
  296. thumbnail = data_image_map.get(img_id[0])
  297. else:
  298. thumbnail = None
  299. results.append({'url': url, 'title': title, 'content': content, 'thumbnail': thumbnail})
  300. except Exception as e: # pylint: disable=broad-except
  301. logger.error(e, exc_info=True)
  302. continue
  303. # parse suggestion
  304. for suggestion in eval_xpath_list(dom, suggestion_xpath):
  305. # append suggestion
  306. results.append({'suggestion': extract_text(suggestion)})
  307. # return results
  308. return results
  309. # get supported languages from their site
  310. skip_countries = [
  311. # official language of google-country not in google-languages
  312. 'AL', # Albanien (sq)
  313. 'AZ', # Aserbaidschan (az)
  314. 'BD', # Bangladesch (bn)
  315. 'BN', # Brunei Darussalam (ms)
  316. 'BT', # Bhutan (dz)
  317. 'ET', # Äthiopien (am)
  318. 'GE', # Georgien (ka, os)
  319. 'GL', # Grönland (kl)
  320. 'KH', # Kambodscha (km)
  321. 'LA', # Laos (lo)
  322. 'LK', # Sri Lanka (si, ta)
  323. 'ME', # Montenegro (sr)
  324. 'MK', # Nordmazedonien (mk, sq)
  325. 'MM', # Myanmar (my)
  326. 'MN', # Mongolei (mn)
  327. 'MV', # Malediven (dv) // dv_MV is unknown by babel
  328. 'MY', # Malaysia (ms)
  329. 'NP', # Nepal (ne)
  330. 'TJ', # Tadschikistan (tg)
  331. 'TM', # Turkmenistan (tk)
  332. 'UZ', # Usbekistan (uz)
  333. ]
  334. def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
  335. """Fetch languages from Google."""
  336. # pylint: disable=import-outside-toplevel, too-many-branches
  337. engine_traits.custom['supported_domains'] = {}
  338. resp = get('https://www.google.com/preferences')
  339. if not resp.ok: # type: ignore
  340. raise RuntimeError("Response from Google's preferences is not OK.")
  341. dom = html.fromstring(resp.text.replace('<?xml version="1.0" encoding="UTF-8"?>', ''))
  342. # supported language codes
  343. lang_map = {'no': 'nb'}
  344. for x in eval_xpath_list(dom, "//select[@name='hl']/option"):
  345. eng_lang = x.get("value")
  346. try:
  347. locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
  348. except babel.UnknownLocaleError:
  349. print("INFO: google UI language %s (%s) is unknown by babel" % (eng_lang, x.text.split("(")[0].strip()))
  350. continue
  351. sxng_lang = language_tag(locale)
  352. conflict = engine_traits.languages.get(sxng_lang)
  353. if conflict:
  354. if conflict != eng_lang:
  355. print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
  356. continue
  357. engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
  358. # alias languages
  359. engine_traits.languages['zh'] = 'lang_zh-CN'
  360. # supported region codes
  361. for x in eval_xpath_list(dom, "//select[@name='gl']/option"):
  362. eng_country = x.get("value")
  363. if eng_country in skip_countries:
  364. continue
  365. if eng_country == 'ZZ':
  366. engine_traits.all_locale = 'ZZ'
  367. continue
  368. sxng_locales = get_official_locales(eng_country, engine_traits.languages.keys(), regional=True)
  369. if not sxng_locales:
  370. print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
  371. continue
  372. for sxng_locale in sxng_locales:
  373. engine_traits.regions[region_tag(sxng_locale)] = eng_country
  374. # alias regions
  375. engine_traits.regions['zh-CN'] = 'HK'
  376. # supported domains
  377. if add_domains:
  378. resp = get('https://www.google.com/supported_domains')
  379. if not resp.ok: # type: ignore
  380. raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
  381. for domain in resp.text.split(): # type: ignore
  382. domain = domain.strip()
  383. if not domain or domain in [
  384. '.google.com',
  385. ]:
  386. continue
  387. region = domain.split('.')[-1].upper()
  388. engine_traits.custom['supported_domains'][region] = 'www' + domain # type: ignore
  389. if region == 'HK':
  390. # There is no google.cn, we use .com.hk for zh-CN
  391. engine_traits.custom['supported_domains']['CN'] = 'www' + domain # type: ignore