bing.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. Bing (Web)
  4. """
  5. import re
  6. from urllib.parse import urlencode
  7. from lxml import html
  8. from searx import logger
  9. from searx.utils import eval_xpath, extract_text, match_language
  10. logger = logger.getChild('bing engine')
  11. # about
  12. about = {
  13. "website": 'https://www.bing.com',
  14. "wikidata_id": 'Q182496',
  15. "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
  16. "use_official_api": False,
  17. "require_api_key": False,
  18. "results": 'HTML',
  19. }
  20. # engine dependent config
  21. categories = ['general']
  22. paging = True
  23. supported_languages_url = 'https://www.bing.com/account/general'
  24. language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
  25. # search-url
  26. base_url = 'https://www.bing.com/'
  27. search_string = 'search?{query}&first={offset}'
  28. def _get_offset_from_pageno(pageno):
  29. return (pageno - 1) * 10 + 1
  30. # do search-request
  31. def request(query, params):
  32. offset = _get_offset_from_pageno(params.get('pageno', 0))
  33. if params['language'] == 'all':
  34. lang = 'EN'
  35. else:
  36. lang = match_language(params['language'], supported_languages, language_aliases)
  37. query = 'language:{} {}'.format(lang.split('-')[0].upper(), query)
  38. search_path = search_string.format(
  39. query=urlencode({'q': query}),
  40. offset=offset)
  41. params['url'] = base_url + search_path
  42. params['headers']['User-Agent'] = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
  43. '(KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36')
  44. return params
  45. # get response from search-request
  46. def response(resp):
  47. results = []
  48. result_len = 0
  49. dom = html.fromstring(resp.text)
  50. # parse results
  51. for result in eval_xpath(dom, '//div[@class="sa_cc"]'):
  52. link = eval_xpath(result, './/h3/a')[0]
  53. url = link.attrib.get('href')
  54. pretty_url = extract_text(eval_xpath(result, './/cite'))
  55. title = extract_text(link)
  56. content = extract_text(eval_xpath(result, './/p'))
  57. # append result
  58. results.append({'url': url,
  59. 'pretty_url': pretty_url,
  60. 'title': title,
  61. 'content': content})
  62. # parse results again if nothing is found yet
  63. for result in eval_xpath(dom, '//li[@class="b_algo"]'):
  64. link = eval_xpath(result, './/h2/a')[0]
  65. url = link.attrib.get('href')
  66. title = extract_text(link)
  67. content = extract_text(eval_xpath(result, './/p'))
  68. # append result
  69. results.append({'url': url,
  70. 'title': title,
  71. 'content': content})
  72. try:
  73. result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
  74. if "-" in result_len_container:
  75. # Remove the part "from-to" for paginated request ...
  76. result_len_container = result_len_container[result_len_container.find("-") * 2 + 2:]
  77. result_len_container = re.sub('[^0-9]', '', result_len_container)
  78. if len(result_len_container) > 0:
  79. result_len = int(result_len_container)
  80. except Exception as e:
  81. logger.debug('result error :\n%s', e)
  82. if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
  83. return []
  84. results.append({'number_of_results': result_len})
  85. return results
  86. # get supported languages from their site
  87. def _fetch_supported_languages(resp):
  88. lang_tags = set()
  89. setmkt = re.compile('setmkt=([^&]*)')
  90. dom = html.fromstring(resp.text)
  91. lang_links = eval_xpath(dom, "//li/a[contains(@href, 'setmkt')]")
  92. for a in lang_links:
  93. href = eval_xpath(a, './@href')[0]
  94. match = setmkt.search(href)
  95. l_tag = match.groups()[0]
  96. _lang, _nation = l_tag.split('-', 1)
  97. l_tag = _lang.lower() + '-' + _nation.upper()
  98. lang_tags.add(l_tag)
  99. return list(lang_tags)