bing.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. Bing (Web)
  4. """
  5. import re
  6. from urllib.parse import urlencode
  7. from lxml import html
  8. from searx import logger
  9. from searx.utils import eval_xpath, extract_text, match_language
  10. logger = logger.getChild('bing engine')
  11. # about
  12. about = {
  13. "website": 'https://www.bing.com',
  14. "wikidata_id": 'Q182496',
  15. "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
  16. "use_official_api": False,
  17. "require_api_key": False,
  18. "results": 'HTML',
  19. }
  20. # engine dependent config
  21. categories = ['general']
  22. paging = True
  23. supported_languages_url = 'https://www.bing.com/account/general'
  24. language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
  25. # search-url
  26. base_url = 'https://www.bing.com/'
  27. search_string = 'search?{query}&first={offset}'
  28. def _get_offset_from_pageno(pageno):
  29. return (pageno - 1) * 10 + 1
  30. # do search-request
  31. def request(query, params):
  32. offset = _get_offset_from_pageno(params.get('pageno', 0))
  33. if params['language'] == 'all':
  34. lang = 'EN'
  35. else:
  36. lang = match_language(params['language'], supported_languages, language_aliases)
  37. query = 'language:{} {}'.format(lang.split('-')[0].upper(), query)
  38. search_path = search_string.format(
  39. query=urlencode({'q': query}),
  40. offset=offset)
  41. params['url'] = base_url + search_path
  42. params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
  43. return params
  44. # get response from search-request
  45. def response(resp):
  46. results = []
  47. result_len = 0
  48. dom = html.fromstring(resp.text)
  49. # parse results
  50. for result in eval_xpath(dom, '//div[@class="sa_cc"]'):
  51. link = eval_xpath(result, './/h3/a')[0]
  52. url = link.attrib.get('href')
  53. pretty_url = extract_text(eval_xpath(result, './/cite'))
  54. title = extract_text(link)
  55. content = extract_text(eval_xpath(result, './/p'))
  56. # append result
  57. results.append({'url': url,
  58. 'pretty_url': pretty_url,
  59. 'title': title,
  60. 'content': content})
  61. # parse results again if nothing is found yet
  62. for result in eval_xpath(dom, '//li[@class="b_algo"]'):
  63. link = eval_xpath(result, './/h2/a')[0]
  64. url = link.attrib.get('href')
  65. pretty_url = extract_text(eval_xpath(result, './/cite'))
  66. title = extract_text(link)
  67. content = extract_text(eval_xpath(result, './/p'))
  68. # append result
  69. results.append({'url': url,
  70. 'pretty_url': pretty_url,
  71. 'title': title,
  72. 'content': content})
  73. try:
  74. result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
  75. if "-" in result_len_container:
  76. # Remove the part "from-to" for paginated request ...
  77. result_len_container = result_len_container[result_len_container.find("-") * 2 + 2:]
  78. result_len_container = re.sub('[^0-9]', '', result_len_container)
  79. if len(result_len_container) > 0:
  80. result_len = int(result_len_container)
  81. except Exception as e:
  82. logger.debug('result error :\n%s', e)
  83. if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
  84. return []
  85. results.append({'number_of_results': result_len})
  86. return results
  87. # get supported languages from their site
  88. def _fetch_supported_languages(resp):
  89. lang_tags = set()
  90. setmkt = re.compile('setmkt=([^&]*)')
  91. dom = html.fromstring(resp.text)
  92. lang_links = eval_xpath(dom, "//li/a[contains(@href, 'setmkt')]")
  93. for a in lang_links:
  94. href = eval_xpath(a, './@href')[0]
  95. match = setmkt.search(href)
  96. l_tag = match.groups()[0]
  97. _lang, _nation = l_tag.split('-', 1)
  98. l_tag = _lang.lower() + '-' + _nation.upper()
  99. lang_tags.add(l_tag)
  100. return list(lang_tags)