online.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. from urllib.parse import urlparse
  3. from time import time
  4. import threading
  5. import requests.exceptions
  6. import searx.poolrequests as poolrequests
  7. from searx.engines import settings
  8. from searx import logger
  9. from searx.utils import gen_useragent
  10. from searx.exceptions import (SearxEngineAccessDeniedException, SearxEngineCaptchaException,
  11. SearxEngineTooManyRequestsException,)
  12. from searx.metrology.error_recorder import record_exception, record_error
  13. from searx.search.processors.abstract import EngineProcessor
  14. logger = logger.getChild('search.processor.online')
  15. def default_request_params():
  16. return {
  17. 'method': 'GET',
  18. 'headers': {},
  19. 'data': {},
  20. 'url': '',
  21. 'cookies': {},
  22. 'verify': True,
  23. 'auth': None
  24. }
  25. class OnlineProcessor(EngineProcessor):
  26. engine_type = 'online'
  27. def get_params(self, search_query, engine_category):
  28. params = super().get_params(search_query, engine_category)
  29. if params is None:
  30. return None
  31. # skip suspended engines
  32. if self.engine.suspend_end_time >= time():
  33. logger.debug('Engine currently suspended: %s', self.engine_name)
  34. return None
  35. # add default params
  36. params.update(default_request_params())
  37. # add an user agent
  38. params['headers']['User-Agent'] = gen_useragent()
  39. return params
  40. def _send_http_request(self, params):
  41. # create dictionary which contain all
  42. # information about the request
  43. request_args = dict(
  44. headers=params['headers'],
  45. cookies=params['cookies'],
  46. verify=params['verify'],
  47. auth=params['auth']
  48. )
  49. # setting engine based proxies
  50. if hasattr(self.engine, 'proxies'):
  51. request_args['proxies'] = poolrequests.get_proxies(self.engine.proxies)
  52. # max_redirects
  53. max_redirects = params.get('max_redirects')
  54. if max_redirects:
  55. request_args['max_redirects'] = max_redirects
  56. # follow_redirects
  57. if 'follow_redirects' in params:
  58. # httpx has renamed this parameter to 'follow_redirects'
  59. request_args['follow_redirects'] = params['follow_redirects']
  60. # soft_max_redirects
  61. soft_max_redirects = params.get('soft_max_redirects', max_redirects or 0)
  62. # raise_for_status
  63. request_args['raise_for_httperror'] = params.get('raise_for_httperror', True)
  64. # specific type of request (GET or POST)
  65. if params['method'] == 'GET':
  66. req = poolrequests.get
  67. else:
  68. req = poolrequests.post
  69. request_args['data'] = params['data']
  70. # send the request
  71. response = req(params['url'], **request_args)
  72. # check soft limit of the redirect count
  73. if len(response.history) > soft_max_redirects:
  74. # unexpected redirect : record an error
  75. # but the engine might still return valid results.
  76. status_code = str(response.status_code or '')
  77. reason = response.reason or ''
  78. hostname = str(urlparse(response.url or '').netloc)
  79. record_error(self.engine_name,
  80. '{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects),
  81. (status_code, reason, hostname))
  82. return response
  83. def _search_basic(self, query, params):
  84. # update request parameters dependent on
  85. # search-engine (contained in engines folder)
  86. self.engine.request(query, params)
  87. # ignoring empty urls
  88. if params['url'] is None:
  89. return None
  90. if not params['url']:
  91. return None
  92. # send request
  93. response = self._send_http_request(params)
  94. # parse the response
  95. response.search_params = params
  96. return self.engine.response(response)
  97. def search(self, query, params, result_container, start_time, timeout_limit):
  98. # set timeout for all HTTP requests
  99. poolrequests.set_timeout_for_thread(timeout_limit, start_time=start_time)
  100. # reset the HTTP total time
  101. poolrequests.reset_time_for_thread()
  102. # enable HTTP only if explicitly enabled
  103. poolrequests.set_enable_http_protocol(self.engine.enable_http)
  104. # suppose everything will be alright
  105. requests_exception = False
  106. suspended_time = None
  107. try:
  108. # send requests and parse the results
  109. search_results = self._search_basic(query, params)
  110. # check if the engine accepted the request
  111. if search_results is not None:
  112. # yes, so add results
  113. result_container.extend(self.engine_name, search_results)
  114. # update engine time when there is no exception
  115. engine_time = time() - start_time
  116. page_load_time = poolrequests.get_time_for_thread()
  117. result_container.add_timing(self.engine_name, engine_time, page_load_time)
  118. with threading.RLock():
  119. self.engine.stats['engine_time'] += engine_time
  120. self.engine.stats['engine_time_count'] += 1
  121. # update stats with the total HTTP time
  122. self.engine.stats['page_load_time'] += page_load_time
  123. self.engine.stats['page_load_count'] += 1
  124. except Exception as e:
  125. record_exception(self.engine_name, e)
  126. # Timing
  127. engine_time = time() - start_time
  128. page_load_time = poolrequests.get_time_for_thread()
  129. result_container.add_timing(self.engine_name, engine_time, page_load_time)
  130. # Record the errors
  131. with threading.RLock():
  132. self.engine.stats['errors'] += 1
  133. if (issubclass(e.__class__, requests.exceptions.Timeout)):
  134. result_container.add_unresponsive_engine(self.engine_name, 'HTTP timeout')
  135. # requests timeout (connect or read)
  136. logger.error("engine {0} : HTTP requests timeout"
  137. "(search duration : {1} s, timeout: {2} s) : {3}"
  138. .format(self.engine_name, engine_time, timeout_limit, e.__class__.__name__))
  139. requests_exception = True
  140. elif (issubclass(e.__class__, requests.exceptions.RequestException)):
  141. result_container.add_unresponsive_engine(self.engine_name, 'HTTP error')
  142. # other requests exception
  143. logger.exception("engine {0} : requests exception"
  144. "(search duration : {1} s, timeout: {2} s) : {3}"
  145. .format(self.engine_name, engine_time, timeout_limit, e))
  146. requests_exception = True
  147. elif (issubclass(e.__class__, SearxEngineCaptchaException)):
  148. result_container.add_unresponsive_engine(self.engine_name, 'CAPTCHA required')
  149. logger.exception('engine {0} : CAPTCHA'.format(self.engine_name))
  150. suspended_time = e.suspended_time # pylint: disable=no-member
  151. elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)):
  152. result_container.add_unresponsive_engine(self.engine_name, 'too many requests')
  153. logger.exception('engine {0} : Too many requests'.format(self.engine_name))
  154. suspended_time = e.suspended_time # pylint: disable=no-member
  155. elif (issubclass(e.__class__, SearxEngineAccessDeniedException)):
  156. result_container.add_unresponsive_engine(self.engine_name, 'blocked')
  157. logger.exception('engine {0} : Searx is blocked'.format(self.engine_name))
  158. suspended_time = e.suspended_time # pylint: disable=no-member
  159. else:
  160. result_container.add_unresponsive_engine(self.engine_name, 'unexpected crash')
  161. # others errors
  162. logger.exception('engine {0} : exception : {1}'.format(self.engine_name, e))
  163. else:
  164. if getattr(threading.current_thread(), '_timeout', False):
  165. record_error(self.engine_name, 'Timeout')
  166. # suspend the engine if there is an HTTP error
  167. # or suspended_time is defined
  168. with threading.RLock():
  169. if requests_exception or suspended_time:
  170. # update continuous_errors / suspend_end_time
  171. self.engine.continuous_errors += 1
  172. if suspended_time is None:
  173. suspended_time = min(settings['search']['max_ban_time_on_fail'],
  174. self.engine.continuous_errors * settings['search']['ban_time_on_fail'])
  175. self.engine.suspend_end_time = time() + suspended_time
  176. else:
  177. # reset the suspend variables
  178. self.engine.continuous_errors = 0
  179. self.engine.suspend_end_time = 0
  180. def get_default_tests(self):
  181. tests = {}
  182. tests['simple'] = {
  183. 'matrix': {'query': ('life', 'computer')},
  184. 'result_container': ['not_empty'],
  185. }
  186. if getattr(self.engine, 'paging', False):
  187. tests['paging'] = {
  188. 'matrix': {'query': 'time',
  189. 'pageno': (1, 2, 3)},
  190. 'result_container': ['not_empty'],
  191. 'test': ['unique_results']
  192. }
  193. if 'general' in self.engine.categories:
  194. # avoid documentation about HTML tags (<time> and <input type="time">)
  195. tests['paging']['matrix']['query'] = 'news'
  196. if getattr(self.engine, 'time_range', False):
  197. tests['time_range'] = {
  198. 'matrix': {'query': 'news',
  199. 'time_range': (None, 'day')},
  200. 'result_container': ['not_empty'],
  201. 'test': ['unique_results']
  202. }
  203. if getattr(self.engine, 'supported_languages', []):
  204. tests['lang_fr'] = {
  205. 'matrix': {'query': 'paris', 'lang': 'fr'},
  206. 'result_container': ['not_empty', ('has_language', 'fr')],
  207. }
  208. tests['lang_en'] = {
  209. 'matrix': {'query': 'paris', 'lang': 'en'},
  210. 'result_container': ['not_empty', ('has_language', 'en')],
  211. }
  212. if getattr(self.engine, 'safesearch', False):
  213. tests['safesearch'] = {
  214. 'matrix': {'query': 'porn',
  215. 'safesearch': (0, 2)},
  216. 'test': ['unique_results']
  217. }
  218. return tests