search.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. import threading
  15. from time import time
  16. from searx import curladapter
  17. from searx import settings
  18. from searx.engines import (
  19. categories, engines
  20. )
  21. from searx.languages import language_codes
  22. from searx.utils import gen_useragent, get_blocked_engines
  23. from searx.query import Query
  24. from searx.results import ResultContainer
  25. from searx import logger
  26. logger = logger.getChild('search')
  27. number_of_searches = 0
  28. def search_request_wrapper(fn, url, engine_name, **kwargs):
  29. try:
  30. return fn(url, **kwargs)
  31. except:
  32. # increase errors stats
  33. with threading.RLock():
  34. engines[engine_name].stats['errors'] += 1
  35. # print engine name and specific error message
  36. logger.exception('engine crash: {0}'.format(engine_name))
  37. return
  38. def threaded_requests(requests):
  39. timeout_limit = max(r[2]['timeout'] for r in requests)
  40. search_start = time()
  41. for fn, url, request_args, engine_name in requests:
  42. request_args['timeout'] = timeout_limit
  43. th = threading.Thread(
  44. target=search_request_wrapper,
  45. args=(fn, url, engine_name),
  46. kwargs=request_args,
  47. name='search_request',
  48. )
  49. th._engine_name = engine_name
  50. th.start()
  51. for th in threading.enumerate():
  52. if th.name == 'search_request':
  53. remaining_time = max(0.0, timeout_limit - (time() - search_start))
  54. th.join(remaining_time)
  55. if th.isAlive():
  56. logger.warning('engine timeout: {0}'.format(th._engine_name))
  57. # get default reqest parameter
  58. def default_request_params():
  59. return {
  60. 'method': 'GET',
  61. 'headers': {},
  62. 'data': {},
  63. 'url': '',
  64. 'cookies': {},
  65. 'verify': True
  66. }
  67. # create a callback wrapper for the search engine results
  68. def make_callback(engine_name, callback, params, result_container):
  69. # creating a callback wrapper for the search engine results
  70. def process_callback(response, **kwargs):
  71. response.search_params = params
  72. search_duration = time() - params['started']
  73. # update stats with current page-load-time
  74. with threading.RLock():
  75. engines[engine_name].stats['page_load_time'] += search_duration
  76. timeout_overhead = 0.2 # seconds
  77. timeout_limit = engines[engine_name].timeout + timeout_overhead
  78. if search_duration > timeout_limit:
  79. with threading.RLock():
  80. engines[engine_name].stats['errors'] += 1
  81. return
  82. # callback
  83. try:
  84. search_results = callback(response)
  85. except:
  86. # increase errors stats
  87. with threading.RLock():
  88. engines[engine_name].stats['errors'] += 1
  89. # print engine name and specific error message
  90. logger.exception('engine crash: {0}'.format(engine_name))
  91. return
  92. # add results
  93. for result in search_results:
  94. result['engine'] = engine_name
  95. result_container.extend(engine_name, search_results)
  96. return process_callback
  97. class Search(object):
  98. """Search information container"""
  99. def __init__(self, request):
  100. # init vars
  101. super(Search, self).__init__()
  102. self.query = None
  103. self.engines = []
  104. self.categories = []
  105. self.paging = False
  106. self.pageno = 1
  107. self.lang = 'all'
  108. # set blocked engines
  109. self.blocked_engines = get_blocked_engines(engines, request.cookies)
  110. self.result_container = ResultContainer()
  111. self.request_data = {}
  112. # set specific language if set
  113. if request.cookies.get('language')\
  114. and request.cookies['language'] in (x[0] for x in language_codes):
  115. self.lang = request.cookies['language']
  116. # set request method
  117. if request.method == 'POST':
  118. self.request_data = request.form
  119. else:
  120. self.request_data = request.args
  121. # TODO better exceptions
  122. if not self.request_data.get('q'):
  123. raise Exception('noquery')
  124. # set pagenumber
  125. pageno_param = self.request_data.get('pageno', '1')
  126. if not pageno_param.isdigit() or int(pageno_param) < 1:
  127. pageno_param = 1
  128. self.pageno = int(pageno_param)
  129. # parse query, if tags are set, which change
  130. # the serch engine or search-language
  131. query_obj = Query(self.request_data['q'], self.blocked_engines)
  132. query_obj.parse_query()
  133. # set query
  134. self.query = query_obj.getSearchQuery()
  135. # get last selected language in query, if possible
  136. # TODO support search with multible languages
  137. if len(query_obj.languages):
  138. self.lang = query_obj.languages[-1]
  139. self.engines = query_obj.engines
  140. self.categories = []
  141. # if engines are calculated from query,
  142. # set categories by using that informations
  143. if self.engines and query_obj.specific:
  144. self.categories = list(set(engine['category']
  145. for engine in self.engines))
  146. # otherwise, using defined categories to
  147. # calculate which engines should be used
  148. else:
  149. # set categories/engines
  150. load_default_categories = True
  151. for pd_name, pd in self.request_data.items():
  152. if pd_name == 'categories':
  153. self.categories.extend(categ for categ in map(unicode.strip, pd.split(',')) if categ in categories)
  154. elif pd_name == 'engines':
  155. pd_engines = [{'category': engines[engine].categories[0],
  156. 'name': engine}
  157. for engine in map(unicode.strip, pd.split(',')) if engine in engines]
  158. if pd_engines:
  159. self.engines.extend(pd_engines)
  160. load_default_categories = False
  161. elif pd_name.startswith('category_'):
  162. category = pd_name[9:]
  163. # if category is not found in list, skip
  164. if category not in categories:
  165. continue
  166. if pd != 'off':
  167. # add category to list
  168. self.categories.append(category)
  169. elif category in self.categories:
  170. # remove category from list if property is set to 'off'
  171. self.categories.remove(category)
  172. if not load_default_categories:
  173. if not self.categories:
  174. self.categories = list(set(engine['category']
  175. for engine in self.engines))
  176. return
  177. # if no category is specified for this search,
  178. # using user-defined default-configuration which
  179. # (is stored in cookie)
  180. if not self.categories:
  181. cookie_categories = request.cookies.get('categories', '')
  182. cookie_categories = cookie_categories.split(',')
  183. for ccateg in cookie_categories:
  184. if ccateg in categories:
  185. self.categories.append(ccateg)
  186. # if still no category is specified, using general
  187. # as default-category
  188. if not self.categories:
  189. self.categories = ['general']
  190. # using all engines for that search, which are
  191. # declared under the specific categories
  192. for categ in self.categories:
  193. self.engines.extend({'category': categ,
  194. 'name': engine.name}
  195. for engine in categories[categ]
  196. if (engine.name, categ) not in self.blocked_engines)
  197. # do search-request
  198. def search(self, request):
  199. global number_of_searches
  200. # increase number of searches
  201. number_of_searches += 1
  202. # set default useragent
  203. # user_agent = request.headers.get('User-Agent', '')
  204. user_agent = gen_useragent()
  205. # start search-reqest for all selected engines
  206. mr = curladapter.MultiRequest()
  207. for selected_engine in self.engines:
  208. if selected_engine['name'] not in engines:
  209. continue
  210. engine = engines[selected_engine['name']]
  211. # if paging is not supported, skip
  212. if self.pageno > 1 and not engine.paging:
  213. continue
  214. # if search-language is set and engine does not
  215. # provide language-support, skip
  216. if self.lang != 'all' and not engine.language_support:
  217. continue
  218. # set default request parameters
  219. request_params = default_request_params()
  220. request_params['headers']['User-Agent'] = user_agent
  221. request_params['category'] = selected_engine['category']
  222. request_params['started'] = time()
  223. request_params['pageno'] = self.pageno
  224. if hasattr(engine, 'language') and engine.language:
  225. request_params['language'] = engine.language
  226. else:
  227. request_params['language'] = self.lang
  228. try:
  229. # 0 = None, 1 = Moderate, 2 = Strict
  230. request_params['safesearch'] = int(request.cookies.get('safesearch'))
  231. except Exception:
  232. request_params['safesearch'] = settings['search']['safe_search']
  233. # update request parameters dependent on
  234. # search-engine (contained in engines folder)
  235. engine.request(self.query.encode('utf-8'), request_params)
  236. if request_params['url'] is None:
  237. # TODO add support of offline engines
  238. pass
  239. # create a callback wrapper for the search engine results
  240. callback = make_callback(
  241. selected_engine['name'],
  242. engine.response,
  243. request_params,
  244. self.result_container)
  245. # create dictionary which contain all
  246. # informations about the request
  247. request_args = dict(
  248. headers=request_params['headers'],
  249. callback=callback,
  250. cookies=request_params['cookies'],
  251. timeout=engine.timeout,
  252. ssl_verification=request_params['verify']
  253. )
  254. # ignoring empty urls
  255. if not request_params['url']:
  256. continue
  257. # append request to list
  258. mr.add(request_params['url'], **request_args)
  259. if not mr.requests:
  260. return self
  261. # send all search-request
  262. mr.send_requests()
  263. # return results, suggestions, answers and infoboxes
  264. return self