results.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # pylint: disable=missing-module-docstring
  3. from __future__ import annotations
  4. import warnings
  5. import re
  6. from collections import defaultdict
  7. from operator import itemgetter
  8. from threading import RLock
  9. from typing import List, NamedTuple, Set
  10. from urllib.parse import urlparse, unquote
  11. from searx import logger
  12. from searx.engines import engines
  13. from searx.metrics import histogram_observe, counter_add, count_error
  14. from searx.result_types import Result, LegacyResult
  15. from searx.result_types.answer import AnswerSet, BaseAnswer
  16. CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
  17. # return the meaningful length of the content for a result
  18. def result_content_len(content):
  19. if isinstance(content, str):
  20. return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))
  21. return 0
  22. def compare_urls(url_a, url_b):
  23. """Lazy compare between two URL.
  24. "www.example.com" and "example.com" are equals.
  25. "www.example.com/path/" and "www.example.com/path" are equals.
  26. "https://www.example.com/" and "http://www.example.com/" are equals.
  27. Args:
  28. url_a (ParseResult): first URL
  29. url_b (ParseResult): second URL
  30. Returns:
  31. bool: True if url_a and url_b are equals
  32. """
  33. # ignore www. in comparison
  34. if url_a.netloc.startswith('www.'):
  35. host_a = url_a.netloc.replace('www.', '', 1)
  36. else:
  37. host_a = url_a.netloc
  38. if url_b.netloc.startswith('www.'):
  39. host_b = url_b.netloc.replace('www.', '', 1)
  40. else:
  41. host_b = url_b.netloc
  42. if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment:
  43. return False
  44. # remove / from the end of the url if required
  45. path_a = url_a.path[:-1] if url_a.path.endswith('/') else url_a.path
  46. path_b = url_b.path[:-1] if url_b.path.endswith('/') else url_b.path
  47. return unquote(path_a) == unquote(path_b)
  48. def merge_two_infoboxes(infobox1, infobox2): # pylint: disable=too-many-branches, too-many-statements
  49. # get engines weights
  50. if hasattr(engines[infobox1['engine']], 'weight'):
  51. weight1 = engines[infobox1['engine']].weight
  52. else:
  53. weight1 = 1
  54. if hasattr(engines[infobox2['engine']], 'weight'):
  55. weight2 = engines[infobox2['engine']].weight
  56. else:
  57. weight2 = 1
  58. if weight2 > weight1:
  59. infobox1['engine'] = infobox2['engine']
  60. infobox1['engines'] |= infobox2['engines']
  61. if 'urls' in infobox2:
  62. urls1 = infobox1.get('urls', None)
  63. if urls1 is None:
  64. urls1 = []
  65. for url2 in infobox2.get('urls', []):
  66. unique_url = True
  67. parsed_url2 = urlparse(url2.get('url', ''))
  68. entity_url2 = url2.get('entity')
  69. for url1 in urls1:
  70. if (entity_url2 is not None and url1.get('entity') == entity_url2) or compare_urls(
  71. urlparse(url1.get('url', '')), parsed_url2
  72. ):
  73. unique_url = False
  74. break
  75. if unique_url:
  76. urls1.append(url2)
  77. infobox1['urls'] = urls1
  78. if 'img_src' in infobox2:
  79. img1 = infobox1.get('img_src', None)
  80. img2 = infobox2.get('img_src')
  81. if img1 is None:
  82. infobox1['img_src'] = img2
  83. elif weight2 > weight1:
  84. infobox1['img_src'] = img2
  85. if 'attributes' in infobox2:
  86. attributes1 = infobox1.get('attributes')
  87. if attributes1 is None:
  88. infobox1['attributes'] = attributes1 = []
  89. attributeSet = set()
  90. for attribute in attributes1:
  91. label = attribute.get('label')
  92. if label not in attributeSet:
  93. attributeSet.add(label)
  94. entity = attribute.get('entity')
  95. if entity not in attributeSet:
  96. attributeSet.add(entity)
  97. for attribute in infobox2.get('attributes', []):
  98. if attribute.get('label') not in attributeSet and attribute.get('entity') not in attributeSet:
  99. attributes1.append(attribute)
  100. if 'content' in infobox2:
  101. content1 = infobox1.get('content', None)
  102. content2 = infobox2.get('content', '')
  103. if content1 is not None:
  104. if result_content_len(content2) > result_content_len(content1):
  105. infobox1['content'] = content2
  106. else:
  107. infobox1['content'] = content2
  108. def result_score(result, priority):
  109. weight = 1.0
  110. for result_engine in result['engines']:
  111. if hasattr(engines.get(result_engine), 'weight'):
  112. weight *= float(engines[result_engine].weight)
  113. weight *= len(result['positions'])
  114. score = 0
  115. for position in result['positions']:
  116. if priority == 'low':
  117. continue
  118. if priority == 'high':
  119. score += weight
  120. else:
  121. score += weight / position
  122. return score
  123. class Timing(NamedTuple): # pylint: disable=missing-class-docstring
  124. engine: str
  125. total: float
  126. load: float
  127. class UnresponsiveEngine(NamedTuple): # pylint: disable=missing-class-docstring
  128. engine: str
  129. error_type: str
  130. suspended: bool
  131. class ResultContainer:
  132. """docstring for ResultContainer"""
  133. __slots__ = (
  134. '_merged_results',
  135. 'infoboxes',
  136. 'suggestions',
  137. 'answers',
  138. 'corrections',
  139. '_number_of_results',
  140. '_closed',
  141. 'paging',
  142. 'unresponsive_engines',
  143. 'timings',
  144. 'redirect_url',
  145. 'engine_data',
  146. 'on_result',
  147. '_lock',
  148. )
  149. def __init__(self):
  150. super().__init__()
  151. self._merged_results: list[LegacyResult] = []
  152. self.infoboxes: list[dict] = []
  153. self.suggestions: set[str] = set()
  154. self.answers = AnswerSet()
  155. self.corrections = set()
  156. self._number_of_results: list[int] = []
  157. self.engine_data: dict[str, str | dict] = defaultdict(dict)
  158. self._closed: bool = False
  159. self.paging: bool = False
  160. self.unresponsive_engines: Set[UnresponsiveEngine] = set()
  161. self.timings: List[Timing] = []
  162. self.redirect_url = None
  163. self.on_result = lambda _: True
  164. self._lock = RLock()
  165. def extend(self, engine_name: str | None, results): # pylint: disable=too-many-branches
  166. if self._closed:
  167. return
  168. standard_result_count = 0
  169. error_msgs = set()
  170. for result in list(results):
  171. if isinstance(result, Result):
  172. result.engine = result.engine or engine_name
  173. result.normalize_result_fields()
  174. if isinstance(result, BaseAnswer) and self.on_result(result):
  175. self.answers.add(result)
  176. else:
  177. # more types need to be implemented in the future ..
  178. raise NotImplementedError(f"no handler implemented to process the result of type {result}")
  179. else:
  180. result['engine'] = result.get('engine') or engine_name or ""
  181. result = LegacyResult(result) # for backward compatibility, will be romeved one day
  182. if 'suggestion' in result and self.on_result(result):
  183. self.suggestions.add(result['suggestion'])
  184. elif 'answer' in result and self.on_result(result):
  185. warnings.warn(
  186. f"answer results from engine {result.engine}"
  187. " are without typification / migrate to Answer class.",
  188. DeprecationWarning,
  189. )
  190. self.answers.add(result)
  191. elif 'correction' in result and self.on_result(result):
  192. self.corrections.add(result['correction'])
  193. elif 'infobox' in result and self.on_result(result):
  194. self._merge_infobox(result)
  195. elif 'number_of_results' in result and self.on_result(result):
  196. self._number_of_results.append(result['number_of_results'])
  197. elif 'engine_data' in result and self.on_result(result):
  198. self.engine_data[result.engine][result['key']] = result['engine_data']
  199. elif result.url:
  200. # standard result (url, title, content)
  201. if not self._is_valid_url_result(result, error_msgs):
  202. continue
  203. # normalize the result
  204. result.normalize_result_fields()
  205. # call on_result call searx.search.SearchWithPlugins._on_result
  206. # which calls the plugins
  207. if not self.on_result(result):
  208. continue
  209. self.__merge_url_result(result, standard_result_count + 1)
  210. standard_result_count += 1
  211. elif self.on_result(result):
  212. self.__merge_result_no_url(result, standard_result_count + 1)
  213. standard_result_count += 1
  214. if len(error_msgs) > 0:
  215. for msg in error_msgs:
  216. count_error(engine_name, 'some results are invalids: ' + msg, secondary=True)
  217. if engine_name in engines:
  218. histogram_observe(standard_result_count, 'engine', engine_name, 'result', 'count')
  219. if not self.paging and engine_name in engines and engines[engine_name].paging:
  220. self.paging = True
  221. def _merge_infobox(self, infobox):
  222. add_infobox = True
  223. infobox_id = infobox.get('id', None)
  224. infobox['engines'] = set([infobox['engine']])
  225. if infobox_id is not None:
  226. parsed_url_infobox_id = urlparse(infobox_id)
  227. with self._lock:
  228. for existingIndex in self.infoboxes:
  229. if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id):
  230. merge_two_infoboxes(existingIndex, infobox)
  231. add_infobox = False
  232. if add_infobox:
  233. self.infoboxes.append(infobox)
  234. def _is_valid_url_result(self, result, error_msgs):
  235. if 'url' in result:
  236. if not isinstance(result['url'], str):
  237. logger.debug('result: invalid URL: %s', str(result))
  238. error_msgs.add('invalid URL')
  239. return False
  240. if 'title' in result and not isinstance(result['title'], str):
  241. logger.debug('result: invalid title: %s', str(result))
  242. error_msgs.add('invalid title')
  243. return False
  244. if 'content' in result:
  245. if not isinstance(result['content'], str):
  246. logger.debug('result: invalid content: %s', str(result))
  247. error_msgs.add('invalid content')
  248. return False
  249. return True
  250. def __merge_url_result(self, result, position):
  251. result['engines'] = set([result['engine']])
  252. with self._lock:
  253. duplicated = self.__find_duplicated_http_result(result)
  254. if duplicated:
  255. self.__merge_duplicated_http_result(duplicated, result, position)
  256. return
  257. # if there is no duplicate found, append result
  258. result['positions'] = [position]
  259. self._merged_results.append(result)
  260. def __find_duplicated_http_result(self, result):
  261. result_template = result.get('template')
  262. for merged_result in self._merged_results:
  263. if 'parsed_url' not in merged_result:
  264. continue
  265. if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get(
  266. 'template'
  267. ):
  268. if result_template != 'images.html':
  269. # not an image, same template, same url : it's a duplicate
  270. return merged_result
  271. # it's an image
  272. # it's a duplicate if the parsed_url, template and img_src are different
  273. if result.get('img_src', '') == merged_result.get('img_src', ''):
  274. return merged_result
  275. return None
  276. def __merge_duplicated_http_result(self, duplicated, result, position):
  277. # use content with more text
  278. if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')):
  279. duplicated['content'] = result['content']
  280. # use title with more text
  281. if result_content_len(result.get('title', '')) > len(duplicated.get('title', '')):
  282. duplicated['title'] = result['title']
  283. # merge all result's parameters not found in duplicate
  284. for key in result.keys():
  285. if not duplicated.get(key):
  286. duplicated[key] = result.get(key)
  287. # add the new position
  288. duplicated['positions'].append(position)
  289. # add engine to list of result-engines
  290. duplicated['engines'].add(result['engine'])
  291. # use https if possible
  292. if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
  293. duplicated['url'] = result['parsed_url'].geturl()
  294. duplicated['parsed_url'] = result['parsed_url']
  295. def __merge_result_no_url(self, result, position):
  296. result['engines'] = set([result['engine']])
  297. result['positions'] = [position]
  298. with self._lock:
  299. self._merged_results.append(result)
  300. def close(self):
  301. self._closed = True
  302. for result in self._merged_results:
  303. result['score'] = result_score(result, result.get('priority'))
  304. # removing html content and whitespace duplications
  305. if result.get('content'):
  306. result['content'] = result['content'].strip()
  307. if result.get('title'):
  308. result['title'] = ' '.join(result['title'].strip().split())
  309. for result_engine in result['engines']:
  310. counter_add(result['score'], 'engine', result_engine, 'score')
  311. results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)
  312. # pass 2 : group results by category and template
  313. gresults = []
  314. categoryPositions = {}
  315. for res in results:
  316. # do we need to handle more than one category per engine?
  317. engine = engines[res['engine']]
  318. res['category'] = engine.categories[0] if len(engine.categories) > 0 else ''
  319. # do we need to handle more than one category per engine?
  320. category = (
  321. res['category']
  322. + ':'
  323. + res.get('template', '')
  324. + ':'
  325. + ('img_src' if 'img_src' in res or 'thumbnail' in res else '')
  326. )
  327. current = None if category not in categoryPositions else categoryPositions[category]
  328. # group with previous results using the same category
  329. # if the group can accept more result and is not too far
  330. # from the current position
  331. if current is not None and (current['count'] > 0) and (len(gresults) - current['index'] < 20):
  332. # group with the previous results using
  333. # the same category with this one
  334. index = current['index']
  335. gresults.insert(index, res)
  336. # update every index after the current one
  337. # (including the current one)
  338. for k in categoryPositions: # pylint: disable=consider-using-dict-items
  339. v = categoryPositions[k]['index']
  340. if v >= index:
  341. categoryPositions[k]['index'] = v + 1
  342. # update this category
  343. current['count'] -= 1
  344. else:
  345. # same category
  346. gresults.append(res)
  347. # update categoryIndex
  348. categoryPositions[category] = {'index': len(gresults), 'count': 8}
  349. # update _merged_results
  350. self._merged_results = gresults
  351. def get_ordered_results(self):
  352. if not self._closed:
  353. self.close()
  354. return self._merged_results
  355. def results_length(self):
  356. return len(self._merged_results)
  357. @property
  358. def number_of_results(self) -> int:
  359. """Returns the average of results number, returns zero if the average
  360. result number is smaller than the actual result count."""
  361. with self._lock:
  362. if not self._closed:
  363. logger.error("call to ResultContainer.number_of_results before ResultContainer.close")
  364. return 0
  365. resultnum_sum = sum(self._number_of_results)
  366. if not resultnum_sum or not self._number_of_results:
  367. return 0
  368. average = int(resultnum_sum / len(self._number_of_results))
  369. if average < self.results_length():
  370. average = 0
  371. return average
  372. def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
  373. with self._lock:
  374. if self._closed:
  375. logger.error("call to ResultContainer.add_unresponsive_engine after ResultContainer.close")
  376. return
  377. if engines[engine_name].display_error_messages:
  378. self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended))
  379. def add_timing(self, engine_name: str, engine_time: float, page_load_time: float):
  380. with self._lock:
  381. if self._closed:
  382. logger.error("call to ResultContainer.add_timing after ResultContainer.close")
  383. return
  384. self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time))
  385. def get_timings(self):
  386. with self._lock:
  387. if not self._closed:
  388. logger.error("call to ResultContainer.get_timings before ResultContainer.close")
  389. return []
  390. return self.timings