hostnames.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # pylint: disable=too-many-branches
  3. """
  4. .. attention::
  5. The **"Hostname replace"** plugin has been replace by **"Hostnames
  6. plugin"**, see :pull:`3463` & :pull:`3552`.
  7. The **Hostnames plugin** can be enabled by adding it to the
  8. ``enabled_plugins`` **list** in the ``setting.yml`` like so.
  9. .. code:: yaml
  10. enabled_plugins:
  11. - 'Hostnames plugin'
  12. ...
  13. - ``hostnames.replace``: A **mapping** of regular expressions to hostnames to be
  14. replaced by other hostnames.
  15. .. code:: yaml
  16. hostnames:
  17. replace:
  18. '(.*\\.)?youtube\\.com$': 'invidious.example.com'
  19. '(.*\\.)?youtu\\.be$': 'invidious.example.com'
  20. ...
  21. - ``hostnames.remove``: A **list** of regular expressions of the hostnames whose
  22. results should be taken from the results list.
  23. .. code:: yaml
  24. hostnames:
  25. remove:
  26. - '(.*\\.)?facebook.com$'
  27. - ...
  28. - ``hostnames.high_priority``: A **list** of regular expressions for hostnames
  29. whose result should be given higher priority. The results from these hosts are
  30. arranged higher in the results list.
  31. .. code:: yaml
  32. hostnames:
  33. high_priority:
  34. - '(.*\\.)?wikipedia.org$'
  35. - ...
  36. - ``hostnames.lower_priority``: A **list** of regular expressions for hostnames
  37. whose result should be given lower priority. The results from these hosts are
  38. arranged lower in the results list.
  39. .. code:: yaml
  40. hostnames:
  41. low_priority:
  42. - '(.*\\.)?google(\\..*)?$'
  43. - ...
  44. If the URL matches the pattern of ``high_priority`` AND ``low_priority``, the
  45. higher priority wins over the lower priority.
  46. Alternatively, you can also specify a file name for the **mappings** or
  47. **lists** to load these from an external file:
  48. .. code:: yaml
  49. hostnames:
  50. replace: 'rewrite-hosts.yml'
  51. remove:
  52. - '(.*\\.)?facebook.com$'
  53. - ...
  54. low_priority:
  55. - '(.*\\.)?google(\\..*)?$'
  56. - ...
  57. high_priority:
  58. - '(.*\\.)?wikipedia.org$'
  59. - ...
  60. The ``rewrite-hosts.yml`` from the example above must be in the folder in which
  61. the ``settings.yml`` file is already located (``/etc/searxng``). The file then
  62. only contains the lists or the mapping tables without further information on the
  63. namespaces. In the example above, this would be a mapping table that looks
  64. something like this:
  65. .. code:: yaml
  66. '(.*\\.)?youtube\\.com$': 'invidious.example.com'
  67. '(.*\\.)?youtu\\.be$': 'invidious.example.com'
  68. """
  69. from __future__ import annotations
  70. import re
  71. from urllib.parse import urlunparse, urlparse
  72. from flask_babel import gettext
  73. from searx import settings
  74. from searx.settings_loader import get_yaml_cfg
  75. name = gettext('Hostnames plugin')
  76. description = gettext('Rewrite hostnames, remove results or prioritize them based on the hostname')
  77. default_on = False
  78. preference_section = 'general'
  79. plugin_id = 'hostnames'
  80. parsed = 'parsed_url'
  81. _url_fields = ['iframe_src', 'audio_src']
  82. def _load_regular_expressions(settings_key) -> dict | set | None:
  83. setting_value = settings.get(plugin_id, {}).get(settings_key)
  84. if not setting_value:
  85. return None
  86. # load external file with configuration
  87. if isinstance(setting_value, str):
  88. setting_value = get_yaml_cfg(setting_value)
  89. if isinstance(setting_value, list):
  90. return {re.compile(r) for r in setting_value}
  91. if isinstance(setting_value, dict):
  92. return {re.compile(p): r for (p, r) in setting_value.items()}
  93. return None
  94. replacements: dict = _load_regular_expressions('replace') or {} # type: ignore
  95. removables: set = _load_regular_expressions('remove') or set() # type: ignore
  96. high_priority: set = _load_regular_expressions('high_priority') or set() # type: ignore
  97. low_priority: set = _load_regular_expressions('low_priority') or set() # type: ignore
  98. def _matches_parsed_url(result, pattern):
  99. return result[parsed] and (parsed in result and pattern.search(result[parsed].netloc))
  100. def on_result(_request, _search, result) -> bool:
  101. for pattern, replacement in replacements.items():
  102. if _matches_parsed_url(result, pattern):
  103. # logger.debug(result['url'])
  104. result[parsed] = result[parsed]._replace(netloc=pattern.sub(replacement, result[parsed].netloc))
  105. result['url'] = urlunparse(result[parsed])
  106. # logger.debug(result['url'])
  107. for url_field in _url_fields:
  108. if not getattr(result, url_field, None):
  109. continue
  110. url_src = urlparse(result[url_field])
  111. if pattern.search(url_src.netloc):
  112. url_src = url_src._replace(netloc=pattern.sub(replacement, url_src.netloc))
  113. result[url_field] = urlunparse(url_src)
  114. for pattern in removables:
  115. if _matches_parsed_url(result, pattern):
  116. return False
  117. for url_field in _url_fields:
  118. if not getattr(result, url_field, None):
  119. continue
  120. url_src = urlparse(result[url_field])
  121. if pattern.search(url_src.netloc):
  122. del result[url_field]
  123. for pattern in low_priority:
  124. if _matches_parsed_url(result, pattern):
  125. result['priority'] = 'low'
  126. for pattern in high_priority:
  127. if _matches_parsed_url(result, pattern):
  128. result['priority'] = 'high'
  129. return True