cache.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Implementations for caching favicons.
  3. :py:obj:`FaviconCacheConfig`:
  4. Configuration of the favicon cache
  5. :py:obj:`FaviconCache`:
  6. Abstract base class for the implementation of a favicon cache.
  7. :py:obj:`FaviconCacheSQLite`:
  8. Favicon cache that manages the favicon BLOBs in a SQLite DB.
  9. :py:obj:`FaviconCacheNull`:
  10. Fallback solution if the configured cache cannot be used for system reasons.
  11. ----
  12. """
  13. from __future__ import annotations
  14. from typing import Literal
  15. import os
  16. import abc
  17. import dataclasses
  18. import hashlib
  19. import logging
  20. import sqlite3
  21. import tempfile
  22. import time
  23. import typer
  24. import msgspec
  25. from searx import sqlitedb
  26. from searx import logger
  27. from searx.utils import humanize_bytes, humanize_number
  28. CACHE: "FaviconCache"
  29. FALLBACK_ICON = b"FALLBACK_ICON"
  30. logger = logger.getChild('favicons.cache')
  31. app = typer.Typer()
  32. @app.command()
  33. def state():
  34. """show state of the cache"""
  35. print(CACHE.state().report())
  36. @app.command()
  37. def maintenance(force: bool = True, debug: bool = False):
  38. """perform maintenance of the cache"""
  39. root_log = logging.getLogger()
  40. if debug:
  41. root_log.setLevel(logging.DEBUG)
  42. else:
  43. root_log.handlers = []
  44. handler = logging.StreamHandler()
  45. handler.setFormatter(logging.Formatter("%(message)s"))
  46. logger.addHandler(handler)
  47. logger.setLevel(logging.DEBUG)
  48. state_t0 = CACHE.state()
  49. CACHE.maintenance(force=force)
  50. state_t1 = CACHE.state()
  51. state_delta = state_t0 - state_t1
  52. print("The cache has been reduced by:")
  53. print(state_delta.report("\n- {descr}: {val}").lstrip("\n"))
  54. def init(cfg: "FaviconCacheConfig"):
  55. """Initialization of a global ``CACHE``"""
  56. global CACHE # pylint: disable=global-statement
  57. if cfg.db_type == "sqlite":
  58. if sqlite3.sqlite_version_info <= (3, 35):
  59. logger.critical(
  60. "Disable favicon caching completely: SQLite library (%s) is too old! (require >= 3.35)",
  61. sqlite3.sqlite_version,
  62. )
  63. CACHE = FaviconCacheNull(cfg)
  64. else:
  65. CACHE = FaviconCacheSQLite(cfg)
  66. elif cfg.db_type == "mem":
  67. logger.error("Favicons are cached in memory, don't use this in production!")
  68. CACHE = FaviconCacheMEM(cfg)
  69. else:
  70. raise NotImplementedError(f"favicons db_type '{cfg.db_type}' is unknown")
  71. class FaviconCacheConfig(msgspec.Struct): # pylint: disable=too-few-public-methods
  72. """Configuration of the favicon cache."""
  73. db_type: Literal["sqlite", "mem"] = "sqlite"
  74. """Type of the database:
  75. ``sqlite``:
  76. :py:obj:`.cache.FaviconCacheSQLite`
  77. ``mem``:
  78. :py:obj:`.cache.FaviconCacheMEM` (not recommended)
  79. """
  80. db_url: str = tempfile.gettempdir() + os.sep + "faviconcache.db"
  81. """URL of the SQLite DB, the path to the database file."""
  82. HOLD_TIME: int = 60 * 60 * 24 * 30 # 30 days
  83. """Hold time (default in sec.), after which a BLOB is removed from the cache."""
  84. LIMIT_TOTAL_BYTES: int = 1024 * 1024 * 50 # 50 MB
  85. """Maximum of bytes (default) stored in the cache of all blobs. Note: The
  86. limit is only reached at each maintenance interval after which the oldest
  87. BLOBs are deleted; the limit is exceeded during the maintenance period. If
  88. the maintenance period is *too long* or maintenance is switched off
  89. completely, the cache grows uncontrollably."""
  90. BLOB_MAX_BYTES: int = 1024 * 20 # 20 KB
  91. """The maximum BLOB size in bytes that a favicon may have so that it can be
  92. saved in the cache. If the favicon is larger, it is not saved in the cache
  93. and must be requested by the client via the proxy."""
  94. MAINTENANCE_PERIOD: int = 60 * 60
  95. """Maintenance period in seconds / when :py:obj:`MAINTENANCE_MODE` is set to
  96. ``auto``."""
  97. MAINTENANCE_MODE: Literal["auto", "off"] = "auto"
  98. """Type of maintenance mode
  99. ``auto``:
  100. Maintenance is carried out automatically as part of the maintenance
  101. intervals (:py:obj:`MAINTENANCE_PERIOD`); no external process is required.
  102. ``off``:
  103. Maintenance is switched off and must be carried out by an external process
  104. if required.
  105. """
  106. @dataclasses.dataclass
  107. class FaviconCacheStats:
  108. """Dataclass wich provides information on the status of the cache."""
  109. favicons: int | None = None
  110. bytes: int | None = None
  111. domains: int | None = None
  112. resolvers: int | None = None
  113. field_descr = (
  114. ("favicons", "number of favicons in cache", humanize_number),
  115. ("bytes", "total size (approx. bytes) of cache", humanize_bytes),
  116. ("domains", "total number of domains in cache", humanize_number),
  117. ("resolvers", "number of resolvers", str),
  118. )
  119. def __sub__(self, other) -> FaviconCacheStats:
  120. if not isinstance(other, self.__class__):
  121. raise TypeError(f"unsupported operand type(s) for +: '{self.__class__}' and '{type(other)}'")
  122. kwargs = {}
  123. for field, _, _ in self.field_descr:
  124. self_val, other_val = getattr(self, field), getattr(other, field)
  125. if None in (self_val, other_val):
  126. continue
  127. if isinstance(self_val, int):
  128. kwargs[field] = self_val - other_val
  129. else:
  130. kwargs[field] = self_val
  131. return self.__class__(**kwargs)
  132. def report(self, fmt: str = "{descr}: {val}\n"):
  133. s = []
  134. for field, descr, cast in self.field_descr:
  135. val = getattr(self, field)
  136. if val is None:
  137. val = "--"
  138. else:
  139. val = cast(val)
  140. s.append(fmt.format(descr=descr, val=val))
  141. return "".join(s)
  142. class FaviconCache(abc.ABC):
  143. """Abstract base class for the implementation of a favicon cache."""
  144. @abc.abstractmethod
  145. def __init__(self, cfg: FaviconCacheConfig):
  146. """An instance of the favicon cache is build up from the configuration."""
  147. @abc.abstractmethod
  148. def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
  149. """Returns ``None`` or the tuple of ``(data, mime)`` that has been
  150. registered in the cache. The ``None`` indicates that there was no entry
  151. in the cache."""
  152. @abc.abstractmethod
  153. def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
  154. """Set data and mime-type in the cache. If data is None, the
  155. :py:obj:`FALLBACK_ICON` is registered. in the cache."""
  156. @abc.abstractmethod
  157. def state(self) -> FaviconCacheStats:
  158. """Returns a :py:obj:`FaviconCacheStats` (key/values) with information
  159. on the state of the cache."""
  160. @abc.abstractmethod
  161. def maintenance(self, force=False):
  162. """Performs maintenance on the cache"""
  163. class FaviconCacheNull(FaviconCache):
  164. """A dummy favicon cache that caches nothing / a fallback solution. The
  165. NullCache is used when more efficient caches such as the
  166. :py:obj:`FaviconCacheSQLite` cannot be used because, for example, the SQLite
  167. library is only available in an old version and does not meet the
  168. requirements."""
  169. def __init__(self, cfg: FaviconCacheConfig):
  170. return None
  171. def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
  172. return None
  173. def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
  174. return False
  175. def state(self):
  176. return FaviconCacheStats(favicons=0)
  177. def maintenance(self, force=False):
  178. pass
  179. class FaviconCacheSQLite(sqlitedb.SQLiteAppl, FaviconCache):
  180. """Favicon cache that manages the favicon BLOBs in a SQLite DB. The DB
  181. model in the SQLite DB is implemented using the abstract class
  182. :py:obj:`sqlitedb.SQLiteAppl`.
  183. The following configurations are required / supported:
  184. - :py:obj:`FaviconCacheConfig.db_url`
  185. - :py:obj:`FaviconCacheConfig.HOLD_TIME`
  186. - :py:obj:`FaviconCacheConfig.LIMIT_TOTAL_BYTES`
  187. - :py:obj:`FaviconCacheConfig.BLOB_MAX_BYTES`
  188. - :py:obj:`MAINTENANCE_PERIOD`
  189. - :py:obj:`MAINTENANCE_MODE`
  190. """
  191. DB_SCHEMA = 1
  192. DDL_BLOBS = """\
  193. CREATE TABLE IF NOT EXISTS blobs (
  194. sha256 TEXT,
  195. bytes_c INTEGER,
  196. mime TEXT NOT NULL,
  197. data BLOB NOT NULL,
  198. PRIMARY KEY (sha256))"""
  199. """Table to store BLOB objects by their sha256 hash values."""
  200. DDL_BLOB_MAP = """\
  201. CREATE TABLE IF NOT EXISTS blob_map (
  202. m_time INTEGER DEFAULT (strftime('%s', 'now')), -- last modified (unix epoch) time in sec.
  203. sha256 TEXT,
  204. resolver TEXT,
  205. authority TEXT,
  206. PRIMARY KEY (resolver, authority))"""
  207. """Table to map from (resolver, authority) to sha256 hash values."""
  208. DDL_CREATE_TABLES = {
  209. "blobs": DDL_BLOBS,
  210. "blob_map": DDL_BLOB_MAP,
  211. }
  212. SQL_DROP_LEFTOVER_BLOBS = (
  213. "DELETE FROM blobs WHERE sha256 IN ("
  214. " SELECT b.sha256"
  215. " FROM blobs b"
  216. " LEFT JOIN blob_map bm"
  217. " ON b.sha256 = bm.sha256"
  218. " WHERE bm.sha256 IS NULL)"
  219. )
  220. """Delete blobs.sha256 (BLOBs) no longer in blob_map.sha256."""
  221. SQL_ITER_BLOBS_SHA256_BYTES_C = (
  222. "SELECT b.sha256, b.bytes_c FROM blobs b"
  223. " JOIN blob_map bm "
  224. " ON b.sha256 = bm.sha256"
  225. " ORDER BY bm.m_time ASC"
  226. )
  227. SQL_INSERT_BLOBS = (
  228. "INSERT INTO blobs (sha256, bytes_c, mime, data) VALUES (?, ?, ?, ?)"
  229. " ON CONFLICT (sha256) DO NOTHING"
  230. ) # fmt: skip
  231. SQL_INSERT_BLOB_MAP = (
  232. "INSERT INTO blob_map (sha256, resolver, authority) VALUES (?, ?, ?)"
  233. " ON CONFLICT DO UPDATE "
  234. " SET sha256=excluded.sha256, m_time=strftime('%s', 'now')"
  235. )
  236. def __init__(self, cfg: FaviconCacheConfig):
  237. """An instance of the favicon cache is build up from the configuration.""" #
  238. if cfg.db_url == ":memory:":
  239. logger.critical("don't use SQLite DB in :memory: in production!!")
  240. super().__init__(cfg.db_url)
  241. self.cfg = cfg
  242. def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
  243. sql = "SELECT sha256 FROM blob_map WHERE resolver = ? AND authority = ?"
  244. res = self.DB.execute(sql, (resolver, authority)).fetchone()
  245. if res is None:
  246. return None
  247. data, mime = (None, None)
  248. sha256 = res[0]
  249. if sha256 == FALLBACK_ICON:
  250. return data, mime
  251. sql = "SELECT data, mime FROM blobs WHERE sha256 = ?"
  252. res = self.DB.execute(sql, (sha256,)).fetchone()
  253. if res is not None:
  254. data, mime = res
  255. return data, mime
  256. def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
  257. if self.cfg.MAINTENANCE_MODE == "auto" and int(time.time()) > self.next_maintenance_time:
  258. # Should automatic maintenance be moved to a new thread?
  259. self.maintenance()
  260. if data is not None and mime is None:
  261. logger.error(
  262. "favicon resolver %s tries to cache mime-type None for authority %s",
  263. resolver,
  264. authority,
  265. )
  266. return False
  267. bytes_c = len(data or b"")
  268. if bytes_c > self.cfg.BLOB_MAX_BYTES:
  269. logger.info(
  270. "favicon of resolver: %s / authority: %s to big to cache (bytes: %s) " % (resolver, authority, bytes_c)
  271. )
  272. return False
  273. if data is None:
  274. sha256 = FALLBACK_ICON
  275. else:
  276. sha256 = hashlib.sha256(data).hexdigest()
  277. with self.connect() as conn:
  278. if sha256 != FALLBACK_ICON:
  279. conn.execute(self.SQL_INSERT_BLOBS, (sha256, bytes_c, mime, data))
  280. conn.execute(self.SQL_INSERT_BLOB_MAP, (sha256, resolver, authority))
  281. return True
  282. @property
  283. def next_maintenance_time(self) -> int:
  284. """Returns (unix epoch) time of the next maintenance."""
  285. return self.cfg.MAINTENANCE_PERIOD + self.properties.m_time("LAST_MAINTENANCE")
  286. def maintenance(self, force=False):
  287. # Prevent parallel DB maintenance cycles from other DB connections
  288. # (e.g. in multi thread or process environments).
  289. if not force and int(time.time()) < self.next_maintenance_time:
  290. logger.debug("no maintenance required yet, next maintenance interval is in the future")
  291. return
  292. self.properties.set("LAST_MAINTENANCE", "") # hint: this (also) sets the m_time of the property!
  293. # do maintenance tasks
  294. with self.connect() as conn:
  295. # drop items not in HOLD time
  296. res = conn.execute(
  297. f"DELETE FROM blob_map"
  298. f" WHERE cast(m_time as integer) < cast(strftime('%s', 'now') as integer) - {self.cfg.HOLD_TIME}"
  299. )
  300. logger.debug("dropped %s obsolete blob_map items from db", res.rowcount)
  301. res = conn.execute(self.SQL_DROP_LEFTOVER_BLOBS)
  302. logger.debug("dropped %s obsolete BLOBS from db", res.rowcount)
  303. # drop old items to be in LIMIT_TOTAL_BYTES
  304. total_bytes = conn.execute("SELECT SUM(bytes_c) FROM blobs").fetchone()[0] or 0
  305. if total_bytes > self.cfg.LIMIT_TOTAL_BYTES:
  306. x = total_bytes - self.cfg.LIMIT_TOTAL_BYTES
  307. c = 0
  308. sha_list = []
  309. for row in conn.execute(self.SQL_ITER_BLOBS_SHA256_BYTES_C):
  310. sha256, bytes_c = row
  311. sha_list.append(sha256)
  312. c += bytes_c
  313. if c > x:
  314. break
  315. if sha_list:
  316. conn.execute("DELETE FROM blobs WHERE sha256 IN ('%s')" % "','".join(sha_list))
  317. conn.execute("DELETE FROM blob_map WHERE sha256 IN ('%s')" % "','".join(sha_list))
  318. logger.debug("dropped %s blobs with total size of %s bytes", len(sha_list), c)
  319. def _query_val(self, sql, default=None):
  320. val = self.DB.execute(sql).fetchone()
  321. if val is not None:
  322. val = val[0]
  323. if val is None:
  324. val = default
  325. return val
  326. def state(self) -> FaviconCacheStats:
  327. return FaviconCacheStats(
  328. favicons=self._query_val("SELECT count(*) FROM blobs", 0),
  329. bytes=self._query_val("SELECT SUM(bytes_c) FROM blobs", 0),
  330. domains=self._query_val("SELECT count(*) FROM (SELECT authority FROM blob_map GROUP BY authority)", 0),
  331. resolvers=self._query_val("SELECT count(*) FROM (SELECT resolver FROM blob_map GROUP BY resolver)", 0),
  332. )
  333. class FaviconCacheMEM(FaviconCache):
  334. """Favicon cache in process' memory. Its just a POC that stores the
  335. favicons in the memory of the process.
  336. .. attention::
  337. Don't use it in production, it will blow up your memory!!
  338. """
  339. def __init__(self, cfg):
  340. self.cfg = cfg
  341. self._data = {}
  342. self._sha_mime = {}
  343. def __call__(self, resolver: str, authority: str) -> None | tuple[bytes | None, str | None]:
  344. sha, mime = self._sha_mime.get(f"{resolver}:{authority}", (None, None))
  345. if sha is None:
  346. return None
  347. data = self._data.get(sha)
  348. if data == FALLBACK_ICON:
  349. data = None
  350. return data, mime
  351. def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
  352. if data is None:
  353. data = FALLBACK_ICON
  354. mime = None
  355. elif mime is None:
  356. logger.error(
  357. "favicon resolver %s tries to cache mime-type None for authority %s",
  358. resolver,
  359. authority,
  360. )
  361. return False
  362. digest = hashlib.sha256(data).hexdigest()
  363. self._data[digest] = data
  364. self._sha_mime[f"{resolver}:{authority}"] = (digest, mime)
  365. return True
  366. def state(self):
  367. return FaviconCacheStats(favicons=len(self._data.keys()))
  368. def maintenance(self, force=False):
  369. pass