123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477 |
- # SPDX-License-Identifier: AGPL-3.0-or-later
- """Implementations for caching favicons.
- :py:obj:`FaviconCacheConfig`:
- Configuration of the favicon cache
- :py:obj:`FaviconCache`:
- Abstract base class for the implementation of a favicon cache.
- :py:obj:`FaviconCacheSQLite`:
- Favicon cache that manages the favicon BLOBs in a SQLite DB.
- :py:obj:`FaviconCacheNull`:
- Fallback solution if the configured cache cannot be used for system reasons.
- ----
- """
- from __future__ import annotations
- from typing import Literal
- import os
- import abc
- import dataclasses
- import hashlib
- import logging
- import sqlite3
- import tempfile
- import time
- import typer
- import msgspec
- from searx import sqlitedb
- from searx import logger
- from searx.utils import humanize_bytes, humanize_number
- CACHE: "FaviconCache"
- FALLBACK_ICON = b"FALLBACK_ICON"
- logger = logger.getChild('favicons.cache')
- app = typer.Typer()
- @app.command()
- def state():
- """show state of the cache"""
- print(CACHE.state().report())
- @app.command()
- def maintenance(force: bool = True, debug: bool = False):
- """perform maintenance of the cache"""
- root_log = logging.getLogger()
- if debug:
- root_log.setLevel(logging.DEBUG)
- else:
- root_log.handlers = []
- handler = logging.StreamHandler()
- handler.setFormatter(logging.Formatter("%(message)s"))
- logger.addHandler(handler)
- logger.setLevel(logging.DEBUG)
- state_t0 = CACHE.state()
- CACHE.maintenance(force=force)
- state_t1 = CACHE.state()
- state_delta = state_t0 - state_t1
- print("The cache has been reduced by:")
- print(state_delta.report("\n- {descr}: {val}").lstrip("\n"))
- def init(cfg: "FaviconCacheConfig"):
- """Initialization of a global ``CACHE``"""
- global CACHE # pylint: disable=global-statement
- if cfg.db_type == "sqlite":
- if sqlite3.sqlite_version_info <= (3, 35):
- logger.critical(
- "Disable favicon caching completely: SQLite library (%s) is too old! (require >= 3.35)",
- sqlite3.sqlite_version,
- )
- CACHE = FaviconCacheNull(cfg)
- else:
- CACHE = FaviconCacheSQLite(cfg)
- elif cfg.db_type == "mem":
- logger.error("Favicons are cached in memory, don't use this in production!")
- CACHE = FaviconCacheMEM(cfg)
- else:
- raise NotImplementedError(f"favicons db_type '{cfg.db_type}' is unknown")
- class FaviconCacheConfig(msgspec.Struct): # pylint: disable=too-few-public-methods
- """Configuration of the favicon cache."""
- db_type: Literal["sqlite", "mem"] = "sqlite"
- """Type of the database:
- ``sqlite``:
- :py:obj:`.cache.FaviconCacheSQLite`
- ``mem``:
- :py:obj:`.cache.FaviconCacheMEM` (not recommended)
- """
- db_url: str = tempfile.gettempdir() + os.sep + "faviconcache.db"
- """URL of the SQLite DB, the path to the database file."""
- HOLD_TIME: int = 60 * 60 * 24 * 30 # 30 days
- """Hold time (default in sec.), after which a BLOB is removed from the cache."""
- LIMIT_TOTAL_BYTES: int = 1024 * 1024 * 50 # 50 MB
- """Maximum of bytes (default) stored in the cache of all blobs. Note: The
- limit is only reached at each maintenance interval after which the oldest
- BLOBs are deleted; the limit is exceeded during the maintenance period. If
- the maintenance period is *too long* or maintenance is switched off
- completely, the cache grows uncontrollably."""
- BLOB_MAX_BYTES: int = 1024 * 20 # 20 KB
- """The maximum BLOB size in bytes that a favicon may have so that it can be
- saved in the cache. If the favicon is larger, it is not saved in the cache
- and must be requested by the client via the proxy."""
- MAINTENANCE_PERIOD: int = 60 * 60
- """Maintenance period in seconds / when :py:obj:`MAINTENANCE_MODE` is set to
- ``auto``."""
- MAINTENANCE_MODE: Literal["auto", "off"] = "auto"
- """Type of maintenance mode
- ``auto``:
- Maintenance is carried out automatically as part of the maintenance
- intervals (:py:obj:`MAINTENANCE_PERIOD`); no external process is required.
- ``off``:
- Maintenance is switched off and must be carried out by an external process
- if required.
- """
- @dataclasses.dataclass
- class FaviconCacheStats:
- """Dataclass wich provides information on the status of the cache."""
- favicons: int | None = None
- bytes: int | None = None
- domains: int | None = None
- resolvers: int | None = None
- field_descr = (
- ("favicons", "number of favicons in cache", humanize_number),
- ("bytes", "total size (approx. bytes) of cache", humanize_bytes),
- ("domains", "total number of domains in cache", humanize_number),
- ("resolvers", "number of resolvers", str),
- )
- def __sub__(self, other) -> FaviconCacheStats:
- if not isinstance(other, self.__class__):
- raise TypeError(f"unsupported operand type(s) for +: '{self.__class__}' and '{type(other)}'")
- kwargs = {}
- for field, _, _ in self.field_descr:
- self_val, other_val = getattr(self, field), getattr(other, field)
- if None in (self_val, other_val):
- continue
- if isinstance(self_val, int):
- kwargs[field] = self_val - other_val
- else:
- kwargs[field] = self_val
- return self.__class__(**kwargs)
- def report(self, fmt: str = "{descr}: {val}\n"):
- s = []
- for field, descr, cast in self.field_descr:
- val = getattr(self, field)
- if val is None:
- val = "--"
- else:
- val = cast(val)
- s.append(fmt.format(descr=descr, val=val))
- return "".join(s)
- class FaviconCache(abc.ABC):
- """Abstract base class for the implementation of a favicon cache."""
- @abc.abstractmethod
- def __init__(self, cfg: FaviconCacheConfig):
- """An instance of the favicon cache is build up from the configuration."""
- @abc.abstractmethod
- def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
- """Returns ``None`` or the tuple of ``(data, mime)`` that has been
- registered in the cache. The ``None`` indicates that there was no entry
- in the cache."""
- @abc.abstractmethod
- def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
- """Set data and mime-type in the cache. If data is None, the
- :py:obj:`FALLBACK_ICON` is registered. in the cache."""
- @abc.abstractmethod
- def state(self) -> FaviconCacheStats:
- """Returns a :py:obj:`FaviconCacheStats` (key/values) with information
- on the state of the cache."""
- @abc.abstractmethod
- def maintenance(self, force=False):
- """Performs maintenance on the cache"""
- class FaviconCacheNull(FaviconCache):
- """A dummy favicon cache that caches nothing / a fallback solution. The
- NullCache is used when more efficient caches such as the
- :py:obj:`FaviconCacheSQLite` cannot be used because, for example, the SQLite
- library is only available in an old version and does not meet the
- requirements."""
- def __init__(self, cfg: FaviconCacheConfig):
- return None
- def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
- return None
- def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
- return False
- def state(self):
- return FaviconCacheStats(favicons=0)
- def maintenance(self, force=False):
- pass
- class FaviconCacheSQLite(sqlitedb.SQLiteAppl, FaviconCache):
- """Favicon cache that manages the favicon BLOBs in a SQLite DB. The DB
- model in the SQLite DB is implemented using the abstract class
- :py:obj:`sqlitedb.SQLiteAppl`.
- The following configurations are required / supported:
- - :py:obj:`FaviconCacheConfig.db_url`
- - :py:obj:`FaviconCacheConfig.HOLD_TIME`
- - :py:obj:`FaviconCacheConfig.LIMIT_TOTAL_BYTES`
- - :py:obj:`FaviconCacheConfig.BLOB_MAX_BYTES`
- - :py:obj:`MAINTENANCE_PERIOD`
- - :py:obj:`MAINTENANCE_MODE`
- """
- DB_SCHEMA = 1
- DDL_BLOBS = """\
- CREATE TABLE IF NOT EXISTS blobs (
- sha256 TEXT,
- bytes_c INTEGER,
- mime TEXT NOT NULL,
- data BLOB NOT NULL,
- PRIMARY KEY (sha256))"""
- """Table to store BLOB objects by their sha256 hash values."""
- DDL_BLOB_MAP = """\
- CREATE TABLE IF NOT EXISTS blob_map (
- m_time INTEGER DEFAULT (strftime('%s', 'now')), -- last modified (unix epoch) time in sec.
- sha256 TEXT,
- resolver TEXT,
- authority TEXT,
- PRIMARY KEY (resolver, authority))"""
- """Table to map from (resolver, authority) to sha256 hash values."""
- DDL_CREATE_TABLES = {
- "blobs": DDL_BLOBS,
- "blob_map": DDL_BLOB_MAP,
- }
- SQL_DROP_LEFTOVER_BLOBS = (
- "DELETE FROM blobs WHERE sha256 IN ("
- " SELECT b.sha256"
- " FROM blobs b"
- " LEFT JOIN blob_map bm"
- " ON b.sha256 = bm.sha256"
- " WHERE bm.sha256 IS NULL)"
- )
- """Delete blobs.sha256 (BLOBs) no longer in blob_map.sha256."""
- SQL_ITER_BLOBS_SHA256_BYTES_C = (
- "SELECT b.sha256, b.bytes_c FROM blobs b"
- " JOIN blob_map bm "
- " ON b.sha256 = bm.sha256"
- " ORDER BY bm.m_time ASC"
- )
- SQL_INSERT_BLOBS = (
- "INSERT INTO blobs (sha256, bytes_c, mime, data) VALUES (?, ?, ?, ?)"
- " ON CONFLICT (sha256) DO NOTHING"
- ) # fmt: skip
- SQL_INSERT_BLOB_MAP = (
- "INSERT INTO blob_map (sha256, resolver, authority) VALUES (?, ?, ?)"
- " ON CONFLICT DO UPDATE "
- " SET sha256=excluded.sha256, m_time=strftime('%s', 'now')"
- )
- def __init__(self, cfg: FaviconCacheConfig):
- """An instance of the favicon cache is build up from the configuration.""" #
- if cfg.db_url == ":memory:":
- logger.critical("don't use SQLite DB in :memory: in production!!")
- super().__init__(cfg.db_url)
- self.cfg = cfg
- def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
- sql = "SELECT sha256 FROM blob_map WHERE resolver = ? AND authority = ?"
- res = self.DB.execute(sql, (resolver, authority)).fetchone()
- if res is None:
- return None
- data, mime = (None, None)
- sha256 = res[0]
- if sha256 == FALLBACK_ICON:
- return data, mime
- sql = "SELECT data, mime FROM blobs WHERE sha256 = ?"
- res = self.DB.execute(sql, (sha256,)).fetchone()
- if res is not None:
- data, mime = res
- return data, mime
- def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
- if self.cfg.MAINTENANCE_MODE == "auto" and int(time.time()) > self.next_maintenance_time:
- # Should automatic maintenance be moved to a new thread?
- self.maintenance()
- if data is not None and mime is None:
- logger.error(
- "favicon resolver %s tries to cache mime-type None for authority %s",
- resolver,
- authority,
- )
- return False
- bytes_c = len(data or b"")
- if bytes_c > self.cfg.BLOB_MAX_BYTES:
- logger.info(
- "favicon of resolver: %s / authority: %s to big to cache (bytes: %s) " % (resolver, authority, bytes_c)
- )
- return False
- if data is None:
- sha256 = FALLBACK_ICON
- else:
- sha256 = hashlib.sha256(data).hexdigest()
- with self.connect() as conn:
- if sha256 != FALLBACK_ICON:
- conn.execute(self.SQL_INSERT_BLOBS, (sha256, bytes_c, mime, data))
- conn.execute(self.SQL_INSERT_BLOB_MAP, (sha256, resolver, authority))
- return True
- @property
- def next_maintenance_time(self) -> int:
- """Returns (unix epoch) time of the next maintenance."""
- return self.cfg.MAINTENANCE_PERIOD + self.properties.m_time("LAST_MAINTENANCE")
- def maintenance(self, force=False):
- # Prevent parallel DB maintenance cycles from other DB connections
- # (e.g. in multi thread or process environments).
- if not force and int(time.time()) < self.next_maintenance_time:
- logger.debug("no maintenance required yet, next maintenance interval is in the future")
- return
- self.properties.set("LAST_MAINTENANCE", "") # hint: this (also) sets the m_time of the property!
- # do maintenance tasks
- with self.connect() as conn:
- # drop items not in HOLD time
- res = conn.execute(
- f"DELETE FROM blob_map"
- f" WHERE cast(m_time as integer) < cast(strftime('%s', 'now') as integer) - {self.cfg.HOLD_TIME}"
- )
- logger.debug("dropped %s obsolete blob_map items from db", res.rowcount)
- res = conn.execute(self.SQL_DROP_LEFTOVER_BLOBS)
- logger.debug("dropped %s obsolete BLOBS from db", res.rowcount)
- # drop old items to be in LIMIT_TOTAL_BYTES
- total_bytes = conn.execute("SELECT SUM(bytes_c) FROM blobs").fetchone()[0] or 0
- if total_bytes > self.cfg.LIMIT_TOTAL_BYTES:
- x = total_bytes - self.cfg.LIMIT_TOTAL_BYTES
- c = 0
- sha_list = []
- for row in conn.execute(self.SQL_ITER_BLOBS_SHA256_BYTES_C):
- sha256, bytes_c = row
- sha_list.append(sha256)
- c += bytes_c
- if c > x:
- break
- if sha_list:
- conn.execute("DELETE FROM blobs WHERE sha256 IN ('%s')" % "','".join(sha_list))
- conn.execute("DELETE FROM blob_map WHERE sha256 IN ('%s')" % "','".join(sha_list))
- logger.debug("dropped %s blobs with total size of %s bytes", len(sha_list), c)
- def _query_val(self, sql, default=None):
- val = self.DB.execute(sql).fetchone()
- if val is not None:
- val = val[0]
- if val is None:
- val = default
- return val
- def state(self) -> FaviconCacheStats:
- return FaviconCacheStats(
- favicons=self._query_val("SELECT count(*) FROM blobs", 0),
- bytes=self._query_val("SELECT SUM(bytes_c) FROM blobs", 0),
- domains=self._query_val("SELECT count(*) FROM (SELECT authority FROM blob_map GROUP BY authority)", 0),
- resolvers=self._query_val("SELECT count(*) FROM (SELECT resolver FROM blob_map GROUP BY resolver)", 0),
- )
- class FaviconCacheMEM(FaviconCache):
- """Favicon cache in process' memory. Its just a POC that stores the
- favicons in the memory of the process.
- .. attention::
- Don't use it in production, it will blow up your memory!!
- """
- def __init__(self, cfg):
- self.cfg = cfg
- self._data = {}
- self._sha_mime = {}
- def __call__(self, resolver: str, authority: str) -> None | tuple[bytes | None, str | None]:
- sha, mime = self._sha_mime.get(f"{resolver}:{authority}", (None, None))
- if sha is None:
- return None
- data = self._data.get(sha)
- if data == FALLBACK_ICON:
- data = None
- return data, mime
- def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
- if data is None:
- data = FALLBACK_ICON
- mime = None
- elif mime is None:
- logger.error(
- "favicon resolver %s tries to cache mime-type None for authority %s",
- resolver,
- authority,
- )
- return False
- digest = hashlib.sha256(data).hexdigest()
- self._data[digest] = data
- self._sha_mime[f"{resolver}:{authority}"] = (digest, mime)
- return True
- def state(self):
- return FaviconCacheStats(favicons=len(self._data.keys()))
- def maintenance(self, force=False):
- pass
|