http_user_agent.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. Method ``http_user_agent``
  4. --------------------------
  5. The ``http_user_agent`` method evaluates a request as the request of a bot if
  6. the User-Agent_ header is unset or matches the regular expression
  7. :py:obj:`USER_AGENT`.
  8. .. _User-Agent:
  9. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
  10. """
  11. # pylint: disable=unused-argument
  12. from __future__ import annotations
  13. import re
  14. from ipaddress import (
  15. IPv4Network,
  16. IPv6Network,
  17. )
  18. import flask
  19. import werkzeug
  20. from . import config
  21. from ._helpers import too_many_requests
  22. USER_AGENT = (
  23. r'('
  24. + r'unknown'
  25. + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
  26. + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
  27. + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
  28. + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
  29. + r'|ZmEu|BLEXBot|bitlybot|HeadlessChrome'
  30. # unmaintained Farside instances
  31. + r'|'
  32. + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
  33. # other bots and client to block
  34. + '|.*PetalBot.*'
  35. + r')'
  36. )
  37. """Regular expression that matches to User-Agent_ from known *bots*"""
  38. _regexp = None
  39. def regexp_user_agent():
  40. global _regexp # pylint: disable=global-statement
  41. if not _regexp:
  42. _regexp = re.compile(USER_AGENT)
  43. return _regexp
  44. def filter_request(
  45. network: IPv4Network | IPv6Network,
  46. request: flask.Request,
  47. cfg: config.Config,
  48. ) -> werkzeug.Response | None:
  49. user_agent = request.headers.get('User-Agent', 'unknown')
  50. if regexp_user_agent().match(user_agent):
  51. return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}")
  52. return None