12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- # SPDX-License-Identifier: AGPL-3.0-or-later
- """
- Method ``http_user_agent``
- --------------------------
- The ``http_user_agent`` method evaluates a request as the request of a bot if
- the User-Agent_ header is unset or matches the regular expression
- :py:obj:`USER_AGENT`.
- .. _User-Agent:
- https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
- """
- # pylint: disable=unused-argument
- from __future__ import annotations
- import re
- from ipaddress import (
- IPv4Network,
- IPv6Network,
- )
- import flask
- import werkzeug
- from . import config
- from ._helpers import too_many_requests
- USER_AGENT = (
- r'('
- + r'unknown'
- + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
- + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
- + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
- + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
- + r'|ZmEu|BLEXBot|bitlybot|HeadlessChrome'
- # unmaintained Farside instances
- + r'|'
- + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
- # other bots and client to block
- + '|.*PetalBot.*'
- + r')'
- )
- """Regular expression that matches to User-Agent_ from known *bots*"""
- _regexp = None
- def regexp_user_agent():
- global _regexp # pylint: disable=global-statement
- if not _regexp:
- _regexp = re.compile(USER_AGENT)
- return _regexp
- def filter_request(
- network: IPv4Network | IPv6Network,
- request: flask.Request,
- cfg: config.Config,
- ) -> werkzeug.Response | None:
- user_agent = request.headers.get('User-Agent', 'unknown')
- if regexp_user_agent().match(user_agent):
- return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}")
- return None
|