adblock.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. #!/usr/bin/env python3
  2. # -*- cofing: utf-8 -*-
  3. # vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:
  4. # Copyright 2014-2018 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
  5. #
  6. # This file is part of qutebrowser.
  7. #
  8. # qutebrowser is free software: you can redistribute it and/or modify
  9. # it under the terms of the GNU General Public License as published by
  10. # the Free Software Foundation, either version 3 of the License, or
  11. # (at your option) any later version.
  12. #
  13. # qutebrowser is distributed in the hope that it will be useful,
  14. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. # GNU General Public License for more details.
  17. #
  18. # You should have received a copy of the GNU General Public License
  19. # along with qutebrowser. If not, see <http://www.gnu.org/licenses/>.
  20. """Functions related to ad blocking."""
  21. import io
  22. import os.path
  23. import functools
  24. import posixpath
  25. import zipfile
  26. from qutebrowser.browser import downloads
  27. from qutebrowser.config import config
  28. from qutebrowser.utils import objreg, standarddir, log, message
  29. from qutebrowser.commands import cmdutils
  30. def _guess_zip_filename(zf):
  31. """Guess which file to use inside a zip file.
  32. Args:
  33. zf: A ZipFile instance.
  34. """
  35. files = zf.namelist()
  36. if len(files) == 1:
  37. return files[0]
  38. else:
  39. for e in files:
  40. if posixpath.splitext(e)[0].lower() == 'hosts':
  41. return e
  42. raise FileNotFoundError("No hosts file found in zip")
  43. def get_fileobj(byte_io):
  44. """Get a usable file object to read the hosts file from."""
  45. byte_io.seek(0) # rewind downloaded file
  46. if zipfile.is_zipfile(byte_io):
  47. byte_io.seek(0) # rewind what zipfile.is_zipfile did
  48. zf = zipfile.ZipFile(byte_io)
  49. filename = _guess_zip_filename(zf)
  50. byte_io = zf.open(filename, mode='r')
  51. else:
  52. byte_io.seek(0) # rewind what zipfile.is_zipfile did
  53. return byte_io
  54. def _is_whitelisted_url(url):
  55. """Check if the given URL is on the adblock whitelist.
  56. Args:
  57. url: The URL to check as QUrl.
  58. """
  59. for pattern in config.val.content.host_blocking.whitelist:
  60. if pattern.matches(url):
  61. return True
  62. return False
  63. class _FakeDownload:
  64. """A download stub to use on_download_finished with local files."""
  65. def __init__(self, fileobj):
  66. self.basename = os.path.basename(fileobj.name)
  67. self.fileobj = fileobj
  68. self.successful = True
  69. class HostBlocker:
  70. """Manage blocked hosts based from /etc/hosts-like files.
  71. Attributes:
  72. _blocked_hosts: A set of blocked hosts.
  73. _config_blocked_hosts: A set of blocked hosts from ~/.config.
  74. _in_progress: The DownloadItems which are currently downloading.
  75. _done_count: How many files have been read successfully.
  76. _local_hosts_file: The path to the blocked-hosts file.
  77. _config_hosts_file: The path to a blocked-hosts in ~/.config
  78. """
  79. def __init__(self):
  80. self._blocked_hosts = set()
  81. self._config_blocked_hosts = set()
  82. self._in_progress = []
  83. self._done_count = 0
  84. data_dir = standarddir.data()
  85. self._local_hosts_file = os.path.join(data_dir, 'blocked-hosts')
  86. self._update_files()
  87. config_dir = standarddir.config()
  88. self._config_hosts_file = os.path.join(config_dir, 'blocked-hosts')
  89. config.instance.changed.connect(self._update_files)
  90. def is_blocked(self, url, first_party_url=None):
  91. """Check if the given URL (as QUrl) is blocked."""
  92. if first_party_url is not None and not first_party_url.isValid():
  93. first_party_url = None
  94. if not config.instance.get('content.host_blocking.enabled',
  95. url=first_party_url):
  96. return False
  97. host = url.host()
  98. u = url.toDisplayString()
  99. if 'youtube.com/get_midroll_' in u:
  100. return True
  101. return ((host in self._blocked_hosts or
  102. host in self._config_blocked_hosts) and
  103. not _is_whitelisted_url(url))
  104. def _read_hosts_file(self, filename, target):
  105. """Read hosts from the given filename.
  106. Args:
  107. filename: The file to read.
  108. target: The set to store the hosts in.
  109. Return:
  110. True if a read was attempted, False otherwise
  111. """
  112. if not os.path.exists(filename):
  113. return False
  114. try:
  115. with open(filename, 'r', encoding='utf-8') as f:
  116. for line in f:
  117. target.add(line.strip())
  118. except (OSError, UnicodeDecodeError):
  119. log.misc.exception("Failed to read host blocklist!")
  120. return True
  121. def read_hosts(self):
  122. """Read hosts from the existing blocked-hosts file."""
  123. self._blocked_hosts = set()
  124. self._read_hosts_file(self._config_hosts_file,
  125. self._config_blocked_hosts)
  126. found = self._read_hosts_file(self._local_hosts_file,
  127. self._blocked_hosts)
  128. if not found:
  129. args = objreg.get('args')
  130. if (config.val.content.host_blocking.lists and
  131. args.basedir is None and
  132. config.val.content.host_blocking.enabled):
  133. message.info("Run :adblock-update to get adblock lists.")
  134. @cmdutils.register(instance='host-blocker')
  135. def adblock_update(self):
  136. """Update the adblock block lists.
  137. This updates `~/.local/share/qutebrowser/blocked-hosts` with downloaded
  138. host lists and re-reads `~/.config/qutebrowser/blocked-hosts`.
  139. """
  140. self._read_hosts_file(self._config_hosts_file,
  141. self._config_blocked_hosts)
  142. self._blocked_hosts = set()
  143. self._done_count = 0
  144. download_manager = objreg.get('qtnetwork-download-manager')
  145. for url in config.val.content.host_blocking.lists:
  146. if url.scheme() == 'file':
  147. filename = url.toLocalFile()
  148. if os.path.isdir(filename):
  149. for entry in os.scandir(filename):
  150. if entry.is_file():
  151. self._import_local(entry.path)
  152. else:
  153. self._import_local(filename)
  154. else:
  155. fobj = io.BytesIO()
  156. fobj.name = 'adblock: ' + url.host()
  157. target = downloads.FileObjDownloadTarget(fobj)
  158. download = download_manager.get(url, target=target,
  159. auto_remove=True)
  160. self._in_progress.append(download)
  161. download.finished.connect(
  162. functools.partial(self._on_download_finished, download))
  163. def _import_local(self, filename):
  164. """Adds the contents of a file to the blocklist.
  165. Args:
  166. filename: path to a local file to import.
  167. """
  168. try:
  169. fileobj = open(filename, 'rb')
  170. except OSError as e:
  171. message.error("adblock: Error while reading {}: {}".format(
  172. filename, e.strerror))
  173. return
  174. download = _FakeDownload(fileobj)
  175. self._in_progress.append(download)
  176. self._on_download_finished(download)
  177. def _parse_line(self, line):
  178. """Parse a line from a host file.
  179. Args:
  180. line: The bytes object to parse.
  181. Returns:
  182. True if parsing succeeded, False otherwise.
  183. """
  184. if line.startswith(b'#'):
  185. # Ignoring comments early so we don't have to care about
  186. # encoding errors in them.
  187. return True
  188. try:
  189. line = line.decode('utf-8')
  190. except UnicodeDecodeError:
  191. log.misc.error("Failed to decode: {!r}".format(line))
  192. return False
  193. # Remove comments
  194. try:
  195. hash_idx = line.index('#')
  196. line = line[:hash_idx]
  197. except ValueError:
  198. pass
  199. line = line.strip()
  200. # Skip empty lines
  201. if not line:
  202. return True
  203. parts = line.split()
  204. if len(parts) == 1:
  205. # "one host per line" format
  206. hosts = [parts[0]]
  207. else:
  208. # /etc/hosts format
  209. hosts = parts[1:]
  210. for host in hosts:
  211. if ('.' in host and
  212. not host.endswith('.localdomain') and
  213. host != '0.0.0.0'):
  214. self._blocked_hosts.add(host)
  215. return True
  216. def _merge_file(self, byte_io):
  217. """Read and merge host files.
  218. Args:
  219. byte_io: The BytesIO object of the completed download.
  220. Return:
  221. A set of the merged hosts.
  222. """
  223. error_count = 0
  224. line_count = 0
  225. try:
  226. f = get_fileobj(byte_io)
  227. except (OSError, zipfile.BadZipFile, zipfile.LargeZipFile,
  228. LookupError) as e:
  229. message.error("adblock: Error while reading {}: {} - {}".format(
  230. byte_io.name, e.__class__.__name__, e))
  231. return
  232. for line in f:
  233. line_count += 1
  234. ok = self._parse_line(line)
  235. if not ok:
  236. error_count += 1
  237. log.misc.debug("{}: read {} lines".format(byte_io.name, line_count))
  238. if error_count > 0:
  239. message.error("adblock: {} read errors for {}".format(
  240. error_count, byte_io.name))
  241. def _on_lists_downloaded(self):
  242. """Install block lists after files have been downloaded."""
  243. with open(self._local_hosts_file, 'w', encoding='utf-8') as f:
  244. for host in sorted(self._blocked_hosts):
  245. f.write(host + '\n')
  246. message.info("adblock: Read {} hosts from {} sources.".format(
  247. len(self._blocked_hosts), self._done_count))
  248. @config.change_filter('content.host_blocking.lists')
  249. def _update_files(self):
  250. """Update files when the config changed."""
  251. if not config.val.content.host_blocking.lists:
  252. try:
  253. os.remove(self._local_hosts_file)
  254. except FileNotFoundError:
  255. pass
  256. except OSError as e:
  257. log.misc.exception("Failed to delete hosts file: {}".format(e))
  258. def _on_download_finished(self, download):
  259. """Check if all downloads are finished and if so, trigger reading.
  260. Arguments:
  261. download: The finished DownloadItem.
  262. """
  263. self._in_progress.remove(download)
  264. if download.successful:
  265. self._done_count += 1
  266. try:
  267. self._merge_file(download.fileobj)
  268. finally:
  269. download.fileobj.close()
  270. if not self._in_progress:
  271. try:
  272. self._on_lists_downloaded()
  273. except OSError:
  274. log.misc.exception("Failed to write host block list!")