dl.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. """
  2. dl.py - Mythical downloader
  3. Copyright (C) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io>
  4. This program is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU Affero General Public License as published by
  6. the Free Software Foundation, either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU Affero General Public License for more details.
  12. You should have received a copy of the GNU Affero General Public License
  13. along with this program. If not, see <https://www.gnu.org/licenses/>.
  14. """
  15. import subprocess
  16. import urllib.parse
  17. import sys
  18. # Don't actually download, just classify
  19. dry_run = True
  20. def read_all(file):
  21. with open(file, "r") as f:
  22. return f.read()
  23. # Known sites. Format: hashmap, where the key is the domain, and the value is
  24. # the list of adaptors. An adaptor is a list, where the first element is the
  25. # names, and the other elements, if any, are parameters to that adaptor.
  26. # TODO: Where to source these site lists?
  27. SITES = {
  28. "discordapp.com": [["libpurple", "purple-discord"]],
  29. "discord.gg": [["libpurple", "purple-discord"]]
  30. }
  31. def domains_from_file(file, adaptor):
  32. # All of these domains are for the site in questions
  33. fanficfare_sites = read_all(file).split("\n")
  34. for site in fanficfare_sites:
  35. SITES[site] = [[adaptor]]
  36. domains_from_file("fanficfare_sites.txt", "fanficfare")
  37. domains_from_file("ytdl_sites.txt", "youtube-dl")
  38. domains_from_file("drm_sites.txt", "drm")
  39. # Functions to download a given URL with a given adaptor
  40. def ytdl(url):
  41. # TODO
  42. subprocess.run(["mpv", "--vo=x11", url])
  43. def fanficfare(url):
  44. # TODO
  45. subprocess.run(["fanficfare", url])
  46. def libpurple(url):
  47. # TODO
  48. print("libpurple'ing " + url)
  49. def drm(url):
  50. print("This site requires the use of Digital Restrictions Mangement.")
  51. print("To learn more, see https://defectivebydesign.org/")
  52. sys.exit(1)
  53. ADAPTORS = {
  54. "youtube-dl": ytdl,
  55. "fanficfare": fanficfare,
  56. "libpurple": libpurple,
  57. "drm": drm
  58. }
  59. # Fast track certain extensions
  60. AV = ["ogg", "ogv", "webm", "mp3", "mp4", "mkv"]
  61. # If it's not a known site, we can just test individual downloaders over subprocess
  62. def test_url_sub(args):
  63. proc = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  64. return proc.returncode == 0
  65. # Parsing a URL
  66. def url_parts(url):
  67. return urllib.parse.urlparse(url if "://" in url else "https://" + url)
  68. def domain_from_url(parts):
  69. domain = parts.netloc
  70. # Remove subdomain
  71. if domain.startswith("www."):
  72. domain = domain[len("www."):]
  73. return domain
  74. # Find adaptors for a URL, first by consulting our dictionary, then by
  75. # bruteforce
  76. def find_adaptors(parts, allow_bruteforce=True):
  77. domain = domain_from_url(parts)
  78. ext = parts.path.split(".")[-1]
  79. # These "crude" techniques merely check the hashes
  80. if domain in SITES:
  81. return SITES[domain]
  82. # These run the full regexs stack but is slow and sometimes I/O bound
  83. if allow_bruteforce or ext in AV:
  84. if test_url_sub(["youtube-dl", "--simulate", url]):
  85. return [["youtube-dl"]]
  86. return []
  87. # Actualise download
  88. def download_with_adaptor(url, adaptor):
  89. fn = ADAPTORS[adaptor[0]]
  90. args = [url] + adaptors[1:]
  91. if fn is not None:
  92. fn(*args)
  93. else:
  94. print("Unknwon adaptor " + adaptor[0])
  95. # Simple test with the cmdline
  96. url = sys.argv[1]
  97. parts = url_parts(url)
  98. adaptors = find_adaptors(parts, allow_bruteforce=True)
  99. print(adaptors)
  100. if len(adaptors) > 1:
  101. print("Warning: ambiguous, using first:")
  102. print(adaptors)
  103. elif len(adaptors) == 0:
  104. print("No adaptors found")
  105. sys.exit(1)
  106. if not dry_run:
  107. download_with_adaptor(url, adaptors[0])