crawlers.js 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
  1. const fs = require('fs')
  2. const path = require('path')
  3. const expandHomeDir = require('expand-home-dir')
  4. const fetch = require('node-fetch')
  5. const url = require('url')
  6. const { downloadPlaylistFromOptionValue, promisifyProcess } = require('./general-util')
  7. const { spawn } = require('child_process')
  8. const { orderBy } = require('natural-orderby')
  9. const { promisify } = require('util')
  10. const readDir = promisify(fs.readdir)
  11. const stat = promisify(fs.stat)
  12. // Each value is a function with these additional properties:
  13. // * crawlerName: The name of the crawler, such as "crawl-http". Used by
  14. // getCrawlerByName.
  15. // * isAppropriateForArg: A function returning whether an argument is valid for
  16. // the crawler. For example, crawlHTTP.isAppropriateForArg returns whether or
  17. // not the passed argument is a valid URL of the HTTP/HTTPS protocol. Used by
  18. // getAllCrawlersForArg.
  19. const allCrawlers = {}
  20. /* TODO: Removed cheerio, so crawl-http no longer works.
  21. function crawlHTTP(absURL, opts = {}, internals = {}) {
  22. // Recursively crawls a given URL, following every link to a deeper path and
  23. // recording all links in a tree (in the same format playlists use). Makes
  24. // multiple attempts to download failed paths.
  25. const {
  26. verbose = false,
  27. maxAttempts = 5,
  28. allowedExternalHostRegex = null,
  29. stayInSameDirectory = true,
  30. keepAnyFileType = false,
  31. fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov', 'mpga', 'mod'],
  32. forceGroupRegex = null,
  33. filterRegex = null
  34. } = opts
  35. if (!internals.attempts) internals.attempts = 0
  36. // TODO: Should absURL initially be added into this array? I'd like to
  37. // re-program this entire crawl function to make more sense - "internal"
  38. // dictionaries aren't quite easy to reason about!
  39. if (!internals.allURLs) internals.allURLs = []
  40. const verboseLog = text => {
  41. if (verbose) {
  42. console.error(text)
  43. }
  44. }
  45. const absURLObj = new url.URL(absURL)
  46. return fetch(absURL)
  47. .then(
  48. res => res.text().then(async text => {
  49. const links = getHTMLLinks(text)
  50. console.log(links)
  51. const items = []
  52. for (const link of links) {
  53. let [ name, href ] = link
  54. if (!href) {
  55. continue
  56. }
  57. // If the name (that's the content inside of <a>..</a>) ends with a
  58. // slash, that's probably just an artifact of a directory lister;
  59. // not actually part of the intended content. So we remove it!
  60. if (name.endsWith('/')) {
  61. name = name.slice(0, -1)
  62. }
  63. name = name.trim()
  64. let base
  65. if (path.extname(absURL)) {
  66. base = path.dirname(absURL) + '/'
  67. console.log('extname:', path.extname(absURL), 'so base:', base)
  68. } else {
  69. base = absURL
  70. }
  71. const urlObj = new url.URL(href, base)
  72. const linkURL = url.format(urlObj)
  73. if (internals.allURLs.includes(linkURL)) {
  74. verboseLog("[Ignored] Already done this URL: " + linkURL)
  75. continue
  76. }
  77. internals.allURLs.push(linkURL)
  78. if (filterRegex && !(filterRegex.test(linkURL))) {
  79. verboseLog("[Ignored] Failed regex: " + linkURL)
  80. continue
  81. }
  82. if (urlObj.host !== absURLObj.host && !(
  83. allowedExternalHostRegex && new RegExp(allowedExternalHostRegex)
  84. .test(urlObj.host))) {
  85. verboseLog("[Ignored] Inconsistent host: " + linkURL)
  86. continue
  87. }
  88. if (stayInSameDirectory) sameDir: {
  89. // Don't bother with staying in the same directory if it's on a
  90. // different host.
  91. if (urlObj.host !== absURLObj.host) {
  92. break sameDir
  93. }
  94. const relative = path.relative((new url.URL(base)).pathname, urlObj.pathname)
  95. if (relative.startsWith('..') || path.isAbsolute(relative)) {
  96. verboseLog("[Ignored] Outside of parent directory: " + linkURL + "\n-- relative: " + relative + "\n-- to base: " + base)
  97. continue
  98. }
  99. }
  100. if (href.endsWith('/') || (forceGroupRegex && new RegExp(forceGroupRegex).test(href))) {
  101. // It's a directory!
  102. verboseLog("[Dir] " + linkURL)
  103. items.push(await (
  104. crawlHTTP(linkURL, opts, Object.assign({}, internals))
  105. .then(({ items }) => ({name, items}))
  106. ))
  107. } else {
  108. // It's a file!
  109. const extensions = fileTypes.map(t => '.' + t)
  110. if (
  111. !keepAnyFileType &&
  112. !(extensions.includes(path.extname(href)))
  113. ) {
  114. verboseLog("[Ignored] Bad extension: " + linkURL)
  115. continue
  116. }
  117. verboseLog("[File] " + linkURL)
  118. items.push({name, downloaderArg: linkURL})
  119. }
  120. }
  121. return {items}
  122. }),
  123. err => {
  124. console.warn("Failed to download: " + absURL)
  125. if (internals.attempts < maxAttempts) {
  126. console.warn(
  127. `Trying again. Attempt ${internals.attempts + 1}/${maxAttempts}...`
  128. )
  129. return crawlHTTP(absURL, opts, Object.assign({}, internals, {
  130. attempts: internals.attempts + 1
  131. }))
  132. } else {
  133. console.error(
  134. "We've hit the download attempt limit (" + maxAttempts + "). " +
  135. "Giving up on this path."
  136. )
  137. throw 'FAILED_DOWNLOAD'
  138. }
  139. }
  140. )
  141. .catch(error => {
  142. if (error === 'FAILED_DOWNLOAD') {
  143. // Debug logging for this is already handled above.
  144. return []
  145. } else {
  146. throw error
  147. }
  148. })
  149. }
  150. crawlHTTP.crawlerName = 'crawl-http'
  151. crawlHTTP.isAppropriateForArg = function(arg) {
  152. // It is only used for HTTP(S) servers:
  153. if (!(arg.startsWith('http://') || arg.startsWith('https://'))) {
  154. return false
  155. }
  156. // It will definitely only work for valid URLs:
  157. let url
  158. try {
  159. url = new URL(arg)
  160. } catch (error) {
  161. return false
  162. }
  163. // If the URL ends with a .json, it is probably meant to be used for a direct
  164. // playlist download, not to be crawled.
  165. if (path.extname(url.pathname) === '.json') {
  166. return false
  167. }
  168. // Just to avoid conflict with crawl-youtube, assume crawl-http is not used
  169. // for URLs on YouTube:
  170. if (crawlYouTube.isAppropriateForArg(arg)) {
  171. return false
  172. }
  173. return true
  174. }
  175. allCrawlers.crawlHTTP = crawlHTTP
  176. function getHTMLLinks(text) {
  177. // Never parse HTML with a regex!
  178. // const $ = cheerio.load(text)
  179. return $('a').get().map(el => {
  180. const $el = $(el)
  181. return [$el.text(), $el.attr('href')]
  182. })
  183. }
  184. */
  185. function crawlLocal(dirPath, extensions = [
  186. 'ogg', 'oga',
  187. 'wav', 'mp3', 'm4a', 'aac', 'flac', 'opus',
  188. 'mp4', 'mov', 'mkv',
  189. 'mod'
  190. ], isTop = true) {
  191. // If the passed path is a file:// URL, try to decode it:
  192. try {
  193. const url = new URL(dirPath)
  194. if (url.protocol === 'file:') {
  195. dirPath = decodeURIComponent(url.pathname)
  196. }
  197. } catch (error) {
  198. // If it's not a URL, it's (assumedly) an ordinary path ("/path/to/the directory").
  199. // In this case we'll expand any ~ in the path (e.g. ~/Music -> /home/.../Music).
  200. dirPath = expandHomeDir(dirPath)
  201. }
  202. return readDir(dirPath).then(items => {
  203. items = orderBy(items)
  204. return Promise.all(items.map(item => {
  205. const itemPath = path.join(dirPath, item)
  206. const itemURL = url.pathToFileURL(itemPath).href
  207. return stat(itemPath).then(stats => {
  208. if (stats.isDirectory()) {
  209. return crawlLocal(itemPath, extensions, false)
  210. .then(group => Object.assign({name: item, url: itemURL}, group))
  211. } else if (stats.isFile()) {
  212. // Extname returns a string starting with a dot; we don't want the
  213. // dot, so we slice it off of the front.
  214. const ext = path.extname(item).slice(1)
  215. if (extensions.includes(ext)) {
  216. // The name of the track doesn't include the file extension; a user
  217. // probably wouldn't add the file extensions to a hand-written
  218. // playlist, or want them in an auto-generated one.
  219. const basename = path.basename(item, path.extname(item))
  220. return {name: basename, downloaderArg: itemPath, url: itemURL}
  221. } else {
  222. return {name: item, url: itemURL}
  223. }
  224. }
  225. }, statErr => null)
  226. }))
  227. }, err => {
  228. if (err.code === 'ENOENT') {
  229. if (isTop) {
  230. throw 'That directory path does not exist!'
  231. } else {
  232. return []
  233. }
  234. } else if (err.code === 'EACCES') {
  235. if (isTop) {
  236. throw 'You do not have permission to open that directory.'
  237. } else {
  238. return []
  239. }
  240. } else {
  241. throw err
  242. }
  243. }).then(items => items.filter(Boolean))
  244. .then(filteredItems => ({
  245. name: path.basename(dirPath),
  246. items: filteredItems
  247. }))
  248. }
  249. crawlLocal.crawlerName = 'crawl-local'
  250. crawlLocal.isAppropriateForArg = function(arg) {
  251. // When the passed argument is a valid URL, it is only used for file://
  252. // URLs:
  253. try {
  254. const url = new URL(arg)
  255. if (url.protocol !== 'file:') {
  256. return false
  257. }
  258. } catch (error) {}
  259. // If the passed argument ends with .json, it is probably not a directory.
  260. if (path.extname(arg) === '.json') {
  261. return false
  262. }
  263. return true
  264. }
  265. allCrawlers.crawlLocal = crawlLocal
  266. async function crawlYouTube(url) {
  267. const ytdl = spawn('youtube-dl', [
  268. '-j', // Output as JSON
  269. '--flat-playlist',
  270. url
  271. ])
  272. const items = []
  273. ytdl.stdout.on('data', data => {
  274. const lines = data.toString().trim().split('\n')
  275. items.push(...lines.map(JSON.parse))
  276. })
  277. // Pass false so it doesn't show logging.
  278. try {
  279. await promisifyProcess(ytdl, false)
  280. } catch (error) {
  281. // Yeow.
  282. throw 'Youtube-dl failed.'
  283. }
  284. return {
  285. name: 'A YouTube playlist',
  286. items: items.map(item => {
  287. return {
  288. name: item.title,
  289. downloaderArg: 'https://youtube.com/watch?v=' + item.id
  290. }
  291. })
  292. }
  293. }
  294. crawlYouTube.crawlerName = 'crawl-youtube'
  295. crawlYouTube.isAppropriateForArg = function(arg) {
  296. // It is definitely not used for arguments that are not URLs:
  297. let url
  298. try {
  299. url = new URL(arg)
  300. } catch (error) {
  301. return false
  302. }
  303. // It is only used for URLs on the YouTube domain:
  304. if (!(url.hostname === 'youtube.com' || url.hostname === 'www.youtube.com')) {
  305. return false
  306. }
  307. // It is only used for playlist pages:
  308. if (url.pathname !== '/playlist') {
  309. return false
  310. }
  311. return true
  312. }
  313. allCrawlers.crawlYouTube = crawlYouTube
  314. async function openFile(input) {
  315. return JSON.parse(await downloadPlaylistFromOptionValue(input))
  316. }
  317. openFile.crawlerName = 'open-file'
  318. openFile.isAppropriateForArg = function(arg) {
  319. // It is only valid for arguments that end with .json:
  320. return path.extname(arg) === '.json'
  321. }
  322. allCrawlers.openFile = openFile
  323. // Actual module.exports stuff:
  324. Object.assign(module.exports, allCrawlers)
  325. module.exports.getCrawlerByName = function(name) {
  326. return Object.values(allCrawlers).find(fn => fn.crawlerName === name)
  327. }
  328. module.exports.getAllCrawlersForArg = function(arg) {
  329. return Object.values(allCrawlers).filter(fn => fn.isAppropriateForArg(arg))
  330. }