towerofnix
/
mtui


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
							const fs = require('fs')
const path = require('path')
const expandHomeDir = require('expand-home-dir')
const fetch = require('node-fetch')
const url = require('url')
const { downloadPlaylistFromOptionValue, promisifyProcess } = require('./general-util')
const { spawn } = require('child_process')
const { orderBy } = require('natural-orderby')

const { promisify } = require('util')
const readDir = promisify(fs.readdir)
const stat = promisify(fs.stat)

// Each value is a function with these additional properties:
// * crawlerName: The name of the crawler, such as "crawl-http". Used by
//   getCrawlerByName.
// * isAppropriateForArg: A function returning whether an argument is valid for
//   the crawler. For example, crawlHTTP.isAppropriateForArg returns whether or
//   not the passed argument is a valid URL of the HTTP/HTTPS protocol. Used by
//   getAllCrawlersForArg.
const allCrawlers = {}

/* TODO: Removed cheerio, so crawl-http no longer works.
function crawlHTTP(absURL, opts = {}, internals = {}) {
  // Recursively crawls a given URL, following every link to a deeper path and
  // recording all links in a tree (in the same format playlists use). Makes
  // multiple attempts to download failed paths.

  const {
    verbose = false,

    maxAttempts = 5,

    allowedExternalHostRegex = null,
    stayInSameDirectory = true,

    keepAnyFileType = false,
    fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov', 'mpga', 'mod'],

    forceGroupRegex = null,
    filterRegex = null
  } = opts

  if (!internals.attempts) internals.attempts = 0

  // TODO: Should absURL initially be added into this array? I'd like to
  // re-program this entire crawl function to make more sense - "internal"
  // dictionaries aren't quite easy to reason about!
  if (!internals.allURLs) internals.allURLs = []

  const verboseLog = text => {
    if (verbose) {
      console.error(text)
    }
  }

  const absURLObj = new url.URL(absURL)

  return fetch(absURL)
    .then(
      res => res.text().then(async text => {
        const links = getHTMLLinks(text)
        console.log(links)

        const items = []

        for (const link of links) {
          let [ name, href ] = link

          if (!href) {
            continue
          }

          // If the name (that's the content inside of <a>..</a>) ends with a
          // slash, that's probably just an artifact of a directory lister;
          // not actually part of the intended content. So we remove it!
          if (name.endsWith('/')) {
            name = name.slice(0, -1)
          }

          name = name.trim()

          let base
          if (path.extname(absURL)) {
            base = path.dirname(absURL) + '/'
            console.log('extname:', path.extname(absURL), 'so base:', base)
          } else {
            base = absURL
          }

          const urlObj = new url.URL(href, base)
          const linkURL = url.format(urlObj)

          if (internals.allURLs.includes(linkURL)) {
            verboseLog("[Ignored] Already done this URL: " + linkURL)
            continue
          }

          internals.allURLs.push(linkURL)

          if (filterRegex && !(filterRegex.test(linkURL))) {
            verboseLog("[Ignored] Failed regex: " + linkURL)
            continue
          }

          if (urlObj.host !== absURLObj.host && !(
            allowedExternalHostRegex && new RegExp(allowedExternalHostRegex)
              .test(urlObj.host))) {
            verboseLog("[Ignored] Inconsistent host: " + linkURL)
            continue
          }

          if (stayInSameDirectory) sameDir: {
            // Don't bother with staying in the same directory if it's on a
            // different host.
            if (urlObj.host !== absURLObj.host) {
              break sameDir
            }

            const relative = path.relative((new url.URL(base)).pathname, urlObj.pathname)
            if (relative.startsWith('..') || path.isAbsolute(relative)) {
              verboseLog("[Ignored] Outside of parent directory: " + linkURL + "\n-- relative: " + relative + "\n-- to base: " + base)
              continue
            }
          }

          if (href.endsWith('/') || (forceGroupRegex && new RegExp(forceGroupRegex).test(href))) {
            // It's a directory!

            verboseLog("[Dir] " + linkURL)

            items.push(await (
              crawlHTTP(linkURL, opts, Object.assign({}, internals))
                .then(({ items }) => ({name, items}))
            ))
          } else {
            // It's a file!

            const extensions = fileTypes.map(t => '.' + t)

            if (
              !keepAnyFileType &&
              !(extensions.includes(path.extname(href)))
            ) {
              verboseLog("[Ignored] Bad extension: " + linkURL)
              continue
            }

            verboseLog("[File] " + linkURL)
            items.push({name, downloaderArg: linkURL})
          }
        }

        return {items}
      }),

      err => {
        console.warn("Failed to download: " + absURL)

        if (internals.attempts < maxAttempts) {
          console.warn(
            `Trying again. Attempt ${internals.attempts + 1}/${maxAttempts}...`
          )

          return crawlHTTP(absURL, opts, Object.assign({}, internals, {
            attempts: internals.attempts + 1
          }))
        } else {
          console.error(
            "We've hit the download attempt limit (" + maxAttempts + "). " +
            "Giving up on this path."
          )

          throw 'FAILED_DOWNLOAD'
        }
      }
    )
    .catch(error => {
      if (error === 'FAILED_DOWNLOAD') {
        // Debug logging for this is already handled above.
        return []
      } else {
        throw error
      }
    })
}

crawlHTTP.crawlerName = 'crawl-http'

crawlHTTP.isAppropriateForArg = function(arg) {
  // It is only used for HTTP(S) servers:
  if (!(arg.startsWith('http://') || arg.startsWith('https://'))) {
    return false
  }

  // It will definitely only work for valid URLs:
  let url
  try {
    url = new URL(arg)
  } catch (error) {
    return false
  }

  // If the URL ends with a .json, it is probably meant to be used for a direct
  // playlist download, not to be crawled.
  if (path.extname(url.pathname) === '.json') {
    return false
  }

  // Just to avoid conflict with crawl-youtube, assume crawl-http is not used
  // for URLs on YouTube:
  if (crawlYouTube.isAppropriateForArg(arg)) {
    return false
  }

  return true
}

allCrawlers.crawlHTTP = crawlHTTP

function getHTMLLinks(text) {
  // Never parse HTML with a regex!
  // const $ = cheerio.load(text)

  return $('a').get().map(el => {
    const $el = $(el)
    return [$el.text(), $el.attr('href')]
  })
}
*/

function crawlLocal(dirPath, extensions = [
  'ogg', 'oga',
  'wav', 'mp3', 'm4a', 'aac', 'flac', 'opus',
  'mp4', 'mov', 'mkv',
  'mod'
], isTop = true) {
  // If the passed path is a file:// URL, try to decode it:
  try {
    const url = new URL(dirPath)
    if (url.protocol === 'file:') {
      dirPath = decodeURIComponent(url.pathname)
    }
  } catch (error) {
    // If it's not a URL, it's (assumedly) an ordinary path ("/path/to/the directory").
    // In this case we'll expand any ~ in the path (e.g. ~/Music -> /home/.../Music).
    dirPath = expandHomeDir(dirPath)
  }

  return readDir(dirPath).then(items => {
    items = orderBy(items)

    return Promise.all(items.map(item => {
      const itemPath = path.join(dirPath, item)
      const itemURL = url.pathToFileURL(itemPath).href

      return stat(itemPath).then(stats => {
        if (stats.isDirectory()) {
          return crawlLocal(itemPath, extensions, false)
            .then(group => Object.assign({name: item, url: itemURL}, group))
        } else if (stats.isFile()) {
          // Extname returns a string starting with a dot; we don't want the
          // dot, so we slice it off of the front.
          const ext = path.extname(item).slice(1)

          if (extensions.includes(ext)) {
            // The name of the track doesn't include the file extension; a user
            // probably wouldn't add the file extensions to a hand-written
            // playlist, or want them in an auto-generated one.
            const basename = path.basename(item, path.extname(item))

            return {name: basename, downloaderArg: itemPath, url: itemURL}
          } else {
            return {name: item, url: itemURL}
          }
        }
      }, statErr => null)
    }))
  }, err => {
    if (err.code === 'ENOENT') {
      if (isTop) {
        throw 'That directory path does not exist!'
      } else {
        return []
      }
    } else if (err.code === 'EACCES') {
      if (isTop) {
        throw 'You do not have permission to open that directory.'
      } else {
        return []
      }
    } else {
      throw err
    }
  }).then(items => items.filter(Boolean))
    .then(filteredItems => ({
      name: path.basename(dirPath),
      items: filteredItems
    }))
}

crawlLocal.crawlerName = 'crawl-local'

crawlLocal.isAppropriateForArg = function(arg) {
  // When the passed argument is a valid URL, it is only used for file://
  // URLs:
  try {
    const url = new URL(arg)
    if (url.protocol !== 'file:') {
      return false
    }
  } catch (error) {}

  // If the passed argument ends with .json, it is probably not a directory.
  if (path.extname(arg) === '.json') {
    return false
  }

  return true
}

allCrawlers.crawlLocal = crawlLocal

async function crawlYouTube(url) {
  const ytdl = spawn('youtube-dl', [
    '-j', // Output as JSON
    '--flat-playlist',
    url
  ])

  const items = []

  ytdl.stdout.on('data', data => {
    const lines = data.toString().trim().split('\n')

    items.push(...lines.map(JSON.parse))
  })

  // Pass false so it doesn't show logging.
  try {
    await promisifyProcess(ytdl, false)
  } catch (error) {
    // Yeow.
    throw 'Youtube-dl failed.'
  }

  return {
    name: 'A YouTube playlist',
    items: items.map(item => {
      return {
        name: item.title,
        downloaderArg: 'https://youtube.com/watch?v=' + item.id
      }
    })
  }
}

crawlYouTube.crawlerName = 'crawl-youtube'

crawlYouTube.isAppropriateForArg = function(arg) {
  // It is definitely not used for arguments that are not URLs:
  let url
  try {
    url = new URL(arg)
  } catch (error) {
    return false
  }

  // It is only used for URLs on the YouTube domain:
  if (!(url.hostname === 'youtube.com' || url.hostname === 'www.youtube.com')) {
    return false
  }

  // It is only used for playlist pages:
  if (url.pathname !== '/playlist') {
    return false
  }

  return true
}

allCrawlers.crawlYouTube = crawlYouTube

async function openFile(input) {
  return JSON.parse(await downloadPlaylistFromOptionValue(input))
}

openFile.crawlerName = 'open-file'

openFile.isAppropriateForArg = function(arg) {
  // It is only valid for arguments that end with .json:
  return path.extname(arg) === '.json'
}

allCrawlers.openFile = openFile

// Actual module.exports stuff:

Object.assign(module.exports, allCrawlers)

module.exports.getCrawlerByName = function(name) {
  return Object.values(allCrawlers).find(fn => fn.crawlerName === name)
}

module.exports.getAllCrawlersForArg = function(arg) {
  return Object.values(allCrawlers).filter(fn => fn.isAppropriateForArg(arg))
}