crawlers.js 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. import {spawn} from 'node:child_process'
  2. import {readdir, stat} from 'node:fs/promises'
  3. import url from 'node:url'
  4. import path from 'node:path'
  5. import {orderBy} from 'natural-orderby'
  6. import expandHomeDir from 'expand-home-dir'
  7. // import fetch from 'node-fetch'
  8. import {downloadPlaylistFromOptionValue, promisifyProcess} from './general-util.js'
  9. export const musicExtensions = [
  10. 'ogg', 'oga',
  11. 'wav', 'mp3', 'm4a', 'aac', 'flac', 'opus',
  12. 'mp4', 'mov', 'mkv',
  13. 'mod'
  14. ]
  15. export const skipNames = [
  16. '.DS_Store',
  17. '.git',
  18. ]
  19. // Each value is a function with these additional properties:
  20. // * crawlerName: The name of the crawler, such as "crawl-http". Used by
  21. // getCrawlerByName.
  22. // * isAppropriateForArg: A function returning whether an argument is valid for
  23. // the crawler. For example, crawlHTTP.isAppropriateForArg returns whether or
  24. // not the passed argument is a valid URL of the HTTP/HTTPS protocol. Used by
  25. // getAllCrawlersForArg.
  26. const allCrawlers = {}
  27. /* TODO: Removed cheerio, so crawl-http no longer works.
  28. export function crawlHTTP(absURL, opts = {}, internals = {}) {
  29. // Recursively crawls a given URL, following every link to a deeper path and
  30. // recording all links in a tree (in the same format playlists use). Makes
  31. // multiple attempts to download failed paths.
  32. const {
  33. verbose = false,
  34. maxAttempts = 5,
  35. allowedExternalHostRegex = null,
  36. stayInSameDirectory = true,
  37. keepAnyFileType = false,
  38. fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov', 'mpga', 'mod'],
  39. forceGroupRegex = null,
  40. filterRegex = null
  41. } = opts
  42. if (!internals.attempts) internals.attempts = 0
  43. // TODO: Should absURL initially be added into this array? I'd like to
  44. // re-program this entire crawl function to make more sense - "internal"
  45. // dictionaries aren't quite easy to reason about!
  46. if (!internals.allURLs) internals.allURLs = []
  47. const verboseLog = text => {
  48. if (verbose) {
  49. console.error(text)
  50. }
  51. }
  52. const absURLObj = new url.URL(absURL)
  53. return fetch(absURL)
  54. .then(
  55. res => res.text().then(async text => {
  56. const links = getHTMLLinks(text)
  57. console.log(links)
  58. const items = []
  59. for (const link of links) {
  60. let [ name, href ] = link
  61. if (!href) {
  62. continue
  63. }
  64. // If the name (that's the content inside of <a>..</a>) ends with a
  65. // slash, that's probably just an artifact of a directory lister;
  66. // not actually part of the intended content. So we remove it!
  67. if (name.endsWith('/')) {
  68. name = name.slice(0, -1)
  69. }
  70. name = name.trim()
  71. let base
  72. if (path.extname(absURL)) {
  73. base = path.dirname(absURL) + '/'
  74. console.log('extname:', path.extname(absURL), 'so base:', base)
  75. } else {
  76. base = absURL
  77. }
  78. const urlObj = new url.URL(href, base)
  79. const linkURL = url.format(urlObj)
  80. if (internals.allURLs.includes(linkURL)) {
  81. verboseLog("[Ignored] Already done this URL: " + linkURL)
  82. continue
  83. }
  84. internals.allURLs.push(linkURL)
  85. if (filterRegex && !(filterRegex.test(linkURL))) {
  86. verboseLog("[Ignored] Failed regex: " + linkURL)
  87. continue
  88. }
  89. if (urlObj.host !== absURLObj.host && !(
  90. allowedExternalHostRegex && new RegExp(allowedExternalHostRegex)
  91. .test(urlObj.host))) {
  92. verboseLog("[Ignored] Inconsistent host: " + linkURL)
  93. continue
  94. }
  95. if (stayInSameDirectory) sameDir: {
  96. // Don't bother with staying in the same directory if it's on a
  97. // different host.
  98. if (urlObj.host !== absURLObj.host) {
  99. break sameDir
  100. }
  101. const relative = path.relative((new url.URL(base)).pathname, urlObj.pathname)
  102. if (relative.startsWith('..') || path.isAbsolute(relative)) {
  103. verboseLog("[Ignored] Outside of parent directory: " + linkURL + "\n-- relative: " + relative + "\n-- to base: " + base)
  104. continue
  105. }
  106. }
  107. if (href.endsWith('/') || (forceGroupRegex && new RegExp(forceGroupRegex).test(href))) {
  108. // It's a directory!
  109. verboseLog("[Dir] " + linkURL)
  110. items.push(await (
  111. crawlHTTP(linkURL, opts, Object.assign({}, internals))
  112. .then(({ items }) => ({name, items}))
  113. ))
  114. } else {
  115. // It's a file!
  116. const extensions = fileTypes.map(t => '.' + t)
  117. if (
  118. !keepAnyFileType &&
  119. !(extensions.includes(path.extname(href)))
  120. ) {
  121. verboseLog("[Ignored] Bad extension: " + linkURL)
  122. continue
  123. }
  124. verboseLog("[File] " + linkURL)
  125. items.push({name, downloaderArg: linkURL})
  126. }
  127. }
  128. return {items}
  129. }),
  130. err => {
  131. console.warn("Failed to download: " + absURL)
  132. if (internals.attempts < maxAttempts) {
  133. console.warn(
  134. `Trying again. Attempt ${internals.attempts + 1}/${maxAttempts}...`
  135. )
  136. return crawlHTTP(absURL, opts, Object.assign({}, internals, {
  137. attempts: internals.attempts + 1
  138. }))
  139. } else {
  140. console.error(
  141. "We've hit the download attempt limit (" + maxAttempts + "). " +
  142. "Giving up on this path."
  143. )
  144. throw 'FAILED_DOWNLOAD'
  145. }
  146. }
  147. )
  148. .catch(error => {
  149. if (error === 'FAILED_DOWNLOAD') {
  150. // Debug logging for this is already handled above.
  151. return []
  152. } else {
  153. throw error
  154. }
  155. })
  156. }
  157. crawlHTTP.crawlerName = 'crawl-http'
  158. crawlHTTP.isAppropriateForArg = function(arg) {
  159. // It is only used for HTTP(S) servers:
  160. if (!(arg.startsWith('http://') || arg.startsWith('https://'))) {
  161. return false
  162. }
  163. // It will definitely only work for valid URLs:
  164. let url
  165. try {
  166. url = new URL(arg)
  167. } catch (error) {
  168. return false
  169. }
  170. // If the URL ends with a .json, it is probably meant to be used for a direct
  171. // playlist download, not to be crawled.
  172. if (path.extname(url.pathname) === '.json') {
  173. return false
  174. }
  175. // Just to avoid conflict with crawl-youtube, assume crawl-http is not used
  176. // for URLs on YouTube:
  177. if (crawlYouTube.isAppropriateForArg(arg)) {
  178. return false
  179. }
  180. return true
  181. }
  182. allCrawlers.crawlHTTP = crawlHTTP
  183. function getHTMLLinks(text) {
  184. // Never parse HTML with a regex!
  185. // const $ = cheerio.load(text)
  186. return $('a').get().map(el => {
  187. const $el = $(el)
  188. return [$el.text(), $el.attr('href')]
  189. })
  190. }
  191. */
  192. function crawlLocal(dirPath, extensions = musicExtensions, isTop = true) {
  193. // If the passed path is a file:// URL, try to decode it:
  194. try {
  195. const url = new URL(dirPath)
  196. if (url.protocol === 'file:') {
  197. dirPath = decodeURIComponent(url.pathname)
  198. }
  199. } catch (error) {
  200. // If it's not a URL, it's (assumedly) an ordinary path ("/path/to/the directory").
  201. // In this case we'll expand any ~ in the path (e.g. ~/Music -> /home/.../Music).
  202. dirPath = expandHomeDir(dirPath)
  203. }
  204. return readdir(dirPath).then(items => {
  205. items = orderBy(items)
  206. return Promise.all(items.map(item => {
  207. // There are a few files which are just never what we're looking for.
  208. // We skip including or searching under these altogether.
  209. if (skipNames.includes(item)) {
  210. return null
  211. }
  212. const itemPath = path.join(dirPath, item)
  213. const itemURL = url.pathToFileURL(itemPath).href
  214. return stat(itemPath).then(stats => {
  215. if (stats.isDirectory()) {
  216. return crawlLocal(itemPath, extensions, false)
  217. .then(group => Object.assign({name: item, url: itemURL}, group))
  218. } else if (stats.isFile()) {
  219. // Extname returns a string starting with a dot; we don't want the
  220. // dot, so we slice it off of the front.
  221. const ext = path.extname(item).slice(1)
  222. if (extensions.includes(ext)) {
  223. // The name of the track doesn't include the file extension; a user
  224. // probably wouldn't add the file extensions to a hand-written
  225. // playlist, or want them in an auto-generated one.
  226. const basename = path.basename(item, path.extname(item))
  227. return {name: basename, downloaderArg: itemPath, url: itemURL}
  228. } else {
  229. return {name: item, url: itemURL}
  230. }
  231. }
  232. }, _statErr => null)
  233. }))
  234. }, err => {
  235. if (err.code === 'ENOENT') {
  236. if (isTop) {
  237. throw 'That directory path does not exist!'
  238. } else {
  239. return []
  240. }
  241. } else if (err.code === 'EACCES') {
  242. if (isTop) {
  243. throw 'You do not have permission to open that directory.'
  244. } else {
  245. return []
  246. }
  247. } else {
  248. throw err
  249. }
  250. }).then(items => items.filter(Boolean))
  251. .then(filteredItems => ({
  252. name: path.basename(dirPath),
  253. items: filteredItems
  254. }))
  255. }
  256. crawlLocal.crawlerName = 'crawl-local'
  257. crawlLocal.isAppropriateForArg = function(arg) {
  258. // When the passed argument is a valid URL, it is only used for file://
  259. // URLs:
  260. try {
  261. const url = new URL(arg)
  262. if (url.protocol !== 'file:') {
  263. return false
  264. }
  265. } catch (error) {}
  266. // If the passed argument ends with .json, it is probably not a directory.
  267. if (path.extname(arg) === '.json') {
  268. return false
  269. }
  270. return true
  271. }
  272. allCrawlers.crawlLocal = crawlLocal
  273. export async function crawlYouTube(url) {
  274. const ytdl = spawn('youtube-dl', [
  275. '-j', // Output as JSON
  276. '--flat-playlist',
  277. url
  278. ])
  279. const items = []
  280. ytdl.stdout.on('data', data => {
  281. const lines = data.toString().trim().split('\n')
  282. items.push(...lines.map(JSON.parse))
  283. })
  284. // Pass false so it doesn't show logging.
  285. try {
  286. await promisifyProcess(ytdl, false)
  287. } catch (error) {
  288. // Yeow.
  289. throw 'Youtube-dl failed.'
  290. }
  291. return {
  292. name: 'A YouTube playlist',
  293. items: items.map(item => {
  294. return {
  295. name: item.title,
  296. downloaderArg: 'https://youtube.com/watch?v=' + item.id
  297. }
  298. })
  299. }
  300. }
  301. crawlYouTube.crawlerName = 'crawl-youtube'
  302. crawlYouTube.isAppropriateForArg = function(arg) {
  303. // It is definitely not used for arguments that are not URLs:
  304. let url
  305. try {
  306. url = new URL(arg)
  307. } catch (error) {
  308. return false
  309. }
  310. // It is only used for URLs on the YouTube domain:
  311. if (!(url.hostname === 'youtube.com' || url.hostname === 'www.youtube.com')) {
  312. return false
  313. }
  314. // It is only used for playlist pages:
  315. if (url.pathname !== '/playlist') {
  316. return false
  317. }
  318. return true
  319. }
  320. allCrawlers.crawlYouTube = crawlYouTube
  321. export async function openFile(input) {
  322. return JSON.parse(await downloadPlaylistFromOptionValue(input))
  323. }
  324. openFile.crawlerName = 'open-file'
  325. openFile.isAppropriateForArg = function(arg) {
  326. // It is only valid for arguments that end with .json:
  327. return path.extname(arg) === '.json'
  328. }
  329. allCrawlers.openFile = openFile
  330. export function getCrawlerByName(name) {
  331. return Object.values(allCrawlers).find(fn => fn.crawlerName === name)
  332. }
  333. export function getAllCrawlersForArg(arg) {
  334. return Object.values(allCrawlers).filter(fn => fn.isAppropriateForArg(arg))
  335. }