123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290 |
- import sys
- import subprocess
- from subprocess import run
- import json
- import re
- # TODO verify link with kanji is processed correctly!!!
- # TODO make content-isolating find() targets more accurate
- # EXIT CODES:
- # 0 : success
- # 101-108 : wget error (1-8)
- # 211 : one failed item
- # 212 : up to ten failed items
- # 213 : many failed items
- linkPairs = {}
- linksToRequest = 100
- fails = 0
- # recent changes feed to scrape
- recentURL = "/wiki/Special:RecentChanges?hideenhanced=1&days=90&limit=" + str(linksToRequest)
- lostMediaDomain = "https://lostmediaarchive.fandom.com"
- ###
- ### Regex patterns for recognising links
- ###
- # recognise items in RecentChanges feed
- itemRegex = re.compile('<li class="mw-line-')
- # note: youtube-dl does a great job of ID processing and handling '&' and '?' parameters in the URL which sometimes denote the playlist a video is in
- # because of this, no attempt is made to remove or parse them in this script
- nonSeparatorCharacter = "[^\s\"'`()[\]{}!.,;<>|~%*\\\\+]"
- # File detection regex
- fileRegex = re.compile("https?://lostmediaarchive.wikia.com/wiki/File:.*")
- # the (?:youtu.be)? component is to allow theoretically possible URLs such as: https://www.youtube.com/watch?v=SORO_rZiy9E&feature=youtu.be&list=PL2vA6ImrBrtctt-hj1mtvqypVpEKjqdEg ,
- # since '.' is considered a separating character, such a link would otherwise be interupted before the playlist ID, so only the single video would be archived.
- youtubeNormalRegex = re.compile("youtube\.com/(?:" + nonSeparatorCharacter + "(?:youtu.be)?)+", re.IGNORECASE)
- youtubeShortenedRegex = re.compile("youtu\.be/(?:" + nonSeparatorCharacter + "(?:youtu.be)?)+", re.IGNORECASE)
- youtubePlaylistRegex = re.compile("PL[0-9A-Za-z_-]{32}")
- # assumption: valid characters for vimeo channels, video and album IDs are a subset of the valid characters for a youtube ID
- # basis: video and album IDs are numeric, channel URLs containing the separator characters were all unavailable
- vimeoRegex = re.compile("vimeo\.com/" + nonSeparatorCharacter + "+", re.IGNORECASE)
- # assumption: valid characters for dailymotion channels, video and album IDs are a subset of the valid characters for a youtube ID
- # basis: none
- dailymotionNormalRegex = re.compile("dailymotion\.com/" + nonSeparatorCharacter + "+", re.IGNORECASE)
- dailymotionShortenedRegex = re.compile("dai\.ly/" + nonSeparatorCharacter + "+", re.IGNORECASE)
- # assumption: valid characters for soundcloud channels, video and album IDs are a subset of the valid characters for a youtube ID
- # basis: song titles may only contain letters, numbers, hyphens or underscores. assuming same applies to account URLs
- soundcloudRegex = re.compile("soundcloud\.com/" + nonSeparatorCharacter + "+", re.IGNORECASE)
- def wget_fetch(url, ID):
- # download file page to stdout
- proc = subprocess.run(["wget", "-O-" , lostMediaDomain + url], stdout=subprocess.PIPE)
-
- # if wget returned an error code, exit with code 101-108 corresponding to wget error code
- if (proc.returncode != 0):
- print("ERROR: " + ID + " wget returned " + str(proc.returncode) + " for " + lostMediaDomain + url + ".")
- if(proc.returncode != 8):
- print("Aborting.")
- exit(100 + proc.returncode)
- else:
- return False
- return proc.stdout.decode("utf-8")
- def find_and_add_links(url, content):
- itemLinksList = []
-
- # substitute XML predefined entities which may interrupt links
- # quantifier is to fix double-encodings such as "&amp;"
- content = re.sub('&(amp;)+','&',content)
- content = re.sub('<','<',content)
- content = re.sub('>','>',content)
- content = re.sub('"','"',content)
- content = re.sub(''','\'',content)
- content = re.sub(''','\'',content)
-
- # remove all tags which may interrupt links
- # TODO BUG does this commented line remove all hrefs??? horribly destructive!
- # not much more comuting power to just check both with and without
- #content = re.sub('<[^>]*>','',content)
- content += re.sub('<[^>]*>','',content)
-
- # use compiled regex to catch media links
-
- # TODO detect and fix or add warning when limitations are triggered
- # limitation: malformed urls over multiple lines aren't matched. not fixed as removing newlines can corrupt actual links at the end of a line
- # limitation: non-separated links such as "youtube.com/watch?aaaaaaaaaaayoutube.com/watch?bbbbbbbbbbb"
- # will be misinterpreted as "youtube.com/watch?aaaaaaaaaaayoutube" only
-
- itemLinksList += re.findall(youtubeNormalRegex, content) # youtube.com/...
- itemLinksList += re.findall(youtubeShortenedRegex, content) # youtu.be/...
- itemLinksList += re.findall(vimeoRegex, content) # vimeo.com/...
- itemLinksList += re.findall(dailymotionNormalRegex, content) # dailymotion.com/...
- itemLinksList += re.findall(dailymotionShortenedRegex, content) # dai.ly/...
- itemLinksList += re.findall(soundcloudRegex, content) # soundcloud.com/...
- itemLinksList += re.findall(youtubePlaylistRegex, content) # PL#######...
- # TODO find potential vIDs, solves above issue.
- for link in itemLinksList:
- linkPairs[link] = lostMediaDomain + url
- def find_and_add_upload_file(url, content):
- # try and find an embedded video link
- # note: could theoretically capture videos from other sites not specified here
- # and falsely treat them as youtube links
- videoId = re.search('"videoId":"([^"]*)"', filePage)
- if (videoId != None):
- fileURL = str(videoId.group(1))
- else:
- # try to find an embedded vimeo link
- videoId = re.search('"src="[^"]*(vimeo.com/[^"]+)"', filePage)
- # try to find an embedded dailymotion link
- if (videoId == None):
- videoId = re.search('"src="[^"]*(dailymotion.com/[^"]+)"', filePage)
- if (videoId == None):
- videoId = re.search('"src="[^"]*(dai\.ly/[^"]+)"', filePage)
- if (videoId != None):
- # extract the link from the regex match
- fileURL = str(videoId.group(1))
-
- # TODO add support for audio files: http://community.wikia.com/wiki/Help:Audio
-
- # if an embedded video link found, add it to list of links
- if (videoId != None):
- linkPairs[fileURL] = lostMediaDomain + url
- else:
- print("No video link found in file '" + url + "'.")
- def process_article(url):
- contentPage = wget_fetch(url, "content")
- if not contentPage:
- print("error getting content " + url)
- global fails
- fails += 1
- return
- #isolate general content so advertisments aren't included
- contentStart = contentPage.find("WikiaMainContent")
- contentEnd = contentPage.find("NewPP limit report")
- content = contentPage[contentStart:contentEnd]
- find_and_add_links(url, content)
- def process_comment(url):
- contentPage = wget_fetch(url, "comment")
- if not contentPage:
- print("error getting content " + url)
- global fails
- fails += 1
- return
- find_and_add_links(url, contentPage)
- def process_thread(url):
- contentPage = wget_fetch(url, "content")
- if not contentPage:
- print("error getting content " + url)
- global fails
- fails += 1
- return
- #isolate general content so advertisments aren't included
- contentStart = contentPage.find('<div class="Wall Thread" id="Wall">')
- contentEnd = contentPage.find("<!-- WikiaMainContent -->")
- content = contentPage[contentStart:contentEnd]
- find_and_add_links(url, content)
- def end_crawl():
- # print statistics
- print(str(content_count) + " had content link")
- print(str(upload_count) + " had upload log")
- print(str(ignore_count) + " had ignored log")
- print(str(none_count) + " had no link")
- print(str(len(linkPairs)) + " links recognised.")
-
- # dump list of links to a .json file
- try:
- with open("links.json", 'w') as outFile:
- json.dump(linkPairs, outFile)
- except:
- print("ERROR: Unexpected error opening 'links.json':", sys.exc_info()[0])
- raise
- if fails > 0:
- print("failure*" + str(fails))
- if fails == 1:
- exit(211)
- if fails < 10:
- exit(212)
- exit(213)
-
- # exit without error
- exit(0)
- last = ""
- # read file listing the last entry processed to prevent redundant checks
- try:
- with open("last", "r+") as lastFileR:
- last = lastFileR.read()
- except:
- print("ERROR: Unexpected error opening 'last' for read:", sys.exc_info()[0])
- raise
- # download recent changes page
- recent = wget_fetch(recentURL, "recent")
- #TODO information leak on error?
- #TODO catch exceptions!!!
- # process downloaded page
- firstRun = True
- ignore_count = 0
- upload_count = 0
- content_count = 0
- none_count = 0
- # get all lines which contain a recent change entry
- for line in recent.splitlines():
- hasItem = re.search(itemRegex, line)
- if hasItem:
- # capture the content page URL if one exists
- reg = re.search(' \. \. .*?href="([^"]*)"[^>]*>[^>]*</a>', line)
- if (reg):
- # remove the unpredictable elements (odd-even prefix, tab index) from line when checking
- check = line[line.find(" | "):]
- # if this exact entry was checked last run, end the run early to
- # avoid redundant requests
- if check == last:
- print("ending crawl early: already checked " + line[:256] + "...")
- end_crawl()
- # if this is the first entry this run, store as the last entry that
- # needs to be checked next run
- if (firstRun):
- try:
- with open("last", "w") as lastFileW:
- lastFileW.write(check)
- except:
- print("ERROR: Unexpected error opening 'last' for write:", sys.exc_info()[0])
- raise
- firstRun = False
- avatar = re.search('\(<a href="/wiki/Special:Log/useravatar" title="Special:Log/useravatar">User avatar log</a>\)', line)
- block = re.search('\(<a href="/wiki/Special:Log/block" title="Special:Log/block">Block log</a>\)', line)
- move = re.search('\(<a href="/wiki/Special:Log/move" title="Special:Log/move">Move log</a>\).*</a> to <a .*href="([^"]*)"', line)
- upload = re.search('\(<a href="/wiki/Special:Log/upload" title="Special:Log/upload">Upload log</a>\).* uploaded "<a href="(/wiki/File:[^"]*)"', line)
- # if this entry is an avatar, block or move log, ignore
- if avatar or block or move:
- ignore_count += 1
- # if this entry is an upload log, check for and capture uploaded file
- elif upload:
- upload_count += 1
- url = str(upload.group(1))
- # download uploaded file page
- filePage = wget_fetch(url, "upload")
- find_and_add_upload_file(url, filePage)
- #TODO isolate actual description before passing into function
- find_and_add_links(url, filePage)
-
- # else used the captured target page
- else:
- content_count += 1
- url = str(reg.group(1))
- # if needed, get link to the whole article or talk page page instead of
- # only a single comment or commit
- a = url.find("/wiki/")
- if(a > 0):
- url = url[a:]
- print(url)
- if url.find("/Thread:") > 0:
- process_thread(url)
- elif url.find("/@comment-") > 0:
- # capture this entry's hist page URL
- hist = re.search('\| <a href="([^"]*)".*?>hist</a>', line).group(1)
- # convert & to & and get this page as an RSS feed
- hist = re.sub('&(amp;)+','&', hist)
- hist += "&feed=rss"
- process_comment(hist)
- else:
- process_article(url)
-
- else:
- none_count += 1
- print(line)
- end_crawl()
|