1234567891011121314151617181920212223242526272829303132333435 |
- from monitor import Monitor
- from threadinfo import ThreadInfo
- from api import ChanAPI
- from utils import getThreadList
- from utils import getThreadObjects
- from utils import getComments
- from bs4 import BeautifulSoup
- import pickle
- from sys import argv
- import sys
- import re
- '''
- Download all 4chan posts and parse them for links
- '''
- # Everything below this line needs to be cleaned
- board = argv[1:][0]
- fchan = ChanAPI(board)
- catalog = fchan.getCatalog()
- threadIds = getThreadList(catalog)
- prog=re.compile("https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\/[-a-zA-Z0-9@:%._\+~#=\/\?]*")
- threadComments = []
- for thread in threadIds:
- threadData = fchan.getThread(thread)
- comments = getComments( threadData )
- for comment in comments:
- comment=comment.replace("<wbr>","")
- comment=comment.replace("<br>","\n")
- result = prog.match(comment)
- if(result):
- tempstr=result.group(0)
- print(tempstr)
|