main.py 971 B

1234567891011121314151617181920212223242526272829303132333435
  1. from monitor import Monitor
  2. from threadinfo import ThreadInfo
  3. from api import ChanAPI
  4. from utils import getThreadList
  5. from utils import getThreadObjects
  6. from utils import getComments
  7. from bs4 import BeautifulSoup
  8. import pickle
  9. from sys import argv
  10. import sys
  11. import re
  12. '''
  13. Download all 4chan posts and parse them for links
  14. '''
  15. # Everything below this line needs to be cleaned
  16. board = argv[1:][0]
  17. fchan = ChanAPI(board)
  18. catalog = fchan.getCatalog()
  19. threadIds = getThreadList(catalog)
  20. prog=re.compile("https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\/[-a-zA-Z0-9@:%._\+~#=\/\?]*")
  21. threadComments = []
  22. for thread in threadIds:
  23. threadData = fchan.getThread(thread)
  24. comments = getComments( threadData )
  25. for comment in comments:
  26. comment=comment.replace("<wbr>","")
  27. comment=comment.replace("<br>","\n")
  28. result = prog.match(comment)
  29. if(result):
  30. tempstr=result.group(0)
  31. print(tempstr)