ArchieBot.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. from selenium import webdriver
  2. from selenium.webdriver.common.keys import Keys
  3. from selenium.webdriver.support.ui import WebDriverWait
  4. from selenium.webdriver.support import expected_conditions as EC
  5. from selenium.webdriver.common.by import By
  6. from selenium.common.exceptions import TimeoutException
  7. from selenium.webdriver.firefox.options import Options
  8. import dateparser
  9. import dateparser.utils.strptime
  10. import datetime
  11. import archiveis
  12. import pytz
  13. import re
  14. utc=pytz.UTC
  15. def login(driver, username, password):
  16. driver.get("http://www.raddle.me/login")
  17. elem = driver.find_element_by_id("login-username")
  18. elem.clear()
  19. elem.send_keys(username)
  20. elem = driver.find_element_by_id("login-password")
  21. elem.clear()
  22. elem.send_keys(password)
  23. elem.send_keys(Keys.RETURN)
  24. wait = WebDriverWait(driver, 30)
  25. wait.until(EC.title_is("Raddle"))
  26. def get_new_posts(driver, link):
  27. driver.get(link)
  28. elements = driver.find_elements_by_css_selector('div.submission-inner')
  29. posts = []
  30. for e in elements:
  31. post = {}
  32. time = e.find_element_by_css_selector("time").get_attribute("datetime")
  33. sub_link = e.find_element_by_css_selector("a.submission-link").get_attribute("href")
  34. comm_link = e.find_element_by_css_selector("nav.submission-nav").find_elements_by_css_selector("a")[0].get_attribute("href")
  35. edit_link = '' # e.find_element_by_css_selector("nav.submission-nav").find_elements_by_css_selector("a")[1].get_attribute("href")
  36. post = {
  37. "sub": sub_link,
  38. "edit": edit_link,
  39. "comm": comm_link,
  40. "time": time,
  41. }
  42. posts.append(post)
  43. return posts
  44. def post_comment(driver, comm_link, comm_text):
  45. driver.get(comm_link)
  46. elem = driver.find_element_by_id("comment_comment")
  47. append_text = comm_text
  48. elem.send_keys(append_text)
  49. #submit change
  50. elem = driver.find_element_by_id("comment_submit")
  51. elem.click()
  52. def check_domain(url):
  53. domains = ["nytimes.com", "wsj.com", "cnn.com", "thetimes.co.uk", "vice.com"
  54. , "newsweek.com", "kyivpost.com", "ft.com", "latimes.com", "nypost.com"
  55. , "telegraph.co.uk", "independent.co.uk", "scmp.com", "nationalpost.com"
  56. , "haaretz.com", "bostonglobe.com", "washingtonpost.com", "theaustralian.com.au"
  57. , "wsj.com", "nytimes.com", "theglobeandmail.com", "theage.com.au", "smh.com.au"
  58. , "www.economist.com", "reuters.com", "afp.com", "rt.com", "huffingtonpost.com"
  59. , "aljazeera.com", "cnbc.com", "chicagotribune.com", "buzzfeed.com"
  60. , "theguardian.com", "reddit.com", "cbc.ca", "bbc.co.uk", "cnet.com"
  61. , "bloomberg.com", "bbc.com", "suntimes.com", "foxnews.com", "jpost.com"
  62. , "voat.co" ]
  63. for d in domains:
  64. match = re.search(d, url)
  65. if match and match.group() == d:
  66. return True #archive it
  67. return False #dont archive it
  68. def check_whitelist(url):
  69. domains = ["raddle.me", "coinsh.red", "youtube.com", "youtu.be" ]
  70. for d in domains:
  71. match = re.search(d, url)
  72. if match and match.group() == d:
  73. return False #don't archive it
  74. return True #archive it
  75. def append_to_body(driver, edit_link, append_text):
  76. driver.get(edit_link)
  77. elem = driver.find_element_by_id("submission_body")
  78. append_text = '\n \n' + append_text
  79. elem.send_keys(append_text)
  80. #submit change
  81. elem = driver.find_element_by_id("submission_submit")
  82. elem.click()
  83. def main():
  84. opts = Options()
  85. opts.add_argument("--headless")
  86. opts.add_argument("--no-sandbox")
  87. ff_driver = PATH_TO_GECKODRIVER #"driver/geckodriver"
  88. driver = webdriver.Firefox(firefox_options=opts, executable_path=ff_driver)
  89. login(driver, "Archie", "******")
  90. posts = get_new_posts(driver, "https://raddle.me/all/new/")
  91. if len(posts) > 0:
  92. date = posts[0]["time"] # output the first (most recent) time
  93. time_file = PATH_TO_time.txt # "/time.txt"
  94. with open(time_file, 'r') as the_file:
  95. data = the_file.read()
  96. the_file.close
  97. with open(time_file, 'w') as the_file:
  98. the_file.write(date)
  99. the_file.close
  100. log_time = (dateparser.parse(data)).replace(tzinfo=utc)
  101. for p in posts:
  102. p_time = (dateparser.parse(p["time"])).replace(tzinfo=utc)
  103. p_url = p["sub"]
  104. if p_time > log_time and check_whitelist(p_url): #if time after last logged time & url is in archive list
  105. print("output this", p["time"])
  106. archive_url = archiveis.capture(p_url) #returns http link
  107. archive_url = archive_url.replace("http://","https://")
  108. post_comment(driver, p["comm"], archive_url)
  109. driver.close()
  110. if __name__ == "__main__":
  111. main()