123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- from selenium import webdriver
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.common.by import By
- from selenium.common.exceptions import TimeoutException
- from selenium.webdriver.firefox.options import Options
- import dateparser
- import dateparser.utils.strptime
- import datetime
- import archiveis
- import pytz
- import re
- utc=pytz.UTC
- def login(driver, username, password):
- driver.get("http://www.raddle.me/login")
- elem = driver.find_element_by_id("login-username")
- elem.clear()
- elem.send_keys(username)
- elem = driver.find_element_by_id("login-password")
- elem.clear()
- elem.send_keys(password)
- elem.send_keys(Keys.RETURN)
- wait = WebDriverWait(driver, 30)
- wait.until(EC.title_is("Raddle"))
- def get_new_posts(driver, link):
- driver.get(link)
- elements = driver.find_elements_by_css_selector('div.submission-inner')
- posts = []
- for e in elements:
- post = {}
- time = e.find_element_by_css_selector("time").get_attribute("datetime")
- sub_link = e.find_element_by_css_selector("a.submission-link").get_attribute("href")
- comm_link = e.find_element_by_css_selector("nav.submission-nav").find_elements_by_css_selector("a")[0].get_attribute("href")
- edit_link = '' # e.find_element_by_css_selector("nav.submission-nav").find_elements_by_css_selector("a")[1].get_attribute("href")
-
- post = {
- "sub": sub_link,
- "edit": edit_link,
- "comm": comm_link,
- "time": time,
- }
- posts.append(post)
- return posts
- def post_comment(driver, comm_link, comm_text):
- driver.get(comm_link)
- elem = driver.find_element_by_id("comment_comment")
- append_text = comm_text
- elem.send_keys(append_text)
- #submit change
- elem = driver.find_element_by_id("comment_submit")
- elem.click()
- def check_domain(url):
- domains = ["nytimes.com", "wsj.com", "cnn.com", "thetimes.co.uk", "vice.com"
- , "newsweek.com", "kyivpost.com", "ft.com", "latimes.com", "nypost.com"
- , "telegraph.co.uk", "independent.co.uk", "scmp.com", "nationalpost.com"
- , "haaretz.com", "bostonglobe.com", "washingtonpost.com", "theaustralian.com.au"
- , "wsj.com", "nytimes.com", "theglobeandmail.com", "theage.com.au", "smh.com.au"
- , "www.economist.com", "reuters.com", "afp.com", "rt.com", "huffingtonpost.com"
- , "aljazeera.com", "cnbc.com", "chicagotribune.com", "buzzfeed.com"
- , "theguardian.com", "reddit.com", "cbc.ca", "bbc.co.uk", "cnet.com"
- , "bloomberg.com", "bbc.com", "suntimes.com", "foxnews.com", "jpost.com"
- , "voat.co" ]
- for d in domains:
- match = re.search(d, url)
- if match and match.group() == d:
- return True #archive it
- return False #dont archive it
- def check_whitelist(url):
- domains = ["raddle.me", "coinsh.red", "youtube.com", "youtu.be" ]
- for d in domains:
- match = re.search(d, url)
- if match and match.group() == d:
- return False #don't archive it
- return True #archive it
- def append_to_body(driver, edit_link, append_text):
- driver.get(edit_link)
- elem = driver.find_element_by_id("submission_body")
- append_text = '\n \n' + append_text
- elem.send_keys(append_text)
- #submit change
- elem = driver.find_element_by_id("submission_submit")
- elem.click()
- def main():
- opts = Options()
- opts.add_argument("--headless")
- opts.add_argument("--no-sandbox")
- ff_driver = PATH_TO_GECKODRIVER #"driver/geckodriver"
- driver = webdriver.Firefox(firefox_options=opts, executable_path=ff_driver)
- login(driver, "Archie", "******")
-
- posts = get_new_posts(driver, "https://raddle.me/all/new/")
- if len(posts) > 0:
- date = posts[0]["time"] # output the first (most recent) time
- time_file = PATH_TO_time.txt # "/time.txt"
- with open(time_file, 'r') as the_file:
- data = the_file.read()
- the_file.close
- with open(time_file, 'w') as the_file:
- the_file.write(date)
- the_file.close
- log_time = (dateparser.parse(data)).replace(tzinfo=utc)
-
- for p in posts:
- p_time = (dateparser.parse(p["time"])).replace(tzinfo=utc)
- p_url = p["sub"]
- if p_time > log_time and check_whitelist(p_url): #if time after last logged time & url is in archive list
- print("output this", p["time"])
- archive_url = archiveis.capture(p_url) #returns http link
- archive_url = archive_url.replace("http://","https://")
- post_comment(driver, p["comm"], archive_url)
- driver.close()
- if __name__ == "__main__":
- main()
|