sqewl
/
Archie


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
							from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options
import dateparser
import dateparser.utils.strptime
import datetime
import archiveis
import pytz
import re

utc=pytz.UTC


def login(driver, username, password):
    driver.get("http://www.raddle.me/login")

    elem = driver.find_element_by_id("login-username")
    elem.clear()
    elem.send_keys(username)

    elem = driver.find_element_by_id("login-password")
    elem.clear()
    elem.send_keys(password)

    elem.send_keys(Keys.RETURN)

    wait = WebDriverWait(driver, 30)
    wait.until(EC.title_is("Raddle"))

def get_new_posts(driver, link):
    driver.get(link)

    elements = driver.find_elements_by_css_selector('div.submission-inner')
    posts = []
    for e in elements:
        post = {}
        time = e.find_element_by_css_selector("time").get_attribute("datetime")
        sub_link = e.find_element_by_css_selector("a.submission-link").get_attribute("href")
        comm_link =  e.find_element_by_css_selector("nav.submission-nav").find_elements_by_css_selector("a")[0].get_attribute("href") 
        edit_link = '' # e.find_element_by_css_selector("nav.submission-nav").find_elements_by_css_selector("a")[1].get_attribute("href") 
        
        post = {
            "sub": sub_link,
            "edit": edit_link,
            "comm": comm_link,
            "time": time,
        }
        posts.append(post)
    return posts

def post_comment(driver, comm_link, comm_text):
    driver.get(comm_link)

    elem = driver.find_element_by_id("comment_comment")
    append_text = comm_text
    elem.send_keys(append_text)

    #submit change
    elem = driver.find_element_by_id("comment_submit")
    elem.click()

def check_domain(url):
    domains = ["nytimes.com", "wsj.com", "cnn.com", "thetimes.co.uk", "vice.com"
              , "newsweek.com", "kyivpost.com", "ft.com", "latimes.com", "nypost.com"
              , "telegraph.co.uk", "independent.co.uk", "scmp.com", "nationalpost.com"
              , "haaretz.com", "bostonglobe.com", "washingtonpost.com", "theaustralian.com.au"
              , "wsj.com", "nytimes.com", "theglobeandmail.com", "theage.com.au", "smh.com.au"
              , "www.economist.com", "reuters.com", "afp.com", "rt.com", "huffingtonpost.com" 
              , "aljazeera.com", "cnbc.com", "chicagotribune.com", "buzzfeed.com"
              , "theguardian.com", "reddit.com", "cbc.ca", "bbc.co.uk", "cnet.com"
              , "bloomberg.com", "bbc.com", "suntimes.com", "foxnews.com", "jpost.com"
              , "voat.co" ]
    for d in domains:
        match = re.search(d, url)
        if match and match.group() == d:
            return True #archive it
    return False #dont archive it

def check_whitelist(url):
    domains = ["raddle.me", "coinsh.red", "youtube.com", "youtu.be" ]
    for d in domains:
        match = re.search(d, url)
        if match and match.group() == d:
            return False #don't archive it 
    return True #archive it

def append_to_body(driver, edit_link, append_text):
    driver.get(edit_link)

    elem = driver.find_element_by_id("submission_body")
    append_text = '\n \n' + append_text
    elem.send_keys(append_text)

    #submit change
    elem = driver.find_element_by_id("submission_submit")
    elem.click()

def main():
    opts = Options()
    opts.add_argument("--headless")
    opts.add_argument("--no-sandbox")
    ff_driver = PATH_TO_GECKODRIVER #"driver/geckodriver"
    driver = webdriver.Firefox(firefox_options=opts, executable_path=ff_driver)

    login(driver, "Archie", "******")
    
    posts = get_new_posts(driver, "https://raddle.me/all/new/") 
    if len(posts) > 0:
        date = posts[0]["time"] # output the first (most recent) time

    time_file = PATH_TO_time.txt # "/time.txt"
    with open(time_file, 'r') as the_file:
       data = the_file.read()
       the_file.close
    with open(time_file, 'w') as the_file:
       the_file.write(date)
       the_file.close
       log_time = (dateparser.parse(data)).replace(tzinfo=utc)
    
    for p in posts:
        p_time = (dateparser.parse(p["time"])).replace(tzinfo=utc)
        p_url = p["sub"]
        if p_time > log_time and check_whitelist(p_url): #if time after last logged time & url is in archive list
            print("output this", p["time"])
            archive_url = archiveis.capture(p_url) #returns http link 
            archive_url = archive_url.replace("http://","https://")
            post_comment(driver, p["comm"], archive_url)

    driver.close()

if __name__ == "__main__":
    main()