scuti
/
ud120-projects


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
							#!/usr/bin/python3

import os
import joblib
import re
import sys
import os

sys.path.append(os.path.abspath("../tools/"))
from parse_out_email_text import parseOutText

from sklearn.feature_extraction.text import TfidfVectorizer

"""
    Starter code to process the emails from Sara and Chris to extract
    the features and get the documents ready for classification.

    The list of all the emails from Sara are in the from_sara list
    likewise for emails from Chris (from_chris)

    The actual documents are in the Enron email dataset, which
    you downloaded/unpacked in Part 0 of the first mini-project. If you have
    not obtained the Enron email corpus, run startup.py in the tools folder.

    The data is stored in lists and packed away in pickle files at the end.
"""


from_sara  = open("from_sara.txt", "r")
from_chris = open("from_chris.txt", "r")

from_data = []
word_data = []

### temp_counter is a way to speed up the development--there are
### thousands of emails from Sara and Chris, so running over all of them
### can take a long time
### temp_counter helps you only look at the first 200 emails in the list so you
### can iterate your modifications quicker
# temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        # temp_counter += 1
        # if temp_counter < 200:
        path = os.path.join('..', path[:-1])
        # print(path)
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email
        stemmed = parseOutText(email)
        foo = stemmed
        ### use str.replace() to remove any instances of the words
        blacklist = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"]
        for word in blacklist:
            foo = foo.replace(word, str())

        ### append the text to word_data
        word_data.append(foo)

        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        # from_data.append(0 if )
        from_data.append(0 if name == "sara" else "chris")

        email.close()

print("Emails Processed")
from_sara.close()
from_chris.close()

joblib.dump( word_data, open("your_word_data.pkl", "wb") )
joblib.dump( from_data, open("your_email_authors.pkl", "wb") )

print("word_data[152] =", word_data[152])
print("word_data[124] =", word_data[124])
### in Part 4, do TfIdf vectorization here

vectorizer = TfidfVectorizer(stop_words='english')
matrix = vectorizer.fit_transform(word_data)

feat_names = vectorizer.get_feature_names_out()
vocab      = list(vectorizer.vocabulary_)
print( "lengths (feature_names, vocabulary) = (%i, %i)" % (len(feat_names), len(vocab)) )

QUIZ_INDEX= 34597
print("items at position %i = (%s, %s)" % (QUIZ_INDEX, feat_names[QUIZ_INDEX],vocab[QUIZ_INDEX]) )