12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- #!/usr/bin/python3
- import os
- import joblib
- import re
- import sys
- import os
- sys.path.append(os.path.abspath("../tools/"))
- from parse_out_email_text import parseOutText
- from sklearn.feature_extraction.text import TfidfVectorizer
- """
- Starter code to process the emails from Sara and Chris to extract
- the features and get the documents ready for classification.
- The list of all the emails from Sara are in the from_sara list
- likewise for emails from Chris (from_chris)
- The actual documents are in the Enron email dataset, which
- you downloaded/unpacked in Part 0 of the first mini-project. If you have
- not obtained the Enron email corpus, run startup.py in the tools folder.
- The data is stored in lists and packed away in pickle files at the end.
- """
- from_sara = open("from_sara.txt", "r")
- from_chris = open("from_chris.txt", "r")
- from_data = []
- word_data = []
- ### temp_counter is a way to speed up the development--there are
- ### thousands of emails from Sara and Chris, so running over all of them
- ### can take a long time
- ### temp_counter helps you only look at the first 200 emails in the list so you
- ### can iterate your modifications quicker
- # temp_counter = 0
- for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
- for path in from_person:
- ### only look at first 200 emails when developing
- ### once everything is working, remove this line to run over full dataset
- # temp_counter += 1
- # if temp_counter < 200:
- path = os.path.join('..', path[:-1])
- # print(path)
- email = open(path, "r")
- ### use parseOutText to extract the text from the opened email
- stemmed = parseOutText(email)
- foo = stemmed
- ### use str.replace() to remove any instances of the words
- blacklist = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"]
- for word in blacklist:
- foo = foo.replace(word, str())
- ### append the text to word_data
- word_data.append(foo)
- ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
- # from_data.append(0 if )
- from_data.append(0 if name == "sara" else "chris")
- email.close()
- print("Emails Processed")
- from_sara.close()
- from_chris.close()
- joblib.dump( word_data, open("your_word_data.pkl", "wb") )
- joblib.dump( from_data, open("your_email_authors.pkl", "wb") )
- print("word_data[152] =", word_data[152])
- print("word_data[124] =", word_data[124])
- ### in Part 4, do TfIdf vectorization here
- vectorizer = TfidfVectorizer(stop_words='english')
- matrix = vectorizer.fit_transform(word_data)
- feat_names = vectorizer.get_feature_names_out()
- vocab = list(vectorizer.vocabulary_)
- print( "lengths (feature_names, vocabulary) = (%i, %i)" % (len(feat_names), len(vocab)) )
- QUIZ_INDEX= 34597
- print("items at position %i = (%s, %s)" % (QUIZ_INDEX, feat_names[QUIZ_INDEX],vocab[QUIZ_INDEX]) )
|