vectorize_text.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. #!/usr/bin/python3
  2. import os
  3. import joblib
  4. import re
  5. import sys
  6. import os
  7. sys.path.append(os.path.abspath("../tools/"))
  8. from parse_out_email_text import parseOutText
  9. from sklearn.feature_extraction.text import TfidfVectorizer
  10. """
  11. Starter code to process the emails from Sara and Chris to extract
  12. the features and get the documents ready for classification.
  13. The list of all the emails from Sara are in the from_sara list
  14. likewise for emails from Chris (from_chris)
  15. The actual documents are in the Enron email dataset, which
  16. you downloaded/unpacked in Part 0 of the first mini-project. If you have
  17. not obtained the Enron email corpus, run startup.py in the tools folder.
  18. The data is stored in lists and packed away in pickle files at the end.
  19. """
  20. from_sara = open("from_sara.txt", "r")
  21. from_chris = open("from_chris.txt", "r")
  22. from_data = []
  23. word_data = []
  24. ### temp_counter is a way to speed up the development--there are
  25. ### thousands of emails from Sara and Chris, so running over all of them
  26. ### can take a long time
  27. ### temp_counter helps you only look at the first 200 emails in the list so you
  28. ### can iterate your modifications quicker
  29. # temp_counter = 0
  30. for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
  31. for path in from_person:
  32. ### only look at first 200 emails when developing
  33. ### once everything is working, remove this line to run over full dataset
  34. # temp_counter += 1
  35. # if temp_counter < 200:
  36. path = os.path.join('..', path[:-1])
  37. # print(path)
  38. email = open(path, "r")
  39. ### use parseOutText to extract the text from the opened email
  40. stemmed = parseOutText(email)
  41. foo = stemmed
  42. ### use str.replace() to remove any instances of the words
  43. blacklist = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"]
  44. for word in blacklist:
  45. foo = foo.replace(word, str())
  46. ### append the text to word_data
  47. word_data.append(foo)
  48. ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
  49. # from_data.append(0 if )
  50. from_data.append(0 if name == "sara" else "chris")
  51. email.close()
  52. print("Emails Processed")
  53. from_sara.close()
  54. from_chris.close()
  55. joblib.dump( word_data, open("your_word_data.pkl", "wb") )
  56. joblib.dump( from_data, open("your_email_authors.pkl", "wb") )
  57. print("word_data[152] =", word_data[152])
  58. print("word_data[124] =", word_data[124])
  59. ### in Part 4, do TfIdf vectorization here
  60. vectorizer = TfidfVectorizer(stop_words='english')
  61. matrix = vectorizer.fit_transform(word_data)
  62. feat_names = vectorizer.get_feature_names_out()
  63. vocab = list(vectorizer.vocabulary_)
  64. print( "lengths (feature_names, vocabulary) = (%i, %i)" % (len(feat_names), len(vocab)) )
  65. QUIZ_INDEX= 34597
  66. print("items at position %i = (%s, %s)" % (QUIZ_INDEX, feat_names[QUIZ_INDEX],vocab[QUIZ_INDEX]) )