parse_out_email_text.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. #!/usr/bin/python3
  2. from nltk.stem.snowball import SnowballStemmer
  3. import string
  4. def parseOutText(f):
  5. """ given an opened email file f, parse out all text below the
  6. metadata block at the top
  7. (in Part 2, you will also add stemming capabilities)
  8. and return a string that contains all the words
  9. in the email (space-separated)
  10. example use case:
  11. f = open("email_file_name.txt", "r")
  12. text = parseOutText(f)
  13. """
  14. f.seek(0) ### go back to beginning of file (annoying)
  15. all_text = f.read()
  16. ### split off metadata
  17. content = all_text.split("X-FileName:")
  18. words = ""
  19. if len(content) > 1:
  20. ### remove punctuation
  21. text_string = content[1].translate(str.maketrans('','',string.punctuation))
  22. ### project part 2: comment out the line below
  23. # words = text_string
  24. ### split the text string into individual words, stem each word,
  25. ### and append the stemmed word to words (make sure there's a single
  26. ### space between each stemmed word)
  27. word_list = []
  28. stemmer = SnowballStemmer("english")
  29. for word in text_string.split():
  30. word_list.append(stemmer.stem(word))
  31. words = " ".join(word_list)
  32. return words
  33. def main():
  34. ff = open("../text_learning/test_email.txt", "r")
  35. text = parseOutText(ff)
  36. print(text)
  37. if __name__ == '__main__':
  38. main()