wordcount.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. #!/usr/bin/env python3 -tt
  2. # Copyright 2010 Google Inc.
  3. # Licensed under the Apache License, Version 2.0
  4. # http://www.apache.org/licenses/LICENSE-2.0
  5. # Google's Python Class
  6. # http://code.google.com/edu/languages/google-python-class/
  7. """Wordcount exercise
  8. Google's Python class
  9. The main() below is already defined and complete. It calls print_words()
  10. and print_top() functions which you write.
  11. 1. For the --count flag, implement a print_words(filename) function that counts
  12. how often each word appears in the text and prints:
  13. word1 count1
  14. word2 count2
  15. ...
  16. Print the above list in order sorted by word (python will sort punctuation to
  17. come before letters -- that's fine). Store all the words as lowercase,
  18. so 'The' and 'the' count as the same word.
  19. 2. For the --topcount flag, implement a print_top(filename) which is similar
  20. to print_words() but which prints just the top 20 most common words sorted
  21. so the most common word is first, then the next most common, and so on.
  22. Use str.split() (no arguments) to split on all whitespace.
  23. Workflow: don't build the whole program at once. Get it to an intermediate
  24. milestone and print your data structure and sys.exit(0).
  25. When that's working, try for the next milestone.
  26. Optional: define a helper function to avoid code duplication inside
  27. print_words() and print_top().
  28. """
  29. import sys
  30. import re
  31. from collections import Counter
  32. # +++your code here+++
  33. # Define print_words(filename) and print_top(filename) functions.
  34. # You could write a helper utility function that reads a file
  35. # and builds and returns a word/count dict for it.
  36. # Then print_words() and print_top() can just call the utility function.
  37. def print_words(filename: str) -> None:
  38. all_words = sorted(Counter(re.findall(r'\w+', open(filename).read().lower())).items())
  39. for word, count in all_words:
  40. print("%s : %s" % (word, count))
  41. def print_top(filename: str) -> None:
  42. for word, count in Counter(
  43. re.findall(r'\w+', open(filename).read().lower())
  44. ).most_common(20):
  45. print("%s : %s" % (word, count))
  46. # This basic command line argument parsing code is provided and
  47. # calls the print_words() and print_top() functions which you must define.
  48. def main():
  49. if len(sys.argv) != 3:
  50. print('usage: ./wordcount.py {--count | --topcount} file')
  51. sys.exit(1)
  52. option = sys.argv[1]
  53. filename = sys.argv[2]
  54. if option == '--count':
  55. print_words(filename)
  56. elif option == '--topcount':
  57. print_top(filename)
  58. else:
  59. print('unknown option: ' + option)
  60. sys.exit(1)
  61. if __name__ == '__main__':
  62. main()