12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879 |
- """Wordcount exercise
- Google's Python class
- The main() below is already defined and complete. It calls print_words()
- and print_top() functions which you write.
- 1. For the --count flag, implement a print_words(filename) function that counts
- how often each word appears in the text and prints:
- word1 count1
- word2 count2
- ...
- Print the above list in order sorted by word (python will sort punctuation to
- come before letters -- that's fine). Store all the words as lowercase,
- so 'The' and 'the' count as the same word.
- 2. For the --topcount flag, implement a print_top(filename) which is similar
- to print_words() but which prints just the top 20 most common words sorted
- so the most common word is first, then the next most common, and so on.
- Use str.split() (no arguments) to split on all whitespace.
- Workflow: don't build the whole program at once. Get it to an intermediate
- milestone and print your data structure and sys.exit(0).
- When that's working, try for the next milestone.
- Optional: define a helper function to avoid code duplication inside
- print_words() and print_top().
- """
- import sys
- import re
- from collections import Counter
- def print_words(filename: str) -> None:
- all_words = sorted(Counter(re.findall(r'\w+', open(filename).read().lower())).items())
- for word, count in all_words:
- print("%s : %s" % (word, count))
- def print_top(filename: str) -> None:
- for word, count in Counter(
- re.findall(r'\w+', open(filename).read().lower())
- ).most_common(20):
- print("%s : %s" % (word, count))
- def main():
- if len(sys.argv) != 3:
- print('usage: ./wordcount.py {--count | --topcount} file')
- sys.exit(1)
- option = sys.argv[1]
- filename = sys.argv[2]
- if option == '--count':
- print_words(filename)
- elif option == '--topcount':
- print_top(filename)
- else:
- print('unknown option: ' + option)
- sys.exit(1)
- if __name__ == '__main__':
- main()
|