parse_google_ngram.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. #!/usr/bin/env python
  2. #
  3. # Copyright 2012 B. J. Potter from https://github.com/bjcubsfan/passphrases
  4. #
  5. # Modified 2016 LE van Braam-Stewart for Idiolectalyzer
  6. #
  7. # This program is free software: you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation, either version 3 of the License, or
  10. # (at your option) any later version.
  11. #
  12. # This program is distributed in the hope that it will be useful,
  13. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. # GNU General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU General Public License
  18. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  19. """Process Google ngram data.
  20. Google ngram data can be found at
  21. http://books.google.com/ngrams/datasets
  22. This was used to generate the word list included with this code. It
  23. generates a list of the most commonly used words in the chosen set by
  24. inputing 1-gram files from Google. This can be used to generate other
  25. word lists suitable for use in making passphrases.
  26. Example usage:
  27. google_ngram.py -o word_list.txt googlebooks-eng-fiction-all-1gram-*
  28. processing: googlebooks-eng-fiction-all-1gram-20120701-a
  29. Reading line 1 million
  30. Reading line 2 million
  31. Reading line 3 million
  32. Reading line 4 million
  33. Reading line 5 million
  34. Reading line 6 million
  35. Reading line 7 million
  36. Reading line 8 million
  37. Reading line 9 million
  38. Reading line 10 million
  39. processing: googlebooks-eng-fiction-all-1gram-20120701-b
  40. Reading line 1 million
  41. Reading line 2 million
  42. Reading line 3 million
  43. Reading line 4 million
  44. Reading line 5 million
  45. Reading line 6 million
  46. Reading line 7 million
  47. Reading line 8 million
  48. Reading line 9 million
  49. This will write 'ngram_processed.txt' with all of the words from the
  50. input ngram files and the 'word_list.txt' with only the top 10,000
  51. words, excluding words less than three letters and those with
  52. punctuation.
  53. """
  54. import os
  55. import sys
  56. import string
  57. from optparse import OptionParser
  58. from operator import itemgetter
  59. # Don't analyze words from books before this year
  60. START_YEAR = 1965
  61. # How many words in the output file
  62. MAX_WORDS = 100000
  63. # Exclude words shorter than this from the final list
  64. MIN_WORD_LENGTH = 1
  65. def unwanted_characters_in_word(word):
  66. """Return boolean indicating unwanted characters in the word."""
  67. for letter in word:
  68. #if ''((letter in string.punctuation) or
  69. # (letter not in string.letters)):
  70. if ((letter not in string.letters) and
  71. (letter is not r"'")):
  72. return True
  73. return False
  74. def process_ngram_file(file_name, word_list):
  75. """Process a Google ngram file to make a word list.
  76. The ngram data comes from http://books.google.com/ngrams/datasets"""
  77. line_count = 0
  78. with open(file_name) as file:
  79. for line in file:
  80. line_count += 1
  81. if line_count % 1000000 == 0:
  82. print "Reading line {0:3d} million".format(line_count / 1000000)
  83. line = line.split()
  84. word = line[0].lower()
  85. year = int(line[1])
  86. occurances = int(line[2])
  87. if (year < START_YEAR or
  88. unwanted_characters_in_word(word)):
  89. continue
  90. word_list[word] = word_list.get(word, 0) + occurances
  91. file.close()
  92. def build_word_list(input_word_list, word_list_file_name, max_words, min_word_length):
  93. """Write final list of words from processed ngram words.
  94. input_word_list: The given list has the most frequent words at the
  95. beginning and less frequent words later.
  96. word_list_file_name: The file to be written to. It will contain just
  97. the most common MAX_WORDS."""
  98. final_word_list = []
  99. for word in input_word_list:
  100. word = word[0]
  101. if len(final_word_list) >= max_words:
  102. break
  103. if len(word) < min_word_length:
  104. continue
  105. final_word_list.append(word)
  106. out_file = open(word_list_file_name, 'w')
  107. for word in final_word_list:
  108. out_file.write('{0}\n'.format(word))
  109. out_file.close()
  110. return final_word_list
  111. if __name__ == '__main__':
  112. usage = "usage: %prog -o word_list.txt input_ngram_file[s]"
  113. parser = OptionParser(usage)
  114. parser.add_option("-o", "--out", default='',
  115. help="Output file name")
  116. parser.add_option("-y", "--startyear", default=1000,
  117. help="Don\'t use books earlier than this year")
  118. parser.add_option("-l", "--minlength", default=1,
  119. help="Minimum word length")
  120. parser.add_option("-w", "--maxwords", default=100000,
  121. help= "Maximum number words to add to list")
  122. (options, args) = parser.parse_args()
  123. if len(args) == 0:
  124. print 'ERROR: Must have an input file'
  125. sys.exit(1)
  126. if options.out == '':
  127. print ('ERROR: Must have an output file "-o word_list.txt"')
  128. sys.exit(1)
  129. word_list = {}
  130. for file_name in args:
  131. print 'processing:', file_name
  132. assert os.path.isfile(file_name)
  133. process_ngram_file(file_name, word_list)
  134. sorted_list = sorted(word_list.iteritems(), key=itemgetter(1), reverse=True)
  135. build_word_list(sorted_list, options.out, MAX_WORDS, MIN_WORD_LENGTH)