markov-words.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. #!/usr/bin/env python3
  2. import random, argparse, re, sys
  3. from collections import defaultdict
  4. clamp = lambda x, y, z: max(x, min(y, z))
  5. def main():
  6. args = get_arguments()
  7. words = []
  8. with open(args.dictionary_file, encoding=args.encoding) as f:
  9. for line in f.read().split('\n'):
  10. if len(line) == 0: continue
  11. if args.no_apostrophes and "'" in line: continue
  12. if args.no_capitals and re.match('[A-Z]', line[0]): continue
  13. words.append(line)
  14. successors = get_successors(words, args.n)
  15. for i in range(args.count):
  16. word = random_word(
  17. successors, args.n,
  18. max_length=args.max_length,
  19. end_bias=args.end_bias
  20. )
  21. print(word)
  22. def get_successors(words, n):
  23. successors = defaultdict(lambda: defaultdict(lambda: 0))
  24. for word in words:
  25. for ci, char in enumerate(list(word) + ['$']):
  26. head = ''
  27. if ci == 0:
  28. head = '^'
  29. elif ci < n:
  30. head = '^' + word[0:clamp(1, ci, len(word))]
  31. elif ci >= n:
  32. head = word[ci - n:ci]
  33. successors[head][char] += 1
  34. return successors
  35. def random_successor(successors, substring, end_bias = 1):
  36. choices = []
  37. for key, value in successors[substring].items():
  38. if value == '$':
  39. choices += [key] * (end_bias * value)
  40. else:
  41. choices += [key] * value
  42. return random.choice(choices)
  43. def random_word(successors, n, max_length = sys.maxsize, end_bias = 1):
  44. word = ""
  45. while len(word) < max_length - 1:
  46. head = word[max(0, len(word) - n):len(word)]
  47. if len(head) < n:
  48. head = '^' + head
  49. successor = random_successor(successors, head, end_bias)
  50. if successor == '$':
  51. break
  52. word += successor
  53. return word
  54. def get_arguments():
  55. parser = argparse.ArgumentParser(prog='markdov-words')
  56. parser.add_argument(
  57. '-d', '--dictionary-file', default="/usr/share/dict/words",
  58. help=(
  59. "Path to dictionary file -- "
  60. "A dictionary file is just one containing "
  61. "a list of words separated by line-breaks. "
  62. "On Unix systems these can usually be found in "
  63. "/usr/share/dict/."
  64. )
  65. )
  66. parser.add_argument(
  67. '--no-apostrophes', action='store_true',
  68. help="Exclude words with apostrophes from the dictionary"
  69. )
  70. parser.add_argument(
  71. '--no-capitals', action='store_true',
  72. help="Exclude words starting with A-Z capital letters"
  73. )
  74. parser.add_argument(
  75. '--encoding', default="utf-8",
  76. help="Number of words to print"
  77. )
  78. parser.add_argument(
  79. '-c', '--count', type=int, default=1,
  80. help="Number of words to print"
  81. )
  82. parser.add_argument(
  83. '-e', '--end-bias', type=int, default=100,
  84. help=(
  85. "Multiplier for the probability "
  86. "that a word will end at a given point -- "
  87. "Note that sometimes the probability is zero, "
  88. "so setting this very high does not guarantee "
  89. "that words will not be abnormally long. "
  90. )
  91. )
  92. parser.add_argument(
  93. '-n', '--n', type=int, default=3,
  94. help=(
  95. "The number of previous letters to take into account "
  96. "in selecting the next one -- "
  97. "For high values of n, "
  98. "the results are likely "
  99. "to exactly reproduce words in the dictionary, "
  100. "whereas for lower values, "
  101. "they are likely to sound implausible."
  102. )
  103. )
  104. parser.add_argument(
  105. '-l', '--max-length', type=int, default=16,
  106. help=(
  107. "Maximum length of a word, "
  108. "which if reached will simply terminate the word, "
  109. "even if the ending is not a probable one."
  110. )
  111. )
  112. return parser.parse_args()
  113. if __name__ == '__main__':
  114. main()