123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125 |
- #!/usr/bin/env python3
- import random, argparse, re, sys
- from collections import defaultdict
- clamp = lambda x, y, z: max(x, min(y, z))
- def main():
- args = get_arguments()
- words = []
- with open(args.dictionary_file, encoding=args.encoding) as f:
- for line in f.read().split('\n'):
- if len(line) == 0: continue
- if args.no_apostrophes and "'" in line: continue
- if args.no_capitals and re.match('[A-Z]', line[0]): continue
- words.append(line)
- successors = get_successors(words, args.n)
- for i in range(args.count):
- word = random_word(
- successors, args.n,
- max_length=args.max_length,
- end_bias=args.end_bias
- )
- print(word)
- def get_successors(words, n):
- successors = defaultdict(lambda: defaultdict(lambda: 0))
- for word in words:
- for ci, char in enumerate(list(word) + ['$']):
- head = ''
- if ci == 0:
- head = '^'
- elif ci < n:
- head = '^' + word[0:clamp(1, ci, len(word))]
- elif ci >= n:
- head = word[ci - n:ci]
- successors[head][char] += 1
- return successors
- def random_successor(successors, substring, end_bias = 1):
- choices = []
- for key, value in successors[substring].items():
- if value == '$':
- choices += [key] * (end_bias * value)
- else:
- choices += [key] * value
- return random.choice(choices)
- def random_word(successors, n, max_length = sys.maxsize, end_bias = 1):
- word = ""
- while len(word) < max_length - 1:
- head = word[max(0, len(word) - n):len(word)]
- if len(head) < n:
- head = '^' + head
- successor = random_successor(successors, head, end_bias)
- if successor == '$':
- break
- word += successor
- return word
- def get_arguments():
- parser = argparse.ArgumentParser(prog='markdov-words')
- parser.add_argument(
- '-d', '--dictionary-file', default="/usr/share/dict/words",
- help=(
- "Path to dictionary file -- "
- "A dictionary file is just one containing "
- "a list of words separated by line-breaks. "
- "On Unix systems these can usually be found in "
- "/usr/share/dict/."
- )
- )
- parser.add_argument(
- '--no-apostrophes', action='store_true',
- help="Exclude words with apostrophes from the dictionary"
- )
- parser.add_argument(
- '--no-capitals', action='store_true',
- help="Exclude words starting with A-Z capital letters"
- )
- parser.add_argument(
- '--encoding', default="utf-8",
- help="Number of words to print"
- )
- parser.add_argument(
- '-c', '--count', type=int, default=1,
- help="Number of words to print"
- )
- parser.add_argument(
- '-e', '--end-bias', type=int, default=100,
- help=(
- "Multiplier for the probability "
- "that a word will end at a given point -- "
- "Note that sometimes the probability is zero, "
- "so setting this very high does not guarantee "
- "that words will not be abnormally long. "
- )
- )
- parser.add_argument(
- '-n', '--n', type=int, default=3,
- help=(
- "The number of previous letters to take into account "
- "in selecting the next one -- "
- "For high values of n, "
- "the results are likely "
- "to exactly reproduce words in the dictionary, "
- "whereas for lower values, "
- "they are likely to sound implausible."
- )
- )
- parser.add_argument(
- '-l', '--max-length', type=int, default=16,
- help=(
- "Maximum length of a word, "
- "which if reached will simply terminate the word, "
- "even if the ending is not a probable one."
- )
- )
- return parser.parse_args()
- if __name__ == '__main__':
- main()
|