make_corpus.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. """
  2. Convert the files to a gensim corpus style.
  3. gensim LineSentence format:
  4. `one line = one sentence. Words must be already preprocessed and
  5. separated by whitespace.`
  6. """
  7. import os
  8. import jsonlines
  9. from glob import glob
  10. from typing import Any, List
  11. def main(args: Any):
  12. corpus_file: str = os.path.join(args.out_dir, 'corpus-30.jsonl')
  13. with jsonlines.open(corpus_file, 'w') as out_fp:
  14. sequence_files: List[str] = glob(
  15. os.path.join(args.data_dir, 'sequences-30-*.jsonl'))
  16. sizes: List[int] = [0] * len(sequence_files)
  17. count: int = 0
  18. for i, sequence_file in enumerate(sequence_files):
  19. sequence_file: str = sequence_file
  20. with jsonlines.open(sequence_file) as in_fp:
  21. for row in in_fp:
  22. row: List[int] = row
  23. out_fp.write(row)
  24. sizes[i] += 1
  25. count += 1
  26. if count % 1000000 == 0:
  27. print(f'Written {count} files.')
  28. print(sizes)
  29. if __name__ == "__main__":
  30. from argparse import ArgumentParser
  31. parser: ArgumentParser = ArgumentParser()
  32. parser.add_argument('data_dir', type=str, help='path to data_dir')
  33. parser.add_argument('out_dir', type=str, help='path to out_dir')
  34. args: Any = parser.parse_args()
  35. main(args)