split_csv.py 1.2 KB

123456789101112131415161718192021222324252627282930313233343536373839
  1. """
  2. Split giant CSV into many small csvs
  3. """
  4. import os, sys, csv
  5. from typing import List
  6. csv.field_size_limit(sys.maxsize)
  7. def split(file_loc: str, out_dir: str, file_size=100000):
  8. with open(file_loc) as fp:
  9. count: int = 0
  10. curr_split: int = 0
  11. reader = csv.reader(fp)
  12. for header in reader: break
  13. writer = None
  14. for row in reader:
  15. if count % file_size == 0:
  16. print(f'parsed {count} rows.')
  17. split_filename: str = os.path.join(out_dir, f'edges-{curr_split}.csv')
  18. writer = csv.writer(open(split_filename, 'w'))
  19. writer.writerow(header)
  20. curr_split += 1
  21. writer.writerow(row)
  22. count += 1
  23. if __name__ == "__main__":
  24. from argparse import ArgumentParser
  25. parser: ArgumentParser = ArgumentParser()
  26. parser.add_argument('edges_file', type=str, help='path to csv file containing edges')
  27. parser.add_argument('split_dir', type=str, help='path to dump split files')
  28. parser.add_argument('--file-size', type=int, default=100000)
  29. args = parser.parse_args()
  30. split(args.edges_file, args.split_dir, file_size=args.file_size)