merge_csv.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. import os
  2. import pandas as pd
  3. from glob import glob
  4. from tqdm import tqdm
  5. from typing import List, Any
  6. def main(args: Any):
  7. processed_dir: str = os.path.join(args.root, 'processed')
  8. if not os.path.isdir(processed_dir): os.makedirs(processed_dir)
  9. merge_file: str = os.path.join(processed_dir, args.merge_name)
  10. if os.path.isfile(merge_file):
  11. print('Merge file already exists. Remove before running.')
  12. return
  13. paths: List[str] = sorted(glob(os.path.join(args.root, '*.csv')))
  14. print(f'Creating merged file: {args.merge_name}.')
  15. for i in tqdm(range(len(paths))):
  16. path: str = paths[i]
  17. df = pd.read_csv(path)
  18. df.to_csv(merge_file, index=False, header=(i==0), mode='a')
  19. del df
  20. if not args.no_sort:
  21. # I ran this part manually
  22. df: pd.DataFrame = pd.read_csv(merge_file)
  23. df: pd.DataFramef = df.sort_values(by=args.sort_column)
  24. out_name: str = os.path.join(
  25. args.root,
  26. 'processed',
  27. args.merge_name.replace('merged', 'sorted'),
  28. )
  29. df.to_csv(out_name, index=False)
  30. if __name__ == "__main__":
  31. import argparse
  32. parser = argparse.ArgumentParser()
  33. parser.add_argument(
  34. '--root',
  35. type=str,
  36. default='./data/bigquery/ethereum-block-data',
  37. help='path to data root (default: ./data/bigquery/ethereum-block-data)',
  38. )
  39. parser.add_argument(
  40. '--merge-name',
  41. type=str,
  42. default='blocks-merged.csv',
  43. help='name of merged file (default: blocks-merged.csv)',
  44. )
  45. parser.add_argument(
  46. '--sort-column',
  47. type=str,
  48. default='number',
  49. help='name of column to sort by (default: number)',
  50. )
  51. parser.add_argument('--no-sort', action='store_true', default=False)
  52. args: Any = parser.parse_args()
  53. main(args)