create_splits.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. """
  2. Once we have all the block data, create splits of
  3. - 1 week (toy debugging set)
  4. - 1 month (cheaper test set)
  5. - 1 year (more expensive test set)
  6. """
  7. import os
  8. import numpy as np
  9. import pandas as pd
  10. from tqdm import tqdm
  11. from datetime import datetime, timedelta
  12. from typing import Any
  13. def main(args: Any):
  14. root_1week: str = os.path.join(
  15. args.root, 'processed', 'blocks-1week.csv')
  16. root_1month: str = os.path.join(
  17. args.root, 'processed', 'blocks-1month.csv')
  18. root_1year: str = os.path.join(
  19. args.root, 'processed', 'blocks-1year.csv')
  20. root_all: str = os.path.join(
  21. args.root, 'processed', 'blocks-sorted.csv')
  22. # start of the month that we scrapped at
  23. end_date: str = '2021-10-01 00:00:00 UTC'
  24. end_date: datetime = datetime.strptime(
  25. end_date,
  26. '%Y-%m-%d %H:%M:%S UTC',
  27. )
  28. start_date_1week: datetime = end_date - timedelta(days=7)
  29. start_date_1month: datetime = end_date - timedelta(days=30)
  30. start_date_1year: datetime = end_date - timedelta(days=365)
  31. print('Reading large file...')
  32. df = pd.read_csv(root_all, index_col=[0])
  33. breakpoint()
  34. df.timestamp = pd.to_datetime(
  35. df.timestamp,
  36. format='%Y-%m-%d %H:%M:%S UTC',
  37. )
  38. slice_1week: pd.Series = np.logical_and(
  39. df.timestamp >= start_date_1week,
  40. df.timestamp <= end_date,
  41. )
  42. slice_1month: pd.Series = np.logical_and(
  43. df.timestamp >= start_date_1month,
  44. df.timestamp <= end_date,
  45. )
  46. slice_1year: pd.Series = np.logical_and(
  47. df.timestamp >= start_date_1year,
  48. df.timestamp <= end_date,
  49. )
  50. df_1week: pd.DataFrame = df[slice_1week]
  51. df_1month: pd.DataFrame = df[slice_1month]
  52. df_1year: pd.DataFrame = df[slice_1year]
  53. print('Saving splits...')
  54. df_1week.to_csv(root_1week)
  55. df_1month.to_csv(root_1month)
  56. df_1year.to_csv(root_1year)
  57. if __name__ == "__main__":
  58. import argparse
  59. parser = argparse.ArgumentParser()
  60. parser.add_argument(
  61. '--root',
  62. type=str,
  63. default='./data/bigquery/ethereum-block-data',
  64. help='path to data root (default: ./data/bigquery/ethereum-block-data)',
  65. )
  66. args: Any = parser.parse_args()
  67. main(args)