prune_data.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. """
  2. The file `data.csv` is produced by running `src/cluster/deposit.py` but these
  3. results could noisy. We apply a set of post-processing rules to at least ensure
  4. consistency.
  5. RULE #1: It is possible for A -> B -> Ex, and C -> A -> Ex to both appear. We
  6. don't want to consider `A` an eoa in one setting, and a deposit in another setting.
  7. It is very unlikely for `A` to be a desposit if we see it do `A -> B -> Ex`. Delete
  8. these entries from `data.csv`.
  9. RULE #2: We are always certain about exchange addresses and so any lines with
  10. them as EOAs or deposits can be removed.
  11. """
  12. import numpy as np
  13. import pandas as pd
  14. from typing import Any
  15. def main(args: Any):
  16. df: pd.DataFrame = pd.read_csv(args.data_csv)
  17. exchanges: np.array = df.exchange.unique()
  18. print(f'init: {len(df)} rows.')
  19. # Exchanges cannot be users or deposits
  20. df = df[~df.user.isin(exchanges)]
  21. df = df[~df.deposit.isin(exchanges)]
  22. print(f'after removing exchanges as eoa/deposits: {len(df)} rows.')
  23. # Find all users and make sure they cannot be deposits since
  24. # deposits cannot send to A -> B -> Exchange.
  25. users: np.array = df.user.unique()
  26. df = df[~df.deposit.isin(users)]
  27. print(f'after removing deposits who are also eoa\'s: {len(df)} rows.')
  28. print('saving to file...')
  29. df.to_csv(args.out_csv, index=False)
  30. if __name__ == "__main__":
  31. import argparse
  32. parser = argparse.ArgumentParser()
  33. parser.add_argument('data_csv', type=str)
  34. parser.add_argument('out_csv', type=str)
  35. args = parser.parse_args()
  36. main(args)