prune_metadata.py 911 B

123456789101112131415161718192021222324252627282930313233
  1. """
  2. As the metadata is stored, deposit addresses might be associated with
  3. multiple exchanges. EOA addresses might appear multiple times as well.
  4. We stored the metadata in chunks; as such there may be duplicates. Let
  5. us just take the most confident of each.
  6. Input: data.csv
  7. Output: data-pruned.csv
  8. """
  9. import pandas as pd
  10. from typing import Any
  11. def main(args: Any):
  12. df: pd.DataFrame = pd.read_csv(args.metadata_csv)
  13. print(f'init: {len(df)} rows.')
  14. print('Running large groupby job...')
  15. pruned_df: pd.DataFrame = df.loc[df.groupby(['address'])['conf'].idxmax()]
  16. print(f'after pruning: {len(pruned_df)} rows.')
  17. pruned_df.to_csv(args.out_csv, index=False)
  18. if __name__ == "__main__":
  19. import argparse
  20. parser = argparse.ArgumentParser()
  21. parser.add_argument('metadata_csv', type=str)
  22. parser.add_argument('out_csv', type=str)
  23. args = parser.parse_args()
  24. main(args)