run_deposit.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. from typing import Any, Dict, List, Set
  2. from src.utils.loader import DataframeLoader
  3. from src.cluster.deposit import DepositCluster
  4. def build_response(
  5. user_clusters: List[Set[str]], # user -> deposit
  6. exchange_clusters: List[Set[str]], # deposit -> exchange
  7. metadata: Dict[str, Dict[str, Any]],
  8. ) -> Dict[str, Dict[str, Any]]:
  9. """
  10. Construct a mapping from an address to an index. That index
  11. picks a set with the clusters list. Also store any extra metadata.
  12. """
  13. response = dict()
  14. for name, clusters in zip(
  15. ['user_cluster', 'exchange_cluster'],
  16. [user_clusters, exchange_clusters],
  17. ):
  18. for i, cluster in enumerate(clusters):
  19. cluster: List[Set[int]] = cluster
  20. for address in cluster:
  21. assert address in metadata, "error: unknown address"
  22. if address in response:
  23. assert name not in response[address], \
  24. "error: each element only allowed to be in one cluster."
  25. response[address][name] = i
  26. else:
  27. response[address] = {name: i, **metadata[address]}
  28. return response
  29. def main(args: Any):
  30. if args.dataset == 'mini_bionic':
  31. loader: DataframeLoader = DataframeLoader(
  32. args.blocks_csv,
  33. args.known_addresses_csv,
  34. args.transactions_csv,
  35. args.save_dir,
  36. )
  37. elif args.dataset == 'bigquery':
  38. loader: DataframeLoader = DataframeLoader(
  39. args.blocks_csv,
  40. args.known_addresses_csv,
  41. args.transactions_csv,
  42. args.save_dir,
  43. )
  44. else:
  45. raise Exception(f'Dataset {args.dataset} not supported.')
  46. algo = DepositCluster(
  47. loader,
  48. a_max = args.a_max,
  49. t_max = args.t_max,
  50. save_dir = args.save_dir,
  51. )
  52. # this saves user/deposit/exchange columns but does not
  53. # compute weakly connected components. See run_nx.py.
  54. algo.make_clusters()
  55. print('done.')
  56. if __name__ == "__main__":
  57. from argparse import ArgumentParser
  58. parser: ArgumentParser = ArgumentParser()
  59. parser.add_argument('blocks_csv', type=str, help='path to block data')
  60. parser.add_argument('transactions_csv', type=str, help='path to transaction data')
  61. parser.add_argument('known_addresses_csv', type=str, help='path to known address data')
  62. parser.add_argument('save_dir', type=str, help='path to save output')
  63. parser.add_argument('--dataset', type=str, default='bigquery',
  64. choices=['mini_bionic', 'bigquery'],
  65. help='dataset name (default: mini_bionic)')
  66. parser.add_argument('--a-max', type=float, default=0.01,
  67. help='maximum amount difference (default: 0.01)')
  68. parser.add_argument('--t-max', type=float, default=3200,
  69. help='maximum time difference (default: 3200)')
  70. args: Any = parser.parse_args()
  71. main(args)