make_h5.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. """
  2. The graph is huge: we need to break it up into chunks.
  3. """
  4. import os
  5. import h5py
  6. import pandas as pd
  7. from typing import Any, List, Tuple
  8. from src.diff2vec.graph import UndirectedGraph
  9. def main(args: Any):
  10. print('Loading data from CSV')
  11. data: pd.DataFrame = pd.read_csv(args.data_csv)
  12. print('Building graph')
  13. graph: UndirectedGraph = build_graph(data)
  14. print(f'Made graph with {len(graph)} nodes')
  15. node_file: str = os.path.join(args.cache_dir, f'nodes.json')
  16. edge_file: str = os.path.join(args.cache_dir, f'edges.h5')
  17. graph.to_h5(node_file, edge_file)
  18. def build_graph(data: pd.DataFrame) -> UndirectedGraph:
  19. node_a: List[int] = data.from_address.to_numpy().tolist()
  20. node_b: List[int] = data.to_address.to_numpy().tolist()
  21. edge_ab: List[Tuple[int, int]] = list(zip(node_a, node_b))
  22. graph: UndirectedGraph = UndirectedGraph()
  23. graph.add_nodes_from(node_a)
  24. graph.add_nodes_from(node_b)
  25. graph.add_edges_from(edge_ab)
  26. return graph
  27. if __name__ == "__main__":
  28. from argparse import ArgumentParser
  29. parser: ArgumentParser = ArgumentParser()
  30. parser.add_argument('data_csv', type=str, help='path to save data')
  31. parser.add_argument('cache_dir', type=str, help='path to cache')
  32. args: Any = parser.parse_args()
  33. main(args)