make_pickle.py 1.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. """
  2. The graph is huge: we need to break it up into chunks.
  3. """
  4. import os
  5. import pandas as pd
  6. from typing import Any, List, Tuple
  7. from src.diff2vec.graph import UndirectedGraph
  8. def main(args: Any):
  9. print('Loading data from CSV')
  10. data: pd.DataFrame = pd.read_csv(args.data_csv)
  11. print('Building graph')
  12. graph: UndirectedGraph = build_graph(data)
  13. print(f'Made graph with {len(graph)} nodes')
  14. edge_file: str = os.path.join(args.cache_dir, f'edges-raw.pickle')
  15. graph.to_pickle(edge_file)
  16. def build_graph(data: pd.DataFrame) -> UndirectedGraph:
  17. node_a: List[int] = data.from_address.to_numpy().tolist()
  18. node_b: List[int] = data.to_address.to_numpy().tolist()
  19. edge_ab: List[Tuple[int, int]] = list(zip(node_a, node_b))
  20. graph: UndirectedGraph = UndirectedGraph()
  21. graph.add_nodes_from(node_a)
  22. graph.add_nodes_from(node_b)
  23. graph.add_edges_from(edge_ab)
  24. return graph
  25. if __name__ == "__main__":
  26. from argparse import ArgumentParser
  27. parser: ArgumentParser = ArgumentParser()
  28. parser.add_argument('data_csv', type=str, help='path to save data')
  29. parser.add_argument('cache_dir', type=str, help='path to cache')
  30. args: Any = parser.parse_args()
  31. main(args)