run_torn_mine_heuristic.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. import os, json
  2. import pandas as pd
  3. from tqdm import tqdm
  4. import networkx as nx
  5. from typing import Any, Tuple, Dict, List, Set
  6. from src.utils.utils import to_json, from_json, Entity, Heuristic
  7. MINE_POOL_RATES: Dict[str, int] ={
  8. '0.1 ETH': 4,
  9. '1 ETH': 20,
  10. '10 ETH': 50,
  11. '100 ETH': 400,
  12. '100 DAI': 2,
  13. '1000 DAI': 10,
  14. '10000 DAI': 40,
  15. '100000 DAI': 250,
  16. '5000 cDAI': 2,
  17. '50000 cDAI': 10,
  18. '500000 cDAI': 40,
  19. '5000000 cDAI': 250,
  20. '0.1 WBTC': 15,
  21. '1 WBTC': 120,
  22. '10 WBTC': 1000,
  23. }
  24. MINE_POOL: List[str] = list(MINE_POOL_RATES.keys())
  25. def main(args: Any):
  26. if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir)
  27. cache_file: str = os.path.join(args.data_dir, 'heuristic_5_linked_txs.json')
  28. if os.path.isfile(cache_file):
  29. total_linked_txs: Dict[str, Dict[str, Any]] = from_json(cache_file)
  30. else:
  31. deposit_df, withdraw_df, miner_df = load_data(args.data_dir)
  32. deposit_df: pd.DataFrame = deposit_df[deposit_df['tcash_pool'].isin(MINE_POOL)]
  33. withdraw_df: pd.DataFrame = withdraw_df[withdraw_df['tcash_pool'].isin(MINE_POOL)]
  34. unique_deposits: Set[str] = set(deposit_df['from_address'])
  35. unique_withdraws: Set[str] = set(withdraw_df['recipient_address'])
  36. addr2deposits: Dict[str, Any] = address_to_txs_and_blocks(deposit_df, 'deposit')
  37. addr2withdraws: Dict[str, Any] = address_to_txs_and_blocks(withdraw_df, 'withdraw')
  38. total_linked_txs: Dict[str, Dict[str, Any]] = get_total_linked_txs(
  39. miner_df, unique_deposits, unique_withdraws, addr2deposits, addr2withdraws)
  40. w2d: Dict[Tuple[str], List[Tuple[str]]] = \
  41. apply_anonymity_mining_heuristic(total_linked_txs)
  42. clusters, tx2addr = build_clusters(w2d)
  43. tx2block, tx2ts = get_transaction_info(withdraw_df, deposit_df)
  44. address_sets: List[Set[str]] = get_address_sets(clusters, tx2addr)
  45. address_metadata: List[Dict[str, Any]] = get_metadata(address_sets)
  46. clusters_file: str = os.path.join(args.save_dir, 'torn_mine_clusters.json')
  47. tx2addr_file: str = os.path.join(args.save_dir, 'torn_mine_tx2addr.json')
  48. tx2block_file: str = os.path.join(args.save_dir, 'torn_mine_tx2block.json')
  49. tx2ts_file: str = os.path.join(args.save_dir, 'torn_mine_tx2ts.json')
  50. address_file: str = os.path.join(args.save_dir, 'torn_mine_address_set.json')
  51. metadata_file: str = os.path.join(args.save_dir, 'torn_mine_metadata.csv')
  52. to_json(clusters, clusters_file)
  53. to_json(tx2addr, tx2addr_file)
  54. to_json(tx2block, tx2block_file)
  55. to_json(tx2ts, tx2ts_file)
  56. to_json(address_sets, address_file)
  57. address_metadata.to_csv(metadata_file, index=False)
  58. def build_clusters(links: Any) -> Tuple[List[Set[str]], Dict[str, str]]:
  59. graph: nx.DiGraph = nx.DiGraph()
  60. tx2addr: Dict[str, str] = {}
  61. for withdraw_tuple, deposit_tuples in links.items():
  62. withdraw_tx, withdraw_addr, _ = withdraw_tuple
  63. graph.add_node(withdraw_tx)
  64. tx2addr[withdraw_tx] = withdraw_addr
  65. for deposit_tuple in deposit_tuples:
  66. deposit_tx, deposit_addr = deposit_tuple
  67. graph.add_node(deposit_tx)
  68. graph.add_edge(withdraw_tx, deposit_tx)
  69. tx2addr[deposit_tx] = deposit_addr
  70. clusters: List[Set[str]] = [ # ignore singletons
  71. c for c in nx.weakly_connected_components(graph) if len(c) > 1]
  72. return clusters, tx2addr
  73. def get_transaction_info(
  74. withdraw_df: pd.DataFrame,
  75. deposit_df: pd.DataFrame
  76. ) -> Tuple[Dict[str, int], Dict[str, Any]]:
  77. hashes: pd.DataFrame = pd.concat([withdraw_df.hash, deposit_df.hash])
  78. block_numbers: pd.DataFrame = \
  79. pd.concat([withdraw_df.block_number, deposit_df.block_number])
  80. block_timestamps: pd.DataFrame = \
  81. pd.concat([withdraw_df.block_timestamp, deposit_df.block_timestamp])
  82. tx2block = dict(zip(hashes, block_numbers))
  83. tx2ts = dict(zip(hashes, block_timestamps))
  84. return tx2block, tx2ts
  85. def get_address_sets(
  86. tx_clusters: List[Set[str]],
  87. tx2addr: Dict[str, str],
  88. ) -> List[Set[str]]:
  89. """
  90. Stores pairs of addresses that are related to each other. Don't
  91. apply graphs on this because we are going to join this into the
  92. other clusters.
  93. """
  94. address_sets: List[Set[str]] = []
  95. for cluster in tx_clusters:
  96. addr_set: Set[str] = set([tx2addr[tx] for tx in cluster])
  97. addr_set: List[str] = list(addr_set)
  98. if len(addr_set) > 1: # make sure not singleton
  99. for addr1 in addr_set:
  100. for addr2 in addr_set:
  101. if addr1 != addr2:
  102. address_sets.append({addr1, addr2})
  103. return address_sets
  104. def get_metadata(address_sets: List[Set[str]]) -> pd.DataFrame:
  105. """
  106. Stores metadata about addresses to add to db.
  107. """
  108. unique_addresses: Set[str] = set().union(*address_sets)
  109. address: List[str] = []
  110. entity: List[int] = []
  111. conf: List[float] = []
  112. meta_data: List[str] = []
  113. heuristic: List[int] = []
  114. for member in unique_addresses:
  115. address.append(member)
  116. entity.append(Entity.EOA.value)
  117. conf.append(1)
  118. meta_data.append(json.dumps({}))
  119. heuristic.append(Heuristic.TORN_MINE.value)
  120. response: Dict[str, List[Any]] = dict(
  121. address = address,
  122. entity = entity,
  123. conf = conf,
  124. meta_data = meta_data,
  125. heuristic = heuristic,
  126. )
  127. response: pd.DataFrame = pd.DataFrame.from_dict(response)
  128. return response
  129. def load_data(root) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
  130. tornado_addrs: Dict[str, str] = from_json(os.path.join(root, 'tornado_pools.json'))
  131. deposit_df: pd.DataFrame = pd.read_csv(
  132. os.path.join(root, 'lighter_complete_deposit_txs.csv'))
  133. deposit_df['tcash_pool'] = deposit_df['tornado_cash_address'].apply(
  134. lambda addr: tornado_addrs[addr])
  135. withdraw_df: pd.DataFrame = pd.read_csv(
  136. os.path.join(root, 'lighter_complete_withdraw_txs.csv'))
  137. withdraw_df['tcash_pool'] = withdraw_df['tornado_cash_address'].apply(
  138. lambda addr: tornado_addrs[addr])
  139. miner_df: pd.DataFrame = pd.read_csv(os.path.join(root, 'lighter_miner_txs.csv'))
  140. miner_df: pd.DataFrame = miner_df[miner_df['function_call'] == 'w']
  141. return deposit_df, withdraw_df, miner_df
  142. def address_to_txs_and_blocks(txs_df: pd.DataFrame, tx_type: str) -> Dict[str, Any]:
  143. assert tx_type in ['deposit', 'withdraw'], 'Transaction type error'
  144. address_field: str = 'from_address' if tx_type == 'deposit' else 'recipient_address'
  145. addr_to_txs_and_blocks: Dict[str, Any] = {}
  146. for _, row in tqdm(txs_df.iterrows(), total=len(txs_df)):
  147. if row[address_field] not in addr_to_txs_and_blocks.keys():
  148. addr_to_txs_and_blocks[row[address_field]] = \
  149. {row.tcash_pool: [(row.hash, row.block_number)]}
  150. elif row.tcash_pool not in addr_to_txs_and_blocks[row[address_field]].keys():
  151. addr_to_txs_and_blocks[row[address_field]].update(
  152. {row.tcash_pool: [(row.hash, row.block_number)]})
  153. else:
  154. addr_to_txs_and_blocks[row[address_field]][row.tcash_pool].append(
  155. (row.hash, row.block_number))
  156. return addr_to_txs_and_blocks
  157. '''
  158. To classify the addresses by their inclusion in the unique_deposit_addresses and
  159. the unique_withdraw_addresses sets.
  160. '''
  161. def is_D_type(address: str, deposits: Set[str], withdraws: Set[str]):
  162. return (address in deposits) and (address not in withdraws)
  163. def is_W_type(address: str, deposits: Set[str], withdraws: Set[str]):
  164. return (address not in deposits) and (address in withdraws)
  165. def is_DW_type(address: str, deposits: Set[str], withdraws: Set[str]):
  166. return (address in deposits) and (address in withdraws)
  167. def ap2blocks(anonymity_points: int, pool: str) -> float:
  168. rate = MINE_POOL_RATES[pool]
  169. return anonymity_points / float(rate)
  170. def D_type_anonymity_heuristic(
  171. miner_tx: pd.Series,
  172. addr2deposits: Dict[str, Any],
  173. addr2withdraws: Dict[str, Any],
  174. ) -> Dict[str, Dict[str, Any]]:
  175. d_addr: str = miner_tx.recipient_address
  176. d_addr2w: Dict[str, Dict[str, Any]] = {d_addr: {}}
  177. breakpoint()
  178. for d_pool in addr2deposits[d_addr]:
  179. for (d_hash, d_blocks) in addr2deposits[d_addr][d_pool]:
  180. delta_blocks: float = ap2blocks(miner_tx.anonimity_points, d_pool)
  181. for w_addr in addr2withdraws.keys():
  182. if d_pool in addr2withdraws[w_addr].keys():
  183. for (w_hash, w_blocks) in addr2withdraws[w_addr][d_pool]:
  184. if d_blocks + delta_blocks == w_blocks:
  185. if d_hash not in d_addr2w[d_addr].keys():
  186. d_addr2w[d_addr][d_hash] = [(w_hash, w_addr, delta_blocks)]
  187. else:
  188. d_addr2w[d_addr][d_hash].append((w_hash, w_addr, delta_blocks))
  189. return d_addr2w
  190. def W_type_anonymity_heuristic(
  191. miner_tx: pd.Series,
  192. addr2deposits: Dict[str, Any],
  193. addr2withdraws: Dict[str, Any],
  194. ) -> Dict[str, Dict[str, Any]]:
  195. w_addr: str = miner_tx.recipient_address
  196. w_addr2d: Dict[str, Dict[str, Any]] = {w_addr: {}}
  197. for w_pool in addr2withdraws[w_addr]:
  198. for (w_hash, w_blocks) in addr2withdraws[w_addr][w_pool]:
  199. delta_blocks: float = ap2blocks(miner_tx.anonimity_points, w_pool)
  200. for d_addr in addr2deposits.keys():
  201. if w_pool in addr2deposits[d_addr].keys():
  202. for (d_hash, d_blocks) in addr2deposits[d_addr][w_pool]:
  203. if d_blocks + delta_blocks == w_blocks:
  204. if w_hash not in w_addr2d[w_addr].keys():
  205. w_addr2d[w_addr][w_hash] = [(d_hash, d_addr, delta_blocks)]
  206. else:
  207. w_addr2d[w_addr][w_hash].append((d_hash, d_addr, delta_blocks))
  208. return w_addr2d
  209. def anonymity_mining_heuristic(
  210. miner_tx: pd.Series,
  211. unique_deposits: Set[str],
  212. unique_withdraws: Set[str],
  213. addr2deposits: Dict[str, Any],
  214. addr2withdraws: Dict[str, Any],
  215. ) -> Dict[str, Dict[str, Any]]:
  216. linked_txs: Dict[str, Dict[str, Any]] = {}
  217. if is_D_type(miner_tx.recipient_address, unique_deposits, unique_withdraws):
  218. d_dict: Dict[str, Any] = D_type_anonymity_heuristic(
  219. miner_tx, addr2deposits, addr2withdraws)
  220. if len(d_dict[miner_tx.recipient_address]) != 0:
  221. linked_txs['D'] = d_dict
  222. return linked_txs
  223. elif is_W_type(miner_tx.recipient_address, unique_deposits, unique_withdraws):
  224. w_dict: Dict[str, Any] = W_type_anonymity_heuristic(
  225. miner_tx, addr2deposits, addr2withdraws)
  226. if len(w_dict[miner_tx.recipient_address]) != 0:
  227. linked_txs['W'] = w_dict
  228. return linked_txs
  229. elif is_DW_type(miner_tx.recipient_address, unique_deposits, unique_withdraws):
  230. d_dict: Dict[str, Any] = D_type_anonymity_heuristic(
  231. miner_tx, addr2deposits, addr2withdraws)
  232. if len(d_dict[miner_tx.recipient_address]) != 0:
  233. linked_txs['D'] = d_dict
  234. w_dict: Dict[str, Any] = W_type_anonymity_heuristic(
  235. miner_tx, addr2deposits, addr2withdraws)
  236. if len(w_dict[miner_tx.recipient_address]) != 0:
  237. linked_txs['W'] = w_dict
  238. return linked_txs
  239. return linked_txs
  240. def get_total_linked_txs(
  241. miner_txs: pd.Series,
  242. unique_deposits: Set[str],
  243. unique_withdraws: Set[str],
  244. addr2deposits: Dict[str, Any],
  245. addr2withdraws: Dict[str, Any],
  246. ) -> Dict[str, Dict[str, Any]]:
  247. total_linked_txs: Dict[str, Dict[str, Any]] = {'D': {}, 'W': {}}
  248. for miner_tx in tqdm(miner_txs.itertuples(), total=len(miner_txs)):
  249. linked_txs: Dict[str, Dict[str, Any]] = anonymity_mining_heuristic(
  250. miner_tx, unique_deposits, unique_withdraws, addr2deposits, addr2withdraws)
  251. if len(linked_txs) != 0:
  252. if 'D' in linked_txs.keys():
  253. if len(linked_txs['D']) != 0:
  254. total_linked_txs['D'].update(linked_txs['D'])
  255. if 'W' in linked_txs.keys():
  256. if len(linked_txs['W']) != 0:
  257. total_linked_txs['W'].update(linked_txs['W'])
  258. return total_linked_txs
  259. def apply_anonymity_mining_heuristic(
  260. total_linked_txs: Dict[str, Dict[str, Any]],
  261. ) -> Dict[Tuple[str], List[Tuple[str]]]:
  262. """
  263. The final version of the results is obtained applying this function
  264. to the output of the 'apply_anonimity_mining_heuristic' function.
  265. w2d -> withdraws and blocks to deposits
  266. """
  267. w2d: Dict[Tuple[str], List[Tuple[str]]] = {}
  268. for addr in total_linked_txs['W'].keys():
  269. for hsh in total_linked_txs['W'][addr]:
  270. delta_blocks: float = total_linked_txs['W'][addr][hsh][0][2]
  271. w2d[(hsh, addr, delta_blocks)] = [
  272. (t[0],t[1]) for t in total_linked_txs['W'][addr][hsh]]
  273. for addr in total_linked_txs['D'].keys():
  274. for hsh in total_linked_txs['D'][addr]:
  275. for tx_tuple in total_linked_txs['D'][addr][hsh]:
  276. if tx_tuple[0] not in w2d.keys():
  277. w2d[tuple(tx_tuple)] = [(hsh, addr)]
  278. else:
  279. if (hsh, addr) not in w2d[tx_tuple]:
  280. w2d[tuple(tx_tuple)].append((hsh, addr))
  281. return w2d
  282. if __name__ == "__main__":
  283. from argparse import ArgumentParser
  284. parser: ArgumentParser = ArgumentParser()
  285. parser.add_argument('data_dir', type=str, help='path to tornado cash data')
  286. parser.add_argument('save_dir', type=str, help='folder to save matches')
  287. args: Any = parser.parse_args()
  288. main(args)