known_address.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. """
  2. Downloaded known addresses from https://www.kaggle.com/hamishhall/labelled-ethereum-addresses/version/1.
  3. We need to combine this with `exchange.csv` and create a dump of `contract.csv`.
  4. """
  5. import numpy as np
  6. import pandas as pd
  7. from typing import Any, List
  8. def main(args: Any):
  9. kaggle_df: pd.DataFrame = pd.read_csv(args.kaggle_csv)
  10. # spend some time just cleaning the kaggle DF
  11. keys: List[str] = [
  12. 'Address', 'Name', 'Account Type',
  13. 'Entity', 'Label', 'Tags',
  14. ]
  15. kaggle_df: pd.DataFrame = kaggle_df[keys]
  16. kaggle_df.rename(
  17. columns={
  18. 'Address': 'address',
  19. 'Name': 'name',
  20. 'Account Type': 'account_type',
  21. 'Entity': 'entity',
  22. 'Label': 'label',
  23. 'Tags': 'tags',
  24. },
  25. inplace = True,
  26. )
  27. label: pd.Series = kaggle_df.label
  28. label: pd.Series = label.replace('Legit', 1)
  29. label: pd.Series = label.replace('Dodgy', 0)
  30. label: pd.Series = label.astype(bool)
  31. kaggle_df.label = label
  32. account_type: pd.Series = kaggle_df.account_type
  33. account_type: pd.Series = account_type.replace('Smart Contract', 'contract')
  34. account_type: pd.Series = account_type.replace('Wallet', 'eoa')
  35. kaggle_df.account_type = account_type
  36. kaggle_df.entity = kaggle_df.entity.str.lower()
  37. kaggle_df.tags = kaggle_df.tags.str.lower()
  38. kaggle_df = kaggle_df.drop_duplicates('address')
  39. kaggle_df = kaggle_df[~(kaggle_df.address == 'Address')]
  40. # process etherclust.csv
  41. etherclust_df: pd.DataFrame = pd.read_csv(args.etherclust_csv)
  42. etherclust_df: pd.DataFrame = etherclust_df[
  43. ~etherclust_df.address.isin(kaggle_df.address)]
  44. etherclust_df.rename(columns={'type': 'entity'}, inplace=True)
  45. etherclust_df.entity = etherclust_df.entity.str.lower()
  46. etherclust_df.loc[etherclust_df['entity'] == 'wallet', 'entity'] = np.nan
  47. etherclust_df['label'] = 1
  48. etherclust_df['tags'] = np.nan
  49. combined_df: pd.DataFrame = pd.concat(
  50. [kaggle_df, etherclust_df],
  51. ignore_index=True,
  52. )
  53. # process etherscan.csv
  54. etherscan_df: pd.DataFrame = pd.read_csv(args.etherscan_csv)
  55. etherscan_df.rename(columns={'labels': 'label'}, inplace=True)
  56. etherscan_df: pd.DataFrame = etherscan_df[
  57. ~etherscan_df.address.isin(combined_df.address)]
  58. etherscan_df.entity = etherscan_df.entity.str.lower()
  59. etherscan_df['tags'] = np.nan
  60. combined_df: pd.DataFrame = pd.concat(
  61. [combined_df, etherscan_df],
  62. ignore_index=True,
  63. )
  64. # process tornado.csv
  65. tornado_df: pd.DataFrame =pd.read_csv(args.tornado_csv)
  66. tornado_df.rename(columns={
  67. 'legitimacy': 'label',
  68. 'type': 'entity',
  69. }, inplace=True)
  70. tornado_df: pd.DataFrame = tornado_df[
  71. ~tornado_df.address.isin(combined_df.address)]
  72. tornado_df.entity = tornado_df.entity.str.lower()
  73. df: pd.DataFrame = pd.concat(
  74. [combined_df, tornado_df],
  75. ignore_index=True,
  76. )
  77. df.rename(columns={'label': 'legitimacy'}, inplace=True)
  78. df.to_csv(args.known_csv, index=False)
  79. if __name__ == "__main__":
  80. import argparse
  81. parser = argparse.ArgumentParser()
  82. parser.add_argument(
  83. '--kaggle-csv',
  84. type=str,
  85. default='./data/static/kaggle.csv',
  86. help='path to data root (default: ./data/static/kaggle.csv)',
  87. )
  88. parser.add_argument(
  89. '--etherclust-csv',
  90. type=str,
  91. default='./data/static/etherclust.csv',
  92. help='path to data root (default: ./data/static/etherclust.csv)',
  93. )
  94. parser.add_argument(
  95. '--etherscan-csv',
  96. type=str,
  97. default='./data/static/etherscan.csv',
  98. help='path to data root (default: ./data/static/etherscan.csv)',
  99. )
  100. parser.add_argument(
  101. '--tornado-csv',
  102. type=str,
  103. default='./data/static/tornado.csv',
  104. help='path to data root (default: ./data/static/tornado.csv)',
  105. )
  106. parser.add_argument(
  107. '--known-csv',
  108. type=str,
  109. default='./data/static/known_addresses.csv',
  110. help='path to data root (default: ./data/static/known_addresses.csv)',
  111. )
  112. args: Any = parser.parse_args()
  113. main(args)