etherscan.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. """
  2. Currently, I am facing issues with 403 Permission Issues
  3. only on EC2.
  4. """
  5. import urllib
  6. from bs4 import BeautifulSoup
  7. from typing import Optional, Any, Dict, List
  8. from bs4.element import Tag, ResultSet, NavigableString
  9. def clean_text(text):
  10. return text.strip().replace(' ', ' ')
  11. def safe_get_props(component: Tag) -> List[str]:
  12. """Pops child conent into a stack. Ignores empty."""
  13. prop_value: List[str] = []
  14. for obj in component.children:
  15. if isinstance(obj, NavigableString):
  16. if len(obj.strip()) == 0 :
  17. continue
  18. else:
  19. prop_value.append(obj.strip())
  20. else:
  21. if len(obj.text) > 0:
  22. prop_value.append(obj.text)
  23. return prop_value
  24. def get_etherscan_page(address: str) -> Optional[bytes]:
  25. # https://stackoverflow.com/questions/28396036/python-3-4-urllib-request-error-http-403
  26. # simulate mozilla browser to download webpage
  27. headers = {
  28. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'
  29. }
  30. uri: str = f'https://etherscan.io/address/{address}'
  31. request = urllib.request.Request(uri, headers = headers)
  32. response = urllib.request.urlopen(request)
  33. if response.code == 200:
  34. data = response.read()
  35. else:
  36. data = None
  37. return data
  38. def get_etherscan_data(page_bytes: Optional[bytes]) -> Dict[str, Any]:
  39. """
  40. Pull out all the data from Etherscan on what the transaction is doing.
  41. This funciton is riddled with safe returns.
  42. """
  43. if page_bytes is None: # garbage in, garbage out
  44. return None
  45. def uncapitalize(s):
  46. if len(s) == 0:
  47. return s
  48. elif len(s) == 1:
  49. return s.lower()
  50. else:
  51. if s[0] == s[0].upper():
  52. s = s[0].lower() + s[1:]
  53. return s
  54. soup: BeautifulSoup = BeautifulSoup(page_bytes, 'html.parser')
  55. summary_div: Tag = soup.find('div', {'id': 'ContentPlaceHolder1_divSummary'})
  56. if summary_div is None: return {} # nothing to do
  57. summary_res: ResultSet = summary_div.findChildren('div', recursive=False)
  58. if len(summary_res) == 0: return {} # nothing to do
  59. summary: Tag = summary_res[0]
  60. body_res: ResultSet = summary.findChildren('div', {'class': 'card-body'})
  61. if len(body_res) == 0: return {} # nothing to do
  62. body: Tag = body_res[0]
  63. columns: ResultSet = body.findChildren('div', recursive=False)
  64. if len(columns) < 2: return {} # incorrect number of columns.
  65. data: Dict[str, Any] = dict() # collect all data
  66. # handle the first two divs the same way
  67. for column in columns[:2]:
  68. items: ResultSet = column.findChildren('div', recursive=False)
  69. if len(items) != 2:
  70. continue # this is not unexpected so we should ignore
  71. k, v = items[0].text.strip(), items[1].text.strip()
  72. k: str = k.replace(':', '').strip()
  73. k: str = uncapitalize(k)
  74. data[k] = v
  75. if len(columns) > 2:
  76. # deal with third column
  77. content_res: ResultSet = columns[2].findChildren(
  78. 'a', {'id': 'availableBalanceDropdown'})
  79. if len(content_res) > 0:
  80. content: Tag = content_res[0]
  81. token_value: str = content.text.strip()
  82. token_value: str = token_value.split('\n')[0]
  83. data['tokenValue'] = token_value
  84. token_res: ResultSet = content.findChildren('span', recursive=False)
  85. if len(token_res) > 0:
  86. token: Tag = token_res[0]
  87. num_tokens: int = int(token.text)
  88. data['numTokenContracts'] = num_tokens
  89. return data
  90. def query_etherscan(address: str):
  91. data = get_etherscan_page(address)
  92. return get_etherscan_data(data)
  93. if __name__ == "__main__":
  94. from pprint import pprint
  95. address: str = '0x49516e20b5692839f877a7e9aa62006a5d02a7b1'
  96. data = get_etherscan_page(address)
  97. data = get_etherscan_data(data)
  98. pprint(data)