cmc_pages.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import csv
  4. import re
  5. import os
  6. def get_normalise_str(string): # нормализация данных
  7. result = string.split(' ')
  8. return result
  9. def write_csv(data): # конвертация данных в csv
  10. path_f = os.path.dirname(os.path.abspath(__file__))
  11. # 'a' - it appends a data in file
  12. with open(os.path.join(path_f, 'cmc-pages.csv'), 'a', newline='', encoding='utf-8') as file_csv:
  13. writer_file = csv.writer(file_csv)
  14. writer_file.writerow([
  15. data['num'],
  16. data['tik'],
  17. data['name'],
  18. data['url'],
  19. data['cap'],
  20. data['price'],
  21. data['vol'],
  22. data['chg']
  23. ])
  24. def get_html(url): # получение dom-html
  25. response = requests.get(url)
  26. if response.ok: # ok == 200
  27. return response.text
  28. print(response.status_code) # ok == 403|404
  29. return response.status_code
  30. def get_page_data(html): # получение данных из html
  31. soup = BeautifulSoup(html, 'lxml') # на вход html и название парсера
  32. trs = soup.find_all('tr', class_='cmc-table-row') # получение строк
  33. for tr in trs: # получение столбцов
  34. try:
  35. tds = tr.find_all('td')
  36. except:
  37. tds = 'None'
  38. if tds != 'None':
  39. try:
  40. num = tds[0].find('div').text.strip()
  41. except:
  42. num = 'None'
  43. try:
  44. name = tds[1].find('div').find('a').text.strip()
  45. except:
  46. name = 'None'
  47. try:
  48. link = tds[1].find('div').find('a').get('href')
  49. except:
  50. link = 'None'
  51. try:
  52. cap = tds[2].find('p').text.strip()
  53. except:
  54. cap = 'None'
  55. try:
  56. price = tds[3].find('a').text.strip()
  57. except:
  58. price = 'None'
  59. try:
  60. volume = tds[4].find('a').text.strip()
  61. except:
  62. volume = 'None'
  63. try:
  64. tiker = tds[5].find('div').text.strip()
  65. except:
  66. tiker = 'None'
  67. try:
  68. change = tds[6].find('div').text.strip()
  69. except:
  70. change = 'None'
  71. data = {
  72. 'num': num,
  73. 'tik': get_normalise_str(tiker)[1],
  74. 'name': name,
  75. 'url': 'https://coinmarketcap.com'+link,
  76. 'cap': cap,
  77. 'price': price,
  78. 'vol': volume,
  79. 'chg': change
  80. }
  81. write_csv(data)
  82. # print(data)
  83. else:
  84. print('Error!')
  85. def get_num_page(html):
  86. soup = BeautifulSoup(html, 'lxml')
  87. try:
  88. div_class = 'cmc-table-listing__pagination-button-group'
  89. # a_class = "table-listing-button-next"
  90. # page = soup.find('div', class_=div_class).find(
  91. # 'a', attrs={"data-qa-id": a_class}).get('href').strip()
  92. pattern = 'Next'
  93. page = soup.find('div', class_=div_class).find(
  94. 'a', text=re.compile(pattern)).get('href').strip()
  95. return page
  96. except:
  97. page = False
  98. return page
  99. def main():
  100. url = 'https://coinmarketcap.com'
  101. get_page_data(get_html(url))
  102. while True:
  103. html = get_html(url)
  104. page = get_num_page(html)
  105. if html == 404 or page == False or page == "404":
  106. print("all done")
  107. break
  108. print(page)
  109. url = 'https://coinmarketcap.com' + page
  110. get_page_data(get_html(url))
  111. if __name__ == '__main__':
  112. main()