123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131 |
- import requests
- from bs4 import BeautifulSoup
- import csv
- import re
- import os
- def get_normalise_str(string): # нормализация данных
- result = string.split(' ')
- return result
- def write_csv(data): # конвертация данных в csv
- path_f = os.path.dirname(os.path.abspath(__file__))
- # 'a' - it appends a data in file
- with open(os.path.join(path_f, 'cmc-pages.csv'), 'a', newline='', encoding='utf-8') as file_csv:
- writer_file = csv.writer(file_csv)
- writer_file.writerow([
- data['num'],
- data['tik'],
- data['name'],
- data['url'],
- data['cap'],
- data['price'],
- data['vol'],
- data['chg']
- ])
- def get_html(url): # получение dom-html
- response = requests.get(url)
- if response.ok: # ok == 200
- return response.text
- print(response.status_code) # ok == 403|404
- return response.status_code
- def get_page_data(html): # получение данных из html
- soup = BeautifulSoup(html, 'lxml') # на вход html и название парсера
- trs = soup.find_all('tr', class_='cmc-table-row') # получение строк
- for tr in trs: # получение столбцов
- try:
- tds = tr.find_all('td')
- except:
- tds = 'None'
- if tds != 'None':
- try:
- num = tds[0].find('div').text.strip()
- except:
- num = 'None'
- try:
- name = tds[1].find('div').find('a').text.strip()
- except:
- name = 'None'
- try:
- link = tds[1].find('div').find('a').get('href')
- except:
- link = 'None'
- try:
- cap = tds[2].find('p').text.strip()
- except:
- cap = 'None'
- try:
- price = tds[3].find('a').text.strip()
- except:
- price = 'None'
- try:
- volume = tds[4].find('a').text.strip()
- except:
- volume = 'None'
- try:
- tiker = tds[5].find('div').text.strip()
- except:
- tiker = 'None'
- try:
- change = tds[6].find('div').text.strip()
- except:
- change = 'None'
- data = {
- 'num': num,
- 'tik': get_normalise_str(tiker)[1],
- 'name': name,
- 'url': 'https://coinmarketcap.com'+link,
- 'cap': cap,
- 'price': price,
- 'vol': volume,
- 'chg': change
- }
- write_csv(data)
- # print(data)
- else:
- print('Error!')
- def get_num_page(html):
- soup = BeautifulSoup(html, 'lxml')
- try:
- div_class = 'cmc-table-listing__pagination-button-group'
- # a_class = "table-listing-button-next"
- # page = soup.find('div', class_=div_class).find(
- # 'a', attrs={"data-qa-id": a_class}).get('href').strip()
- pattern = 'Next'
- page = soup.find('div', class_=div_class).find(
- 'a', text=re.compile(pattern)).get('href').strip()
- return page
- except:
- page = False
- return page
- def main():
- url = 'https://coinmarketcap.com'
- get_page_data(get_html(url))
- while True:
- html = get_html(url)
- page = get_num_page(html)
- if html == 404 or page == False or page == "404":
- print("all done")
- break
- print(page)
- url = 'https://coinmarketcap.com' + page
- get_page_data(get_html(url))
- if __name__ == '__main__':
- main()
|