1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495 |
- import requests
- from bs4 import BeautifulSoup
- import csv
- def get_normalise_str(name, string): # нормализация данных
- if name == "IMG":
- return string.replace('//www.', 'https://www.')
- else: # name == "DATE"
- result = string.split('- ')
- return result[-1] # -1 последний элемент
- def write_csv(data): # конвертация данных в csv
- # 'a' - it appends a data in file
- with open('catalogue.csv', 'a', newline='', encoding='utf-8') as file_csv:
- writer_file = csv.writer(file_csv)
- writer_file.writerow([
- data['name'],
- data['link'],
- data['img'],
- data['group'],
- data['discr'],
- data['date']
- ])
- def get_html(url): # получение dom-html
- response = requests.get(url)
- if response.ok: # ok == 200
- return response.text
- print(response.status_code) # ok == 403|404
- def get_page_data(html): # получение данных из html
- soup = BeautifulSoup(html, 'lxml')
- divs = soup.find_all('div', class_='b-catalog__feed__box')
- for item in divs:
- elem1 = item.find('div', class_="b-catalog__feed__image")
- try:
- link = elem1.find('a').get('href')
- except:
- link = 'None'
- try:
- img = elem1.find('a').find('img').get('src')
- except:
- img = 'None'
- elem2 = item.find('div', class_="b-catalog__feed__text")
- try:
- header = elem2.find('h3').find('a').text.strip()
- except:
- header = 'None'
- try:
- date = elem2.find('h3').find('small').text.strip()
- except:
- date = 'None'
- try:
- group = elem2.find('h6').find('a').text.strip()
- except:
- group = 'None'
- cls3 = "b-catalog__feed__text__description"
- try:
- discr = elem2.find('div', class_=cls3).text.strip()
- except:
- discr = 'None'
- data = {
- "name": header,
- "link": 'https://www.shopolog.ru'+link,
- "img": get_normalise_str("IMG", img),
- "group": group,
- "discr": discr,
- "date": get_normalise_str("DATE", date)
- }
- # write_csv(data)
- print(count_item)
- def main():
- # url = 'https://yandex.ru/yaca/cat/Entertainment/'
- count = 1
- while count <= 3:
- url = f'https://www.shopolog.ru/company/section/pc/?page={count}'
- html = get_html(url)
- get_page_data(html)
- # print(url)
- count = count + 1
- if __name__ == '__main__':
- main()
|