ya_cat.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import csv
  4. def get_normalise_str(name, string): # нормализация данных
  5. if name == "IMG":
  6. return string.replace('//www.', 'https://www.')
  7. else: # name == "DATE"
  8. result = string.split('- ')
  9. return result[-1] # -1 последний элемент
  10. def write_csv(data): # конвертация данных в csv
  11. # 'a' - it appends a data in file
  12. with open('catalogue.csv', 'a', newline='', encoding='utf-8') as file_csv:
  13. writer_file = csv.writer(file_csv)
  14. writer_file.writerow([
  15. data['name'],
  16. data['link'],
  17. data['img'],
  18. data['group'],
  19. data['discr'],
  20. data['date']
  21. ])
  22. def get_html(url): # получение dom-html
  23. response = requests.get(url)
  24. if response.ok: # ok == 200
  25. return response.text
  26. print(response.status_code) # ok == 403|404
  27. def get_page_data(html): # получение данных из html
  28. soup = BeautifulSoup(html, 'lxml')
  29. divs = soup.find_all('div', class_='b-catalog__feed__box')
  30. for item in divs:
  31. elem1 = item.find('div', class_="b-catalog__feed__image")
  32. try:
  33. link = elem1.find('a').get('href')
  34. except:
  35. link = 'None'
  36. try:
  37. img = elem1.find('a').find('img').get('src')
  38. except:
  39. img = 'None'
  40. elem2 = item.find('div', class_="b-catalog__feed__text")
  41. try:
  42. header = elem2.find('h3').find('a').text.strip()
  43. except:
  44. header = 'None'
  45. try:
  46. date = elem2.find('h3').find('small').text.strip()
  47. except:
  48. date = 'None'
  49. try:
  50. group = elem2.find('h6').find('a').text.strip()
  51. except:
  52. group = 'None'
  53. cls3 = "b-catalog__feed__text__description"
  54. try:
  55. discr = elem2.find('div', class_=cls3).text.strip()
  56. except:
  57. discr = 'None'
  58. data = {
  59. "name": header,
  60. "link": 'https://www.shopolog.ru'+link,
  61. "img": get_normalise_str("IMG", img),
  62. "group": group,
  63. "discr": discr,
  64. "date": get_normalise_str("DATE", date)
  65. }
  66. # write_csv(data)
  67. print(count_item)
  68. def main():
  69. # url = 'https://yandex.ru/yaca/cat/Entertainment/'
  70. count = 1
  71. while count <= 3:
  72. url = f'https://www.shopolog.ru/company/section/pc/?page={count}'
  73. html = get_html(url)
  74. get_page_data(html)
  75. # print(url)
  76. count = count + 1
  77. if __name__ == '__main__':
  78. main()