main.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import os
  4. import csv
  5. import time
  6. def cut_quots(text):
  7. result = text.replace('“', '')
  8. return result.replace('”', '')
  9. def clean_since(text):
  10. result = text.split(" ")
  11. return result[2]
  12. def get_html(url):
  13. agent = 'user-agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
  14. header = {"User-Agent": agent}
  15. response = requests.get(url, headers=header)
  16. if response.ok: # ok == 200
  17. return response.text
  18. return response.status_code
  19. def write_csv(data):
  20. path_f = os.path.dirname(os.path.abspath(__file__))
  21. with open(
  22. os.path.join(path_f, "testimonials.csv"), "a", newline='', encoding='utf-8') as file_:
  23. order = [
  24. "category",
  25. "header",
  26. "content",
  27. "avtr",
  28. "autor",
  29. "acc_name",
  30. "since",
  31. "email",
  32. "tel"]
  33. writer_f = csv.DictWriter(file_, fieldnames=order)
  34. writer_f.writerow(data)
  35. def get_arts(html):
  36. soup = BeautifulSoup(html, 'lxml')
  37. div_id = 'testimonial-2364-3-0-0'
  38. arts_cls = "testimonial-post"
  39. articles = soup.find('div', id=div_id).find_all('article', class_=arts_cls)
  40. return articles
  41. def get_page_data(html): # получение данных из html
  42. arts = get_arts(html)
  43. if len(arts) == 0:
  44. print("Parsing container is empty")
  45. else:
  46. for art in arts:
  47. parent_div_cont = art.find('div', class_="testimonial-content")
  48. parent_div_autr = art.find('div', class_="author-details")
  49. try:
  50. cat_cls = "testimonial-category"
  51. category = parent_div_cont.find(
  52. 'span', class_=cat_cls).text.strip()
  53. except:
  54. category = ""
  55. try:
  56. header = parent_div_cont.find('h2').text.strip()
  57. except:
  58. header = ""
  59. try:
  60. cont_cls = "entry-content"
  61. content = parent_div_cont.find(
  62. 'div', class_=cont_cls).text.strip()
  63. except:
  64. content = ""
  65. try:
  66. hide_cls = "coll-hidden"
  67. hide_content = parent_div_cont.find(
  68. 'span', class_=hide_cls).text.strip()
  69. except:
  70. hide_content = ""
  71. try:
  72. avtr = art.find('figure').find('img').get("src")
  73. except:
  74. avtr = ""
  75. try:
  76. autor = parent_div_autr.find(
  77. "p", class_="testimonial-author").text.strip()
  78. except:
  79. autor = ""
  80. try:
  81. acc_name = parent_div_autr.find(
  82. "p", class_="testimonial-author").find('span', class_="account-name").text.strip()
  83. except:
  84. acc_name = ""
  85. try:
  86. since = parent_div_autr.find(
  87. "p", class_="traxer-since").text.strip()
  88. except:
  89. since = ""
  90. try:
  91. email = parent_div_autr.find("ul", class_="testimonial-meta").find(
  92. "li", class_="email").find('a').text.strip()
  93. except:
  94. email = ""
  95. try:
  96. tel = parent_div_autr.find("ul", class_="testimonial-meta").find(
  97. "li", class_="tel").text.strip()
  98. except:
  99. tel = ""
  100. data = {
  101. "category": category,
  102. "header": cut_quots(header),
  103. "content": f"{content} {hide_content}",
  104. "avtr": avtr,
  105. "autor": autor,
  106. "acc_name": acc_name,
  107. "since": clean_since(since),
  108. "email": email,
  109. "tel": tel,
  110. }
  111. print(data)
  112. write_csv(data)
  113. def main():
  114. page = 1
  115. while True:
  116. url = f'https://catertrax.com/why-catertrax/traxers/page/{page}/'
  117. html = get_html(url)
  118. if html == 403 or html == 404:
  119. print(f"Error request: {html}")
  120. break
  121. else:
  122. arts = get_arts(html)
  123. if len(arts) != 0: # если содержит articles
  124. get_page_data(html)
  125. page = page + 1
  126. else:
  127. print("Parsing done!")
  128. break
  129. if __name__ == '__main__':
  130. start_time = time.time()
  131. main()
  132. print("--- %s seconds ---" % (time.time() - start_time))