123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150 |
- import requests
- from bs4 import BeautifulSoup
- import os
- import csv
- import time
- def cut_quots(text):
- result = text.replace('“', '')
- return result.replace('”', '')
- def clean_since(text):
- result = text.split(" ")
- return result[2]
- def get_html(url):
- agent = 'user-agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
- header = {"User-Agent": agent}
- response = requests.get(url, headers=header)
- if response.ok: # ok == 200
- return response.text
- return response.status_code
- def write_csv(data):
- path_f = os.path.dirname(os.path.abspath(__file__))
- with open(
- os.path.join(path_f, "testimonials.csv"), "a", newline='', encoding='utf-8') as file_:
- order = [
- "category",
- "header",
- "content",
- "avtr",
- "autor",
- "acc_name",
- "since",
- "email",
- "tel"]
- writer_f = csv.DictWriter(file_, fieldnames=order)
- writer_f.writerow(data)
- def get_arts(html):
- soup = BeautifulSoup(html, 'lxml')
- div_id = 'testimonial-2364-3-0-0'
- arts_cls = "testimonial-post"
- articles = soup.find('div', id=div_id).find_all('article', class_=arts_cls)
- return articles
- def get_page_data(html): # получение данных из html
- arts = get_arts(html)
- if len(arts) == 0:
- print("Parsing container is empty")
- else:
- for art in arts:
- parent_div_cont = art.find('div', class_="testimonial-content")
- parent_div_autr = art.find('div', class_="author-details")
- try:
- cat_cls = "testimonial-category"
- category = parent_div_cont.find(
- 'span', class_=cat_cls).text.strip()
- except:
- category = ""
- try:
- header = parent_div_cont.find('h2').text.strip()
- except:
- header = ""
- try:
- cont_cls = "entry-content"
- content = parent_div_cont.find(
- 'div', class_=cont_cls).text.strip()
- except:
- content = ""
- try:
- hide_cls = "coll-hidden"
- hide_content = parent_div_cont.find(
- 'span', class_=hide_cls).text.strip()
- except:
- hide_content = ""
- try:
- avtr = art.find('figure').find('img').get("src")
- except:
- avtr = ""
- try:
- autor = parent_div_autr.find(
- "p", class_="testimonial-author").text.strip()
- except:
- autor = ""
- try:
- acc_name = parent_div_autr.find(
- "p", class_="testimonial-author").find('span', class_="account-name").text.strip()
- except:
- acc_name = ""
- try:
- since = parent_div_autr.find(
- "p", class_="traxer-since").text.strip()
- except:
- since = ""
- try:
- email = parent_div_autr.find("ul", class_="testimonial-meta").find(
- "li", class_="email").find('a').text.strip()
- except:
- email = ""
- try:
- tel = parent_div_autr.find("ul", class_="testimonial-meta").find(
- "li", class_="tel").text.strip()
- except:
- tel = ""
- data = {
- "category": category,
- "header": cut_quots(header),
- "content": f"{content} {hide_content}",
- "avtr": avtr,
- "autor": autor,
- "acc_name": acc_name,
- "since": clean_since(since),
- "email": email,
- "tel": tel,
- }
- print(data)
- write_csv(data)
- def main():
- page = 1
- while True:
- url = f'https://catertrax.com/why-catertrax/traxers/page/{page}/'
- html = get_html(url)
- if html == 403 or html == 404:
- print(f"Error request: {html}")
- break
- else:
- arts = get_arts(html)
- if len(arts) != 0: # если содержит articles
- get_page_data(html)
- page = page + 1
- else:
- print("Parsing done!")
- break
- if __name__ == '__main__':
- start_time = time.time()
- main()
- print("--- %s seconds ---" % (time.time() - start_time))
|