domclick_prcr.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import csv
  4. import re
  5. import time
  6. # olan_link = 'https://ekaterinburg.olan.ru/sale-flat/one-room/114031530-45-0-m-etazh-6-15-8500000-rub-ul-bazhova-ekaterinburg-munitsipalnoe-obrazovanie'
  7. # main_link = input("enter mirkvartir link ex.(https://www.mirkvartir.ru/313375186/)\n--> ")
  8. #GET PHONE NUMBER
  9. main_link = 'https://ekaterinburg.domclick.ru/card/sale__flat__1578505738?appmetrica_tracking_id=748334322586582078&referrer=reattribution%3D1&utm_campaign=vitrina_frk_jan-dec2022_20211200028_fid_free_sale_online&utm_medium=card&utm_source=2gis&utm_term=1578505738'
  10. # main_link = 'https://ekaterinburg.domclick.ru/card/sale__flat__2056970632'
  11. get_number = re.findall(r'\/\w*__\w*__\d*', main_link)[0]
  12. get_number = re.findall(r'\d*$', get_number)[0]
  13. get_phone_link = f'https://offer-card.domclick.ru/api/v3/offers/phone/{get_number}'
  14. get_token_link = f'https://offer-card.domclick.ru/api/v3/public_request/{get_number}'
  15. get_token_link_header = {
  16. "Host": "offer-card.domclick.ru",
  17. "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0",
  18. "Accept": "application/json, text/plain, */*",
  19. "Accept-Language": "en-US,en;q=0.5",
  20. "Accept-Encoding": "gzip, deflate, br",
  21. "Referer": "https://ekaterinburg.domclick.ru/",
  22. "Origin": "https://ekaterinburg.domclick.ru",
  23. "Connection": "keep-alive",
  24. "Cookie": "ns_session=813382d8-0cca-4b57-9783-2562ecd40bc1; ftgl_cookie_id=ba202698a83b70f7b94b55a2d6e9bc3f; currentRegionGuid=962c3758-8514-4c8f-91fe-aa465d78e56f; currentLocalityGuid=0d475b79-88de-4054-818c-37d8f9d0d440; rent-experiment=false; logoSuffix=; RETENTION_COOKIES_NAME=54c39e9b9044410f9acfd73e0a30ad40:WzKcZT-ludN8iMdTKEZF1lAtT7g; sessionId=c4eaa2e7bd454539b11b118b46d689d1:JG0GtBq9AO-S_AzwPl_TbJ7DewU; UNIQ_SESSION_ID=88909ef999ce432f991898d73eccf894:5Ful5CISAeg313OnCZf3E3Nt3xQ; regionName=0d475b79-88de-4054-818c-37d8f9d0d440:%D0%95%D0%BA%D0%B0%D1%82%D0%B5%D1%80%D0%B8%D0%BD%D0%B1%D1%83%D1%80%D0%B3; _sa=SA1.b011a8d8-2ee5-4512-9649-50836b3c9dd0.1711597785; regionAlert=1; dtCookie=v_4_srv_7_sn_B503EC1C773551CA00D99475DFFC8502_perc_100000_ol_0_mul_1_app-3Aca312da39d5a5d07_1_app-3A6ea6d147da1fb68a_1_rcs-3Acss_0; region={%22data%22:{%22name%22:%22%D0%9C%D0%BE%D1%81%D0%BA%D0%B2%D0%B0%22%2C%22kladr%22:%2277%22%2C%22guid%22:%221d1463ae-c80f-4d19-9331-a1b68a85b553%22}%2C%22isAutoResolved%22:true}; qrator_ssid=1714120909.550.ngZxXW1uuOacNFhU-9enn4bma98dn7c08hvcmhg3rn823gp2e; qrator_jsid=1714120943.341.QFj0b9VUpNkDl0ea-mjgdn0arifjb54nj400lu7kq2ke80gfu",
  25. "Sec-Fetch-Dest": "empty",
  26. "Sec-Fetch-Mode": "cors",
  27. "Sec-Fetch-Site": "same-site"
  28. }
  29. r_token = requests.get(get_token_link, headers=get_token_link_header)
  30. token = r_token.json()['result']['token']
  31. get_phone_link_header = {
  32. "Host": "offer-card.domclick.ru",
  33. "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0",
  34. "Accept": "application/json, text/plain, */*",
  35. "Accept-Language": "en-US,en;q=0.5",
  36. "Accept-Encoding": "gzip, deflate, br",
  37. "Referer": "https://ekaterinburg.domclick.ru/",
  38. "research-api-token": f'{token}',
  39. "Origin": "https://ekaterinburg.domclick.ru",
  40. "Connection": "keep-alive",
  41. "Cookie": "ns_session=813382d8-0cca-4b57-9783-2562ecd40bc1; ftgl_cookie_id=ba202698a83b70f7b94b55a2d6e9bc3f; currentRegionGuid=962c3758-8514-4c8f-91fe-aa465d78e56f; currentLocalityGuid=0d475b79-88de-4054-818c-37d8f9d0d440; rent-experiment=false; logoSuffix=; RETENTION_COOKIES_NAME=54c39e9b9044410f9acfd73e0a30ad40:WzKcZT-ludN8iMdTKEZF1lAtT7g; sessionId=c4eaa2e7bd454539b11b118b46d689d1:JG0GtBq9AO-S_AzwPl_TbJ7DewU; UNIQ_SESSION_ID=88909ef999ce432f991898d73eccf894:5Ful5CISAeg313OnCZf3E3Nt3xQ; regionName=0d475b79-88de-4054-818c-37d8f9d0d440:%D0%95%D0%BA%D0%B0%D1%82%D0%B5%D1%80%D0%B8%D0%BD%D0%B1%D1%83%D1%80%D0%B3; _sa=SA1.b011a8d8-2ee5-4512-9649-50836b3c9dd0.1711597785; regionAlert=1; dtCookie=v_4_srv_7_sn_B503EC1C773551CA00D99475DFFC8502_perc_100000_ol_0_mul_1_app-3Aca312da39d5a5d07_1_app-3A6ea6d147da1fb68a_1_rcs-3Acss_0; region={%22data%22:{%22name%22:%22%D0%9C%D0%BE%D1%81%D0%BA%D0%B2%D0%B0%22%2C%22kladr%22:%2277%22%2C%22guid%22:%221d1463ae-c80f-4d19-9331-a1b68a85b553%22}%2C%22isAutoResolved%22:true}; qrator_ssid=1714120909.550.ngZxXW1uuOacNFhU-9enn4bma98dn7c08hvcmhg3rn823gp2e; qrator_jsid=1714120943.341.QFj0b9VUpNkDl0ea-mjgdn0arifjb54nj400lu7kq2ke80gfu",
  42. "Sec-Fetch-Dest": "empty",
  43. "Sec-Fetch-Mode": "cors",
  44. "Sec-Fetch-Site": "same-site"
  45. }
  46. r_phone = requests.get(get_phone_link, headers=get_phone_link_header)
  47. #PARSE THE SITE
  48. get_parce_header = {
  49. "Host": "ekaterinburg.domclick.ru",
  50. "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0",
  51. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
  52. "Accept-Language": "en-US,en;q=0.5",
  53. "Accept-Encoding": "gzip, deflate, br",
  54. "Connection": "keep-alive",
  55. "Cookie": "ns_session=813382d8-0cca-4b57-9783-2562ecd40bc1; ftgl_cookie_id=ba202698a83b70f7b94b55a2d6e9bc3f; currentRegionGuid=962c3758-8514-4c8f-91fe-aa465d78e56f; currentLocalityGuid=0d475b79-88de-4054-818c-37d8f9d0d440; logoSuffix=; RETENTION_COOKIES_NAME=54c39e9b9044410f9acfd73e0a30ad40:WzKcZT-ludN8iMdTKEZF1lAtT7g; sessionId=c4eaa2e7bd454539b11b118b46d689d1:JG0GtBq9AO-S_AzwPl_TbJ7DewU; UNIQ_SESSION_ID=88909ef999ce432f991898d73eccf894:5Ful5CISAeg313OnCZf3E3Nt3xQ; regionName=0d475b79-88de-4054-818c-37d8f9d0d440:%D0%95%D0%BA%D0%B0%D1%82%D0%B5%D1%80%D0%B8%D0%BD%D0%B1%D1%83%D1%80%D0%B3; _sa=SA1.b011a8d8-2ee5-4512-9649-50836b3c9dd0.1711597785; regionAlert=1; dtCookie=v_4_srv_7_sn_B503EC1C773551CA00D99475DFFC8502_perc_100000_ol_0_mul_1_app-3Aca312da39d5a5d07_1_app-3A6ea6d147da1fb68a_1_rcs-3Acss_0; region={%22data%22:{%22name%22:%22%D0%9C%D0%BE%D1%81%D0%BA%D0%B2%D0%B0%22%2C%22kladr%22:%2277%22%2C%22guid%22:%221d1463ae-c80f-4d19-9331-a1b68a85b553%22}%2C%22isAutoResolved%22:true}; qrator_ssid=1714187591.260.yl2HdazNqknLfm0g-cufi92o3qh3g4depdh3q2u7a9oh0mrkc; qrator_jsid=1714187590.857.14fc282zI3ao25tV-tifc2eul1allf3vc9057800949aq0i79; is-lotto-banner-hidden=true; rent-experiment=false; _visitId=240fc726-37b0-4e76-826e-1e52d0d07501-887c3e444c005759",
  56. "Sec-Fetch-Dest": "document",
  57. "Sec-Fetch-Mode": "navigate",
  58. "Sec-Fetch-Site": "cross-site"
  59. }
  60. r_parce = requests.get(main_link, headers=get_parce_header)
  61. soup = BeautifulSoup(r_parce.text, 'html5lib')
  62. find_main_info = soup.find_all("div", class_="adkhV")
  63. find_price = soup.find("div", class_="ZS0ck")
  64. find_address = soup.find_all("span", class_="ItUnT")
  65. find_remont = soup.find_all("li", class_="_cGv6")
  66. find_name = soup.find("a", class_="YmqEJ")
  67. if(find_name == None):
  68. find_name = False
  69. address = re.findall(r'улица\s\w*\,\s\d*.\d*|\w*\sулица\,\s\d*.\d*|улица\s\w*\s\(\w*\.\s\w*\)\,\s\d*.\d*|улица\s\w*\s\w*\s\w*\,\s\d*.\d*', find_address[0].get_text())[0];
  70. address_street = re.findall(r'\s[a-zA-Zа-яА-Я]{3,}', address)[0];
  71. address_number = re.findall(r'\d*$', address.strip())[0];
  72. full_info_json = {
  73. "rooms": '',
  74. "full_space": find_main_info[0].get_text(),
  75. "kitchen_space": find_main_info[2].get_text(),
  76. "room_space": find_main_info[1].get_text(),
  77. "flat_height": re.findall(r'^\d*', find_main_info[3].get_text())[0],
  78. "flats": re.findall(r'\d*$', find_main_info[3].get_text())[0],
  79. "price": re.findall(r'\d*\s\d*\s\d*', find_price.get_text())[0].replace(' ', ''),
  80. "address_street": address_street.strip(),
  81. "address_flat_number": address_number,
  82. "phone": r_phone.json()['result']['phone'],
  83. "name": find_name.get_text() if find_name else "0",
  84. "remont": '',
  85. "link": main_link,
  86. }
  87. for i in find_remont:
  88. if(len(re.findall(r'Ремонт', i.get_text())) != 0):
  89. remont = re.findall(r'\w*$', i.get_text())[0]
  90. full_info_json['remont'] = remont
  91. elif(len(re.findall(r'Комнат', i.get_text())) != 0):
  92. komnat = re.findall(r'\d*$', i.get_text())[0]
  93. full_info_json['rooms'] = komnat
  94. with open('../flat_club.csv', 'a', newline='') as csvfile:
  95. spamwriter = csv.writer(csvfile, delimiter=' ', quotechar='"', quoting=csv.QUOTE_MINIMAL)
  96. # spamwriter.writerow(
  97. # ['Улица'] +
  98. # ['№ Дома'] +
  99. # ['Год'] +
  100. # ['Ссылка'] +
  101. # ['Этаж'] +
  102. # ['Этажей'] +
  103. # ['Общая'] +
  104. # ['Кухня'] +
  105. # ['Комната'] +
  106. # ['Потолок'] +
  107. # ['Ремонт'] +
  108. # ['Стоимость'] +
  109. # ['Телефон'] +
  110. # ['Имя']
  111. # )
  112. spamwriter.writerow(
  113. [full_info_json['address_street']] +
  114. [full_info_json['address_flat_number']] +
  115. ["0"] +
  116. [full_info_json['link']] +
  117. [full_info_json['flat_height']] +
  118. [full_info_json['flats']] +
  119. [full_info_json['full_space']] +
  120. [full_info_json['kitchen_space']] +
  121. [full_info_json['room_space']] +
  122. ["0"] +
  123. [full_info_json['remont']] +
  124. [full_info_json['price']] +
  125. [full_info_json['phone']] +
  126. [full_info_json['name']]
  127. )
  128. time.sleep(1)
  129. print("OK!")