123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107 |
- import time
- import requests
- import os
- from bs4 import BeautifulSoup
- import random
- from fake_useragent import UserAgent
- # http: // proxylist.hidemyass.com/search-1299627 # listable
- def get_html(url, headers=None, proxy=None):
- print('---------get_html------------')
- print(f"proxy: {proxy}\nheaders: {headers}")
- # response = requests.get(url, headers=headers, proxies=proxy)
- response = requests.get(url, headers=headers, proxies=proxy)
- # return response.text
- return response
- def get_my_ip(html, proxy, protocol):
- print('---------get_my_ip------------')
- arr0, arr1 = proxy.split('//')
- r0, r1 = arr1.split(':')
- ip = ''
- agent = ''
- soup = BeautifulSoup(html, 'lxml')
- if protocol == 'http':
- try:
- ip = soup.find('div', class_="ip-block").find('span',
- class_='ip').text.strip()
- except:
- ip = 'Not found'
- try:
- agent = soup.find('div', class_="ip-block").find('span',
- class_='ip').find_next_sibling('span').text.strip()
- except:
- agent = 'Not found'
- else:
- divs = soup.find('div', id='techinfocontent').find_all(
- 'dl', class_='list-info__content')
- lists = divs[0].find_all('div', class_="list-info__item")
- ip = lists[0].find('div', class_="list-info__renderer").text.strip()
- agent = 'agent'
- data = {'ip': ip, 'agent': agent}
- print(ip+"\n"+r0)
- if ip == r0:
- path_f = os.path.dirname(os.path.abspath(__file__))
- with open(os.path.join(path_f, "ip.txt"), "a") as _file:
- _file.write(proxy+'\n')
- def time_watcher(start_time):
- print('--- %s seconds ---' % (time.time() - start_time))
- def get_data_from_file(nameFile):
- path_f = os.path.dirname(os.path.abspath(__file__))
- with open(os.path.join(path_f, nameFile)) as txt_file:
- data = [line.rstrip() for line in txt_file]
- return data
- def main():
- # url = 'http://sitespy.ru/my-ip'
- # url = 'https://httpbin.org/ip'
- url = 'https://yandex.ru/internet/'
- protocol = 'https'
- useragents = get_data_from_file('useragents.txt')
- proxies = get_data_from_file(f'proxy_{protocol}.txt')
- for i in range(17, len(proxies)):
- time_await = random.uniform(2, 3)
- time.sleep(time_await)
- # proxy = random.choice(proxies)
- # proxy = 'http://201.131.164.150:3128'
- proxy = proxies[i]
- proxies_ = {
- "http": proxy,
- "https": proxy
- }
- # headers = {'User-Agent': random.choice(useragents)}
- ua = UserAgent()
- headers = {'User-Agent': ua.random}
- print('time_await: ' + str(time_await) +
- f' request: {i} '+time.asctime(time.localtime(time.time())))
- try:
- # html = get_html(url, headers, proxy)
- response = get_html(url, headers, proxies_)
- get_my_ip(response.text, proxy, protocol)
- except:
- continue
- # response = get_html(url)
- # get_my_ip(response.text, 'proxy://proxy:proxy', 'https')
- if __name__ == '__main__':
- start_time = time.time()
- main()
- time_watcher(start_time)
|