scraper.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. #!/usr/bin/python3
  2. import urllib.request
  3. import lxml
  4. from lxml import etree
  5. import sys
  6. import re
  7. import os
  8. import json
  9. import pprint
  10. import getopt
  11. def convert_simple_list(html):
  12. result = []
  13. for line in html.split(b"\r\n"):
  14. st = re.search(b"^([\d.]*):([\d]*)$", line)
  15. if st:
  16. try:
  17. addr = st.group(1).decode('utf-8')
  18. port = st.group(2).decode('utf-8')
  19. result += [ {"addrport":(addr, port)} ]
  20. except IndexError as e:
  21. pass
  22. except UnicodeError as e:
  23. print(e, file=sys.stderr)
  24. # pprint.pp(result)
  25. return result
  26. def find_addrports(html):
  27. result = []
  28. for line in html.split(b"\r\n"):
  29. st = re.finditer(br"(\d+\.\d+\.\d+\.\d+):([\d]*)", line)
  30. for match in st:
  31. try:
  32. addr = match.group(1).decode('utf-8')
  33. port = match.group(2).decode('utf-8')
  34. result += [ {"addrport":(addr, port)} ]
  35. except IndexError as e:
  36. pass
  37. except UnicodeError as e:
  38. print(e, file=sys.stderr)
  39. return result
  40. def convert_proxyscan_io(html):
  41. result = []
  42. regex = re.compile(br">(\d+\.\d+\.\d+\.\d+)<.*?>([\d]*)<", re.MULTILINE|re.DOTALL)
  43. for match in re.finditer(regex, html):
  44. try:
  45. addr = match.group(1).decode('utf-8')
  46. port = match.group(2).decode('utf-8')
  47. result += [ {"addrport":(addr, port)} ]
  48. except IndexError as e:
  49. pass
  50. except UnicodeError as e:
  51. print(e, file=sys.stderr)
  52. return result
  53. # print(tree)
  54. # print(tree.xpath(f'//body/
  55. # return []
  56. # print(html)
  57. def established_process1():
  58. results = []
  59. url = 'https://premproxy.com/socks-by-country/United-States-03.htm'
  60. with urllib.request.urlopen(url) as response:
  61. html = response.read()
  62. results += find_addrports(html)
  63. url = 'https://premproxy.com/socks-by-country/United-States-04.htm'
  64. with urllib.request.urlopen(url) as response:
  65. html = response.read()
  66. results += find_addrports(html)
  67. url = 'https://premproxy.com/socks-by-country/United-States-05.htm'
  68. with urllib.request.urlopen(url) as response:
  69. html = response.read()
  70. results += find_addrports(html)
  71. url = 'https://premproxy.com/socks-by-country/United-States-06.htm'
  72. with urllib.request.urlopen(url) as response:
  73. html = response.read()
  74. results += find_addrports(html)
  75. url = 'https://api.proxyscrape.com/?request=getproxies&proxytype=socks5&timeout=10000&country=all'
  76. with urllib.request.urlopen(url) as response:
  77. html = response.read()
  78. results += convert_simple_list(html)
  79. url = 'https://www.proxyscan.io/Home/FilterResult?status=1&ping=&selectedType=SOCKS5'
  80. with urllib.request.urlopen(url) as response:
  81. html = response.read()
  82. results += convert_proxyscan_io(html)
  83. print(json.dumps(results))
  84. if __name__ == '__main__':
  85. try:
  86. opts, args = getopt.getopt(sys.argv[1:], "f")
  87. except getopt.GetoptError as err:
  88. print (str(err))
  89. usage()
  90. sys.exit(2)
  91. doFetch = False
  92. for o, a in opts:
  93. if o == "-f":
  94. doFetch = True
  95. if doFetch:
  96. established_process1()
  97. else:
  98. url = 'file:///%s/list' % os.getcwd()
  99. with urllib.request.urlopen(url) as response:
  100. html = response.read()
  101. find_addrports(html)