laingraph.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. #!/bin/python
  2. import networkx as nx
  3. import matplotlib.pyplot as plt
  4. import re
  5. import requests
  6. from bs4 import BeautifulSoup
  7. from operator import xor
  8. """
  9. TODO:
  10. > crawl for pages that contain root tld instead of string 'webring' for the finding of webring pages
  11. > fix some of the ugliness
  12. """
  13. root = 'https://strlst.myogaya.jp/misc/webring.html'
  14. def extract_tld(url, dots = 1):
  15. # assumes valid uri
  16. try:
  17. token = url.split('/')[2]
  18. res = token.split('.')[-1 - dots]
  19. for i in range(-dots, 0, 1):
  20. res = res + '.' + token.split('.')[i]
  21. return res
  22. except IndexError:
  23. return ''
  24. def extract_any_domain(url):
  25. token = url.split('/')[2]
  26. dots = sum('.' in s for s in token)
  27. return extract_tld(url, dots)
  28. def extract_hrefs(url, relative=False, same_tld=False):
  29. try:
  30. reqs = requests.get(url, timeout=10)
  31. except requests.exceptions.TooManyRedirects:
  32. print('too many redirects encountered for', url)
  33. return
  34. except:
  35. print('connection error for', url)
  36. return
  37. soup = BeautifulSoup(reqs.text, 'lxml')
  38. url_regex = re.compile('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
  39. url_tld = extract_tld(url)
  40. # href tag generator
  41. for href_tag in soup.find_all('a', href=True):
  42. # if uri is not valid, it's relative
  43. res = href_tag['href']
  44. # skip empty anchors
  45. if (not len(res)):
  46. continue
  47. # either enable relative and construct valid uri or filter for valid uris
  48. if not bool(url_regex.search(res)):
  49. if relative:
  50. # construct valid uri
  51. res = res[1:] if res[0] == '/' else res
  52. res = url + ('' if url[-1] == '/' else '/') + res
  53. else:
  54. continue
  55. # either ignore tld or require same root tld, ignore onions
  56. if (not bool(same_tld) or url_tld in extract_tld(res)) and not ('onion' in res):
  57. yield res
  58. def bfs_search_crawl(url, search='webring', case_sensitive=False):
  59. print('started BFS for', url)
  60. structure = []
  61. urls_found = set([])
  62. structure.append(url)
  63. max_count = 10
  64. count = 0
  65. while count < max_count and structure:
  66. current_url = structure.pop(0)
  67. urls = [ u for u in extract_hrefs(current_url, relative=True, same_tld=True) ]
  68. print('{0} links found in {1}'.format(len(urls), current_url))
  69. for i in urls:
  70. structure.append(i)
  71. urls_found.add(i)
  72. count += 1
  73. for u in urls_found:
  74. if search in u.lower():
  75. #print('found', u)
  76. return u
  77. return None
  78. def main():
  79. to_crawl = list()
  80. for i in extract_hrefs(root):
  81. print('processing href tag:', i)
  82. if not (extract_tld(root) in extract_tld(i)):
  83. to_crawl.append(i)
  84. print(to_crawl)
  85. print('that was all for sites to crawl lol')
  86. #print(bfs_search_crawl('https://strlst.myogaya.jp/'))
  87. #print(bfs_search_crawl('https://crawler-test.com/', search='foo1'))
  88. graph_sources = list()
  89. for page in to_crawl:
  90. print('processing page root:', page)
  91. webring_page = bfs_search_crawl(page)
  92. if webring_page:
  93. print(page, '| webring page found', webring_page)
  94. graph_sources.append(tuple([page, webring_page]))
  95. print(graph_sources)
  96. graph = dict()
  97. for page, webring_page in graph_sources:
  98. pages = [ p for p in extract_hrefs(webring_page, relative=True, same_tld=False) ]
  99. graph[page] = pages
  100. # to stoned to make proper use of algorithms
  101. G = nx.DiGraph()
  102. E = list()
  103. for key in graph.keys():
  104. for link in graph[key]:
  105. source = extract_any_domain(key)
  106. dest = extract_any_domain(link)
  107. #print(extract_any_domain(key), 'yields', extract_any_domain(link))
  108. E.append(tuple([source, dest]))
  109. # remove duplicates
  110. F = list()
  111. for e in E:
  112. to_append = True
  113. for f in E:
  114. if e[0] == f[1] and e[1] == f[0]:
  115. to_append = False
  116. break
  117. if to_append:
  118. F.append(e)
  119. # add edges in networkx
  120. for f in F:
  121. print(f[0], '->', f[1])
  122. G.add_edge(f[0], f[1])
  123. options = {
  124. 'with_labels': True,
  125. 'node_color': 'black',
  126. 'node_size': 50,
  127. 'width': 3,
  128. }
  129. # draw graph
  130. nx.draw_shell(G, **(options))
  131. plt.show()
  132. if __name__ == '__main__':
  133. main()