123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153 |
- #!/bin/python
- import networkx as nx
- import matplotlib.pyplot as plt
- import re
- import requests
- from bs4 import BeautifulSoup
- from operator import xor
- """
- TODO:
- > crawl for pages that contain root tld instead of string 'webring' for the finding of webring pages
- > fix some of the ugliness
- """
- root = 'https://strlst.myogaya.jp/misc/webring.html'
- def extract_tld(url, dots = 1):
- # assumes valid uri
- try:
- token = url.split('/')[2]
- res = token.split('.')[-1 - dots]
- for i in range(-dots, 0, 1):
- res = res + '.' + token.split('.')[i]
- return res
- except IndexError:
- return ''
- def extract_any_domain(url):
- token = url.split('/')[2]
- dots = sum('.' in s for s in token)
- return extract_tld(url, dots)
- def extract_hrefs(url, relative=False, same_tld=False):
- try:
- reqs = requests.get(url, timeout=10)
- except requests.exceptions.TooManyRedirects:
- print('too many redirects encountered for', url)
- return
- except:
- print('connection error for', url)
- return
- soup = BeautifulSoup(reqs.text, 'lxml')
- url_regex = re.compile('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
- url_tld = extract_tld(url)
- # href tag generator
- for href_tag in soup.find_all('a', href=True):
- # if uri is not valid, it's relative
- res = href_tag['href']
- # skip empty anchors
- if (not len(res)):
- continue
- # either enable relative and construct valid uri or filter for valid uris
- if not bool(url_regex.search(res)):
- if relative:
- # construct valid uri
- res = res[1:] if res[0] == '/' else res
- res = url + ('' if url[-1] == '/' else '/') + res
- else:
- continue
- # either ignore tld or require same root tld, ignore onions
- if (not bool(same_tld) or url_tld in extract_tld(res)) and not ('onion' in res):
- yield res
- def bfs_search_crawl(url, search='webring', case_sensitive=False):
- print('started BFS for', url)
- structure = []
- urls_found = set([])
- structure.append(url)
- max_count = 10
- count = 0
- while count < max_count and structure:
- current_url = structure.pop(0)
- urls = [ u for u in extract_hrefs(current_url, relative=True, same_tld=True) ]
- print('{0} links found in {1}'.format(len(urls), current_url))
- for i in urls:
- structure.append(i)
- urls_found.add(i)
- count += 1
- for u in urls_found:
- if search in u.lower():
- #print('found', u)
- return u
- return None
- def main():
- to_crawl = list()
- for i in extract_hrefs(root):
- print('processing href tag:', i)
- if not (extract_tld(root) in extract_tld(i)):
- to_crawl.append(i)
- print(to_crawl)
- print('that was all for sites to crawl lol')
- #print(bfs_search_crawl('https://strlst.myogaya.jp/'))
- #print(bfs_search_crawl('https://crawler-test.com/', search='foo1'))
- graph_sources = list()
- for page in to_crawl:
- print('processing page root:', page)
- webring_page = bfs_search_crawl(page)
- if webring_page:
- print(page, '| webring page found', webring_page)
- graph_sources.append(tuple([page, webring_page]))
- print(graph_sources)
- graph = dict()
- for page, webring_page in graph_sources:
- pages = [ p for p in extract_hrefs(webring_page, relative=True, same_tld=False) ]
- graph[page] = pages
- # to stoned to make proper use of algorithms
- G = nx.DiGraph()
- E = list()
- for key in graph.keys():
- for link in graph[key]:
- source = extract_any_domain(key)
- dest = extract_any_domain(link)
- #print(extract_any_domain(key), 'yields', extract_any_domain(link))
- E.append(tuple([source, dest]))
- # remove duplicates
- F = list()
- for e in E:
- to_append = True
- for f in E:
- if e[0] == f[1] and e[1] == f[0]:
- to_append = False
- break
- if to_append:
- F.append(e)
- # add edges in networkx
- for f in F:
- print(f[0], '->', f[1])
- G.add_edge(f[0], f[1])
- options = {
- 'with_labels': True,
- 'node_color': 'black',
- 'node_size': 50,
- 'width': 3,
- }
- # draw graph
- nx.draw_shell(G, **(options))
- plt.show()
- if __name__ == '__main__':
- main()
|