strlst
/
laingraph


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
							#!/bin/python

import networkx as nx
import matplotlib.pyplot as plt
import re
import requests
from bs4 import BeautifulSoup
from operator import xor

"""
TODO:
> crawl for pages that contain root tld instead of string 'webring' for the finding of webring pages
> fix some of the ugliness
"""

root = 'https://strlst.myogaya.jp/misc/webring.html'

def extract_tld(url, dots = 1):
    # assumes valid uri
    try:
        token = url.split('/')[2]
        res = token.split('.')[-1 - dots]
        for i in range(-dots, 0, 1):
            res = res + '.' + token.split('.')[i]
        return res
    except IndexError:
        return ''

def extract_any_domain(url):
    token = url.split('/')[2]
    dots = sum('.' in s for s in token)
    return extract_tld(url, dots)

def extract_hrefs(url, relative=False, same_tld=False):
    try:
        reqs = requests.get(url, timeout=10)
    except requests.exceptions.TooManyRedirects:
        print('too many redirects encountered for', url)
        return
    except:
        print('connection error for', url)
        return
    soup = BeautifulSoup(reqs.text, 'lxml')
    url_regex = re.compile('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
    url_tld = extract_tld(url)

    # href tag generator
    for href_tag in soup.find_all('a', href=True):
        # if uri is not valid, it's relative
        res = href_tag['href']
        # skip empty anchors
        if (not len(res)):
            continue
        # either enable relative and construct valid uri or filter for valid uris
        if not bool(url_regex.search(res)):
            if relative:
                # construct valid uri
                res = res[1:] if res[0] == '/' else res
                res = url + ('' if url[-1] == '/' else '/') + res
            else:
                continue
        # either ignore tld or require same root tld, ignore onions
        if (not bool(same_tld) or url_tld in extract_tld(res)) and not ('onion' in res):
            yield res

def bfs_search_crawl(url, search='webring', case_sensitive=False):
    print('started BFS for', url)
    structure = []
    urls_found = set([])
    structure.append(url)
    max_count = 10
    count = 0
    while count < max_count and structure:
        current_url = structure.pop(0)
        urls = [ u for u in extract_hrefs(current_url, relative=True, same_tld=True) ]
        print('{0} links found in {1}'.format(len(urls), current_url))
        for i in urls:
            structure.append(i)
            urls_found.add(i)
        count += 1
        for u in urls_found:
            if search in u.lower():
                #print('found', u)
                return u
    return None

def main():
    to_crawl = list()
    for i in extract_hrefs(root):
        print('processing href tag:', i)
        if not (extract_tld(root) in extract_tld(i)):
            to_crawl.append(i)

    print(to_crawl)
    print('that was all for sites to crawl lol')

    #print(bfs_search_crawl('https://strlst.myogaya.jp/'))
    #print(bfs_search_crawl('https://crawler-test.com/', search='foo1'))

    graph_sources = list()
    for page in to_crawl:
        print('processing page root:', page)
        webring_page = bfs_search_crawl(page)
        if webring_page:
            print(page, '| webring page found', webring_page)
            graph_sources.append(tuple([page, webring_page]))

    print(graph_sources)

    graph = dict()
    for page, webring_page in graph_sources:
        pages = [ p for p in extract_hrefs(webring_page, relative=True, same_tld=False) ]
        graph[page] = pages

    # to stoned to make proper use of algorithms
    G = nx.DiGraph()
    E = list()
    for key in graph.keys():
        for link in graph[key]:
            source = extract_any_domain(key)
            dest = extract_any_domain(link)
            #print(extract_any_domain(key), 'yields', extract_any_domain(link))
            E.append(tuple([source, dest]))

    # remove duplicates
    F = list()
    for e in E:
        to_append = True
        for f in E:
            if e[0] == f[1] and e[1] == f[0]:
                to_append = False
                break
        if to_append:
            F.append(e)

    # add edges in networkx
    for f in F:
        print(f[0], '->', f[1])
        G.add_edge(f[0], f[1])

    options = {
        'with_labels': True,
        'node_color': 'black',
        'node_size': 50,
        'width': 3,
    }
    # draw graph
    nx.draw_shell(G, **(options))
    plt.show()

if __name__ == '__main__':
    main()