123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235 |
- import random
- import numpy as np
- def get_codes(data):
- #arguments
- # data list of text,author code tuples
- #returns
- # codes list of distinct author codes
- codes = []
- for tuples in data:
- codes.append(tuples[1])
- codes = set(codes)
- return codes
- def traintestsplit(data,testnum,seed=None):
- #arguments
- # data list of text,author code tuples
- # testnum int number of test items
- #returns
- # trainpairs list list of training pairs
- # testpairs list list of test pairs
- if seed != None: random.seed(seed)
- random.shuffle(data) # shuffles data set
- return data[testnum:], data[:testnum]
- def getcatlogprobs(train_data):
- #arguments
- # train_data list list of text,code tuples
- #returns
- # code_logprob dict dictionary from codes to logprobs
- # Log Probability can be defined as: log(number of times a text by stated author appears / total number of texts )
-
- # Calculate how often an author appears
- author_counts = {}
- for text, author in train_data:
- if author in author_counts:
- author_counts[author] += 1
- else:
- author_counts[author] = 1
- # Now we can calculate the log probabilities
- # The log probability gives us a larger and easier to utilize number than regular probability
- log_probs = {}
- total_num_of_pairs = len(train_data)
- for author, count in author_counts.items():
- probability = count / total_num_of_pairs
- if probability != 0:
- log_probs[author] = np.log(probability)
- return log_probs
- def getwords(listofpairs):
- #arguments
- # listofpairs list list of text,code tuples
- #returns
- # d dict dictionary of codes [authors, etc.] to dictionaries of word counts [example is the: 27]
- d = {}
- for text, code in listofpairs:
- words = text.split()
- # create d (return value)
- # Add author/code if not yet in d (dictionary)
- if code not in d: d[code] = {}
- # Collect how many times word is stated by author
- for word in words:
- if word in d[code]: d[code][word] += 1
- else: d[code][word] = 1
-
- return d
- def get_vocab(listofpairs):
- # gets all words in texts and returns list of vocab
- vocab = set()
- for text, code in listofpairs:
- words = text.split()
- vocab.update(words)
- return vocab
- def addone(cnts, vocab):
- #arguments
- # cnts dictionary of codes [authors, etc.] to dictionaries of word counts [example is the: 27]
- #returns
- # cnts dictionary of codes [authors, etc.] to dictionaries of word counts [example is the: 27] (with addone smoothing)
- #
- # Adds one to each value to ensure no word is equivlant to zero
-
- for code in cnts:
- for word in vocab:
- if word in cnts[code]:
- cnts[code][word] += 1
- else:
- cnts[code][word] = 1 # perhaps this should equal zero
- return cnts
- def logprobwords(cnts):
- #arguments
- # cnts dictionary of codes [authors, etc.] to dictionaries of word counts [example is the: 27]
- #returns
- # cnts dictionary of codes [authors, etc.] to dictionaries of the log probability of a word
- # The log probability is defined as: log(instances of word by author / total words by author)
-
- for code in cnts:
- total_count = sum(cnts[code].values()) # sum of all the values of the word count in dictinary: AKA the total number of words
- for word in cnts[code]:
- proportion = cnts[code][word] / total_count
- cnts[code][word] = np.log(proportion)
- return cnts
- def testone(catlogs,catwordlogs,pair):
- #arguments
- # catlogs dict the log probability of each author (how often they show up in stated data set)
- # catwordlogs dict thge log probability of each term for each author
- # words mapped to log probabilities
- # pair tuple a single text,code tuple
- #returns
- # y int the correct category (the true author)
- # yhat int the predicted category (the predicted author)
- text, y = pair # sets the correct category as y and the text as text
-
- words = text.split()
- scores = {}
- # Initialize dict of scores: highest score will be predicted author/category
- for code in catlogs:
- scores[code] = catlogs[code] # default log probability to the prominence of a certain category in data set
-
- for word in words:
- scores[code] += catwordlogs[code][word]
-
- yhat = max(scores, key=scores.get) # review syntax for .get()
-
- return y, yhat
- def bayes(data, trainsplit=100):
- test_data, train_data = traintestsplit(data, trainsplit)
- correct = 0
- total = len(train_data)
- #train_data = data
- vocab = get_vocab(data)
- #vocab = list(get_vocab(train_data))
- # Set catlogs, catwordlogs, and pairs
- catlogs = getcatlogprobs(train_data)
- catwordlogs = getwords(train_data)
- # Error Checking: checks if addone functions properly
- random_author = random.choice(list(catwordlogs.keys()))
- random_word = random.choice(list(catwordlogs[random_author].keys()))
- before_val = catwordlogs[random_author][random_word]
- catwordlogs = addone(catwordlogs, vocab)
- if before_val != catwordlogs[random_author][random_word] - 1: print("error") # prints error if it hasn't grown by one
- # End of Error Checking
-
- catwordlogs = logprobwords(catwordlogs)
-
- for pair in train_data:
- actual, predicted = testone(catlogs, catwordlogs, pair)
- if actual == predicted: correct += 1
- return correct, total
- def get_data(file_number):
- # Open File
- file_number = int(file_number)
- data_files = {1: "Gungor_2018_VictorianAuthorAttribution_data-train.csv"}
- dir = 'data/'
- print(f"opening: {dir}{data_files[file_number]}")
- f = open(dir+data_files[file_number],'r',encoding='ISO-8859-1')
- t = f.read()
- f.close()
- lines = t.split('\n')[1:-1]
- if file_number == 1:
- #lines = t.split('\n')[1:-1]
- data = [tuple(line.split(',')) for line in lines]
- return data
- def main():
- print("0: Test , 1: Gungor_2018_VictorianAuthorAttribution_data-train.csv")
- file_number = input("Select File Number:")
- print("Enter number for trainsplit (recommended value is 100):")
- training_num = int(input("Enter Number:"))
- if int(file_number) != 0: print(bayes(get_data(file_number), training_num))
- else: test_bayes(get_data(1))
- main()
- # Some tests provided for the data set Gungor_2018_VictorianAuthorAttribution_data-train.csv
- def test_bayes(data, test_dopairs=True):
- res = data
- print(len(res) == 53678)
- print(res[3][1] == '1')
- codes = get_codes(data)
- print(len(codes) == 45)
- print('2' in codes)
- #random.seed(1234)
- trainps,testps = traintestsplit(data,100,1234)
- vocab = get_vocab(trainps)
- print(len(trainps) + len(testps) == len(res))
- print(len(testps) == 100)
- lps = getcatlogprobs(trainps)
- print(len(lps) == 45)
- print(np.abs(lps['3'] + 5.5276) < .1)
- print(np.abs(lps['40'] + 4.82743) < .1)
- counts = getwords(trainps)
- print(np.abs(len(counts['8']) - 9994) < 2)
- print(np.abs(counts['9']['there'] - 3259) < 2)
- print(np.abs(counts['19']['apple'] - 42) < 2)
- initialcount = counts['3']['the']
- counts = addone(counts, vocab)
- print(counts['3']['the'] == initialcount + 1)
- print(np.abs(counts['19']['apple'] - 43) < 2)
- counts = logprobwords(counts)
- print(np.abs(counts['19']['apple'] + 10.494) < .1)
- print(np.abs(counts['41']['cats'] + 13.733) < .1)
- def dopairs(catlogs,catwordlogs,manypairs):
- correct = 0
- total = len(manypairs)
-
- for pair in manypairs:
- actual, predicted = testone(catlogs, catwordlogs, pair)
- if actual == predicted:
- correct += 1
- return correct, total
- if test_dopairs==True: print(dopairs(lps,counts,testps) == (81,100))
- print("End")
|