myrcy
/
ling539-playground


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
							import random
import numpy as np

def get_codes(data):
	#arguments
	#   data		list of text,author code tuples
	#returns
	#   codes		list of distinct author codes
	codes = []
	for tuples in data:
		codes.append(tuples[1])
	codes = set(codes)
	return codes

def traintestsplit(data,testnum,seed=None):
	#arguments
	#   data			list of text,author code tuples
	#   testnum	     int	number of test items
	#returns
	#   trainpairs   list     list of training pairs
	#   testpairs    list     list of test pairs
	if seed != None: random.seed(seed)
	random.shuffle(data) # shuffles data set
	return data[testnum:], data[:testnum]

def getcatlogprobs(train_data):
	#arguments
	#   train_data   list   list of text,code tuples
	#returns
	#   code_logprob          dict   dictionary from codes to logprobs
	#				Log Probability can be defined as: log(number of times a text by stated author appears / total number of texts )
	
	# Calculate how often an author appears
	author_counts = {}
	for text, author in train_data:
		if author in author_counts:
			author_counts[author] += 1
		else:
			author_counts[author] = 1

	# Now we can calculate the log probabilities
	# The log probability gives us a larger and easier to utilize number than regular probability
	log_probs = {}
	total_num_of_pairs = len(train_data)
	for author, count in author_counts.items():
		probability = count / total_num_of_pairs
		if probability != 0:
			log_probs[author] = np.log(probability)
	return log_probs

def getwords(listofpairs):
	#arguments
	#   listofpairs   list   list of text,code tuples
	#returns
	#   d             dict   dictionary of codes [authors, etc.] to dictionaries of word counts [example is the: 27]
	d = {}
	for text, code in listofpairs:
		words = text.split()
		# create d (return value)
		# Add author/code if not yet in d (dictionary)
		if code not in d: d[code] = {}
		# Collect how many times word is stated by author
		for word in words:
			if word in d[code]: d[code][word] += 1
			else: d[code][word] = 1
	
	return d

def get_vocab(listofpairs):
	# gets all words in texts and returns list of vocab
	vocab = set()
	for text, code in listofpairs:
		words = text.split()
		vocab.update(words)
	return vocab


def addone(cnts, vocab):
	#arguments
	#   cnts   dictionary of codes [authors, etc.] to dictionaries of word counts [example is the: 27]
	#returns
	#   cnts   dictionary of codes [authors, etc.] to dictionaries of word counts [example is the: 27] (with addone smoothing)
	#
	#   Adds one to each value to ensure no word is equivlant to zero
	
	for code in cnts:
		for word in vocab:
			if word in cnts[code]:
				cnts[code][word] += 1
			else:
				cnts[code][word] = 1 # perhaps this should equal zero
	return cnts

def logprobwords(cnts):
        #arguments
        #   cnts   dictionary of codes [authors, etc.] to dictionaries of word counts [example is the: 27]
        #returns
        #   cnts   dictionary of codes [authors, etc.] to dictionaries of the log probability of a word
	#  The log probability is defined as: log(instances of word by author / total words by author)
	
	for code in cnts:
		total_count = sum(cnts[code].values()) # sum of all the values of the word count in dictinary: AKA the total number of words
		for word in cnts[code]:
			proportion = cnts[code][word] / total_count
			cnts[code][word] = np.log(proportion)
	return cnts


def testone(catlogs,catwordlogs,pair):
	#arguments
	#   catlogs      dict   the log probability of each author (how often they show up in stated data set)
	#   catwordlogs  dict   thge log probability of each term for each author
	#                       words mapped to log probabilities
	#   pair      tuple  a single text,code tuple
	#returns
	#   y            int    the correct category (the true author)
	#   yhat         int    the predicted category (the predicted author)
	text, y = pair # sets the correct category as y and the text as text
	
	words = text.split()
	scores = {}
	# Initialize dict of scores: highest score will be predicted author/category
	for code in catlogs:
		scores[code] = catlogs[code] # default log probability to the prominence of a certain category in data set
		
		for word in words:
			scores[code] += catwordlogs[code][word]
	
	yhat = max(scores, key=scores.get) # review syntax for .get()
	
	return y, yhat

def bayes(data, trainsplit=100):
	test_data, train_data = traintestsplit(data, trainsplit)
	correct = 0
	total = len(train_data)
	#train_data = data
	vocab = get_vocab(data)
	#vocab = list(get_vocab(train_data))
	# Set catlogs, catwordlogs, and pairs
	catlogs = getcatlogprobs(train_data)
	catwordlogs = getwords(train_data)

	# Error Checking: checks if addone functions properly
	random_author = random.choice(list(catwordlogs.keys()))
	random_word = random.choice(list(catwordlogs[random_author].keys()))
	before_val = catwordlogs[random_author][random_word]
	catwordlogs = addone(catwordlogs, vocab)
	if before_val != catwordlogs[random_author][random_word] - 1: print("error") # prints error if it hasn't grown by one
	# End of Error Checking
	
	catwordlogs = logprobwords(catwordlogs)
	
	for pair in train_data:
		actual, predicted = testone(catlogs, catwordlogs, pair)
		if actual == predicted: correct += 1

	return correct, total

def get_data(file_number):
	# Open File
	file_number = int(file_number)
	data_files = {1: "Gungor_2018_VictorianAuthorAttribution_data-train.csv"}
	dir = 'data/'
	print(f"opening: {dir}{data_files[file_number]}")
	f = open(dir+data_files[file_number],'r',encoding='ISO-8859-1')
	t = f.read()
	f.close()
	lines = t.split('\n')[1:-1]

	if file_number == 1:
		#lines = t.split('\n')[1:-1]
		data = [tuple(line.split(',')) for line in lines]
		return data


def main():
	print("0: Test , 1: Gungor_2018_VictorianAuthorAttribution_data-train.csv")
	file_number = input("Select File Number:")

	print("Enter number for trainsplit (recommended value is 100):")
	training_num = int(input("Enter Number:"))

	if int(file_number) != 0: print(bayes(get_data(file_number), training_num))
	else: test_bayes(get_data(1))

main()


# Some tests provided for the data set Gungor_2018_VictorianAuthorAttribution_data-train.csv
def test_bayes(data, test_dopairs=True):
	res = data
	print(len(res) == 53678)
	print(res[3][1] == '1')
	codes = get_codes(data)
	print(len(codes) == 45)
	print('2' in codes)
	#random.seed(1234)
	trainps,testps = traintestsplit(data,100,1234)
	vocab = get_vocab(trainps)
	print(len(trainps) + len(testps) == len(res))
	print(len(testps) == 100)
	lps = getcatlogprobs(trainps)
	print(len(lps) == 45)
	print(np.abs(lps['3'] + 5.5276) < .1)
	print(np.abs(lps['40'] + 4.82743) < .1)
	counts = getwords(trainps)
	print(np.abs(len(counts['8']) - 9994) < 2)
	print(np.abs(counts['9']['there'] - 3259) < 2)
	print(np.abs(counts['19']['apple'] - 42) < 2)
	initialcount = counts['3']['the']
	counts = addone(counts, vocab)
	print(counts['3']['the'] == initialcount + 1)
	print(np.abs(counts['19']['apple'] - 43) < 2)
	counts = logprobwords(counts)
	print(np.abs(counts['19']['apple'] + 10.494) < .1)
	print(np.abs(counts['41']['cats'] + 13.733) < .1)
	def dopairs(catlogs,catwordlogs,manypairs):
		correct = 0
		total = len(manypairs)
		
		for pair in manypairs:
			actual, predicted = testone(catlogs, catwordlogs, pair)
			if actual == predicted:
				correct += 1
		return correct, total
	if test_dopairs==True: print(dopairs(lps,counts,testps) == (81,100))
	print("End")