123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169 |
- #!/usr/bin/python3
- """
- This is the code to accompany the Lesson 2 (SVM) mini-project.
- Use a SVM to identify emails from the Enron corpus by their authors:
- Sara has label 0
- Chris has label 1
- """
-
- import sys
- from time import time
- sys.path.append("../tools/")
- from email_preprocess import preprocess
- import numpy as np
- from sklearn.svm import SVC
- ### features_train and features_test are the features for the training
- ### and testing datasets, respectively
- ### labels_train and labels_test are the corresponding item labels
- # features_train, features_test, labels_train, labels_test = preprocess()
- #########################################################
- ### your code goes here ###
- # do tradeoff by default because it takes less time
- def run_svc(features_train, features_test,
- labels_train, labels_test,
- do_tradeoff=True,
- k="linear"):
- print("using kernel ", k)
- classifier = SVC(kernel=k)
- if do_tradeoff:
- features_train = features_train[:int(len(features_train)/100)]
- labels_train = labels_train[:int(len(labels_train)/100)]
- classifier.fit(features_train, labels_train)
- # pred = classifier.predict(features_test)
- return classifier.score(features_test, labels_test)
- # linear- no tradeoff:
- # accuracy = 0.9840728100113766
- #wlinear - with tradeoff:
- # accuracy = 0.8845278725824801
- # rbf - with tradeoff
- # accuracy = 0.8953356086461889
- # rbf - without tradeoff
- # accuracy = 0.9926052332195677
- # -------------------------------+
- def test_rbf(features_train, features_test,
- labels_train, labels_test):
- def foo(features_train, features_test,
- labels_train, labels_test,
- testval=1.0):
- classifier = SVC(kernel="rbf", C=testval)
- print("testval = C = ", testval)
- classifier.fit(features_train, labels_train)
- return classifier.score(features_test, labels_test)
- # do subset of full data
- features_train = features_train[:int(len(features_train)/100)]
- labels_train = labels_train[:int(len(labels_train)/100)]
- test_values = [1.0, 10.0, 100.0, 10000.0]
- for val in test_values:
- acc = foo(
- features_train, features_test,
- labels_train, labels_test,
- val)
- print("accuracy = ", acc)
- # testval = C = 1.0
- # accuracy = 0.8953356086461889
- # testval = C = 10.0
- # accuracy = 0.8998862343572241
- # testval = C = 100.0
- # accuracy = 0.8998862343572241
- # testval = C = 10000.0
- # accuracy = 0.8998862343572241
- # -------------------------------+
- def test_optimized_svm(
- features_train, features_test,
- labels_train, labels_test):
- classifier = SVC(kernel="linear", C=10.0)
- classifier.fit(features_train, labels_train)
- # pred = classifier.predict(features_test)
- return classifier.score(features_test, labels_test)
- def extract_predictions(
- features_train, features_test,
- labels_train, labels_test):
- classifier = SVC(kernel="rbf", C=10000.0)
- features_train = features_train[:int(len(features_train)/100)]
- labels_train = labels_train[:int(len(labels_train)/100)]
- classifier.fit(features_train, labels_train)
- pred = classifier.predict(features_test)
- quiz = [10, 26, 50]
- for elem in quiz:
- answer = "Chris" if pred[elem] else "Sara"
- print("answer(%i) = %i (%s)" % (elem, pred[elem], answer))
- def how_many(features_train, features_test,
- labels_train, labels_test):
- classifier = SVC(kernel="rbf", C=10000.0)
- classifier.fit(features_train, labels_train)
- pred = classifier.predict(features_test)
- unique, frequency = np.unique(pred, return_counts=True)
- print("(unique, frequency) = ", unique, frequency)
-
- # with full training set
- # (unique, frequency) = [0 1] [892 866]
- # with partial training set
- # (unique, frequency) = [0 1] [892 866]
- if __name__ == "__main__":
- # accuracy = run_svc(*preprocess(), do_tradeoff=False, k="rbf")
- # print("accuracy = ", accuracy)
- # test for optimal C parameter
- # test_rbf(*preprocess())
- # use optimized C parameter on linear model
- # accuracy = test_optimized_svm(*preprocess())
- # print("accuracy = ", accuracy)
- # predict author of e-mail
- # extract_predictions(*preprocess())
- how_many(*preprocess())
- pass
- #########################################################
- #########################################################
- '''
- You'll be Provided similar code in the Quiz
- But the Code provided in Quiz has an Indexing issue
- The Code Below solves that issue, So use this one
- '''
- # features_train = features_train[:int(len(features_train)/100)]
- # labels_train = labels_train[:int(len(labels_train)/100)]
- # ^ there isn't actually an issue or error
- # this is for the second quiz,
- # which asks to demonstrate an accuracu-speed tradeoff'
- #########################################################
|