svm_author_id.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. #!/usr/bin/python3
  2. """
  3. This is the code to accompany the Lesson 2 (SVM) mini-project.
  4. Use a SVM to identify emails from the Enron corpus by their authors:
  5. Sara has label 0
  6. Chris has label 1
  7. """
  8. import sys
  9. from time import time
  10. sys.path.append("../tools/")
  11. from email_preprocess import preprocess
  12. import numpy as np
  13. from sklearn.svm import SVC
  14. ### features_train and features_test are the features for the training
  15. ### and testing datasets, respectively
  16. ### labels_train and labels_test are the corresponding item labels
  17. # features_train, features_test, labels_train, labels_test = preprocess()
  18. #########################################################
  19. ### your code goes here ###
  20. # do tradeoff by default because it takes less time
  21. def run_svc(features_train, features_test,
  22. labels_train, labels_test,
  23. do_tradeoff=True,
  24. k="linear"):
  25. print("using kernel ", k)
  26. classifier = SVC(kernel=k)
  27. if do_tradeoff:
  28. features_train = features_train[:int(len(features_train)/100)]
  29. labels_train = labels_train[:int(len(labels_train)/100)]
  30. classifier.fit(features_train, labels_train)
  31. # pred = classifier.predict(features_test)
  32. return classifier.score(features_test, labels_test)
  33. # linear- no tradeoff:
  34. # accuracy = 0.9840728100113766
  35. #wlinear - with tradeoff:
  36. # accuracy = 0.8845278725824801
  37. # rbf - with tradeoff
  38. # accuracy = 0.8953356086461889
  39. # rbf - without tradeoff
  40. # accuracy = 0.9926052332195677
  41. # -------------------------------+
  42. def test_rbf(features_train, features_test,
  43. labels_train, labels_test):
  44. def foo(features_train, features_test,
  45. labels_train, labels_test,
  46. testval=1.0):
  47. classifier = SVC(kernel="rbf", C=testval)
  48. print("testval = C = ", testval)
  49. classifier.fit(features_train, labels_train)
  50. return classifier.score(features_test, labels_test)
  51. # do subset of full data
  52. features_train = features_train[:int(len(features_train)/100)]
  53. labels_train = labels_train[:int(len(labels_train)/100)]
  54. test_values = [1.0, 10.0, 100.0, 10000.0]
  55. for val in test_values:
  56. acc = foo(
  57. features_train, features_test,
  58. labels_train, labels_test,
  59. val)
  60. print("accuracy = ", acc)
  61. # testval = C = 1.0
  62. # accuracy = 0.8953356086461889
  63. # testval = C = 10.0
  64. # accuracy = 0.8998862343572241
  65. # testval = C = 100.0
  66. # accuracy = 0.8998862343572241
  67. # testval = C = 10000.0
  68. # accuracy = 0.8998862343572241
  69. # -------------------------------+
  70. def test_optimized_svm(
  71. features_train, features_test,
  72. labels_train, labels_test):
  73. classifier = SVC(kernel="linear", C=10.0)
  74. classifier.fit(features_train, labels_train)
  75. # pred = classifier.predict(features_test)
  76. return classifier.score(features_test, labels_test)
  77. def extract_predictions(
  78. features_train, features_test,
  79. labels_train, labels_test):
  80. classifier = SVC(kernel="rbf", C=10000.0)
  81. features_train = features_train[:int(len(features_train)/100)]
  82. labels_train = labels_train[:int(len(labels_train)/100)]
  83. classifier.fit(features_train, labels_train)
  84. pred = classifier.predict(features_test)
  85. quiz = [10, 26, 50]
  86. for elem in quiz:
  87. answer = "Chris" if pred[elem] else "Sara"
  88. print("answer(%i) = %i (%s)" % (elem, pred[elem], answer))
  89. def how_many(features_train, features_test,
  90. labels_train, labels_test):
  91. classifier = SVC(kernel="rbf", C=10000.0)
  92. classifier.fit(features_train, labels_train)
  93. pred = classifier.predict(features_test)
  94. unique, frequency = np.unique(pred, return_counts=True)
  95. print("(unique, frequency) = ", unique, frequency)
  96. # with full training set
  97. # (unique, frequency) = [0 1] [892 866]
  98. # with partial training set
  99. # (unique, frequency) = [0 1] [892 866]
  100. if __name__ == "__main__":
  101. # accuracy = run_svc(*preprocess(), do_tradeoff=False, k="rbf")
  102. # print("accuracy = ", accuracy)
  103. # test for optimal C parameter
  104. # test_rbf(*preprocess())
  105. # use optimized C parameter on linear model
  106. # accuracy = test_optimized_svm(*preprocess())
  107. # print("accuracy = ", accuracy)
  108. # predict author of e-mail
  109. # extract_predictions(*preprocess())
  110. how_many(*preprocess())
  111. pass
  112. #########################################################
  113. #########################################################
  114. '''
  115. You'll be Provided similar code in the Quiz
  116. But the Code provided in Quiz has an Indexing issue
  117. The Code Below solves that issue, So use this one
  118. '''
  119. # features_train = features_train[:int(len(features_train)/100)]
  120. # labels_train = labels_train[:int(len(labels_train)/100)]
  121. # ^ there isn't actually an issue or error
  122. # this is for the second quiz,
  123. # which asks to demonstrate an accuracu-speed tradeoff'
  124. #########################################################