tester.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. #!/usr/bin/pickle
  2. """ a basic script for importing student's POI identifier,
  3. and checking the results that they get from it
  4. requires that the algorithm, dataset, and features list
  5. be written to my_classifier.pkl, my_dataset.pkl, and
  6. my_feature_list.pkl, respectively
  7. that process should happen at the end of poi_id.py
  8. """
  9. import pickle
  10. import sys
  11. from sklearn.model_selection import StratifiedShuffleSplit
  12. import os
  13. sys.path.append(os.path.abspath(("../tools/")))
  14. from feature_format import featureFormat, targetFeatureSplit
  15. PERF_FORMAT_STRING = "\
  16. \tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
  17. Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
  18. RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
  19. \tFalse negatives: {:4d}\tTrue negatives: {:4d}"
  20. def test_classifier(clf, dataset, feature_list, folds = 1000):
  21. data = featureFormat(dataset, feature_list, sort_keys = True)
  22. labels, features = targetFeatureSplit(data)
  23. cv = StratifiedShuffleSplit(folds, random_state = 42)
  24. true_negatives = 0
  25. false_negatives = 0
  26. true_positives = 0
  27. false_positives = 0
  28. for train_idx, test_idx in cv.split(features, labels):
  29. features_train = []
  30. features_test = []
  31. labels_train = []
  32. labels_test = []
  33. for ii in train_idx:
  34. features_train.append( features[ii] )
  35. labels_train.append( labels[ii] )
  36. for jj in test_idx:
  37. features_test.append( features[jj] )
  38. labels_test.append( labels[jj] )
  39. ### fit the classifier using training set, and test on test set
  40. clf.fit(features_train, labels_train)
  41. predictions = clf.predict(features_test)
  42. for prediction, truth in zip(predictions, labels_test):
  43. if prediction == 0 and truth == 0:
  44. true_negatives += 1
  45. elif prediction == 0 and truth == 1:
  46. false_negatives += 1
  47. elif prediction == 1 and truth == 0:
  48. false_positives += 1
  49. elif prediction == 1 and truth == 1:
  50. true_positives += 1
  51. else:
  52. print("Warning: Found a predicted label not == 0 or 1.")
  53. print("All predictions should take value 0 or 1.")
  54. print("Evaluating performance for processed predictions:")
  55. break
  56. try:
  57. total_predictions = true_negatives + false_negatives + false_positives + true_positives
  58. accuracy = 1.0*(true_positives + true_negatives)/total_predictions
  59. precision = 1.0*true_positives/(true_positives+false_positives)
  60. recall = 1.0*true_positives/(true_positives+false_negatives)
  61. f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
  62. f2 = float()
  63. if precision == 0 and recall == 0:
  64. f2 = 0
  65. else:
  66. f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
  67. print(clf)
  68. print(PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))
  69. print(RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))
  70. print("")
  71. except ZeroDivisionError:
  72. print("Got a divide by zero when trying out:", clf)
  73. print("Precision or recall may be undefined due to a lack of true positive predicitons.")
  74. CLF_PICKLE_FILENAME = "my_classifier.pkl"
  75. DATASET_PICKLE_FILENAME = "my_dataset.pkl"
  76. FEATURE_LIST_FILENAME = "my_feature_list.pkl"
  77. def dump_classifier_and_data(clf, dataset, feature_list):
  78. with open(CLF_PICKLE_FILENAME, "wb") as clf_outfile:
  79. pickle.dump(clf, clf_outfile)
  80. with open(DATASET_PICKLE_FILENAME, "wb") as dataset_outfile:
  81. pickle.dump(dataset, dataset_outfile)
  82. with open(FEATURE_LIST_FILENAME, "wb") as featurelist_outfile:
  83. pickle.dump(feature_list, featurelist_outfile)
  84. def load_classifier_and_data():
  85. with open(CLF_PICKLE_FILENAME, "rb") as clf_infile:
  86. clf = pickle.load(clf_infile)
  87. with open(DATASET_PICKLE_FILENAME, "rb") as dataset_infile:
  88. dataset = pickle.load(dataset_infile)
  89. with open(FEATURE_LIST_FILENAME, "rb") as featurelist_infile:
  90. feature_list = pickle.load(featurelist_infile)
  91. return clf, dataset, feature_list
  92. def main():
  93. ### load up student's classifier, dataset, and feature_list
  94. clf, dataset, feature_list = load_classifier_and_data()
  95. ### Run testing script
  96. test_classifier(clf, dataset, feature_list)
  97. if __name__ == '__main__':
  98. main()