poi_id.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. #!/usr/bin/python
  2. import sys
  3. import pickle
  4. import os
  5. sys.path.append(os.path.abspath(("../tools/")))
  6. from feature_format import featureFormat, targetFeatureSplit
  7. from tester import dump_classifier_and_data
  8. ### Task 1: Select what features you'll use.
  9. ### features_list is a list of strings, each of which is a feature name.
  10. ### The first feature must be "poi".
  11. features_list = ['poi','salary'] # You will need to use more features
  12. ### Load the dictionary containing the dataset
  13. with open("./final_project/final_project_dataset.pkl", "rb") as data_file:
  14. data_dict = pickle.load(data_file)
  15. ### Task 2: Remove outliers
  16. ### Task 3: Create new feature(s)
  17. ### Store to my_dataset for easy export below.
  18. my_dataset = data_dict
  19. ### Extract features and labels from dataset for local testing
  20. data = featureFormat(my_dataset, features_list, sort_keys = True)
  21. labels, features = targetFeatureSplit(data)
  22. ### Task 4: Try a varity of classifiers
  23. ### Please name your classifier clf for easy export below.
  24. ### Note that if you want to do PCA or other multi-stage operations,
  25. ### you'll need to use Pipelines. For more info:
  26. ### http://scikit-learn.org/stable/modules/pipeline.html
  27. # Provided to give you a starting point. Try a variety of classifiers.
  28. from sklearn.naive_bayes import GaussianNB
  29. clf = GaussianNB()
  30. ### Task 5: Tune your classifier to achieve better than .3 precision and recall
  31. ### using our testing script. Check the tester.py script in the final project
  32. ### folder for details on the evaluation method, especially the test_classifier
  33. ### function. Because of the small size of the dataset, the script uses
  34. ### stratified shuffle split cross validation. For more info:
  35. ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
  36. # Example starting point. Try investigating other evaluation techniques!
  37. from sklearn.model_selection import train_test_split
  38. features_train, features_test, labels_train, labels_test = \
  39. train_test_split(features, labels, test_size=0.3, random_state=42)
  40. ### Task 6: Dump your classifier, dataset, and features_list so anyone can
  41. ### check your results. You do not need to change anything below, but make sure
  42. ### that the version of poi_id.py that you submit can be run on its own and
  43. ### generates the necessary .pkl files for validating your results.
  44. dump_classifier_and_data(clf, my_dataset, features_list)