explore_enron_data.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. #!/usr/bin/python3
  2. """
  3. Starter code for exploring the Enron dataset (emails + finances);
  4. loads up the dataset (pickled dict of dicts).
  5. The dataset has the form:
  6. enron_data["LASTNAME FIRSTNAME MIDDLEINITIAL"] = { features_dict }
  7. {features_dict} is a dictionary of features associated with that person.
  8. You should explore features_dict as part of the mini-project,
  9. but here's an example to get you started:
  10. enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000
  11. """
  12. import joblib
  13. enron_data = joblib.load(open("../final_project/final_project_dataset.pkl", "rb"))
  14. import argparse
  15. import random
  16. # import sys
  17. # sys.path.append("../tools/")
  18. # import feature_format
  19. def highest_paid(dataset):
  20. highest_paid = 0
  21. the_rich_bitch = str()
  22. for person in dataset:
  23. if dataset[person]["poi"] == False:
  24. continue
  25. total_payments = dataset[person]["total_payments"]
  26. if highest_paid < total_payments:
  27. the_rich_bitch = person
  28. highest_paid = total_payments
  29. return {the_rich_bitch, highest_paid}
  30. def get_person(dataset, interest=True):
  31. ppl = [p for p in dataset \
  32. if dataset[p]["poi"] == interest
  33. ]
  34. x = len(ppl) - 1
  35. index = random.randint(0 , x) # inclusive
  36. print(ppl[index])
  37. print(dataset[ppl[index]])
  38. if __name__ == "__main__":
  39. parser = argparse.ArgumentParser(
  40. description='Project to familiarize with the Enron data set.'
  41. )
  42. parser.add_argument(
  43. 'action',
  44. nargs='?',
  45. type=str,
  46. help='Action to perform (e.g., list_features)'
  47. )
  48. args = parser.parse_args()
  49. if args.action == 'list_features':
  50. print("Showing features available...")
  51. print( enron_data["PRENTICE JAMES"].keys())
  52. if args.action == "summary":
  53. print(len(enron_data),
  54. "people are in the data set."
  55. )
  56. print(len(enron_data["SKILLING JEFFREY K"]),
  57. "available features for each person."
  58. )
  59. no_email = [peep for peep in enron_data \
  60. if enron_data[peep]["email_address"] == 'NaN']
  61. print(len(no_email),
  62. "people don't have email addresses."
  63. )
  64. print(len(enron_data) - len(no_email),
  65. "people have an email address."
  66. )
  67. have_salary = [peep for peep in enron_data \
  68. if enron_data[peep]["salary"] != 'NaN']
  69. print(len(have_salary),
  70. "people have a quantified salary."
  71. )
  72. print(highest_paid(enron_data),
  73. "= the highest paid person"
  74. )
  75. print(enron_data["SKILLING JEFFREY K"]["exercised_stock_options"],
  76. " <-- his value of exercised stock options ($)"
  77. )
  78. pois = [p for p in enron_data if enron_data[p]["poi"] == True]
  79. print("there are %i people of interest in the dataset." % len(pois))
  80. # print( enron_data["PRENTICE JAMES"]["total_stock_value"] )
  81. # print( enron_data["COLWELL WESLEY"]["from_this_person_to_poi"])
  82. if args.action == "show_poi":
  83. print("This is someone who is a person of interest.")
  84. nobody = get_person(enron_data, interest=True)
  85. if args.action == "show_nonpoi":
  86. print("This is someone who is not a person of interest.")
  87. nobody = get_person(enron_data, interest=False)
  88. if args.action == "missing_data":
  89. missing_fin_data = [p for p in enron_data \
  90. if enron_data[p]["total_payments"] == 'NaN']
  91. how_many = len(missing_fin_data)
  92. how_many_pct = how_many/len(enron_data) * 100
  93. print("%i (%0.2f%%) don't have financial data." \
  94. % (how_many, how_many_pct))
  95. # pois = [p for p in enron_data \
  96. # if enron_data[p]["poi"] == True \
  97. # and enron_data[p]["total_payments"] == 'NaN']