123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110 |
- #!/usr/bin/python3
- """
- Starter code for exploring the Enron dataset (emails + finances);
- loads up the dataset (pickled dict of dicts).
- The dataset has the form:
- enron_data["LASTNAME FIRSTNAME MIDDLEINITIAL"] = { features_dict }
- {features_dict} is a dictionary of features associated with that person.
- You should explore features_dict as part of the mini-project,
- but here's an example to get you started:
- enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000
-
- """
- import joblib
- enron_data = joblib.load(open("../final_project/final_project_dataset.pkl", "rb"))
- import argparse
- import random
- # import sys
- # sys.path.append("../tools/")
- # import feature_format
- def highest_paid(dataset):
- highest_paid = 0
- the_rich_bitch = str()
- for person in dataset:
- if dataset[person]["poi"] == False:
- continue
- total_payments = dataset[person]["total_payments"]
- if highest_paid < total_payments:
- the_rich_bitch = person
- highest_paid = total_payments
- return {the_rich_bitch, highest_paid}
- def get_person(dataset, interest=True):
- ppl = [p for p in dataset \
- if dataset[p]["poi"] == interest
- ]
- x = len(ppl) - 1
- index = random.randint(0 , x) # inclusive
- print(ppl[index])
- print(dataset[ppl[index]])
- if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description='Project to familiarize with the Enron data set.'
- )
- parser.add_argument(
- 'action',
- nargs='?',
- type=str,
- help='Action to perform (e.g., list_features)'
- )
- args = parser.parse_args()
- if args.action == 'list_features':
- print("Showing features available...")
- print( enron_data["PRENTICE JAMES"].keys())
- if args.action == "summary":
- print(len(enron_data),
- "people are in the data set."
- )
- print(len(enron_data["SKILLING JEFFREY K"]),
- "available features for each person."
- )
- no_email = [peep for peep in enron_data \
- if enron_data[peep]["email_address"] == 'NaN']
- print(len(no_email),
- "people don't have email addresses."
- )
- print(len(enron_data) - len(no_email),
- "people have an email address."
- )
- have_salary = [peep for peep in enron_data \
- if enron_data[peep]["salary"] != 'NaN']
- print(len(have_salary),
- "people have a quantified salary."
- )
- print(highest_paid(enron_data),
- "= the highest paid person"
- )
- print(enron_data["SKILLING JEFFREY K"]["exercised_stock_options"],
- " <-- his value of exercised stock options ($)"
- )
- pois = [p for p in enron_data if enron_data[p]["poi"] == True]
- print("there are %i people of interest in the dataset." % len(pois))
- # print( enron_data["PRENTICE JAMES"]["total_stock_value"] )
- # print( enron_data["COLWELL WESLEY"]["from_this_person_to_poi"])
- if args.action == "show_poi":
- print("This is someone who is a person of interest.")
- nobody = get_person(enron_data, interest=True)
- if args.action == "show_nonpoi":
- print("This is someone who is not a person of interest.")
- nobody = get_person(enron_data, interest=False)
- if args.action == "missing_data":
- missing_fin_data = [p for p in enron_data \
- if enron_data[p]["total_payments"] == 'NaN']
- how_many = len(missing_fin_data)
- how_many_pct = how_many/len(enron_data) * 100
- print("%i (%0.2f%%) don't have financial data." \
- % (how_many, how_many_pct))
- # pois = [p for p in enron_data \
- # if enron_data[p]["poi"] == True \
- # and enron_data[p]["total_payments"] == 'NaN']
|