finance_regression.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. #!/usr/bin/python3
  2. """
  3. Starter code for the regression mini-project.
  4. Loads up/formats a modified version of the dataset
  5. (why modified? we've removed some trouble points
  6. that you'll find yourself in the outliers mini-project).
  7. Draws a little scatterplot of the training/testing data
  8. You fill in the regression code where indicated:
  9. """
  10. import os
  11. import sys
  12. import joblib
  13. sys.path.append(os.path.abspath("../tools/"))
  14. from feature_format import featureFormat, targetFeatureSplit
  15. dictionary = joblib.load( open("../final_project/final_project_dataset_modified.pkl", "rb") )
  16. ### list the features you want to look at--first item in the
  17. ### list will be the "target" feature
  18. features_list = ["bonus", "salary"]
  19. data = featureFormat( dictionary, features_list, remove_any_zeroes=True, sort_keys = '../tools/python2_lesson06_keys.pkl')
  20. target, features = targetFeatureSplit( data )
  21. ### training-testing split needed in regression, just like classification
  22. from sklearn.model_selection import train_test_split
  23. feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
  24. train_color = "b"
  25. test_color = "r"
  26. ### Your regression goes here!
  27. ### Please name it reg, so that the plotting code below picks it up and
  28. ### plots it correctly. Don't forget to change the test_color above from "b" to
  29. ### "r" to differentiate training points from test points.
  30. from sklearn import linear_model
  31. reg = linear_model.LinearRegression()
  32. # reg.fit(features, target)
  33. reg.fit(feature_train, target_train)
  34. print(reg.coef_)
  35. print(reg.intercept_)
  36. print(reg.score(feature_test, target_test))
  37. ### draw the scatterplot, with color-coded training and testing points
  38. import matplotlib.pyplot as plt
  39. for feature, target in zip(feature_test, target_test):
  40. plt.scatter( feature, target, color=test_color )
  41. for feature, target in zip(feature_train, target_train):
  42. plt.scatter( feature, target, color=train_color )
  43. ### labels for the legend
  44. plt.scatter(feature_test[0], target_test[0], color=test_color, label="test")
  45. plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")
  46. ### draw the regression line, once it's coded
  47. try:
  48. plt.plot( feature_test, reg.predict(feature_test) )
  49. except NameError:
  50. pass
  51. reg.fit(feature_test, target_test)
  52. plt.plot(feature_train, reg.predict(feature_train), color="b")
  53. print(reg.coef_)
  54. plt.xlabel(features_list[1])
  55. plt.ylabel(features_list[0])
  56. plt.legend()
  57. plt.show()