taxis-kmeansklearn.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. # https://pythonru.com/uroki/sklearn-kmeans-i-knn
  2. # https://gitlab.com/PythonRu/notebooks/-/blob/master/classified_data.csv
  3. import numpy as np
  4. import pandas as pd
  5. import matplotlib.pyplot as plt
  6. import seaborn as sns
  7. from sklearn.preprocessing import StandardScaler
  8. from sklearn.model_selection import train_test_split
  9. from sklearn.neighbors import KNeighborsClassifier
  10. from sklearn.metrics import classification_report
  11. from sklearn.metrics import confusion_matrix
  12. raw_data = pd.read_csv('taxis.csv')
  13. # raw_data = pd.read_csv('classified_data.csv', index_col = 0)
  14. print(raw_data.head())
  15. raw_data = raw_data.drop(['pickup', 'dropoff','color', 'pickup_zone', 'dropoff_zone', 'pickup_borough',
  16. 'dropoff_borough'], axis=1)
  17. raw_data = raw_data.dropna()
  18. print(raw_data.head())
  19. scaler = StandardScaler()
  20. scaler.fit(raw_data.drop('payment', axis=1))
  21. scaled_features = scaler.transform(raw_data.drop('payment', axis=1))
  22. scaled_data = pd.DataFrame(scaled_features, columns = raw_data.drop('payment', axis=1).columns)
  23. x = scaled_data
  24. y = raw_data['payment']
  25. x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(x, y, test_size = 0.3)
  26. model = KNeighborsClassifier(n_neighbors = 1)
  27. model.fit(x_training_data, y_training_data)
  28. predictions = model.predict(x_test_data)
  29. print(predictions)
  30. print(classification_report(y_test_data, predictions))
  31. print(confusion_matrix(y_test_data, predictions))
  32. '''
  33. precision recall f1-score support
  34. cash 0.85 0.88 0.87 544
  35. credit card 0.95 0.94 0.95 1373
  36. accuracy 0.92 1917
  37. macro avg 0.90 0.91 0.91 1917
  38. weighted avg 0.92 0.92 0.92 1917
  39. [[ 479 65]
  40. [ 83 1290]]
  41. '''