linear_discriminant_analysis.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. # conda activate PANDA
  2. # https://wellsr.com/python/linear-discriminant-analysis-for-dimensionality-reduction-in-python/
  3. import pandas as pd
  4. import numpy as np
  5. import seaborn as sns
  6. from sklearn.model_selection import train_test_split
  7. from sklearn.preprocessing import StandardScaler
  8. from sklearn.ensemble import RandomForestClassifier
  9. from sklearn.metrics import accuracy_score
  10. from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
  11. from sklearn.decomposition import PCA
  12. header_list = ["Preg", "Glucose", "BP", "skinThick", "Insulin", "BMI", "DPF", "Age", "Class"]
  13. diabetes_ds = pd.read_csv(r"pima-indians-diabetes.csv", names = header_list)
  14. print(diabetes_ds.head())
  15. features = diabetes_ds.drop(['Class'], axis=1)
  16. labels = diabetes_ds["Class"]
  17. X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, random_state=0)
  18. # print(X_train.head(6))
  19. X_Test = X_test
  20. stand_scal = StandardScaler()
  21. X_train = stand_scal.fit_transform(X_train)
  22. X_test = stand_scal.transform(X_test)
  23. model = RandomForestClassifier(n_estimators=50, random_state=0)
  24. model.fit(X_train, y_train)
  25. y_pred = model.predict(X_test)
  26. # print('accuracy Default Features = ', accuracy_score(y_test, y_pred))
  27. # print("X_test[13] = ", X_test[13])
  28. # print('y_test = ', y_test.head(13))
  29. # print(y_pred)
  30. # Dimensionality Reduction with Linear Discriminant Analysis
  31. lda_model = LDA(n_components = 1)
  32. X_train_lda = lda_model.fit_transform(X_train, y_train)
  33. X_test_lda = lda_model.transform(X_test)
  34. # print(X_train_lda.shape)
  35. # print(X_test_lda.shape)
  36. model = RandomForestClassifier(n_estimators=50, random_state=0)
  37. model.fit(X_train_lda, y_train)
  38. y_pred = model.predict(X_test_lda)
  39. # print('accuracy LDA = ', accuracy_score(y_test, y_pred))
  40. # print("X_test_lda[13] = ", X_test_lda[13])
  41. # # print(y_test.head(13))
  42. # print('y_pred = ', y_pred)
  43. # print('y_pred[13] = ', y_pred[13])
  44. # Comparing LDA to PCA
  45. pca_model = PCA(n_components= 1)
  46. X_train_pca = pca_model.fit_transform(X_train)
  47. X_test_pca = pca_model.transform(X_test)
  48. model = RandomForestClassifier(n_estimators=50, random_state=0)
  49. model.fit(X_train_pca, y_train)
  50. y_pred = model.predict(X_test_pca)
  51. # print('accuracy PCA = ', accuracy_score(y_test, y_pred))
  52. # print("X_train_pca[13] = ", X_train_pca[13])
  53. # print('y_pred = ', y_pred)
  54. # print('y_pred[13] = ', y_pred[13])
  55. '''
  56. Приведенный выше результат показывает, что при сокращении набора признаков до 1 компонента
  57. с помощью метода анализа главных компонент (PCA) мы получаем точность 66,23 %, что примерно
  58. на 11 % меньше, чем 77,92 %, достигнутых с помощью метода LDA. Это ясно показывает, что для
  59. нашего набора данных LDA является более подходящей методикой сокращения размерности, и
  60. должно убедить вас в том, что вам нужно играть с различными методиками при сокращении
  61. размерности в ваших собственных наборах данных.
  62. '''
  63. print('y_pred = ', y_pred)
  64. print('X_Test = \n', X_Test.head(6))
  65. X_Test['pred'] = y_pred
  66. # X_Test.to_csv('data1.csv')
  67. print('X_Test = \n', X_Test.head(6))