123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596 |
- # conda activate PANDA
- # https://wellsr.com/python/linear-discriminant-analysis-for-dimensionality-reduction-in-python/
- import pandas as pd
- import numpy as np
- import seaborn as sns
- from sklearn.model_selection import train_test_split
- from sklearn.preprocessing import StandardScaler
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.metrics import accuracy_score
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
- from sklearn.decomposition import PCA
- header_list = ["Preg", "Glucose", "BP", "skinThick", "Insulin", "BMI", "DPF", "Age", "Class"]
- diabetes_ds = pd.read_csv(r"pima-indians-diabetes.csv", names = header_list)
- print(diabetes_ds.head())
- features = diabetes_ds.drop(['Class'], axis=1)
- labels = diabetes_ds["Class"]
- X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, random_state=0)
- # print(X_train.head(6))
- X_Test = X_test
- stand_scal = StandardScaler()
- X_train = stand_scal.fit_transform(X_train)
- X_test = stand_scal.transform(X_test)
- model = RandomForestClassifier(n_estimators=50, random_state=0)
- model.fit(X_train, y_train)
- y_pred = model.predict(X_test)
- # print('accuracy Default Features = ', accuracy_score(y_test, y_pred))
- # print("X_test[13] = ", X_test[13])
- # print('y_test = ', y_test.head(13))
- # print(y_pred)
- # Dimensionality Reduction with Linear Discriminant Analysis
- lda_model = LDA(n_components = 1)
- X_train_lda = lda_model.fit_transform(X_train, y_train)
- X_test_lda = lda_model.transform(X_test)
- # print(X_train_lda.shape)
- # print(X_test_lda.shape)
- model = RandomForestClassifier(n_estimators=50, random_state=0)
- model.fit(X_train_lda, y_train)
- y_pred = model.predict(X_test_lda)
- # print('accuracy LDA = ', accuracy_score(y_test, y_pred))
- # print("X_test_lda[13] = ", X_test_lda[13])
- # # print(y_test.head(13))
- # print('y_pred = ', y_pred)
- # print('y_pred[13] = ', y_pred[13])
- # Comparing LDA to PCA
- pca_model = PCA(n_components= 1)
- X_train_pca = pca_model.fit_transform(X_train)
- X_test_pca = pca_model.transform(X_test)
- model = RandomForestClassifier(n_estimators=50, random_state=0)
- model.fit(X_train_pca, y_train)
- y_pred = model.predict(X_test_pca)
- # print('accuracy PCA = ', accuracy_score(y_test, y_pred))
- # print("X_train_pca[13] = ", X_train_pca[13])
- # print('y_pred = ', y_pred)
- # print('y_pred[13] = ', y_pred[13])
- '''
- Приведенный выше результат показывает, что при сокращении набора признаков до 1 компонента
- с помощью метода анализа главных компонент (PCA) мы получаем точность 66,23 %, что примерно
- на 11 % меньше, чем 77,92 %, достигнутых с помощью метода LDA. Это ясно показывает, что для
- нашего набора данных LDA является более подходящей методикой сокращения размерности, и
- должно убедить вас в том, что вам нужно играть с различными методиками при сокращении
- размерности в ваших собственных наборах данных.
- '''
- print('y_pred = ', y_pred)
- print('X_Test = \n', X_Test.head(6))
- X_Test['pred'] = y_pred
- # X_Test.to_csv('data1.csv')
- print('X_Test = \n', X_Test.head(6))
|