123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142 |
- # conda activate PANDA
- # https://machinelearningknowledge.ai/tutorial-for-k-means-clustering-in-python-sklearn/
- # https://gist.githubusercontent.com/ryanorsinger/cb1222e506c1266b9cc808143ddbab82/raw/b2fe8213426159be7f9c8de108726d3d814153eb/mall_customers.csv
- import os
- nthreads = 1
- os.environ["OMP_NUM_THREADS"] = str(nthreads)
- '''
- UserWarning: KMeans is known to have a memory leak on Windows with MKL,
- when there are less chunks than available threads. You can avoid it by
- setting the environment variable OMP_NUM_THREADS=1
- '''
- # https://stackoverflow.com/questions/73391779/setting-number-of-threads-in-python
- '''
- KMeans преимущества параллелизма на основе OpenMP через Cython. Небольшие порции данных (256 выборок)
- обрабатываются параллельно, что, кроме того, снижает объем памяти. Дополнительные сведения о том,
- как контролировать количество потоков
- '''
- # https://scikit-learn.ru/clustering/
- from sklearn.cluster import KMeans
- from sklearn import preprocessing
- import sklearn.cluster as cluster
- import sklearn.metrics as metrics
- import pandas as pd
- from sklearn.preprocessing import MinMaxScaler
- import seaborn as sns
- from matplotlib import pyplot as plt
- df = pd.read_csv(r"Mall_Customers.csv")
- print(df.head())
- print('\n', df.shape)
- scaler = MinMaxScaler()
- scale = scaler.fit_transform(df[['annual_income','spending_score']])
- df_scale = pd.DataFrame(scale, columns = ['annual_income','spending_score']);
- print(df_scale.head(10))
- km=KMeans(n_clusters=2)
- # y_predicted = km.fit_predict(df[['annual_income','spending_score']])
- y_predicted = km.fit_predict(df[['age','annual_income','spending_score']])
- # y_predicted = km.fit_predict(df_scale[['annual_income','spending_score']])
- print(y_predicted)
- # центр тяжести двух кластеров:
- print(km.cluster_centers_)
- #print(km.predict(df[['annual_income','spending_score']]))
- df['Clusters'] = km.labels_
- print(df.head(21))
- # sns.scatterplot(x="annual_income", y="spending_score", hue = 'Clusters', data=df, palette='viridis')
- # plt.scatter(km.cluster_centers_[0][0], km.cluster_centers_[0][1], label='центр тяжести кластерa 0')
- # plt.scatter(km.cluster_centers_[1][0], km.cluster_centers_[1][1], label='центр тяжести кластерa 1', color = 'red')
- # # plt.scatter(km.cluster_centers_[2][0], km.cluster_centers_[2][1], label='центр тяжести кластерa 2', color = 'violet')
- # plt.grid()
- # plt.legend()
- # plt.show()
- # # sns.scatterplot(x="age", y="spending_score", hue = 'Clusters', data=df, palette='viridis')
- # sns.scatterplot(x="age", y="annual_income", hue = 'Clusters', data=df, palette='viridis')
- # plt.scatter(km.cluster_centers_[0][0], km.cluster_centers_[0][1], label='центр тяжести кластерa 0')
- # plt.scatter(km.cluster_centers_[1][0], km.cluster_centers_[1][1], label='центр тяжести кластерa 1', color = 'red')
- # # plt.scatter(km.cluster_centers_[2][0], km.cluster_centers_[2][1], label='центр тяжести кластерa 2', color = 'violet')
- # plt.grid()
- # plt.legend()
- # plt.show()
- '''
- Applying PCA
- Now let us reduce the dimensionality of the dataset into two components.
- '''
- from sklearn.decomposition import PCA
- pca = PCA(n_components=2)
- # principalComponents = pca.fit_transform(df_scale)
- principalComponents = pca.fit_transform(df[['age','annual_income','spending_score']])
- pca_df = pd.DataFrame(data = principalComponents,
- columns = ['principal component 1', 'principal component 2'])
- print(pca_df.head())
- km=KMeans(n_clusters=2, init='k-means++')
- y_predicted = km.fit_predict(pca_df)
- print(y_predicted)
- # центр тяжести двух кластеров:
- print(km.cluster_centers_)
- #print(km.predict(df[['annual_income','spending_score']]))
- pca_df['Clusters'] = km.labels_
- print(pca_df.head(21))
- # sns.scatterplot(x="principal component 1", y="principal component 2", hue = 'Clusters', data=pca_df, palette='viridis')
- # plt.scatter(km.cluster_centers_[0][0], km.cluster_centers_[0][1], label='центр тяжести кластерa 0')
- # plt.scatter(km.cluster_centers_[1][0], km.cluster_centers_[1][1], label='центр тяжести кластерa 1', color = 'red')
- # # plt.scatter(km.cluster_centers_[2][0], km.cluster_centers_[2][1], label='центр тяжести кластерa 2', color = 'violet')
- # plt.grid()
- # plt.legend()
- # plt.show()
- '''
- Finding Optimum Value of K
- The Silhouette Method
- Using the Silhouette method, it can be seen that the Silhouette value is maximum for K=6.
- Hence it can be concluded that the dataset can be segmented properly with 7 clusters.
- '''
- for i in range(2,12):
- labels=cluster.KMeans(n_clusters=i,random_state=200,init='k-means++').fit(pca_df).labels_
- print ("Silhouette score for k(clusters) = "+str(i)+" is "
- +str(metrics.silhouette_score(pca_df,labels,metric="euclidean",sample_size=1000,random_state=200)))
- '''
- Silhouette score for k(clusters) = 2 is 0.3929352994462861
- Silhouette score for k(clusters) = 3 is 0.45094150175950004
- Silhouette score for k(clusters) = 4 is 0.49657104725411494
- Silhouette score for k(clusters) = 5 is 0.4639067957511873
- Silhouette score for k(clusters) = 6 is 0.5342952101268506
- Silhouette score for k(clusters) = 7 is 0.45106742826242596
- Silhouette score for k(clusters) = 8 is 0.4937480874760602
- Silhouette score for k(clusters) = 9 is 0.4753608295862216
- Silhouette score for k(clusters) = 10 is 0.4806759377217517
- Silhouette score for k(clusters) = 11 is 0.46289709248542193
- '''
- kmeans = cluster.KMeans(n_clusters=5)
- kmeans = kmeans.fit(pca_df)
- pca_df['Clusters'] = kmeans.labels_
- sns.scatterplot(x="principal component 1", y="principal component 2",hue = 'Clusters', data=pca_df,palette='viridis')
- plt.grid()
- plt.show()
|