kMeansClustering.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. # conda activate PANDA
  2. # https://machinelearningknowledge.ai/tutorial-for-k-means-clustering-in-python-sklearn/
  3. # https://gist.githubusercontent.com/ryanorsinger/cb1222e506c1266b9cc808143ddbab82/raw/b2fe8213426159be7f9c8de108726d3d814153eb/mall_customers.csv
  4. import os
  5. nthreads = 1
  6. os.environ["OMP_NUM_THREADS"] = str(nthreads)
  7. '''
  8. UserWarning: KMeans is known to have a memory leak on Windows with MKL,
  9. when there are less chunks than available threads. You can avoid it by
  10. setting the environment variable OMP_NUM_THREADS=1
  11. '''
  12. # https://stackoverflow.com/questions/73391779/setting-number-of-threads-in-python
  13. '''
  14. KMeans преимущества параллелизма на основе OpenMP через Cython. Небольшие порции данных (256 выборок)
  15. обрабатываются параллельно, что, кроме того, снижает объем памяти. Дополнительные сведения о том,
  16. как контролировать количество потоков
  17. '''
  18. # https://scikit-learn.ru/clustering/
  19. from sklearn.cluster import KMeans
  20. from sklearn import preprocessing
  21. import sklearn.cluster as cluster
  22. import sklearn.metrics as metrics
  23. import pandas as pd
  24. from sklearn.preprocessing import MinMaxScaler
  25. import seaborn as sns
  26. from matplotlib import pyplot as plt
  27. df = pd.read_csv(r"Mall_Customers.csv")
  28. print(df.head())
  29. print('\n', df.shape)
  30. scaler = MinMaxScaler()
  31. scale = scaler.fit_transform(df[['annual_income','spending_score']])
  32. df_scale = pd.DataFrame(scale, columns = ['annual_income','spending_score']);
  33. print(df_scale.head(10))
  34. km=KMeans(n_clusters=2)
  35. # y_predicted = km.fit_predict(df[['annual_income','spending_score']])
  36. y_predicted = km.fit_predict(df[['age','annual_income','spending_score']])
  37. # y_predicted = km.fit_predict(df_scale[['annual_income','spending_score']])
  38. print(y_predicted)
  39. # центр тяжести двух кластеров:
  40. print(km.cluster_centers_)
  41. #print(km.predict(df[['annual_income','spending_score']]))
  42. df['Clusters'] = km.labels_
  43. print(df.head(21))
  44. # sns.scatterplot(x="annual_income", y="spending_score", hue = 'Clusters', data=df, palette='viridis')
  45. # plt.scatter(km.cluster_centers_[0][0], km.cluster_centers_[0][1], label='центр тяжести кластерa 0')
  46. # plt.scatter(km.cluster_centers_[1][0], km.cluster_centers_[1][1], label='центр тяжести кластерa 1', color = 'red')
  47. # # plt.scatter(km.cluster_centers_[2][0], km.cluster_centers_[2][1], label='центр тяжести кластерa 2', color = 'violet')
  48. # plt.grid()
  49. # plt.legend()
  50. # plt.show()
  51. # # sns.scatterplot(x="age", y="spending_score", hue = 'Clusters', data=df, palette='viridis')
  52. # sns.scatterplot(x="age", y="annual_income", hue = 'Clusters', data=df, palette='viridis')
  53. # plt.scatter(km.cluster_centers_[0][0], km.cluster_centers_[0][1], label='центр тяжести кластерa 0')
  54. # plt.scatter(km.cluster_centers_[1][0], km.cluster_centers_[1][1], label='центр тяжести кластерa 1', color = 'red')
  55. # # plt.scatter(km.cluster_centers_[2][0], km.cluster_centers_[2][1], label='центр тяжести кластерa 2', color = 'violet')
  56. # plt.grid()
  57. # plt.legend()
  58. # plt.show()
  59. '''
  60. Applying PCA
  61. Now let us reduce the dimensionality of the dataset into two components.
  62. '''
  63. from sklearn.decomposition import PCA
  64. pca = PCA(n_components=2)
  65. # principalComponents = pca.fit_transform(df_scale)
  66. principalComponents = pca.fit_transform(df[['age','annual_income','spending_score']])
  67. pca_df = pd.DataFrame(data = principalComponents,
  68. columns = ['principal component 1', 'principal component 2'])
  69. print(pca_df.head())
  70. km=KMeans(n_clusters=2, init='k-means++')
  71. y_predicted = km.fit_predict(pca_df)
  72. print(y_predicted)
  73. # центр тяжести двух кластеров:
  74. print(km.cluster_centers_)
  75. #print(km.predict(df[['annual_income','spending_score']]))
  76. pca_df['Clusters'] = km.labels_
  77. print(pca_df.head(21))
  78. # sns.scatterplot(x="principal component 1", y="principal component 2", hue = 'Clusters', data=pca_df, palette='viridis')
  79. # plt.scatter(km.cluster_centers_[0][0], km.cluster_centers_[0][1], label='центр тяжести кластерa 0')
  80. # plt.scatter(km.cluster_centers_[1][0], km.cluster_centers_[1][1], label='центр тяжести кластерa 1', color = 'red')
  81. # # plt.scatter(km.cluster_centers_[2][0], km.cluster_centers_[2][1], label='центр тяжести кластерa 2', color = 'violet')
  82. # plt.grid()
  83. # plt.legend()
  84. # plt.show()
  85. '''
  86. Finding Optimum Value of K
  87. The Silhouette Method
  88. Using the Silhouette method, it can be seen that the Silhouette value is maximum for K=6.
  89. Hence it can be concluded that the dataset can be segmented properly with 7 clusters.
  90. '''
  91. for i in range(2,12):
  92. labels=cluster.KMeans(n_clusters=i,random_state=200,init='k-means++').fit(pca_df).labels_
  93. print ("Silhouette score for k(clusters) = "+str(i)+" is "
  94. +str(metrics.silhouette_score(pca_df,labels,metric="euclidean",sample_size=1000,random_state=200)))
  95. '''
  96. Silhouette score for k(clusters) = 2 is 0.3929352994462861
  97. Silhouette score for k(clusters) = 3 is 0.45094150175950004
  98. Silhouette score for k(clusters) = 4 is 0.49657104725411494
  99. Silhouette score for k(clusters) = 5 is 0.4639067957511873
  100. Silhouette score for k(clusters) = 6 is 0.5342952101268506
  101. Silhouette score for k(clusters) = 7 is 0.45106742826242596
  102. Silhouette score for k(clusters) = 8 is 0.4937480874760602
  103. Silhouette score for k(clusters) = 9 is 0.4753608295862216
  104. Silhouette score for k(clusters) = 10 is 0.4806759377217517
  105. Silhouette score for k(clusters) = 11 is 0.46289709248542193
  106. '''
  107. kmeans = cluster.KMeans(n_clusters=5)
  108. kmeans = kmeans.fit(pca_df)
  109. pca_df['Clusters'] = kmeans.labels_
  110. sns.scatterplot(x="principal component 1", y="principal component 2",hue = 'Clusters', data=pca_df,palette='viridis')
  111. plt.grid()
  112. plt.show()