pca.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. #!/usr/bin/env python
  2. """ a small class for Principal Component Analysis
  3. Usage:
  4. p = PCA( A, fraction=0.90 )
  5. In:
  6. A: an array of e.g. 1000 observations x 20 variables, 1000 rows x 20 columns
  7. fraction: use principal components that account for e.g.
  8. 90 % of the total variance
  9. Out:
  10. p.U, p.d, p.Vt: from numpy.linalg.svd, A = U . d . Vt
  11. p.dinv: 1/d or 0, see NR
  12. p.eigen: the eigenvalues of A*A, in decreasing order (p.d**2).
  13. eigen[j] / eigen.sum() is variable j's fraction of the total variance;
  14. look at the first few eigen[] to see how many PCs get to 90 %, 95 % ...
  15. p.npc: number of principal components,
  16. e.g. 2 if the top 2 eigenvalues are >= `fraction` of the total.
  17. It's ok to change this; methods use the current value.
  18. Methods:
  19. The methods of class PCA transform vectors or arrays of e.g.
  20. 20 variables, 2 principal components and 1000 observations,
  21. using partial matrices U' d' Vt', parts of the full U d Vt:
  22. A ~ U' . d' . Vt' where e.g.
  23. U' is 1000 x 2
  24. d' is diag([ d0, d1 ]), the 2 largest singular values
  25. Vt' is 2 x 20. Dropping the primes,
  26. d . Vt 2 principal vars = p.vars_pc( 20 vars )
  27. U 1000 obs = p.pc_obs( 2 principal vars )
  28. U . d . Vt 1000 obs, p.obs( 20 vars ) = pc_obs( vars_pc( vars ))
  29. fast approximate A . vars, using the `npc` principal components
  30. Ut 2 pcs = p.obs_pc( 1000 obs )
  31. V . dinv 20 vars = p.pc_vars( 2 principal vars )
  32. V . dinv . Ut 20 vars, p.vars( 1000 obs ) = pc_vars( obs_pc( obs )),
  33. fast approximate Ainverse . obs: vars that give ~ those obs.
  34. Notes:
  35. PCA does not center or scale A; you usually want to first
  36. A -= A.mean(A, axis=0)
  37. A /= A.std(A, axis=0)
  38. with the little class Center or the like, below.
  39. See also:
  40. http://en.wikipedia.org/wiki/Principal_component_analysis
  41. http://en.wikipedia.org/wiki/Singular_value_decomposition
  42. Press et al., Numerical Recipes (2 or 3 ed), SVD
  43. PCA micro-tutorial
  44. iris-pca .py .png
  45. """
  46. from __future__ import division
  47. import numpy
  48. dot = numpy.dot
  49. # import bz.numpyutil as nu
  50. # dot = nu.pdot
  51. __version__ = "2010-04-14 apr"
  52. __author_email__ = "denis-bz-py at t-online dot de"
  53. # From http://stackoverflow.com/questions/1730600/principal-component-analysis-in-python
  54. # Accessed 19-June-2011
  55. #...............................................................................
  56. class PCA:
  57. def __init__( self, A, fraction=0.90 ):
  58. assert 0 <= fraction <= 1
  59. # A = U . diag(d) . Vt, O( m n^2 ), lapack_lite --
  60. self.U, self.d, self.Vt = numpy.linalg.svd( A, full_matrices=False )
  61. assert numpy.all( self.d[:-1] >= self.d[1:] ) # sorted
  62. self.eigen = self.d**2
  63. self.sumvariance = numpy.cumsum(self.eigen)
  64. self.sumvariance /= self.sumvariance[-1]
  65. self.npc = numpy.searchsorted( self.sumvariance, fraction ) + 1
  66. self.dinv = numpy.array([ 1/d if d > self.d[0] * 1e-6 else 0
  67. for d in self.d ])
  68. def pc( self ):
  69. """ e.g. 1000 x 2 U[:, :npc] * d[:npc], to plot etc. """
  70. n = self.npc
  71. return self.U[:, :n] * self.d[:n]
  72. # These 1-line methods may not be worth the bother;
  73. # then use U d Vt directly --
  74. def vars_pc( self, x ):
  75. n = self.npc
  76. return self.d[:n] * dot( self.Vt[:n], x.T ).T # 20 vars -> 2 principal
  77. def pc_vars( self, p ):
  78. n = self.npc
  79. return dot( self.Vt[:n].T, (self.dinv[:n] * p).T ) .T # 2 PC -> 20 vars
  80. def pc_obs( self, p ):
  81. n = self.npc
  82. return dot( self.U[:, :n], p.T ) # 2 principal -> 1000 obs
  83. def obs_pc( self, obs ):
  84. n = self.npc
  85. return dot( self.U[:, :n].T, obs ) .T # 1000 obs -> 2 principal
  86. def obs( self, x ):
  87. return self.pc_obs( self.vars_pc(x) ) # 20 vars -> 2 principal -> 1000 obs
  88. def vars( self, obs ):
  89. return self.pc_vars( self.obs_pc(obs) ) # 1000 obs -> 2 principal -> 20 vars
  90. class Center:
  91. """ A -= A.mean() /= A.std(), inplace -- use A.copy() if need be
  92. uncenter(x) == original A . x
  93. """
  94. # mttiw
  95. def __init__( self, A, axis=0, scale=True, verbose=1 ):
  96. self.mean = A.mean(axis=axis)
  97. if verbose:
  98. print "Center -= A.mean:", self.mean
  99. A -= self.mean
  100. if scale:
  101. std = A.std(axis=axis)
  102. self.std = numpy.where( std, std, 1. )
  103. if verbose:
  104. print "Center /= A.std:", self.std
  105. A /= self.std
  106. else:
  107. self.std = numpy.ones( A.shape[-1] )
  108. self.A = A
  109. def uncenter( self, x ):
  110. return numpy.dot( self.A, x * self.std ) + numpy.dot( x, self.mean )
  111. def pca(A, fraction = .9, center = True):
  112. if type(A) is not numpy.ndarray:
  113. A = numpy.array(A, numpy.float)
  114. if center:
  115. Center(A)
  116. p = PCA(A, fraction=fraction)
  117. return p
  118. #...............................................................................
  119. if __name__ == "__main__":
  120. import sys
  121. if len(sys.argv) < 2:
  122. print 'Usage: %s filename' % sys.argv[0]
  123. sys.exit(-1)
  124. csv = sys.argv[1]
  125. fraction = .90
  126. numpy.set_printoptions( 1, threshold=100, suppress=True ) # .1f
  127. A = numpy.genfromtxt( csv, delimiter="," )
  128. # Remove headers, both the names and the titles
  129. A = A[1:,1:]
  130. N, K = A.shape
  131. print "csv: %s N: %d K: %d fraction: %.2g" % (csv, N, K, fraction)
  132. print "PCA ..." ,
  133. p = pca(A, fraction)
  134. print "npc:", p.npc
  135. print "% variance:", p.sumvariance * 100
  136. # Convery to a numpy array to allow for more elaborate array slicing
  137. pc = numpy.ma.array(p.pc())
  138. print "Vt[0], weights that give PC 0:", p.Vt[0]
  139. print "A . Vt[0]:", dot( A, p.Vt[0] )
  140. print "pc:", p.pc()
  141. import numpy
  142. from matplotlib.figure import Figure
  143. from matplotlib.backends.backend_agg import FigureCanvasAgg
  144. # Plot the first two PCA components
  145. fig = Figure(figsize=(8,6))
  146. ax = fig.add_subplot(111)
  147. ax.scatter(pc[:, 0], pc[:, 1])
  148. canvas = FigureCanvasAgg(fig)
  149. canvas.print_figure('pca.png')
  150. print "\nobs <-> pc <-> x: with fraction=1, diffs should be ~ 0"
  151. x = numpy.ones(K)
  152. # x = numpy.ones(( 3, K ))
  153. print "x:", x
  154. pc = p.vars_pc(x) # d' Vt' x
  155. print "vars_pc(x):", pc
  156. print "back to ~ x:", p.pc_vars(pc)
  157. Ax = dot( A, x.T )
  158. pcx = p.obs(x) # U' d' Vt' x
  159. print "Ax:", Ax
  160. print "A'x:", pcx
  161. print "max |Ax - A'x|: %.2g" % numpy.linalg.norm( Ax - pcx, numpy.inf )
  162. b = Ax # ~ back to original x, Ainv A x
  163. back = p.vars(b)
  164. print "~ back again:", back
  165. print "max |back - x|: %.2g" % numpy.linalg.norm( back - x, numpy.inf )
  166. # end pca.py