Package csb :: Package statistics :: Module mixtures
[frames] | no frames]

Source Code for Module csb.statistics.mixtures

  1  """ 
  2  Mixture models for multi-dimensional data. 
  3   
  4  Reference: Hirsch M, Habeck M. - Bioinformatics. 2008 Oct 1;24(19):2184-92 
  5  """ 
  6  import numpy 
  7   
  8  from abc import ABCMeta, abstractmethod 
9 10 11 -class GaussianMixture(object):
12 """ 13 Gaussian mixture model for multi-dimensional data. 14 """ 15 _axis = None 16 17 # prior for variance (inverse Gamma distribution) 18 ALPHA_SIGMA = 0.0001 19 BETA_SIGMA = 0.01 20 MIN_SIGMA = 0.0 21 22 use_cache = True 23
24 - def __init__(self, X, K, train=True, axis=None):
25 """ 26 @param X: multi dimensional input vector with samples along first axis 27 @type X: (M,...) numpy array 28 29 @param K: number of components 30 @type K: int 31 32 @param train: train model 33 @type train: bool 34 35 @param axis: component axis in C{X} 36 @type axis: int 37 """ 38 if self._axis is not None: 39 if axis is not None and axis != self._axis: 40 raise ValueError('axis is fixed for {0}'.format(type(self).__name__)) 41 axis = self._axis 42 elif axis is None: 43 axis = 0 44 self._axis = axis 45 46 N = X.shape[axis] 47 self._X = X 48 self._dimension = numpy.prod(X.shape) / N 49 50 c = numpy.linspace(0, K, N, False).astype(int) 51 self._scales = numpy.equal.outer(range(K), c).astype(float) 52 self._means = numpy.zeros((K,) + X.shape[1:]) 53 self.del_cache() 54 55 if train: 56 self.em()
57 58 @property
59 - def K(self):
60 """ 61 Number of components 62 @rtype: int 63 """ 64 return len(self.means)
65 66 @property
67 - def N(self):
68 """ 69 Length of component axis 70 @rtype: int 71 """ 72 return self._scales.shape[1]
73 74 @property
75 - def M(self):
76 """ 77 Number of data points 78 @rtype: int 79 """ 80 return len(self._X)
81
82 - def del_cache(self):
83 """Clear model parameter cache (force recalculation)""" 84 self._w = None 85 self._sigma = None 86 self._delta = None
87 88 @property
89 - def dimension(self):
90 """ 91 Dimensionality of the mixture domain 92 @rtype: int 93 """ 94 return self._dimension
95 96 @property
97 - def means(self):
98 """ 99 @rtype: (K, ...) numpy array 100 """ 101 return self._means
102 103 @means.setter
104 - def means(self, means):
105 if means.shape != self._means.shape: 106 raise ValueError('shape mismatch') 107 self._means = means 108 self.del_cache()
109 110 @property
111 - def scales(self):
112 """ 113 @rtype: (K, N) numpy array 114 """ 115 return self._scales
116 117 @scales.setter
118 - def scales(self, scales):
119 if scales.shape != self._scales.shape: 120 raise ValueError('shape mismatch') 121 self._scales = scales 122 self.del_cache()
123 124 @property
125 - def w(self):
126 """ 127 Component weights 128 @rtype: (K,) numpy array 129 """ 130 if not self.use_cache or self._w is None: 131 self._w = self.scales.mean(1) 132 return self._w
133 134 @property
135 - def sigma(self):
136 """ 137 Component variations 138 @rtype: (K,) numpy array 139 """ 140 if not self.use_cache or self._sigma is None: 141 alpha = self.dimension * self.scales.sum(1) + self.ALPHA_SIGMA 142 beta = (self.delta * self.scales.T).sum(0) + self.BETA_SIGMA 143 self._sigma = numpy.sqrt(beta / alpha).clip(self.MIN_SIGMA) 144 return self._sigma
145 146 @property
147 - def delta(self):
148 """ 149 Squared "distances" between data and components 150 @rtype: (N, K) numpy array 151 """ 152 if not self.use_cache or self._delta is None: 153 self._delta = numpy.transpose([[d.sum() 154 for d in numpy.swapaxes([(self.means[k] - self.datapoint(m, k)) ** 2 155 for m in range(self.M)], 0, self._axis)] 156 for k in range(self.K)]) 157 return self._delta
158 159 @property
160 - def log_likelihood_reduced(self):
161 """ 162 Log-likelihood of the marginalized model (no auxiliary indicator variables) 163 @rtype: float 164 """ 165 from csb.numeric import log, log_sum_exp 166 s_sq = (self.sigma ** 2).clip(1e-300, 1e300) 167 log_p = log(self.w) - 0.5 * \ 168 (self.delta / s_sq + self.dimension * log(2 * numpy.pi * s_sq)) 169 return log_sum_exp(log_p.T).sum()
170 171 @property
172 - def log_likelihood(self):
173 """ 174 Log-likelihood of the extended model (with indicators) 175 @rtype: float 176 """ 177 from csb.numeric import log 178 from numpy import pi, sum 179 n = self.scales.sum(1) 180 N = self.dimension 181 Z = self.scales.T 182 s_sq = (self.sigma ** 2).clip(1e-300, 1e300) 183 return sum(n * log(self.w)) - 0.5 * \ 184 (sum(Z * self.delta / s_sq) + N * sum(n * log(2 * pi * s_sq)) + sum(log(s_sq)))
185
186 - def datapoint(self, m, k):
187 """ 188 Training point number C{m} as if it would belong to component C{k} 189 @rtype: numpy array 190 """ 191 return self._X[m]
192
193 - def estimate_means(self):
194 """ 195 Update means from current model and samples 196 """ 197 n = self.scales.sum(1) 198 self.means = numpy.array([numpy.sum([self.scales[k, m] * self.datapoint(m, k) 199 for m in range(self.M)], 0) / n[k] 200 for k in range(self.K)])
201
202 - def estimate_scales(self, beta=1.0):
203 """ 204 Update scales from current model and samples 205 @param beta: inverse temperature 206 @type beta: float 207 """ 208 from csb.numeric import log, log_sum_exp, exp 209 s_sq = (self.sigma ** 2).clip(1e-300, 1e300) 210 Z = (log(self.w) - 0.5 * (self.delta / s_sq + self.dimension * log(s_sq))) * beta 211 self.scales = exp(Z.T - log_sum_exp(Z.T))
212
213 - def randomize_means(self):
214 """ 215 Pick C{K} samples from C{X} as means 216 """ 217 import random 218 self.means = numpy.asarray(random.sample(self._X, self.K)) 219 self.estimate_scales()
220
221 - def randomize_scales(self, ordered=True):
222 """ 223 Random C{scales} initialization 224 """ 225 from numpy.random import random, multinomial 226 if ordered: 227 K, N = self.scales.shape 228 Ks = numpy.arange(K) 229 w = random(K) + (5. * K / N) # with pseudocounts 230 c = numpy.repeat(Ks, multinomial(N, w / w.sum())) 231 self.scales = numpy.equal.outer(Ks, c).astype(float) 232 else: 233 s = random(self.scales.shape) 234 self.scales = s / s.sum(0) 235 236 if 0.0 in self.w: 237 self.randomize_scales(ordered) 238 return 239 240 self.estimate_means()
241
242 - def e_step(self, beta=1.0):
243 """ 244 Expectation step for EM 245 @param beta: inverse temperature 246 @type beta: float 247 """ 248 self.estimate_scales(beta)
249
250 - def m_step(self):
251 """ 252 Maximization step for EM 253 """ 254 self.estimate_means()
255
256 - def em(self, n_iter=100, eps=1e-30):
257 """ 258 Expectation maximization 259 260 @param n_iter: maximum number of iteration steps 261 @type n_iter: int 262 263 @param eps: log-likelihood convergence criterion 264 @type eps: float 265 """ 266 LL_prev = -numpy.inf 267 for i in range(n_iter): 268 self.m_step() 269 self.e_step() 270 271 if eps is not None: 272 LL = self.log_likelihood 273 if abs(LL - LL_prev) < eps: 274 break 275 LL_prev = LL
276
277 - def anneal(self, betas):
278 """ 279 Deterministic annealing 280 281 @param betas: sequence of inverse temperatures 282 @type betas: iterable of floats 283 """ 284 for beta in betas: 285 self.m_step() 286 self.e_step(beta)
287
288 - def increment_K(self, train=True):
289 """ 290 Split component with largest sigma 291 292 @returns: new instance of mixture with incremented C{K} 293 @rtype: L{GaussianMixture} subclass 294 """ 295 i = self.sigma.argmax() 296 297 # duplicate column 298 Z = numpy.vstack([self.scales, self.scales[i]]) 299 300 # mask disjoint equal sized parts 301 mask = Z[i].cumsum() / Z[i].sum() > 0.5 302 Z[i, mask] *= 0.0 303 Z[-1, ~mask] *= 0.0 304 305 new = type(self)(self._X, self.K + 1, False, self._axis) 306 new.scales = Z 307 new.m_step() 308 if train: 309 new.em() 310 311 return new
312 313 @classmethod
314 - def series(cls, X, start=1, stop=9):
315 """ 316 Iterator with mixture instances for C{K in range(start, stop)} 317 318 @type X: (M,...) numpy array 319 @type start: int 320 @type stop: int 321 @rtype: generator 322 """ 323 mixture = cls(X, start) 324 yield mixture 325 326 for K in range(start + 1, stop): #@UnusedVariable 327 mixture = mixture.increment_K() 328 yield mixture
329 330 @classmethod
331 - def new(cls, X, K=0):
332 """ 333 Factory method with optional C{K}. If C{K=0}, guess best C{K} according 334 to L{BIC<GaussianMixture.BIC>}. 335 336 @param X: multi dimensional input vector with samples along first axis 337 @type X: (M,...) numpy array 338 339 @return: Mixture instance 340 @rtype: L{GaussianMixture} subclass 341 """ 342 if K > 0: 343 return cls(X, K) 344 345 mixture_it = cls.series(X) 346 mixture = next(mixture_it) 347 348 # increase K as long as next candidate looks better 349 for candidate in mixture_it: 350 if candidate.BIC >= mixture.BIC: 351 break 352 mixture = candidate 353 354 return mixture
355 356 @property
357 - def BIC(self):
358 """ 359 Bayesian information criterion, calculated as 360 BIC = M * ln(sigma_e^2) + K * ln(M) 361 362 @rtype: float 363 """ 364 from numpy import log 365 366 n = self.M 367 k = self.K 368 error_variance = sum(self.sigma ** 2 * self.w) 369 370 return n * log(error_variance) + k * log(n)
371 372 @property
373 - def membership(self):
374 """ 375 Membership array 376 @rtype: (N,) numpy array 377 """ 378 return self.scales.argmax(0)
379
380 - def overlap(self, other):
381 """ 382 Similarity of two mixtures measured in membership overlap 383 384 @param other: Mixture or membership array 385 @type other: L{GaussianMixture} or sequence 386 387 @return: segmentation overlap 388 @rtype: float in interval [0.0, 1.0] 389 """ 390 if isinstance(other, GaussianMixture): 391 other_w = other.membership 392 K = min(self.K, other.K) 393 elif isinstance(other, (list, tuple, numpy.ndarray)): 394 other_w = other 395 K = min(self.K, len(set(other))) 396 else: 397 raise TypeError('other') 398 399 self_w = self.membership 400 if len(self_w) != len(other_w): 401 raise ValueError('self.N != other.N') 402 403 # position numbers might be permutated, so count equal pairs 404 ww = tuple(zip(self_w, other_w)) 405 same = sum(sorted(ww.count(i) for i in set(ww))[-K:]) 406 407 return float(same) / len(ww)
408
409 -class AbstractStructureMixture(GaussianMixture):
410 """ 411 Abstract mixture model for protein structure ensembles. 412 """ 413 __metaclass__ = ABCMeta 414
415 - def __init__(self, X, K, *args, **kwargs):
416 if len(X.shape) != 3 or X.shape[-1] != 3: 417 raise ValueError('X must be array of shape (M,N,3)') 418 419 self._R = numpy.zeros((len(X), K, 3, 3)) 420 self._t = numpy.zeros((len(X), K, 3)) 421 422 super(AbstractStructureMixture, self).__init__(X, K, *args, **kwargs)
423 424 @property
425 - def R(self):
426 """ 427 Rotation matrices 428 @rtype: (M,K,3,3) numpy array 429 """ 430 return self._R
431 432 @property
433 - def t(self):
434 """ 435 Translation vectors 436 @rtype: (M,K,3) numpy array 437 """ 438 return self._t
439
440 - def datapoint(self, m, k):
441 return numpy.dot(self._X[m] - self._t[m, k], self._R[m, k])
442
443 - def m_step(self):
444 self.estimate_means() 445 self.estimate_T()
446 447 @abstractmethod
448 - def estimate_T(self):
449 """ 450 Estimate superpositions 451 """ 452 raise NotImplementedError
453
454 -class SegmentMixture(AbstractStructureMixture):
455 """ 456 Gaussian mixture model for protein structure ensembles using a set of segments 457 458 If C{X} is the coordinate array of a protein structure ensemble which 459 can be decomposed into 2 rigid segments, the segmentation will be found by: 460 461 >>> mixture = SegmentMixture(X, 2) 462 463 The segment membership of each atom is given by: 464 465 >>> mixture.membership 466 array([0, 0, 0, ..., 1, 1, 1]) 467 """ 468 _axis = 1 469
470 - def estimate_T(self):
471 from csb.bio.utils import wfit 472 for m in range(self.M): 473 for k in range(self.K): 474 self._R[m, k], self._t[m, k] = wfit(self._X[m], self.means[k], self.scales[k])
475
476 - def estimate_means(self):
477 # superpositions are weighted, so do unweighted mean here 478 self.means = numpy.mean([[self.datapoint(m, k) 479 for m in range(self.M)] 480 for k in range(self.K)], 1)
481
482 -class ConformerMixture(AbstractStructureMixture):
483 """ 484 Gaussian mixture model for protein structure ensembles using a set of conformers 485 486 If C{mixture} is a trained model, the ensemble coordinate array of 487 structures from C{X} which belong to conformation C{k} is given by: 488 489 >>> indices = numpy.where(mixture.membership == k)[0] 490 >>> conformer = [mixture.datapoint(m, k) for m in indices] 491 """ 492 _axis = 0 493
494 - def estimate_T(self):
495 from csb.bio.utils import fit 496 for m in range(self.M): 497 for k in range(self.K): 498 self._R[m, k], self._t[m, k] = fit(self._X[m], self.means[k])
499 500 # vi:expandtab:smarttab:sw=4 501