1 """
2 Mixture models for multi-dimensional data.
3
4 Reference: Hirsch M, Habeck M. - Bioinformatics. 2008 Oct 1;24(19):2184-92
5 """
6 import numpy
7
8 from abc import ABCMeta, abstractmethod
12 """
13 Gaussian mixture model for multi-dimensional data.
14 """
15 _axis = None
16
17
18 ALPHA_SIGMA = 0.0001
19 BETA_SIGMA = 0.01
20 MIN_SIGMA = 0.0
21
22 use_cache = True
23
24 - def __init__(self, X, K, train=True, axis=None):
25 """
26 @param X: multi dimensional input vector with samples along first axis
27 @type X: (M,...) numpy array
28
29 @param K: number of components
30 @type K: int
31
32 @param train: train model
33 @type train: bool
34
35 @param axis: component axis in C{X}
36 @type axis: int
37 """
38 if self._axis is not None:
39 if axis is not None and axis != self._axis:
40 raise ValueError('axis is fixed for {0}'.format(type(self).__name__))
41 axis = self._axis
42 elif axis is None:
43 axis = 0
44 self._axis = axis
45
46 N = X.shape[axis]
47 self._X = X
48 self._dimension = numpy.prod(X.shape) / N
49
50 c = numpy.linspace(0, K, N, False).astype(int)
51 self._scales = numpy.equal.outer(range(K), c).astype(float)
52 self._means = numpy.zeros((K,) + X.shape[1:])
53 self.del_cache()
54
55 if train:
56 self.em()
57
58 @property
60 """
61 Number of components
62 @rtype: int
63 """
64 return len(self.means)
65
66 @property
68 """
69 Length of component axis
70 @rtype: int
71 """
72 return self._scales.shape[1]
73
74 @property
76 """
77 Number of data points
78 @rtype: int
79 """
80 return len(self._X)
81
83 """Clear model parameter cache (force recalculation)"""
84 self._w = None
85 self._sigma = None
86 self._delta = None
87
88 @property
90 """
91 Dimensionality of the mixture domain
92 @rtype: int
93 """
94 return self._dimension
95
96 @property
98 """
99 @rtype: (K, ...) numpy array
100 """
101 return self._means
102
103 @means.setter
109
110 @property
112 """
113 @rtype: (K, N) numpy array
114 """
115 return self._scales
116
117 @scales.setter
123
124 @property
126 """
127 Component weights
128 @rtype: (K,) numpy array
129 """
130 if not self.use_cache or self._w is None:
131 self._w = self.scales.mean(1)
132 return self._w
133
134 @property
145
146 @property
148 """
149 Squared "distances" between data and components
150 @rtype: (N, K) numpy array
151 """
152 if not self.use_cache or self._delta is None:
153 self._delta = numpy.transpose([[d.sum()
154 for d in numpy.swapaxes([(self.means[k] - self.datapoint(m, k)) ** 2
155 for m in range(self.M)], 0, self._axis)]
156 for k in range(self.K)])
157 return self._delta
158
159 @property
161 """
162 Log-likelihood of the marginalized model (no auxiliary indicator variables)
163 @rtype: float
164 """
165 from csb.numeric import log, log_sum_exp
166 s_sq = (self.sigma ** 2).clip(1e-300, 1e300)
167 log_p = log(self.w) - 0.5 * \
168 (self.delta / s_sq + self.dimension * log(2 * numpy.pi * s_sq))
169 return log_sum_exp(log_p.T).sum()
170
171 @property
173 """
174 Log-likelihood of the extended model (with indicators)
175 @rtype: float
176 """
177 from csb.numeric import log
178 from numpy import pi, sum
179 n = self.scales.sum(1)
180 N = self.dimension
181 Z = self.scales.T
182 s_sq = (self.sigma ** 2).clip(1e-300, 1e300)
183 return sum(n * log(self.w)) - 0.5 * \
184 (sum(Z * self.delta / s_sq) + N * sum(n * log(2 * pi * s_sq)) + sum(log(s_sq)))
185
187 """
188 Training point number C{m} as if it would belong to component C{k}
189 @rtype: numpy array
190 """
191 return self._X[m]
192
194 """
195 Update means from current model and samples
196 """
197 n = self.scales.sum(1)
198 self.means = numpy.array([numpy.sum([self.scales[k, m] * self.datapoint(m, k)
199 for m in range(self.M)], 0) / n[k]
200 for k in range(self.K)])
201
212
220
241
243 """
244 Expectation step for EM
245 @param beta: inverse temperature
246 @type beta: float
247 """
248 self.estimate_scales(beta)
249
251 """
252 Maximization step for EM
253 """
254 self.estimate_means()
255
256 - def em(self, n_iter=100, eps=1e-30):
257 """
258 Expectation maximization
259
260 @param n_iter: maximum number of iteration steps
261 @type n_iter: int
262
263 @param eps: log-likelihood convergence criterion
264 @type eps: float
265 """
266 LL_prev = -numpy.inf
267 for i in range(n_iter):
268 self.m_step()
269 self.e_step()
270
271 if eps is not None:
272 LL = self.log_likelihood
273 if abs(LL - LL_prev) < eps:
274 break
275 LL_prev = LL
276
278 """
279 Deterministic annealing
280
281 @param betas: sequence of inverse temperatures
282 @type betas: iterable of floats
283 """
284 for beta in betas:
285 self.m_step()
286 self.e_step(beta)
287
289 """
290 Split component with largest sigma
291
292 @returns: new instance of mixture with incremented C{K}
293 @rtype: L{GaussianMixture} subclass
294 """
295 i = self.sigma.argmax()
296
297
298 Z = numpy.vstack([self.scales, self.scales[i]])
299
300
301 mask = Z[i].cumsum() / Z[i].sum() > 0.5
302 Z[i, mask] *= 0.0
303 Z[-1, ~mask] *= 0.0
304
305 new = type(self)(self._X, self.K + 1, False, self._axis)
306 new.scales = Z
307 new.m_step()
308 if train:
309 new.em()
310
311 return new
312
313 @classmethod
314 - def series(cls, X, start=1, stop=9):
315 """
316 Iterator with mixture instances for C{K in range(start, stop)}
317
318 @type X: (M,...) numpy array
319 @type start: int
320 @type stop: int
321 @rtype: generator
322 """
323 mixture = cls(X, start)
324 yield mixture
325
326 for K in range(start + 1, stop):
327 mixture = mixture.increment_K()
328 yield mixture
329
330 @classmethod
331 - def new(cls, X, K=0):
332 """
333 Factory method with optional C{K}. If C{K=0}, guess best C{K} according
334 to L{BIC<GaussianMixture.BIC>}.
335
336 @param X: multi dimensional input vector with samples along first axis
337 @type X: (M,...) numpy array
338
339 @return: Mixture instance
340 @rtype: L{GaussianMixture} subclass
341 """
342 if K > 0:
343 return cls(X, K)
344
345 mixture_it = cls.series(X)
346 mixture = next(mixture_it)
347
348
349 for candidate in mixture_it:
350 if candidate.BIC >= mixture.BIC:
351 break
352 mixture = candidate
353
354 return mixture
355
356 @property
358 """
359 Bayesian information criterion, calculated as
360 BIC = M * ln(sigma_e^2) + K * ln(M)
361
362 @rtype: float
363 """
364 from numpy import log
365
366 n = self.M
367 k = self.K
368 error_variance = sum(self.sigma ** 2 * self.w)
369
370 return n * log(error_variance) + k * log(n)
371
372 @property
374 """
375 Membership array
376 @rtype: (N,) numpy array
377 """
378 return self.scales.argmax(0)
379
381 """
382 Similarity of two mixtures measured in membership overlap
383
384 @param other: Mixture or membership array
385 @type other: L{GaussianMixture} or sequence
386
387 @return: segmentation overlap
388 @rtype: float in interval [0.0, 1.0]
389 """
390 if isinstance(other, GaussianMixture):
391 other_w = other.membership
392 K = min(self.K, other.K)
393 elif isinstance(other, (list, tuple, numpy.ndarray)):
394 other_w = other
395 K = min(self.K, len(set(other)))
396 else:
397 raise TypeError('other')
398
399 self_w = self.membership
400 if len(self_w) != len(other_w):
401 raise ValueError('self.N != other.N')
402
403
404 ww = tuple(zip(self_w, other_w))
405 same = sum(sorted(ww.count(i) for i in set(ww))[-K:])
406
407 return float(same) / len(ww)
408
410 """
411 Abstract mixture model for protein structure ensembles.
412 """
413 __metaclass__ = ABCMeta
414
415 - def __init__(self, X, K, *args, **kwargs):
416 if len(X.shape) != 3 or X.shape[-1] != 3:
417 raise ValueError('X must be array of shape (M,N,3)')
418
419 self._R = numpy.zeros((len(X), K, 3, 3))
420 self._t = numpy.zeros((len(X), K, 3))
421
422 super(AbstractStructureMixture, self).__init__(X, K, *args, **kwargs)
423
424 @property
426 """
427 Rotation matrices
428 @rtype: (M,K,3,3) numpy array
429 """
430 return self._R
431
432 @property
434 """
435 Translation vectors
436 @rtype: (M,K,3) numpy array
437 """
438 return self._t
439
441 return numpy.dot(self._X[m] - self._t[m, k], self._R[m, k])
442
446
447 @abstractmethod
449 """
450 Estimate superpositions
451 """
452 raise NotImplementedError
453
455 """
456 Gaussian mixture model for protein structure ensembles using a set of segments
457
458 If C{X} is the coordinate array of a protein structure ensemble which
459 can be decomposed into 2 rigid segments, the segmentation will be found by:
460
461 >>> mixture = SegmentMixture(X, 2)
462
463 The segment membership of each atom is given by:
464
465 >>> mixture.membership
466 array([0, 0, 0, ..., 1, 1, 1])
467 """
468 _axis = 1
469
471 from csb.bio.utils import wfit
472 for m in range(self.M):
473 for k in range(self.K):
474 self._R[m, k], self._t[m, k] = wfit(self._X[m], self.means[k], self.scales[k])
475
477
478 self.means = numpy.mean([[self.datapoint(m, k)
479 for m in range(self.M)]
480 for k in range(self.K)], 1)
481
499
500
501