1  """ 
  2  Mixture models for multi-dimensional data. 
  3   
  4  Reference: Hirsch M, Habeck M. - Bioinformatics. 2008 Oct 1;24(19):2184-92 
  5  """ 
  6  import numpy 
  7   
  8  from abc import ABCMeta, abstractmethod 
 12      """ 
 13      Gaussian mixture model for multi-dimensional data. 
 14      """ 
 15      _axis = None 
 16   
 17       
 18      ALPHA_SIGMA = 0.0001 
 19      BETA_SIGMA = 0.01 
 20      MIN_SIGMA = 0.0 
 21   
 22      use_cache = True 
 23   
 24 -    def __init__(self, X, K, train=True, axis=None): 
  25          """ 
 26          @param X: multi dimensional input vector with samples along first axis 
 27          @type X: (M,...) numpy array 
 28   
 29          @param K: number of components 
 30          @type K: int 
 31   
 32          @param train: train model 
 33          @type train: bool 
 34   
 35          @param axis: component axis in C{X} 
 36          @type axis: int 
 37          """ 
 38          if self._axis is not None: 
 39              if axis is not None and axis != self._axis: 
 40                  raise ValueError('axis is fixed for {0}'.format(type(self).__name__)) 
 41              axis = self._axis 
 42          elif axis is None: 
 43              axis = 0 
 44          self._axis = axis 
 45   
 46          N = X.shape[axis] 
 47          self._X = X 
 48          self._dimension = numpy.prod(X.shape) / N 
 49   
 50          c = numpy.linspace(0, K, N, False).astype(int) 
 51          self._scales = numpy.equal.outer(range(K), c).astype(float) 
 52          self._means = numpy.zeros((K,) + X.shape[1:]) 
 53          self.del_cache() 
 54   
 55          if train: 
 56              self.em() 
  57   
 58      @property 
 60          """ 
 61          Number of components 
 62          @rtype: int 
 63          """ 
 64          return len(self.means) 
  65   
 66      @property 
 68          """ 
 69          Length of component axis 
 70          @rtype: int 
 71          """ 
 72          return self._scales.shape[1] 
  73   
 74      @property 
 76          """ 
 77          Number of data points 
 78          @rtype: int 
 79          """ 
 80          return len(self._X) 
  81   
 83          """Clear model parameter cache (force recalculation)""" 
 84          self._w = None 
 85          self._sigma = None 
 86          self._delta = None 
  87   
 88      @property 
 90          """ 
 91          Dimensionality of the mixture domain 
 92          @rtype: int 
 93          """ 
 94          return self._dimension 
  95   
 96      @property 
 98          """ 
 99          @rtype: (K, ...) numpy array 
100          """ 
101          return self._means 
 102   
103      @means.setter 
109   
110      @property 
112          """ 
113          @rtype: (K, N) numpy array 
114          """ 
115          return self._scales 
 116   
117      @scales.setter 
123   
124      @property 
126          """ 
127          Component weights 
128          @rtype: (K,) numpy array 
129          """ 
130          if not self.use_cache or self._w is None: 
131              self._w = self.scales.mean(1) 
132          return self._w 
 133   
134      @property 
145   
146      @property 
148          """ 
149          Squared "distances" between data and components 
150          @rtype: (N, K) numpy array 
151          """ 
152          if not self.use_cache or self._delta is None: 
153              self._delta = numpy.transpose([[d.sum() 
154                  for d in numpy.swapaxes([(self.means[k] - self.datapoint(m, k)) ** 2 
155                      for m in range(self.M)], 0, self._axis)] 
156                  for k in range(self.K)]) 
157          return self._delta 
 158   
159      @property 
161          """ 
162          Log-likelihood of the marginalized model (no auxiliary indicator variables) 
163          @rtype: float 
164          """ 
165          from csb.numeric import log, log_sum_exp 
166          s_sq = (self.sigma ** 2).clip(1e-300, 1e300) 
167          log_p = log(self.w) - 0.5 * \ 
168                  (self.delta / s_sq + self.dimension * log(2 * numpy.pi * s_sq)) 
169          return log_sum_exp(log_p.T).sum() 
 170   
171      @property 
173          """ 
174          Log-likelihood of the extended model (with indicators) 
175          @rtype: float 
176          """ 
177          from csb.numeric import log 
178          from numpy import pi, sum 
179          n = self.scales.sum(1) 
180          N = self.dimension 
181          Z = self.scales.T 
182          s_sq = (self.sigma ** 2).clip(1e-300, 1e300) 
183          return sum(n * log(self.w)) - 0.5 * \ 
184                  (sum(Z * self.delta / s_sq) + N * sum(n * log(2 * pi * s_sq)) + sum(log(s_sq))) 
 185   
187          """ 
188          Training point number C{m} as if it would belong to component C{k} 
189          @rtype: numpy array 
190          """ 
191          return self._X[m] 
 192   
194          """ 
195          Update means from current model and samples 
196          """ 
197          n = self.scales.sum(1) 
198          self.means = numpy.array([numpy.sum([self.scales[k, m] * self.datapoint(m, k) 
199              for m in range(self.M)], 0) / n[k] 
200              for k in range(self.K)]) 
 201   
212   
220   
241   
243          """ 
244          Expectation step for EM 
245          @param beta: inverse temperature 
246          @type beta: float 
247          """ 
248          self.estimate_scales(beta) 
 249   
251          """ 
252          Maximization step for EM 
253          """ 
254          self.estimate_means() 
 255   
256 -    def em(self, n_iter=100, eps=1e-30): 
 257          """ 
258          Expectation maximization 
259   
260          @param n_iter: maximum number of iteration steps 
261          @type n_iter: int 
262   
263          @param eps: log-likelihood convergence criterion 
264          @type eps: float 
265          """ 
266          LL_prev = -numpy.inf 
267          for i in range(n_iter): 
268              self.m_step() 
269              self.e_step() 
270   
271              if eps is not None: 
272                  LL = self.log_likelihood 
273                  if abs(LL - LL_prev) < eps: 
274                      break 
275                  LL_prev = LL 
 276   
278          """ 
279          Deterministic annealing 
280   
281          @param betas: sequence of inverse temperatures 
282          @type betas: iterable of floats 
283          """ 
284          for beta in betas: 
285              self.m_step() 
286              self.e_step(beta) 
 287   
289          """ 
290          Split component with largest sigma 
291   
292          @returns: new instance of mixture with incremented C{K} 
293          @rtype: L{GaussianMixture} subclass 
294          """ 
295          i = self.sigma.argmax() 
296   
297           
298          Z = numpy.vstack([self.scales, self.scales[i]]) 
299   
300           
301          mask = Z[i].cumsum() / Z[i].sum() > 0.5 
302          Z[i, mask] *= 0.0 
303          Z[-1, ~mask] *= 0.0 
304   
305          new = type(self)(self._X, self.K + 1, False, self._axis) 
306          new.scales = Z 
307          new.m_step() 
308          if train: 
309              new.em() 
310   
311          return new 
 312   
313      @classmethod 
314 -    def series(cls, X, start=1, stop=9): 
 315          """ 
316          Iterator with mixture instances for C{K in range(start, stop)} 
317   
318          @type X: (M,...) numpy array 
319          @type start: int 
320          @type stop: int 
321          @rtype: generator 
322          """ 
323          mixture = cls(X, start) 
324          yield mixture 
325   
326          for K in range(start + 1, stop):                         
327              mixture = mixture.increment_K() 
328              yield mixture 
 329   
330      @classmethod 
331 -    def new(cls, X, K=0): 
 332          """ 
333          Factory method with optional C{K}. If C{K=0}, guess best C{K} according 
334          to L{BIC<GaussianMixture.BIC>}. 
335   
336          @param X: multi dimensional input vector with samples along first axis 
337          @type X: (M,...) numpy array 
338   
339          @return: Mixture instance 
340          @rtype: L{GaussianMixture} subclass 
341          """ 
342          if K > 0: 
343              return cls(X, K) 
344   
345          mixture_it = cls.series(X) 
346          mixture = next(mixture_it) 
347   
348           
349          for candidate in mixture_it: 
350              if candidate.BIC >= mixture.BIC: 
351                  break 
352              mixture = candidate 
353   
354          return mixture 
 355   
356      @property 
358          """ 
359          Bayesian information criterion, calculated as 
360          BIC = M * ln(sigma_e^2) + K * ln(M) 
361   
362          @rtype: float 
363          """ 
364          from numpy import log 
365   
366          n = self.M 
367          k = self.K 
368          error_variance = sum(self.sigma ** 2 * self.w) 
369   
370          return n * log(error_variance) + k * log(n) 
 371   
372      @property 
374          """ 
375          Membership array 
376          @rtype: (N,) numpy array 
377          """ 
378          return self.scales.argmax(0) 
 379   
381          """ 
382          Similarity of two mixtures measured in membership overlap 
383   
384          @param other: Mixture or membership array 
385          @type other: L{GaussianMixture} or sequence 
386   
387          @return: segmentation overlap 
388          @rtype: float in interval [0.0, 1.0] 
389          """ 
390          if isinstance(other, GaussianMixture): 
391              other_w = other.membership 
392              K = min(self.K, other.K) 
393          elif isinstance(other, (list, tuple, numpy.ndarray)): 
394              other_w = other 
395              K = min(self.K, len(set(other))) 
396          else: 
397              raise TypeError('other') 
398   
399          self_w = self.membership 
400          if len(self_w) != len(other_w): 
401              raise ValueError('self.N != other.N') 
402   
403           
404          ww = tuple(zip(self_w, other_w)) 
405          same = sum(sorted(ww.count(i) for i in set(ww))[-K:]) 
406   
407          return float(same) / len(ww) 
  408   
410      """ 
411      Abstract mixture model for protein structure ensembles. 
412      """ 
413      __metaclass__ = ABCMeta 
414   
415 -    def __init__(self, X, K, *args, **kwargs): 
 416          if len(X.shape) != 3 or X.shape[-1] != 3: 
417              raise ValueError('X must be array of shape (M,N,3)') 
418   
419          self._R = numpy.zeros((len(X), K, 3, 3)) 
420          self._t = numpy.zeros((len(X), K, 3)) 
421   
422          super(AbstractStructureMixture, self).__init__(X, K, *args, **kwargs) 
 423   
424      @property 
426          """ 
427          Rotation matrices 
428          @rtype: (M,K,3,3) numpy array 
429          """ 
430          return self._R 
 431   
432      @property 
434          """ 
435          Translation vectors 
436          @rtype: (M,K,3) numpy array 
437          """ 
438          return self._t 
 439   
441          return numpy.dot(self._X[m] - self._t[m, k], self._R[m, k]) 
 442   
446   
447      @abstractmethod 
449          """ 
450          Estimate superpositions 
451          """ 
452          raise NotImplementedError 
  453   
455      """ 
456      Gaussian mixture model for protein structure ensembles using a set of segments 
457   
458      If C{X} is the coordinate array of a protein structure ensemble which 
459      can be decomposed into 2 rigid segments, the segmentation will be found by: 
460   
461      >>> mixture = SegmentMixture(X, 2) 
462   
463      The segment membership of each atom is given by: 
464   
465      >>> mixture.membership 
466      array([0, 0, 0, ..., 1, 1, 1]) 
467      """ 
468      _axis = 1 
469   
471          from csb.bio.utils import wfit 
472          for m in range(self.M): 
473              for k in range(self.K): 
474                  self._R[m, k], self._t[m, k] = wfit(self._X[m], self.means[k], self.scales[k]) 
 475   
477           
478          self.means = numpy.mean([[self.datapoint(m, k) 
479              for m in range(self.M)] 
480              for k in range(self.K)], 1) 
  481   
499   
500   
501