csb.statistics.mixtures

12 """ 13 Gaussian mixture model for multi-dimensional data. 14 """ 15 _axis = None 16 17 # prior for variance (inverse Gamma distribution) 18 ALPHA_SIGMA = 0.0001 19 BETA_SIGMA = 0.01 20 MIN_SIGMA = 0.0 21 22 use_cache = True 23

24 - def __init__(self, X, K, train=True, axis=None):

25 """ 26 @param X: multi dimensional input vector with samples along first axis 27 @type X: (M,...) numpy array 28 29 @param K: number of components 30 @type K: int 31 32 @param train: train model 33 @type train: bool 34 35 @param axis: component axis in C{X} 36 @type axis: int 37 """ 38 if self._axis is not None: 39 if axis is not None and axis != self._axis: 40 raise ValueError('axis is fixed for {0}'.format(type(self).__name__)) 41 axis = self._axis 42 elif axis is None: 43 axis = 0 44 self._axis = axis 45 46 N = X.shape[axis] 47 self._X = X 48 self._dimension = numpy.prod(X.shape) / N 49 50 c = numpy.linspace(0, K, N, False).astype(int) 51 self._scales = numpy.equal.outer(range(K), c).astype(float) 52 self._means = numpy.zeros((K,) + X.shape[1:]) 53 self.del_cache() 54 55 if train: 56 self.em()

57 58 @property

59 - def K(self):

60 """ 61 Number of components 62 @rtype: int 63 """ 64 return len(self.means)

65 66 @property

67 - def N(self):

68 """ 69 Length of component axis 70 @rtype: int 71 """ 72 return self._scales.shape[1]

73 74 @property

75 - def M(self):

76 """ 77 Number of data points 78 @rtype: int 79 """ 80 return len(self._X)

81

82 - def del_cache(self):

83 """Clear model parameter cache (force recalculation)""" 84 self._w = None 85 self._sigma = None 86 self._delta = None

87 88 @property

89 - def dimension(self):

90 """ 91 Dimensionality of the mixture domain 92 @rtype: int 93 """ 94 return self._dimension

95 96 @property

97 - def means(self):

98 """ 99 @rtype: (K, ...) numpy array 100 """ 101 return self._means

102 103 @means.setter

104 - def means(self, means):

105 if means.shape != self._means.shape: 106 raise ValueError('shape mismatch') 107 self._means = means 108 self.del_cache()

109 110 @property

111 - def scales(self):

112 """ 113 @rtype: (K, N) numpy array 114 """ 115 return self._scales

116 117 @scales.setter

118 - def scales(self, scales):

119 if scales.shape != self._scales.shape: 120 raise ValueError('shape mismatch') 121 self._scales = scales 122 self.del_cache()

123 124 @property

125 - def w(self):

126 """ 127 Component weights 128 @rtype: (K,) numpy array 129 """ 130 if not self.use_cache or self._w is None: 131 self._w = self.scales.mean(1) 132 return self._w

133 134 @property

135 - def sigma(self):

136 """ 137 Component variations 138 @rtype: (K,) numpy array 139 """ 140 if not self.use_cache or self._sigma is None: 141 alpha = self.dimension * self.scales.sum(1) + self.ALPHA_SIGMA 142 beta = (self.delta * self.scales.T).sum(0) + self.BETA_SIGMA 143 self._sigma = numpy.sqrt(beta / alpha).clip(self.MIN_SIGMA) 144 return self._sigma

145 146 @property

147 - def delta(self):

148 """ 149 Squared "distances" between data and components 150 @rtype: (N, K) numpy array 151 """ 152 if not self.use_cache or self._delta is None: 153 self._delta = numpy.transpose([[d.sum() 154 for d in numpy.swapaxes([(self.means[k] - self.datapoint(m, k)) ** 2 155 for m in range(self.M)], 0, self._axis)] 156 for k in range(self.K)]) 157 return self._delta

158 159 @property

160 - def log_likelihood_reduced(self):

161 """ 162 Log-likelihood of the marginalized model (no auxiliary indicator variables) 163 @rtype: float 164 """ 165 from csb.numeric import log, log_sum_exp 166 s_sq = (self.sigma ** 2).clip(1e-300, 1e300) 167 log_p = log(self.w) - 0.5 * \ 168 (self.delta / s_sq + self.dimension * log(2 * numpy.pi * s_sq)) 169 return log_sum_exp(log_p.T).sum()

170 171 @property

172 - def log_likelihood(self):

173 """ 174 Log-likelihood of the extended model (with indicators) 175 @rtype: float 176 """ 177 from csb.numeric import log 178 from numpy import pi, sum 179 n = self.scales.sum(1) 180 N = self.dimension 181 Z = self.scales.T 182 s_sq = (self.sigma ** 2).clip(1e-300, 1e300) 183 return sum(n * log(self.w)) - 0.5 * \ 184 (sum(Z * self.delta / s_sq) + N * sum(n * log(2 * pi * s_sq)) + sum(log(s_sq)))

185

186 - def datapoint(self, m, k):

187 """ 188 Training point number C{m} as if it would belong to component C{k} 189 @rtype: numpy array 190 """ 191 return self._X[m]

192

193 - def estimate_means(self):

194 """ 195 Update means from current model and samples 196 """ 197 n = self.scales.sum(1) 198 self.means = numpy.array([numpy.sum([self.scales[k, m] * self.datapoint(m, k) 199 for m in range(self.M)], 0) / n[k] 200 for k in range(self.K)])

201

202 - def estimate_scales(self, beta=1.0):

203 """ 204 Update scales from current model and samples 205 @param beta: inverse temperature 206 @type beta: float 207 """ 208 from csb.numeric import log, log_sum_exp, exp 209 s_sq = (self.sigma ** 2).clip(1e-300, 1e300) 210 Z = (log(self.w) - 0.5 * (self.delta / s_sq + self.dimension * log(s_sq))) * beta 211 self.scales = exp(Z.T - log_sum_exp(Z.T))

212

213 - def randomize_means(self):

214 """ 215 Pick C{K} samples from C{X} as means 216 """ 217 import random 218 self.means = numpy.asarray(random.sample(self._X, self.K)) 219 self.estimate_scales()

220

221 - def randomize_scales(self, ordered=True):

222 """ 223 Random C{scales} initialization 224 """ 225 from numpy.random import random, multinomial 226 if ordered: 227 K, N = self.scales.shape 228 Ks = numpy.arange(K) 229 w = random(K) + (5. * K / N) # with pseudocounts 230 c = numpy.repeat(Ks, multinomial(N, w / w.sum())) 231 self.scales = numpy.equal.outer(Ks, c).astype(float) 232 else: 233 s = random(self.scales.shape) 234 self.scales = s / s.sum(0) 235 236 if 0.0 in self.w: 237 self.randomize_scales(ordered) 238 return 239 240 self.estimate_means()

241

242 - def e_step(self, beta=1.0):

243 """ 244 Expectation step for EM 245 @param beta: inverse temperature 246 @type beta: float 247 """ 248 self.estimate_scales(beta)

249

250 - def m_step(self):

251 """ 252 Maximization step for EM 253 """ 254 self.estimate_means()

255

256 - def em(self, n_iter=100, eps=1e-30):

257 """ 258 Expectation maximization 259 260 @param n_iter: maximum number of iteration steps 261 @type n_iter: int 262 263 @param eps: log-likelihood convergence criterion 264 @type eps: float 265 """ 266 LL_prev = -numpy.inf 267 for i in range(n_iter): 268 self.m_step() 269 self.e_step() 270 271 if eps is not None: 272 LL = self.log_likelihood 273 if abs(LL - LL_prev) < eps: 274 break 275 LL_prev = LL

276

277 - def anneal(self, betas):

278 """ 279 Deterministic annealing 280 281 @param betas: sequence of inverse temperatures 282 @type betas: iterable of floats 283 """ 284 for beta in betas: 285 self.m_step() 286 self.e_step(beta)

287

288 - def increment_K(self, train=True):

289 """ 290 Split component with largest sigma 291 292 @returns: new instance of mixture with incremented C{K} 293 @rtype: L{GaussianMixture} subclass 294 """ 295 i = self.sigma.argmax() 296 297 # duplicate column 298 Z = numpy.vstack([self.scales, self.scales[i]]) 299 300 # mask disjoint equal sized parts 301 mask = Z[i].cumsum() / Z[i].sum() > 0.5 302 Z[i, mask] *= 0.0 303 Z[-1, ~mask] *= 0.0 304 305 new = type(self)(self._X, self.K + 1, False, self._axis) 306 new.scales = Z 307 new.m_step() 308 if train: 309 new.em() 310 311 return new

312 313 @classmethod

314 - def series(cls, X, start=1, stop=9):

315 """ 316 Iterator with mixture instances for C{K in range(start, stop)} 317 318 @type X: (M,...) numpy array 319 @type start: int 320 @type stop: int 321 @rtype: generator 322 """ 323 mixture = cls(X, start) 324 yield mixture 325 326 for K in range(start + 1, stop): #@UnusedVariable 327 mixture = mixture.increment_K() 328 yield mixture

329 330 @classmethod

331 - def new(cls, X, K=0):

332 """ 333 Factory method with optional C{K}. If C{K=0}, guess best C{K} according 334 to L{BIC<GaussianMixture.BIC>}. 335 336 @param X: multi dimensional input vector with samples along first axis 337 @type X: (M,...) numpy array 338 339 @return: Mixture instance 340 @rtype: L{GaussianMixture} subclass 341 """ 342 if K > 0: 343 return cls(X, K) 344 345 mixture_it = cls.series(X) 346 mixture = next(mixture_it) 347 348 # increase K as long as next candidate looks better 349 for candidate in mixture_it: 350 if candidate.BIC >= mixture.BIC: 351 break 352 mixture = candidate 353 354 return mixture

355 356 @property

357 - def BIC(self):

358 """ 359 Bayesian information criterion, calculated as 360 BIC = M * ln(sigma_e^2) + K * ln(M) 361 362 @rtype: float 363 """ 364 from numpy import log 365 366 n = self.M 367 k = self.K 368 error_variance = sum(self.sigma ** 2 * self.w) 369 370 return n * log(error_variance) + k * log(n)

371 372 @property

373 - def membership(self):

374 """ 375 Membership array 376 @rtype: (N,) numpy array 377 """ 378 return self.scales.argmax(0)

379

380 - def overlap(self, other):

381 """ 382 Similarity of two mixtures measured in membership overlap 383 384 @param other: Mixture or membership array 385 @type other: L{GaussianMixture} or sequence 386 387 @return: segmentation overlap 388 @rtype: float in interval [0.0, 1.0] 389 """ 390 if isinstance(other, GaussianMixture): 391 other_w = other.membership 392 K = min(self.K, other.K) 393 elif isinstance(other, (list, tuple, numpy.ndarray)): 394 other_w = other 395 K = min(self.K, len(set(other))) 396 else: 397 raise TypeError('other') 398 399 self_w = self.membership 400 if len(self_w) != len(other_w): 401 raise ValueError('self.N != other.N') 402 403 # position numbers might be permutated, so count equal pairs 404 ww = tuple(zip(self_w, other_w)) 405 same = sum(sorted(ww.count(i) for i in set(ww))[-K:]) 406 407 return float(same) / len(ww)

Source Code for Module csb.statistics.mixtures