Package csb :: Package statistics
[frames] | no frames]

Source Code for Package csb.statistics

  1  """ 
  2  Statistics root package. 
  3   
  4  This package contains a number of common statistical utilities. Sub-packages 
  5  provide more specialized APIs, for example L{csb.statistics.pdf} defines the 
  6  probability density object model. 
  7  """ 
  8   
9 -class Cumulative(object):
10 11 total_mem = 1e8 12
13 - def __init__(self, data):
14 15 self.data = data
16
17 - def __call__(self, x, nchunks=None):
18 19 from numpy import greater, reshape, concatenate 20 21 c = [] 22 x = reshape(x, (-1,)) 23 24 if nchunks is None: 25 26 total_size = len(x) * len(self.data) 27 nchunks = total_size / self.total_mem + int(total_size % self.total_mem != 0) 28 29 size = int(len(x) / nchunks + int(len(x) % nchunks != 0)) 30 31 while len(x): 32 33 y = x[:size] 34 x = x[size:] 35 36 c = concatenate((c, greater.outer(y, self.data).sum(1) / float(len(self.data)))) 37 38 return c
39
40 - def cumulative_density(self, x, nchunks=None):
41 return 1 - self.__call__(x, nchunks)
42
43 -def geometric_mean(x, axis=None):
44 """ 45 @param x: 46 @param axis: compute the geometric mean along this axis 47 48 @return: geometric mean of x 49 """ 50 from numpy import exp, log, clip, mean 51 52 return exp(mean(log(clip(x, 1e-300, 1e300)), axis))
53
54 -def harmonic_mean(x, axis=None):
55 """ 56 @param x: 57 @param axis: compute the harmonic mean along this axis 58 59 @return: harmonic mean of x 60 """ 61 from numpy import mean 62 63 return 1 / mean(1 / x, axis)
64
65 -def kurtosis(x, axis=None):
66 """ 67 @param x: random variables 68 @param axis: compute the kurtosis along this axis 69 70 @return: Sample kurtosis of x 71 """ 72 from numpy import mean, std 73 74 m = x.mean(axis) 75 a = mean((x - m) ** 4, axis) 76 s = std(x, axis) 77 78 return a / s ** 4 - 3
79
80 -def skewness(x, axis=None):
81 """ 82 @param x: random variables 83 @param axis: compute the skewness along this axis 84 85 @return: Sample skewness of x 86 """ 87 from numpy import mean, std 88 89 s = std(x) 90 return mean((x - x.mean()) ** 3, axis) / s ** 3
91
92 -def autocorrelation(x, n):
93 """ 94 auto-correlation of a times series 95 96 @param x: time series 97 @type x: numpy.array 98 @param n: Maximal lag for which to compute the auto-correlation 99 @type n: int 100 """ 101 from numpy import array, mean, std 102 x = x - x.mean() 103 return array([mean(x[i:] * x[:len(x) - i]) for i in range(n)]) / std(x)**2
104
105 -def probabilistic_and(p, axis=0):
106 """ 107 Probabilistic version of AND 108 """ 109 from numpy import array, multiply 110 return multiply.reduce(array(p), axis=axis)
111
112 -def probabilistic_or(p, axis=0):
113 """ 114 Probabilistic version of OR 115 """ 116 from numpy import array 117 return 1 - probabilistic_and(1 - array(p), axis)
118
119 -def probabilistic_xor(p, axis=0):
120 """ 121 Probabilistic version of XOR. 122 Works only for axis=0. 123 """ 124 from numpy import array 125 126 p = array(p) 127 p_not = 1 - p 128 129 P = [] 130 131 for i in range(p.shape[axis]): 132 x = p_not * 1 133 x[i] = p[i] 134 P.append(probabilistic_and(x, 0)) 135 136 return probabilistic_or(P, 0)
137
138 -def principal_coordinates(D, nd=None):
139 """ 140 Reconstruction of a multidimensional configuration that 141 optimally reproduces the input distance matrix. 142 143 See: Gower, J (1966) 144 """ 145 from numpy import clip, sqrt, take, argsort, sort 146 from csb.numeric import reverse 147 from scipy.linalg import eigh 148 149 ## calculate centered similarity matrix 150 151 B = -clip(D, 1e-150, 1e150) ** 2 / 2. 152 153 b = B.mean(0) 154 155 B = B - b 156 B = (B.T - b).T 157 B += b.mean() 158 159 ## calculate spectral decomposition 160 161 v, U = eigh(B) 162 v = v.real 163 U = U.real 164 165 U = take(U, argsort(v), 1) 166 v = sort(v) 167 168 U = reverse(U, 1) 169 v = reverse(v) 170 171 if nd is None: nd = len(v) 172 173 X = U[:, :nd] * sqrt(clip(v[:nd], 0., 1e300)) 174 175 return X
176
177 -def entropy(p):
178 """ 179 Calculate the entropy of p. 180 @return: entropy of p 181 """ 182 from csb.numeric import log 183 from numpy import sum 184 185 return -sum(p * log(p))
186
187 -def histogram2D(x, nbins=100, axes=None, nbatch=1000, normalize=True):
188 """ 189 Non-greedy two-dimensional histogram. 190 191 @param x: input array of rank two 192 @type x: numpy array 193 @param nbins: number of bins 194 @type nbins: integer 195 @param axes: x- and y-axes used for binning the data (if provided this will be used instead of <nbins>) 196 @type axes: tuple of two one-dimensional numpy arrays 197 @param nbatch: size of batch that is used to sort the data into the 2D grid 198 @type nbatch: integer 199 @param normalize: specifies whether histogram should be normalized 200 @type normalize: boolean 201 202 @return: 2-rank array storing histogram, tuple of x- and y-axis 203 """ 204 from numpy import linspace, zeros, argmin, fabs, subtract, transpose 205 206 if axes is None: 207 208 lower, upper = x.min(0), x.max(0) 209 axes = [linspace(lower[i], upper[i], nbins) for i in range(lower.shape[0])] 210 211 H = zeros((len(axes[0]), len(axes[1]))) 212 213 while len(x): 214 215 y = x[:nbatch] 216 x = x[nbatch:] 217 218 I = transpose([argmin(fabs(subtract.outer(y[:, i], axes[i])), 1) for i in range(2)]) 219 220 for i, j in I: H[i, j] += 1 221 222 if normalize: 223 H = H / H.sum() / (axes[0][1] - axes[0][0]) / (axes[1][1] - axes[1][0]) 224 225 return H, axes
226
227 -def histogram_nd(x, nbins=100, axes=None, nbatch=1000, normalize=True):
228 """ 229 Non-greedy n-dimemsional histogram. 230 231 @param x: input array of rank (-1,n) 232 @type x: numpy array 233 @param nbins: number of bins 234 @type nbins: integer 235 @param axes: axes used for binning the data (if provided this will be used instead of <nbins>) 236 @type axes: tuple of two one-dimensional numpy arrays 237 @param nbatch: size of batch that is used to sort the data into the nD grid 238 @type nbatch: integer 239 @param normalize: specifies whether histogram should be normalized 240 @type normalize: boolean 241 242 @return: n-rank array storing histogram, tuple of axes 243 """ 244 import numpy as np 245 246 if len(x.shape) == 1: 247 x = np.reshape(x, (-1,1)) 248 249 d = x.shape[1] 250 251 if axes is None: 252 253 lower, upper = x.min(0), x.max(0) 254 axes = [np.linspace(lower[i], upper[i], nbins) for i in range(d)] 255 256 shape = tuple(map(len, axes)) 257 258 H = np.zeros(shape) 259 ## MH: was like that before... 260 ## s = np.multiply.accumulate(np.array((1,) + H.shape[:-1]))[::-1] 261 s = np.multiply.accumulate(np.array((1,) + H.shape[1:]))[::-1] 262 H = H.flatten() 263 264 while len(x): 265 266 y = x[:nbatch] 267 x = x[nbatch:] 268 269 I = np.transpose([np.argmin(np.fabs(np.subtract.outer(y[:, i], axes[i])), 1) 270 for i in range(d)]) 271 I = np.dot(I, s) 272 I = np.sort(I) 273 274 i = list(set(I.tolist())) 275 n = np.equal.outer(I, i).sum(0) 276 277 H[i] += n 278 279 if normalize: 280 H = H / H.sum() / np.multiply.reduce([axes[i][1] - axes[i][0] for i in range(d)]) 281 282 H = np.reshape(H, shape) 283 284 return H, axes
285
286 -def density(x, nbins, normalize=True):
287 """ 288 Histogram of univariate input data: basically calls numpy's histogram method and 289 does a proper normalization. 290 291 @param x: input numpy array 292 @param nbins: number of bins 293 @type nbins: integer 294 @param normalize: if true, histogram will be normalized 295 """ 296 from numpy import histogram 297 298 hy, hx = histogram(x, nbins) 299 hx = 0.5 * (hx[1:] + hx[:-1]) 300 hy = hy.astype('d') 301 if normalize: 302 hy /= (hx[1] - hx[0]) * hy.sum() 303 304 return hx, hy
305
306 -def circvar(a, axis=None):
307 """ 308 Calculate circular variance of a circular variable. 309 310 @param a: input array 311 @param axis: axis along which mean is calculated 312 @type axis: None or integer 313 """ 314 from numpy import average, cos, sin 315 return 1 - average(cos(a), axis) ** 2 - average(sin(a), axis) ** 2
316
317 -def circmean(a, axis=None):
318 """ 319 Estimate mean of a circular variable 320 321 @param a: input array 322 @param axis: axis along which mean is calculated 323 @type axis: None or integer 324 """ 325 from numpy import sin, cos, arctan2, average 326 return arctan2(average(sin(a), axis), average(cos(a), axis))
327
328 -def running_average(x, w, axis=None):
329 """ 330 Calculates a running average for given window size 331 332 @param x: input array 333 @param w: window size 334 @type w: integer 335 @param axis: axis along which mean is calculated 336 """ 337 from numpy import array, mean 338 return array([mean(x[i:i + w], axis) for i in range(len(x) - w)])
339
340 -def weighted_median(x, w):
341 """ 342 Calculates the weighted median, that is the minimizer of 343 argmin {\sum w_i |x_i - \mu|} 344 345 @param x: input array 346 @param w: array of weights 347 """ 348 from numpy import sum, add, argsort, sort 349 350 w = w / w.sum() 351 w = w[argsort(x)] 352 x = sort(x) 353 j = sum(add.accumulate(w) < 0.5) 354 355 return x[j]
356