1 """
2 Statistics root package.
3
4 This package contains a number of common statistical utilities. Sub-packages
5 provide more specialized APIs, for example L{csb.statistics.pdf} defines the
6 probability density object model.
7 """
8
10
11 total_mem = 1e8
12
16
18
19 from numpy import greater, reshape, concatenate
20
21 c = []
22 x = reshape(x, (-1,))
23
24 if nchunks is None:
25
26 total_size = len(x) * len(self.data)
27 nchunks = total_size / self.total_mem + int(total_size % self.total_mem != 0)
28
29 size = int(len(x) / nchunks + int(len(x) % nchunks != 0))
30
31 while len(x):
32
33 y = x[:size]
34 x = x[size:]
35
36 c = concatenate((c, greater.outer(y, self.data).sum(1) / float(len(self.data))))
37
38 return c
39
42
44 """
45 @param x:
46 @param axis: compute the geometric mean along this axis
47
48 @return: geometric mean of x
49 """
50 from numpy import exp, log, clip, mean
51
52 return exp(mean(log(clip(x, 1e-300, 1e300)), axis))
53
55 """
56 @param x:
57 @param axis: compute the harmonic mean along this axis
58
59 @return: harmonic mean of x
60 """
61 from numpy import mean
62
63 return 1 / mean(1 / x, axis)
64
66 """
67 @param x: random variables
68 @param axis: compute the kurtosis along this axis
69
70 @return: Sample kurtosis of x
71 """
72 from numpy import mean, std
73
74 m = x.mean(axis)
75 a = mean((x - m) ** 4, axis)
76 s = std(x, axis)
77
78 return a / s ** 4 - 3
79
81 """
82 @param x: random variables
83 @param axis: compute the skewness along this axis
84
85 @return: Sample skewness of x
86 """
87 from numpy import mean, std
88
89 s = std(x)
90 return mean((x - x.mean()) ** 3, axis) / s ** 3
91
93 """
94 auto-correlation of a times series
95
96 @param x: time series
97 @type x: numpy.array
98 @param n: Maximal lag for which to compute the auto-correlation
99 @type n: int
100 """
101 from numpy import array, mean, std
102 x = x - x.mean()
103 return array([mean(x[i:] * x[:len(x) - i]) for i in range(n)]) / std(x)**2
104
106 """
107 Probabilistic version of AND
108 """
109 from numpy import array, multiply
110 return multiply.reduce(array(p), axis=axis)
111
118
137
139 """
140 Reconstruction of a multidimensional configuration that
141 optimally reproduces the input distance matrix.
142
143 See: Gower, J (1966)
144 """
145 from numpy import clip, sqrt, take, argsort, sort
146 from csb.numeric import reverse
147 from scipy.linalg import eigh
148
149
150
151 B = -clip(D, 1e-150, 1e150) ** 2 / 2.
152
153 b = B.mean(0)
154
155 B = B - b
156 B = (B.T - b).T
157 B += b.mean()
158
159
160
161 v, U = eigh(B)
162 v = v.real
163 U = U.real
164
165 U = take(U, argsort(v), 1)
166 v = sort(v)
167
168 U = reverse(U, 1)
169 v = reverse(v)
170
171 if nd is None: nd = len(v)
172
173 X = U[:, :nd] * sqrt(clip(v[:nd], 0., 1e300))
174
175 return X
176
178 """
179 Calculate the entropy of p.
180 @return: entropy of p
181 """
182 from csb.numeric import log
183 from numpy import sum
184
185 return -sum(p * log(p))
186
187 -def histogram2D(x, nbins=100, axes=None, nbatch=1000, normalize=True):
188 """
189 Non-greedy two-dimensional histogram.
190
191 @param x: input array of rank two
192 @type x: numpy array
193 @param nbins: number of bins
194 @type nbins: integer
195 @param axes: x- and y-axes used for binning the data (if provided this will be used instead of <nbins>)
196 @type axes: tuple of two one-dimensional numpy arrays
197 @param nbatch: size of batch that is used to sort the data into the 2D grid
198 @type nbatch: integer
199 @param normalize: specifies whether histogram should be normalized
200 @type normalize: boolean
201
202 @return: 2-rank array storing histogram, tuple of x- and y-axis
203 """
204 from numpy import linspace, zeros, argmin, fabs, subtract, transpose
205
206 if axes is None:
207
208 lower, upper = x.min(0), x.max(0)
209 axes = [linspace(lower[i], upper[i], nbins) for i in range(lower.shape[0])]
210
211 H = zeros((len(axes[0]), len(axes[1])))
212
213 while len(x):
214
215 y = x[:nbatch]
216 x = x[nbatch:]
217
218 I = transpose([argmin(fabs(subtract.outer(y[:, i], axes[i])), 1) for i in range(2)])
219
220 for i, j in I: H[i, j] += 1
221
222 if normalize:
223 H = H / H.sum() / (axes[0][1] - axes[0][0]) / (axes[1][1] - axes[1][0])
224
225 return H, axes
226
227 -def histogram_nd(x, nbins=100, axes=None, nbatch=1000, normalize=True):
228 """
229 Non-greedy n-dimemsional histogram.
230
231 @param x: input array of rank (-1,n)
232 @type x: numpy array
233 @param nbins: number of bins
234 @type nbins: integer
235 @param axes: axes used for binning the data (if provided this will be used instead of <nbins>)
236 @type axes: tuple of two one-dimensional numpy arrays
237 @param nbatch: size of batch that is used to sort the data into the nD grid
238 @type nbatch: integer
239 @param normalize: specifies whether histogram should be normalized
240 @type normalize: boolean
241
242 @return: n-rank array storing histogram, tuple of axes
243 """
244 import numpy as np
245
246 if len(x.shape) == 1:
247 x = np.reshape(x, (-1,1))
248
249 d = x.shape[1]
250
251 if axes is None:
252
253 lower, upper = x.min(0), x.max(0)
254 axes = [np.linspace(lower[i], upper[i], nbins) for i in range(d)]
255
256 shape = tuple(map(len, axes))
257
258 H = np.zeros(shape)
259
260
261 s = np.multiply.accumulate(np.array((1,) + H.shape[1:]))[::-1]
262 H = H.flatten()
263
264 while len(x):
265
266 y = x[:nbatch]
267 x = x[nbatch:]
268
269 I = np.transpose([np.argmin(np.fabs(np.subtract.outer(y[:, i], axes[i])), 1)
270 for i in range(d)])
271 I = np.dot(I, s)
272 I = np.sort(I)
273
274 i = list(set(I.tolist()))
275 n = np.equal.outer(I, i).sum(0)
276
277 H[i] += n
278
279 if normalize:
280 H = H / H.sum() / np.multiply.reduce([axes[i][1] - axes[i][0] for i in range(d)])
281
282 H = np.reshape(H, shape)
283
284 return H, axes
285
286 -def density(x, nbins, normalize=True):
287 """
288 Histogram of univariate input data: basically calls numpy's histogram method and
289 does a proper normalization.
290
291 @param x: input numpy array
292 @param nbins: number of bins
293 @type nbins: integer
294 @param normalize: if true, histogram will be normalized
295 """
296 from numpy import histogram
297
298 hy, hx = histogram(x, nbins)
299 hx = 0.5 * (hx[1:] + hx[:-1])
300 hy = hy.astype('d')
301 if normalize:
302 hy /= (hx[1] - hx[0]) * hy.sum()
303
304 return hx, hy
305
307 """
308 Calculate circular variance of a circular variable.
309
310 @param a: input array
311 @param axis: axis along which mean is calculated
312 @type axis: None or integer
313 """
314 from numpy import average, cos, sin
315 return 1 - average(cos(a), axis) ** 2 - average(sin(a), axis) ** 2
316
318 """
319 Estimate mean of a circular variable
320
321 @param a: input array
322 @param axis: axis along which mean is calculated
323 @type axis: None or integer
324 """
325 from numpy import sin, cos, arctan2, average
326 return arctan2(average(sin(a), axis), average(cos(a), axis))
327
329 """
330 Calculates a running average for given window size
331
332 @param x: input array
333 @param w: window size
334 @type w: integer
335 @param axis: axis along which mean is calculated
336 """
337 from numpy import array, mean
338 return array([mean(x[i:i + w], axis) for i in range(len(x) - w)])
339
356