1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35 import scipy.stats.distributions as distributions
36 import numpy as np
37 import scipy as sp
38 import copy
39
40
42 '''Binomial probabilites - measured from the left.'''
43 dist = distributions.binom(size,prob)
44 return dist.cdf(q)
45
46
48 '''Binomial quantiles'''
49
50
51 minq = 0
52 maxq = size
53 minp = pbinom(minq,size,prob)
54 maxp = pbinom(maxq,size,prob)
55
56 if minp > p: return minq
57 if maxp < p: return maxq
58
59 while(maxq > minq+1):
60 newq = (minq+maxq)/2
61 newp = pbinom(newq,size,prob)
62
63 if p > newp:
64 minp = newp
65 minq = newq
66 else:
67 maxp = newp
68 maxq = newq
69
70 return maxq
71
72 -def cibinom(size,success,alpha=0.05):
73 '''Confidence interval for a binomial distribution.'''
74 goal = 0.5*alpha
75
76
77 lower_prob = 0.0
78
79 upper_prob = 1.0
80
81
82 for _ in range(32):
83 new_prob = (lower_prob + upper_prob)*0.5
84 new_p = pbinom(success,size,new_prob)
85
86 if new_p < goal:
87 upper_prob = new_prob
88
89 else:
90 lower_prob = new_prob
91
92
93 ub = upper_prob
94
95
96
97
98 success = size - success
99 lower_prob = 0.0
100
101 upper_prob = 1.0
102
103
104 for _ in range(64):
105 new_prob = (lower_prob + upper_prob)*0.5
106 new_p = pbinom(success,size,new_prob)
107
108 if new_p < goal:
109 upper_prob = new_prob
110
111 else:
112 lower_prob = new_prob
113
114
115 lb = 1-upper_prob
116
117 return (lb,ub)
118
119
121 '''
122 From Zhoo and Chellappa. "Face Processining". Chapter 3.
123
124 Compairs the output of two algorithms on the same set of trials.
125 Notice that trials where they both succeeded or both failed are ignored.
126 Output: The two-sided p-value for McNemar's Exact Test. For one sided divide by two.
127
128 If sf+fs is large you may want to use the approximate test.
129
130 Here is an example on a simple classifer... The problem is classifing
131 images of bananas, apples, and oranges. Two algorithms are compaired by
132 running the algorithms on the same set of 9 test images. Here are the
133 outcomes.
134
135 |---|--------|--------|--------|-----------|-----------|
136 | | truth | Alg A | Alg B | Success A | Success B |
137 |---|--------|--------|--------|-----------|-----------|
138 | 1 | banana | banana | banana | T | T |
139 | 2 | apple | apple | banana | T | F |
140 | 3 | orange | apple | orange | F | T |
141 | 4 | orange | apple | apple | F | F |
142 | 5 | apple | apple | apple | T | T |
143 | 6 | banana | banana | banana | T | T |
144 | 7 | apple | apple | banana | T | F |
145 | 8 | orange | orange | apple | T | F |
146 | 9 | banana | None | banana | T | T |
147 |---|--------|--------|--------|-----------|-----------|
148
149 Now you can count the number of times both algorithms succeed, both
150 algorithms fail, A succeeds and B fails, and A fails and B succeeds.
151
152 |-------|-----|-----|-------|
153 | | A=T | A=F | Total |
154 |-------|-----|-----|-------|
155 | B=T | 4 | 1 | 5 |
156 | B=F | 3 | 1 | 4 |
157 |-------|-----|-----|-------|
158 | Total | 7 | 2 | 9 |
159 |-------|-----|-----|-------|
160
161 From this table you can compute success rates (A=T)/Total...
162
163 > 7.0/9.0 # A Success Rate
164 0.77777777777777779
165
166 > 5.0/9.0 # B Success Rate
167 0.55555555555555558
168
169 The input to McNemar's Test are the SF (A=T,B=F) = 3 and FS (A=F,B=T) = 1.
170
171 > mcnemar_test(3,1) # Two-sided p-value
172 0.625
173
174 @param sf: the number of trials algorithm A succeeded and algorithm B failed.
175 @param fs: the number of trials algorithm A failed and algorithm B succeeded.
176
177 '''
178 def factorial(n):
179 if n <= 1: return 1
180 else: return n*factorial(n-1)
181
182 low = min(sf,fs)
183 high = max(sf,fs)
184 n = low + high
185
186 pvalue = 0.0
187 for i in range(0,low+1):
188 pvalue += (0.5**n)*factorial(n)/float(factorial(i)*factorial(n-i))
189
190 for i in range(high,n+1):
191 pvalue += (0.5**n)*factorial(n)/float(factorial(i)*factorial(n-i))
192
193 return pvalue
194
198
199
203
204
206 '''
207 Emperically fit a Weibull distribution to x
208
209 @param x: a list containing the x.
210 @type x: a list of floats
211 @param ilog: an image log to save fit information.
212 @type ilog: pv.ImageLog
213 @returns: (k,lambda)
214 '''
215 x = np.array(x)
216 assert x.min() >= 0.0
217
218 n = len(x)
219
220 def nll(params):
221 ''' Negative log liklyhood'''
222 shape,scale = params
223
224 mask = x > 0
225 pdf = pdfWeibull(x, shape, scale)
226
227 ll = np.log(pdf[mask]).sum()
228 t1 = (~mask).sum()
229 ll += t1*np.log(0.000001)
230
231 return -ll/len(x)
232
233 tmp = sp.optimize.fmin(nll,[1.0,np.mean(x)],disp=0)
234 shape,scale = tmp
235
236 if ilog != None:
237
238 order = x.argsort()
239 x = x[order]
240 del order
241
242 points = [0,0]
243 points = [[0,0]] + [ [x[i],float(i)/n] for i in range(n)]
244 plot = pv.Plot(title="Weibull CDF")
245 plot.points(points)
246
247
248
249 lines = [[0,0]] + [ [x[i],cdfWeibull(x[i], shape, scale)] for i in range(n)]
250 plot.lines(lines)
251 ilog(plot,"WeibullCDF")
252
253 plot = pv.Plot(title="Weibull PDF")
254 y = pdfWeibull(x, shape, scale)
255 points = np.array([x,y]).reshape(2,n).T
256 plot.lines(points,color='red',width=3)
257
258 hist, bins = np.histogram(x, 5, normed=True)
259
260 t1 = 0.5*(bins[:-1] + bins[1:])
261 points = np.array([t1,hist]).reshape(2,5).T
262 plot.lines(points,color='blue',width=3)
263 ilog(plot,"WeibullPDF")
264
265
266 return shape,scale
267
268
270 '''
271 A function that simply computes the covariance: Cov(X,Y). Data points
272 should be stored in rows. This function should have similar conventions
273 and results as the R function of the same name.
274
275 @param x: a matrix containing data.
276 @type x: np.array
277 @param y: an optional second matrix. By default y = x.
278 @type x: np.array
279 '''
280 if y == None:
281 y = x
282 x = copy.deepcopy(x)
283 y = copy.deepcopy(y)
284 nx,dx = x.shape
285 n,dy = y.shape
286
287 assert nx == n
288
289 mx = x.mean(axis=0).reshape(1,dx)
290 my = y.mean(axis=0).reshape(1,dy)
291 x -= mx
292 y -= my
293 s = 1.0/(n-1)
294 return s*np.dot(x.T,y)
295
296
298 '''
299 A function that simply computes the correlation matrix: Corr(X,Y). Data points
300 should be stored in rows. This function should have similar conventions
301 and results as the R function of the same name.
302
303 @param x: a matrix containing data.
304 @type x: np.array
305 @param y: an optional second matrix. By default y = x.
306 @type x: np.array
307 '''
308 if y == None:
309 y = x
310 x = copy.deepcopy(x)
311 y = copy.deepcopy(y)
312 nx,dx = x.shape
313 n,dy = y.shape
314
315 assert nx == n
316
317 mx = x.mean(axis=0).reshape(1,dx)
318 my = y.mean(axis=0).reshape(1,dy)
319 sx = x.std(axis=0,ddof=1).reshape(1,dx)
320 sy = y.std(axis=0,ddof=1).reshape(1,dy)
321 x = (x-mx)/sx
322 y = (y-my)/sy
323 s = 1.0/(n-1)
324 return s*np.dot(x.T,y)
325
326
328 '''
329 Converts a symmetric positive definite matrix to a correlation matrix by
330 normalizing by the diagonal.
331 '''
332 r,c = v.shape
333 assert r == c
334 s = 1.0/np.sqrt(np.diag(v))
335 v *= s.reshape(r,1)
336 v *= s.reshape(1,c)
337 return v
338
340
341 - def __init__(self, x, name="NO_NAME", alpha=0.05):
342 '''
343 Computes some basic information about the data set x.
344
345 Assumes x comes from a t distribution.
346 '''
347
348 self.name = name
349
350 x = np.array(x,'d')
351
352 self.alpha = alpha
353 self.n = len(x)
354 self.mean = x.mean()
355 self.var = ((x-self.mean)*(x-self.mean)).sum()/(self.n-1)
356 self.std = np.sqrt(self.var)
357 self.ste = self.std/np.sqrt(self.n)
358 self.df = self.n - 1
359
360 tci = distributions.t.ppf(1-alpha/2,self.df)
361 lcim = self.mean-tci*self.ste
362 ucim = self.mean+tci*self.ste
363 self.mean_ci = [lcim,ucim]
364
365 tci = distributions.t.ppf(1-alpha/2,self.df)
366 lci = self.mean-tci*self.std
367 uci = self.mean+tci*self.std
368 self.ci = [lci,uci]
369
370 self.median = np.median(x)
371
372
374 data = self.asTable()
375 title = "Summary of %s"%self.name
376
377
378 key_length = 0
379 value_length = 0
380
381 formatted_data = []
382
383 for key,value in data:
384 key_length = max(key_length,len(key))
385
386 if isinstance(value,int):
387 value = "%d "%value
388 elif isinstance(value,str):
389 value = "%s "%value
390 elif isinstance(value,float):
391 if value < 0.0001 and value > 0.0:
392 value = "< 0.00001"
393 else:
394 value = "%0.5f"%value
395 else:
396 raise TypeError("Only int, float, and str supported")
397
398 value_length = max(value_length,len(value))
399 formatted_data.append([key,value])
400
401 row_format = "| %%-%ds | %%%ds |"%(key_length,value_length)
402 row_length = 7+key_length+value_length
403
404 rows = []
405
406 rows.append(title)
407 rows.append('-'*row_length)
408 for key,value in formatted_data:
409 rows.append(row_format%(key,value))
410 rows.append('-'*row_length)
411
412 result = ""
413 for each in rows:
414 result += each
415 result += '\n'
416
417 return result
418
419
420
422 data = []
423 data.append(['N',self.n])
424 data.append(['Mean',self.mean])
425 if self.alpha == 0.01:
426 data.append(['Mean 99% CI LB',self.mean_ci[0]])
427 data.append(['Mean 99% CI UB',self.mean_ci[1]])
428 elif self.alpha == 0.05:
429 data.append(['Mean 95% CI LB',self.mean_ci[0]])
430 data.append(['Mean 95% CI UB',self.mean_ci[1]])
431 elif self.alpha == 0.1:
432 data.append(['Mean 90% CI LB',self.mean_ci[0]])
433 data.append(['Mean 90% CI UB',self.mean_ci[1]])
434 elif self.alpha == 0.25:
435 data.append(['Mean 75% CI LB',self.mean_ci[0]])
436 data.append(['Mean 75% CI UB',self.mean_ci[1]])
437 else:
438 data.append(['Mean %0.4f CI LB'%(1.-self.alpha,),self.mean_ci[0]])
439 data.append(['Mean %0.4f CI UB'%(1.-self.alpha,),self.mean_ci[1]])
440 data.append(['Var',self.var])
441 data.append(['Std Dev',self.std])
442 data.append(['Std Error',self.ste])
443 if self.alpha == 0.01:
444 data.append(['99% CI LB',self.ci[0]])
445 data.append(['99% CI UB',self.ci[1]])
446 elif self.alpha == 0.05:
447 data.append(['95% CI LB',self.ci[0]])
448 data.append(['95% CI UB',self.ci[1]])
449 elif self.alpha == 0.1:
450 data.append(['90% CI LB',self.ci[0]])
451 data.append(['90% CI UB',self.ci[1]])
452 elif self.alpha == 0.25:
453 data.append(['75% CI LB',self.ci[0]])
454 data.append(['75% CI UB',self.ci[1]])
455 else:
456 data.append(['%0.4f CI LB'%(1.-self.alpha,),self.ci[0]])
457 data.append(['%0.4f CI UB'%(1.-self.alpha,),self.ci[1]])
458
459 data.append(['Median',self.median])
460
461 return data
462
463
464
465 import unittest
466 import pyvision as pv
469 self.normal_data = [1.3139, 5.2441, 0.0756, 4.4679, 2.3845,
470 2.9330, 2.9803, 2.3844, 0.7643, -2.2058,
471 1.9057, -0.1609, 4.4459, -0.0158, 5.9733,
472 2.8994, 0.2282, 1.0099, 2.8802, 2.3120,
473 1.8388, -2.1818, -0.3264, -0.0711, 4.8463,
474 0.6059, 6.1048, 1.7795, 1.2986, 5.4349,
475 2.2219, 3.0162, -1.6250, 2.8928, -6.7314,
476 2.5222, 2.2261, 3.3774, 2.7479, 2.7690,
477 4.6934, 3.0834, 8.9465, 5.5662, 5.1551,
478 -1.6149, -1.2087, 1.8739, 7.6589, 4.9503]
479
480 self.longley = [[83.0, 234.289, 235.6, 159.0, 107.608, 1947, 60.323,],
481 [88.5, 259.426, 232.5, 145.6, 108.632, 1948, 61.122,],
482 [88.2, 258.054, 368.2, 161.6, 109.773, 1949, 60.171,],
483 [89.5, 284.599, 335.1, 165.0, 110.929, 1950, 61.187,],
484 [96.2, 328.975, 209.9, 309.9, 112.075, 1951, 63.221,],
485 [98.1, 346.999, 193.2, 359.4, 113.270, 1952, 63.639,],
486 [99.0, 365.385, 187.0, 354.7, 115.094, 1953, 64.989,],
487 [100.0, 363.112, 357.8, 335.0, 116.219, 1954, 63.761,],
488 [101.2, 397.469, 290.4, 304.8, 117.388, 1955, 66.019,],
489 [104.6, 419.180, 282.2, 285.7, 118.734, 1956, 67.857,],
490 [108.4, 442.769, 293.6, 279.8, 120.445, 1957, 68.169,],
491 [110.8, 444.546, 468.1, 263.7, 121.950, 1958, 66.513,],
492 [112.6, 482.704, 381.3, 255.2, 123.366, 1959, 68.655,],
493 [114.2, 502.601, 393.1, 251.4, 125.368, 1960, 69.564,],
494 [115.7, 518.173, 480.6, 257.2, 127.852, 1961, 69.331,],
495 [116.9, 554.894, 400.7, 282.7, 130.081, 1962, 70.551,],]
496
497 self.longley = np.array(self.longley)
498
499
501
502 stats = SummaryStats(self.normal_data, name="Normal Data")
503
504 self.assertEquals(stats.n,50)
505 self.assertAlmostEqual(stats.mean, 2.273416, places=6)
506 self.assertAlmostEqual(stats.var, 7.739672, places=6)
507 self.assertAlmostEqual(stats.std, 2.782027, places=6)
508 self.assertAlmostEqual(stats.ste, 0.393438, places=6)
509 self.assertAlmostEqual(stats.ci[0], -3.317276, places=6)
510 self.assertAlmostEqual(stats.ci[1], 7.864108, places=6)
511 self.assertAlmostEqual(stats.mean_ci[0], 1.482773, places=6)
512 self.assertAlmostEqual(stats.mean_ci[1], 3.064059, places=6)
513 self.assertAlmostEqual(stats.median, 2.38445, places=5)
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
530
531 self.assertAlmostEquals(pbinom(50,100,0.5), 0.5397946)
532 self.assertAlmostEquals(pbinom(25,100,0.3), 0.1631301)
533 self.assertAlmostEquals(pbinom(20,100,0.1), 0.9991924)
534 self.assertAlmostEquals(pbinom(8,100,0.05), 0.9369104)
535 self.assertAlmostEquals(pbinom(0,100,0.01), 0.3660323)
536 self.assertAlmostEquals(pbinom(100,100,0.98), 1.0000000)
537
539
540 self.assertEquals(qbinom(0.5,100,0.5), 50)
541 self.assertEquals(qbinom(0.1,100,0.01), 0)
542 self.assertEquals(qbinom(1.0,100,0.9), 100)
543 self.assertEquals(qbinom(0.2,100,0.4), 36)
544 self.assertEquals(qbinom(0.4,100,0.85), 84)
545
547
548 self.assertAlmostEqual( cibinom(100,50)[0], 0.3983, places=4)
549 self.assertAlmostEqual( cibinom(100,50)[1], 0.6017, places=4)
550 self.assertAlmostEqual( cibinom(1000,50)[0], 0.0373, places=4)
551 self.assertAlmostEqual( cibinom(1000,50)[1], 0.0654, places=4)
552 self.assertAlmostEqual( cibinom(10000,50)[0], 0.0037, places=4)
553 self.assertAlmostEqual( cibinom(10000,50)[1], 0.0066, places=4)
554 self.assertAlmostEqual( cibinom(1,1)[0], 0.0250, places=4)
555 self.assertAlmostEqual( cibinom(1,1)[1], 1.0000, places=4)
556 self.assertAlmostEqual( cibinom(10,10)[0], 0.6915, places=4)
557 self.assertAlmostEqual( cibinom(10,10)[1], 1.0000, places=4)
558 self.assertAlmostEqual( cibinom(100,100)[0], 0.9638, places=4)
559 self.assertAlmostEqual( cibinom(100,100)[1], 1.0000, places=4)
560 self.assertAlmostEqual( cibinom(1,0)[0], 0.0000, places=4)
561 self.assertAlmostEqual( cibinom(1,0)[1], 0.9750, places=4)
562 self.assertAlmostEqual( cibinom(10,0)[0], 0.0000, places=4)
563 self.assertAlmostEqual( cibinom(10,0)[1], 0.3085, places=4)
564 self.assertAlmostEqual( cibinom(100,0)[0], 0.0000, places=4)
565 self.assertAlmostEqual( cibinom(100,0)[1], 0.0362, places=4)
566
568
569 self.assertAlmostEqual(mcnemar_test(16,24),0.268,places=3)
570 self.assertAlmostEqual(mcnemar_test(64,96),0.014,places=3)
571
573 ilog = None
574
575
576
577
578
579
580
581 data = [0.39764089, 0.60824086, 0.40285732, 1.54531775, 1.73364323, 1.23747338, 1.12446222, 3.15989785, 0.22271289,
582 1.28213280, 1.68005746, 0.58658749, 0.83938237, 1.25577118, 0.64729513, 1.92565971, 0.36610902, 0.10363669,
583 0.15618127, 0.02262031, 0.25985175, 0.14230431, 1.54069502, 1.06272791, 0.05364079, 0.93874689, 1.01770360,
584 0.40204781, 0.40660520, 0.12017453, 0.73480365, 3.73042281, 1.37838373, 0.17739429, 1.21166837, 3.79022634,
585 0.91822186, 1.07417484, 0.37926781, 0.66128749,]
586 shape,scale = fitWeibull(data,ilog = ilog)
587 self.assertAlmostEqual(shape,1.1082557,places=4)
588 self.assertAlmostEqual(scale,1.0212356,places=4)
589
590
591
592
593
594
595 data = [9.0668007, 7.5244193, 1.3643692, 2.4980839, 3.8229886, 0.7847899, 10.2635502, 6.4853731, 4.1691479,
596 4.7222325, 3.6751391, 10.5038682, 1.8489645, 5.5697636, 10.3385587, 1.8399665, 7.8512893, 1.6301032,
597 7.1892784, 3.4151212, 2.1018280, 3.0128155, 5.4290304, 3.9759659, 6.4867134, 4.8687895, 1.2671571,
598 6.4746843, 3.6922549, 3.6133898, 5.8451979, 5.5435995, 4.2617657, 3.3490959, 6.3412869, 1.3440581,
599 2.7830355, 2.1482365, 2.5091446, 9.5137472]
600 shape,scale = fitWeibull(data,ilog = ilog)
601 self.assertAlmostEqual(shape,1.8456678,places=4)
602 self.assertAlmostEqual(scale,5.3412324,places=4)
603
604
605
606
607
608
609 data = [4.368635e-02, 1.716870e-01, 5.265532e-01, 2.387941e-04, 1.836984e-01, 4.835876e-01, 2.159292e-03, 1.060331e+00,
610 1.945628e-02, 4.110887e-01, 1.257612e-01, 2.911412e-02, 6.198067e-01, 5.143289e-01, 1.047416e-01, 3.997763e-01,
611 4.596470e-07, 7.417249e-03, 5.209768e-03, 4.370919e-03, 3.816381e-01, 8.640891e-03, 4.125977e-02, 2.129932e-02,
612 6.916213e-03, 1.037448e-01, 1.946721e-02, 1.445826e-01, 9.911569e-01, 2.074493e-01, 2.726630e-03, 3.030224e-02,
613 1.991381e+00, 1.616899e-01, 1.251923e+00, 4.915620e-01, 1.826906e-01, 8.091978e-04, 7.905816e-03, 5.381265e-02]
614 shape,scale = fitWeibull(data,ilog=ilog)
615 self.assertAlmostEqual(shape,0.51119109,places=4)
616 self.assertAlmostEqual(scale,0.15523840,places=4)
617
618 if ilog != None:
619 ilog.show()
620
623
627
630