1 '''
2 Created on Nov 23, 2010
3
4 @author: bolme
5 '''
6
7 import numpy as np
8 import itertools as it
9 import copy
10 import pyvision as pv
11 from libsvm import Classifier,Regression
12 import time
13
15
17 ''' Initializing a cross validation algorithm. '''
18 self.n_folds=n_folds
19 self.best_score = None
20 self.best_tuning=None
21 self.training_data = None
22 self.training_labels = None
23 self.folds = None
24 self.options = []
25 self.tuning_data = None
26 self.classification = True
27
28 self._alg_class = None
29
30
32 '''
33
34 '''
35 assert issubclass(alg_class,Classifier) or issubclass(alg_class,Regression)
36 self.classification = issubclass(alg_class,Classifier)
37 self._alg_class = alg_class
38 self._alg_args = args
39 self._alg_kwargs = kwargs
40
42 if self.best_tuning == None:
43 self.tuneAlogirthmExaustive()
44
45 alg = self._alg_class(*self._alg_args,**self.best_tuning)
46 labels = self.training_labels
47 data = self.training_data
48 alg.train(labels,data)
49
50 alg.tuning_data = self.tuning_data
51 alg.best_score = self.best_score
52 alg.best_tuning = self.best_tuning
53
54 return alg
55
56
58 '''
59 Adds training data.
60
61 @param label: a list of labels (int,str) or regression outputs (float)
62 @param data: a matrix of data values each row is a feature vector
63 @keyword fold: a list specifying the folds for validation. If None it will be randomly assigned.
64 '''
65 self.training_labels = np.array(labels)
66 self.training_data = np.array(data)
67 if folds == None:
68 n = len(self.training_labels)
69 reps = n/self.n_folds + 1
70 folds = np.tile(np.arange(self.n_folds),reps)
71 folds = folds[:n]
72
73 np.random.shuffle(folds)
74
75
76 self.folds = np.array(folds)
77
78
80 '''
81 This specifie
82 @param keyword: A keyword to use in the algorithm initializer.
83 @type keyword: str
84 @param values: A list or tuple of values to use for keyword.
85 @type values: list
86 '''
87
88 self.options.append([keyword,values])
89
91 '''
92 This conducts an exaustive search of all tunable parameters to find the best tuning.
93 '''
94 if len(self.options) == 0:
95 score = self.runTest(self._alg_class,self._alg_args,self._alg_kwargs)
96 self.best_score = score
97 self.best_tuning = copy.deepcopy(self._alg_kwargs)
98 results = pv.Table()
99 results[0,'tuning'] = 'None'
100 results[0,'score'] = score
101 self.tuning_data = results
102 else:
103
104 keywords = [key for key,_ in self.options]
105 values = [val for _,val in self.options]
106
107 r = 0
108 results = pv.Table()
109
110
111 for vals in it.product(*values):
112
113
114 kwargs = copy.deepcopy(self._alg_kwargs)
115 for i in range(len(keywords)):
116 kw = keywords[i]
117 val = vals[i]
118 kwargs[kw] = val
119 results[r,kw] = val
120
121
122 score = self.runTest(self._alg_class,self._alg_args,kwargs,verbose=verbose)
123
124
125 if self.classification and (self.best_score == None or score > self.best_score):
126 self.best_score = score
127 self.best_tuning = kwargs
128 if not self.classification and (self.best_score == None or score < self.best_score):
129 self.best_score = score
130 self.best_tuning = kwargs
131
132
133 results[r,'score'] = score
134 r += 1
135
136
137 self.tuning_data = results
138
139
140 - def runTest(self, alg_class, args, kwargs,verbose=True):
141
142 squared_error = 0.0
143 successes = 0
144
145 for fold in range(self.n_folds):
146 alg = alg_class(*args,**kwargs)
147
148 labels = self.training_labels[fold != self.folds]
149 data = self.training_data[fold != self.folds]
150
151
152
153 alg.train(labels,data)
154
155 for i in range(len(self.folds)):
156
157 if self.folds[i] != fold:
158 continue
159
160 prediction = alg(self.training_data[i])
161 truth = self.training_labels[i]
162
163 if isinstance(prediction, float):
164 squared_error += (prediction-truth)**2
165
166 else:
167 successes += prediction == truth
168
169
170 if self.classification:
171 score = float(successes)/len(self.folds)
172
173 else:
174 score = np.sqrt(squared_error/len(self.folds))
175
176 tmp = str(kwargs)
177 if verbose:
178 print "%-40s %8.6f"%(tmp[:40],score),successes,'/',len(self.folds)
179 return score
180
181
182
183
184
185
186
188 '''
189 @returns: the best known tuning.
190 '''
191
192
194
202
203
204 - def runTest(self, alg_class, args, kwargs,verbose=True):
205 alg = alg_class(*args,**kwargs)
206
207 squared_error = 0.0
208 successes = 0
209
210 fold = 0
211
212 alg = alg_class(*args,**kwargs)
213
214 labels = self.training_labels[fold != self.folds]
215 data = self.training_data[fold != self.folds]
216
217 alg.train(labels,data)
218
219 start = time.time()
220 count = 0
221 for i in range(len(self.folds)):
222 if self.folds[i] != fold:
223 continue
224
225 count += 1
226
227 prediction = alg(self.training_data[i])
228 truth = self.training_labels[i]
229
230 if isinstance(prediction, float):
231 squared_error += (prediction-truth)**2
232
233 else:
234 successes += prediction == truth
235
236 stop = time.time()
237
238
239 if self.classification:
240 score = float(successes)/count
241
242 row = self.results.nRows()
243
244 for key,value in kwargs.iteritems():
245 self.results[row,key] = value
246
247 new_best = False
248 if self.best_eval < score:
249 new_best = True
250 self.best_eval = score
251 self.best_alg = alg
252 self.best_time = stop-start
253 elif self.best_eval == score and self.best_time > stop-start:
254 new_best = True
255 self.best_eval = score
256 self.best_alg = alg
257 self.best_time = stop-start
258
259 self.results[row,'time'] = stop-start
260 self.results[row,'score'] = score
261 self.results[row,'new_best'] = new_best
262 print self.results
263 else:
264 score = np.sqrt(squared_error/count)
265 tmp = str(kwargs)
266 if verbose:
267 print "%-40s %8.6f"%(tmp[:40],score),successes,'/',len(self.folds)
268 return score
269