Package pyvision :: Package ml :: Module crossvalidate
[hide private]
[frames] | no frames]

Source Code for Module pyvision.ml.crossvalidate

  1  ''' 
  2  Created on Nov 23, 2010 
  3   
  4  @author: bolme 
  5  ''' 
  6   
  7  import numpy as np 
  8  import itertools as it 
  9  import copy 
 10  import pyvision as pv 
 11  from libsvm import Classifier,Regression 
 12  import time 
 13   
14 -class CrossValidation:
15
16 - def __init__(self,n_folds=5):
17 ''' Initializing a cross validation algorithm. ''' 18 self.n_folds=n_folds 19 self.best_score = None 20 self.best_tuning=None 21 self.training_data = None 22 self.training_labels = None 23 self.folds = None 24 self.options = [] 25 self.tuning_data = None 26 self.classification = True 27 28 self._alg_class = None
29 30
31 - def setAlgorithm(self,alg_class,args=[],kwargs={}):
32 ''' 33 34 ''' 35 assert issubclass(alg_class,Classifier) or issubclass(alg_class,Regression) 36 self.classification = issubclass(alg_class,Classifier) 37 self._alg_class = alg_class 38 self._alg_args = args 39 self._alg_kwargs = kwargs
40
41 - def getTunedAlgorithm(self):
42 if self.best_tuning == None: 43 self.tuneAlogirthmExaustive() 44 45 alg = self._alg_class(*self._alg_args,**self.best_tuning) 46 labels = self.training_labels 47 data = self.training_data 48 alg.train(labels,data) 49 50 alg.tuning_data = self.tuning_data 51 alg.best_score = self.best_score 52 alg.best_tuning = self.best_tuning 53 54 return alg
55 56
57 - def setTraining(self,labels,data,folds=None):
58 ''' 59 Adds training data. 60 61 @param label: a list of labels (int,str) or regression outputs (float) 62 @param data: a matrix of data values each row is a feature vector 63 @keyword fold: a list specifying the folds for validation. If None it will be randomly assigned. 64 ''' 65 self.training_labels = np.array(labels) 66 self.training_data = np.array(data) 67 if folds == None: 68 n = len(self.training_labels) 69 reps = n/self.n_folds + 1 70 folds = np.tile(np.arange(self.n_folds),reps) 71 folds = folds[:n] 72 #print folds 73 np.random.shuffle(folds) 74 #print repr(folds) 75 76 self.folds = np.array(folds)
77 78
79 - def addTunableOption(self,keyword,values):
80 ''' 81 This specifie 82 @param keyword: A keyword to use in the algorithm initializer. 83 @type keyword: str 84 @param values: A list or tuple of values to use for keyword. 85 @type values: list 86 ''' 87 88 self.options.append([keyword,values])
89
90 - def tuneAlogirthmExaustive(self,verbose = True):
91 ''' 92 This conducts an exaustive search of all tunable parameters to find the best tuning. 93 ''' 94 if len(self.options) == 0: 95 score = self.runTest(self._alg_class,self._alg_args,self._alg_kwargs) 96 self.best_score = score 97 self.best_tuning = copy.deepcopy(self._alg_kwargs) 98 results = pv.Table() 99 results[0,'tuning'] = 'None' 100 results[0,'score'] = score 101 self.tuning_data = results 102 else: 103 # Get keywords and values 104 keywords = [key for key,_ in self.options] 105 values = [val for _,val in self.options] 106 107 r = 0 108 results = pv.Table() 109 110 # Test all possible tuning assignments 111 for vals in it.product(*values): 112 113 # construct kwargs for this assignment 114 kwargs = copy.deepcopy(self._alg_kwargs) 115 for i in range(len(keywords)): 116 kw = keywords[i] 117 val = vals[i] 118 kwargs[kw] = val 119 results[r,kw] = val 120 121 # run a cross validation test 122 score = self.runTest(self._alg_class,self._alg_args,kwargs,verbose=verbose) 123 124 # save the best score 125 if self.classification and (self.best_score == None or score > self.best_score): 126 self.best_score = score 127 self.best_tuning = kwargs 128 if not self.classification and (self.best_score == None or score < self.best_score): 129 self.best_score = score 130 self.best_tuning = kwargs 131 132 # construct a table of tuning information 133 results[r,'score'] = score 134 r += 1 135 #print results 136 137 self.tuning_data = results
138 #print results 139
140 - def runTest(self, alg_class, args, kwargs,verbose=True):
141 142 squared_error = 0.0 143 successes = 0 144 145 for fold in range(self.n_folds): 146 alg = alg_class(*args,**kwargs) 147 148 labels = self.training_labels[fold != self.folds] 149 data = self.training_data[fold != self.folds] 150 151 #print len(labels),len(self.training_labels) 152 153 alg.train(labels,data) 154 155 for i in range(len(self.folds)): 156 157 if self.folds[i] != fold: 158 continue 159 160 prediction = alg(self.training_data[i]) 161 truth = self.training_labels[i] 162 163 if isinstance(prediction, float): 164 squared_error += (prediction-truth)**2 165 #self.classification = False 166 else: 167 successes += prediction == truth 168 #print successes 169 170 if self.classification: 171 score = float(successes)/len(self.folds) 172 #print score 173 else: 174 score = np.sqrt(squared_error/len(self.folds)) 175 #print score 176 tmp = str(kwargs) 177 if verbose: 178 print "%-40s %8.6f"%(tmp[:40],score),successes,'/',len(self.folds) 179 return score
180 181 182 183 184 185 186
187 - def getBestTuning(self):
188 ''' 189 @returns: the best known tuning. 190 '''
191 192
193 -class Validation(CrossValidation):
194
195 - def __init__(self,n_folds=3):
196 CrossValidation.__init__(self, n_folds=n_folds) 197 198 self.best_alg = None 199 self.best_eval = 0.0 200 201 self.results = pv.Table()
202 203
204 - def runTest(self, alg_class, args, kwargs,verbose=True):
205 alg = alg_class(*args,**kwargs) 206 207 squared_error = 0.0 208 successes = 0 209 210 fold = 0 211 212 alg = alg_class(*args,**kwargs) 213 214 labels = self.training_labels[fold != self.folds] 215 data = self.training_data[fold != self.folds] 216 217 alg.train(labels,data) 218 219 start = time.time() 220 count = 0 221 for i in range(len(self.folds)): 222 if self.folds[i] != fold: 223 continue 224 225 count += 1 226 227 prediction = alg(self.training_data[i]) 228 truth = self.training_labels[i] 229 230 if isinstance(prediction, float): 231 squared_error += (prediction-truth)**2 232 #self.classification = False 233 else: 234 successes += prediction == truth 235 #print successes 236 stop = time.time() 237 238 239 if self.classification: 240 score = float(successes)/count 241 242 row = self.results.nRows() 243 244 for key,value in kwargs.iteritems(): 245 self.results[row,key] = value 246 247 new_best = False 248 if self.best_eval < score: 249 new_best = True 250 self.best_eval = score 251 self.best_alg = alg 252 self.best_time = stop-start 253 elif self.best_eval == score and self.best_time > stop-start: 254 new_best = True 255 self.best_eval = score 256 self.best_alg = alg 257 self.best_time = stop-start 258 259 self.results[row,'time'] = stop-start 260 self.results[row,'score'] = score 261 self.results[row,'new_best'] = new_best 262 print self.results 263 else: 264 score = np.sqrt(squared_error/count) 265 tmp = str(kwargs) 266 if verbose: 267 print "%-40s %8.6f"%(tmp[:40],score),successes,'/',len(self.folds) 268 return score
269