Package pyvision :: Package vector :: Module VectorClassifier
[hide private]
[frames] | no frames]

Source Code for Module pyvision.vector.VectorClassifier

  1  # PyVision License 
  2  # 
  3  # Copyright (c) 2006-2008 David S. Bolme 
  4  # All rights reserved. 
  5  # 
  6  # Redistribution and use in source and binary forms, with or without 
  7  # modification, are permitted provided that the following conditions 
  8  # are met: 
  9  #  
 10  # 1. Redistributions of source code must retain the above copyright 
 11  # notice, this list of conditions and the following disclaimer. 
 12  #  
 13  # 2. Redistributions in binary form must reproduce the above copyright 
 14  # notice, this list of conditions and the following disclaimer in the 
 15  # documentation and/or other materials provided with the distribution. 
 16  #  
 17  # 3. Neither name of copyright holders nor the names of its contributors 
 18  # may be used to endorse or promote products derived from this software 
 19  # without specific prior written permission. 
 20  #  
 21  #  
 22  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 23  # ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 24  # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
 25  # A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR 
 26  # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
 27  # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
 28  # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
 29  # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
 30  # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
 31  # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
 32  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 33   
 34   
 35   
 36  from numpy import array,mean,std 
 37  import pyvision as pv 
 38   
 39  from pyvision.vector.PCA import PCA 
 40  import unittest 
 41  import os.path 
 42   
 43   
 44  NORM_NONE="NONE" 
 45  NORM_PCA="PCA_WHITEN" 
 46  NORM_VALUE="VALUE" 
 47  NORM_AUTO="AUTO" 
 48   
 49  REG_NORM_NONE="NONE" 
 50  REG_NORM_VALUE="VALUE" 
 51   
 52  TYPE_TWOCLASS="TWOCLASS" 
 53  TYPE_MULTICLASS="MULTICLASS" 
 54  TYPE_REGRESSION="REGRESSION" 
 55   
 56  ## 
 57  # The purpose of this class is to provied common services  
 58  # and a common interface to classifers.  For the most part 
 59  # this class provides normalization services.  Many  
 60  # classification algorthms assume that the input values have  
 61  # zero mean and a unit variance.  This class also provides  
 62  # PCA based normalization that also reduces dimensionality. 
63 -class VectorClassifier:
64 65 66 ## 67 # Configure some defaults for the classifier value normalizion. 68 # 69 # <p>This configures some defalts for the classifier such as the 70 # type of classifier, and how values are normalized.
71 - def __init__(self, classifer_type, normalization=NORM_AUTO, reg_norm=REG_NORM_VALUE, pca_basis=0.95, pca_drop=0):
72 73 # Setup basic configuration 74 self.type = classifer_type 75 self.norm = normalization 76 self.reg_norm = reg_norm 77 self.pca_basis = pca_basis 78 self.pca_drop = pca_drop 79 80 self.labels = [] 81 self.vectors = [] 82 self.vector_length = None 83 84 self.reg_mean = 0.0 85 self.reg_std = 1.0
86 87 88 ## 89 # Learn the range of values that are expected for labels and data. 90 # Then setup for normalization.
91 - def trainNormalization(self):
92 93 assert len(self.labels) >= 2 94 95 if self.type == TYPE_TWOCLASS or self.type == TYPE_MULTICLASS: 96 # Learn the classes 97 n_classes = 0 98 self.class_map = {} 99 100 for label in self.labels: 101 if not self.class_map.has_key(label): 102 self.class_map[label] = n_classes 103 n_classes+=1 104 105 if self.type == TYPE_MULTICLASS: 106 assert n_classes >= 2 107 if self.type == TYPE_TWOCLASS: 108 assert n_classes == 2 109 110 self.class_inv = {} 111 for key,value in self.class_map.iteritems(): 112 self.class_inv[value] = key 113 114 new_labels=[] 115 for each in self.labels: 116 new_labels.append(self.class_map[each]) 117 self.labels = new_labels 118 119 if self.type == TYPE_REGRESSION: 120 self.reg_mean = mean(self.labels) 121 self.reg_std = std(self.labels) 122 123 new_labels=[] 124 for each in self.labels: 125 new_labels.append((each - self.reg_mean)/self.reg_std) 126 self.labels = new_labels 127 128 #test length 129 shape = self.vectors[0].shape 130 assert len(shape) == 1 131 132 for each in self.vectors: 133 assert shape == each.shape 134 135 #crate a data matrix 136 data = array(self.vectors,'d') 137 if self.norm == NORM_AUTO: 138 self.norm = NORM_VALUE 139 if data.shape[1] > 128: 140 self.norm = NORM_PCA 141 142 #Setup value normalization 143 if self.norm == NORM_VALUE: 144 self.dmean = data.mean(axis=0) 145 self.dstd = data.std(axis=0) 146 self.vectors = (data-self.dmean)/self.dstd 147 148 elif self.norm == NORM_PCA: 149 self.pca = PCA() 150 for vec in self.vectors: 151 self.pca.addFeature(vec) 152 153 if self.pca_basis > 1: 154 self.pca.train(drop_front=self.pca_drop,number=self.pca_basis) 155 else: 156 self.pca.train(drop_front=self.pca_drop,energy=self.pca_basis) 157 158 new_vectors = [] 159 for each in self.vectors: 160 new_vectors.append(self.pca.project(each,whiten=True)) 161 self.vectors=array(new_vectors,'d')
162 163 164 165 ## 166 # Normalize the values in a data vector to be mean zero.
167 - def normalizeVector(self,data):
168 if self.norm == NORM_NONE: 169 return data 170 elif self.norm == NORM_VALUE: 171 return (data-self.dmean)/self.dstd 172 elif self.norm == NORM_PCA: 173 return self.pca.project(data,whiten=True) 174 else: 175 raise NotImplementedError("Could not determine nomalization type: "+ self.norm)
176 177 178 ## 179 # Add a training sample. Data must be a vector of numbers.
180 - def addTraining(self,label,data,ilog=None):
181 if self.type == TYPE_REGRESSION: 182 self.labels.append(float(label)) 183 else: 184 self.labels.append(label) 185 186 if isinstance(data,pv.Image): 187 data = data.asMatrix2D().flatten() 188 data = array(data,'d').flatten() 189 190 self.vectors.append(data)
191 192 193 ## 194 # Predict the class or the value for the input data. 195 # 196 # <p>This function will perform value normalization and then 197 # delegate to the subclass to perform classifiaction or 198 # regression.
199 - def predict(self,data,ilog=None):
200 if isinstance(data,pv.Image): 201 data = data.asMatrix2D().flatten() 202 data = array(data,'d').flatten() 203 204 data = self.normalizeVector(data) 205 206 value = self.predictValue(data,ilog=ilog) 207 208 if self.type == TYPE_TWOCLASS or self.type == TYPE_MULTICLASS: 209 return self.invertClass(value) 210 if self.type == TYPE_REGRESSION: 211 return self.invertReg(value)
212 213 214 ## 215 # Override this method in subclasses. 216 # Input should be a numpy array of doubles 217 # 218 # If classifer output is int 219 # If regression output is float
220 - def predictValue(self,data):
221 raise NotImplementedError("This is an abstract method")
222 223 224 ## 225 # Train the classifer on the training data. 226 # 227 # This normalizes the data and the labels, and then passes the 228 # results to the subclass for training.
229 - def train(self,ilog=None,**kwargs):
230 self.trainNormalization() 231 232 self.trainClassifer(self.labels,self.vectors,ilog=ilog,**kwargs) 233 234 # remove training data 235 del self.labels 236 del self.vectors
237 238 239 ## 240 # This abstract method should be overridden by subclasses. 241 # 242 # <p> This method is called from {@link train}. The vectors and values 243 # passed to this method will have been normalized. This method is should 244 # train a classifier or regression algorithm for that normalized data. 245 # 246 # <p> Any keyword arguments passed to train will also be passed on to train 247 # classifier. This could allow variations in training or for verbose 248 # output.
249 - def trainClassifer(self,labels,vectors,ilog=None, **kwargs):
250 raise NotImplementedError("This is an abstract method")
251 252 ## 253 # Convert a normalized regression value back to the original scale
254 - def invertReg(self,value):
255 return value*self.reg_std + self.reg_mean
256 257 258 ## 259 # Convert an integer class value back to the original label values.
260 - def invertClass(self,value):
261 '''Map an integer back into a class label''' 262 return self.class_inv[value]
263 264
265 -def _mse(a,b):
266 assert len(a) == len(b) 267 ss = 0.0 268 for i in range(len(a)): 269 d = float(a[i])-float(b[i]) 270 ss += d*d 271 return ss/len(a)
272
273 -class _TestVectorClassifier(unittest.TestCase):
274
275 - def setUp(self):
276 277 # a simple binary two class 278 xor = VectorClassifier(TYPE_TWOCLASS) 279 xor.addTraining(0,[0,0]) 280 xor.addTraining(0,[1,1]) 281 xor.addTraining(1,[0,1]) 282 xor.addTraining(1,[1,0]) 283 self.xor = xor 284 285 # synthetic linear regression 286 rega = VectorClassifier(TYPE_REGRESSION) 287 filename = os.path.join(pv.__path__[0],'data','synthetic','regression.dat') 288 reg_file = open(filename,'r') 289 for line in reg_file: 290 datapoint = line.split() 291 rega.addTraining(float(datapoint[0]),[float(datapoint[3]),float(datapoint[4]),float(datapoint[5])]) 292 self.rega = rega 293 294 # image classification 295 gender = VectorClassifier(TYPE_TWOCLASS) 296 filename = os.path.join(pv.__path__[0],'data','csuScrapShots','gender.txt') 297 f = open(filename,'r') 298 for line in f: 299 im_name, class_name = line.split() 300 im_name = os.path.join(pv.__path__[0],'data','csuScrapShots',im_name) 301 im = pv.Image(im_name) 302 im = pv.Image(im.asPIL().resize((200,200))) 303 gender.addTraining(class_name,im) 304 self.gender = gender
305
306 - def test_vc_create(self):
310
311 - def test_vc_normalize(self):
312 # This should test class normalization 313 self.xor.trainNormalization() 314 self.assert_(self.xor.norm == NORM_VALUE) 315 self.assert_( _mse(self.xor.dmean, [0.5,0.5]) < 0.0001 ) 316 self.assert_( _mse(self.xor.dstd, [0.5,0.5]) < 0.0001 ) 317 self.assert_(self.xor.class_map == {0:0,1:1}) 318 self.assert_(self.xor.class_inv == {0:0,1:1}) 319 320 # This should test value normalization 321 self.rega.trainNormalization() 322 self.assert_(self.rega.norm == NORM_VALUE) 323 self.assertAlmostEqual( self.rega.reg_mean, 85.49472, places = 4) 324 self.assertAlmostEqual( self.rega.reg_std, 12.20683, places = 4) 325 self.assert_( _mse(self.rega.dmean, [29.082505, 29.9741642, 30.4516687]) < 0.0001 ) 326 self.assert_( _mse(self.rega.dstd, [11.08164301,11.983678,11.18806686]) < 0.0001 ) 327 328 # This should test PCA normalization 329 self.gender.trainNormalization() 330 self.assertEqual(self.gender.norm, NORM_PCA) 331 self.assertEqual(len(self.gender.pca.getValues()), 73) 332 self.assert_(self.gender.class_map == {'M': 1, 'F': 0}) 333 self.assert_(self.gender.class_inv == {0: 'F', 1: 'M'})
334