1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 '''
35 This module contains functions for reading and writing files
36 for the Biometrics Evaluation Environment (BEE) including distance
37 matricies and sigsets.
38
39 @authors: David S. Bolme (CSU) and C.J. Carey (NIST)
40
41 see: <a href="http://www.bee-biometrics.org">http://www.bee-biometrics.org</a>
42 '''
43
44 import xml.etree.cElementTree as ET
45 import os.path
46 import struct
47 import binascii
48 import numpy as np
49
50 import scipy.io as spio
51 import pyvision as pv
52 import pyvision.analysis.roc as roc
53 import gzip
54
55 BIOMETRIC_SIGNATURE = '{http://www.bee-biometrics.org/schemas/sigset/0.1}biometric-signature'
56 PRESENTATION = '{http://www.bee-biometrics.org/schemas/sigset/0.1}presentation'
57
58 COMPLEX_BIOMETRIC_SIGNATURE = '{http://www.bee-biometrics.org/schemas/sigset/0.1}complex-biometric-signature'
59 COMPLEX_PRESENTATION = '{http://www.bee-biometrics.org/schemas/sigset/0.1}complex-presentation'
60 COMPLEX_COMPONENT = '{http://www.bee-biometrics.org/schemas/sigset/0.1}presentation-component'
61 COMPLEX_DATA = '{http://www.bee-biometrics.org/schemas/sigset/0.1}data'
62
63 BEE_NONMATCH = 0x7f
64 BEE_MATCH = -1
65 BEE_DONTCARE = 0x00
66
67 BEE_CODE_MAP = {
68 0x7f:"NONMATCH",
69 0xff:"MATCH",
70 -1:"MATCH",
71 0x00:"DONTCARE",
72 }
73
74
75
77 '''
78 the format of a sigset is::
79 sigset = [
80 ("subject_id", #biometric-signature
81 [ # multiple presentations
82 {'name':"recording_id", 'modality':"...", 'file-name':"...", 'file-format':"..."},
83 {'name':"recording_id", 'modality':"...", 'file-name':"...", 'file-format':"..."},
84 {'name':"recording_id", 'modality':"...", 'file-name':"...", 'file-format':"..."}
85 ]
86 ),
87 ("subject_id",#biometric-signature
88 [ # multiple presentations
89 {'name':"recording_id", 'modality':"...", 'file-name':"...", 'file-format':"..."},
90 {'name':"recording_id", 'modality':"...", 'file-name':"...", 'file-format':"..."},
91 {'name':"recording_id", 'modality':"...", 'file-name':"...", 'file-format':"..."}
92 ]
93 )
94 ]
95 '''
96 if isinstance(filename,str) and filename.endswith('.gz'):
97
98 filename = gzip.open(filename,'rb')
99
100 sigset = ET.parse(filename)
101 result = []
102
103
104 for sig in sigset.findall('biometric-signature'):
105 name = sig.get('name')
106 signature = []
107 result.append( (name,signature) )
108 for pres in sig.findall('presentation'):
109 presentation = {}
110 for key in pres.keys():
111 presentation[key] = pres.get(key)
112 signature.append(presentation)
113
114
115 for sig in sigset.findall(BIOMETRIC_SIGNATURE):
116 name = sig.get('name')
117 signature = []
118 result.append( (name, signature ) )
119 for pres in sig.findall(PRESENTATION):
120 presentation = {}
121 for key in pres.keys():
122 presentation[key] = pres.get(key)
123 signature.append(presentation)
124
125
126 for sig in sigset.findall(COMPLEX_BIOMETRIC_SIGNATURE):
127 name = sig.get('name')
128 signature = []
129 result.append( (name, signature) )
130 for pres in sig.findall(COMPLEX_PRESENTATION):
131 presentation = {}
132 for key in pres.keys():
133 presentation[key] = pres.get(key)
134 for comp in pres.findall(COMPLEX_COMPONENT):
135 for data in comp.findall(COMPLEX_DATA):
136 for key in data.keys():
137 presentation[key] = data.get(key)
138
139 signature.append(presentation)
140
141 return result
142
144 '''
145 save a sigset to a file.
146
147 @param ss: a sigset structured list
148 @param filename: a file object or filename
149 '''
150 if isinstance(filename,str) and filename.endswith('.gz'):
151
152 filename = gzip.open(filename,'wb')
153
154 xmlss = sigset2xml(ss)
155 xmlss.write(filename)
156
158 root = ET.Element("biometric-signature-set")
159 root.text="\n "
160 for signature in ss:
161 sig = ET.SubElement(root,"biometric-signature")
162 sig.set('name',signature[0])
163 sig.text="\n "
164 sig.tail="\n "
165 for presentation in signature[1]:
166 pres = ET.SubElement(sig,'presentation')
167 for key,value in presentation.iteritems():
168 pres.set(key,value)
169 pres.tail="\n "
170 tree = ET.ElementTree(root)
171 return tree
172
174 result = []
175 for signature in ss:
176 sub_id = signature[0]
177 if len(signature[1]) != 1:
178 raise TypeError("This function only handles simple sigsets.")
179
180
181 mode = signature[1][0]['modality']
182 file_format = signature[1][0]['file-format']
183 rec_id = signature[1][0]['name']
184 filename = signature[1][0]['file-name']
185 result.append([sub_id,mode,file_format,rec_id,filename])
186 return result
187
188
201
202
203 -def fastROC(sorted_positives, sorted_negatives):
204 '''
205 '''
206
207 positives = sorted_positives
208 negatives = sorted_negatives
209
210 n_pos = len(positives)
211 n_neg = len(negatives)
212
213 assert len(positives) < len(negatives)
214
215
216 indexes = np.searchsorted(negatives,positives)
217
218
219
220
221 tp = (1.0/n_pos) * np.arange(n_pos)
222 fn = (1.0/n_neg) * indexes
223
224
225
226 curve = np.array([tp,fn]).transpose()
227
228
229
230
231 return curve
232
234
236 '''
237 Creates a BEE distance matrix
238 '''
239 if isinstance(args[0],str):
240 self.loadFile(*args,**kwargs)
241
242 elif isinstance(args[0],np.ndarray):
243 self.loadMatrix(*args,**kwargs)
244
245 else:
246 raise TypeError("Cannot create a BEEDistanceMatrix from an object of type: %s"%type(args[0]))
247
248 - def loadFile(self,filename,sigset_dir=None):
249 '''
250 Loads a BEE matrix from a file.
251 '''
252 self.filename = filename
253 self.shortname = os.path.basename(filename)
254
255
256 f = open(filename,'rb')
257
258
259
260 line = f.readline()
261
262 if len(line) != 3 or line[-1] != "\x0a":
263
264
265
266
267 raise ValueError("Unsupported line ending. Should two characters followed by LF (0x0A).")
268
269 line = line.strip()
270 if line not in ['D2','S2','M2']:
271 raise ValueError('Unknown matrix Format "%s". Should be D2, S2, or M2.'%line)
272
273 self.is_distance = True
274 if line[0][0] == 'S':
275 self.is_distance = False
276
277
278 line = f.readline().split()
279 self.target_filename = os.path.basename(line[0])
280
281
282 line = f.readline().split()
283 self.query_filename = os.path.basename(line[0])
284
285
286 line = f.readline().split()
287 assert line[0] in ['MF','MB']
288 file_type = line[0][1]
289
290 self.n_queries = int(line[1])
291 self.n_targets = int(line[2])
292
293 big_endian = struct.pack(">I",0x12345678)
294 little_endian = struct.pack("<I",0x12345678)
295
296 if line[3] != big_endian and line[3] != little_endian:
297 print "Warning unsupported magic number is BEE matrix: 0x%s"%binascii.hexlify(line[3])
298
299 self.magic_number = struct.unpack_from("=I",line[3])[0]
300 if self.magic_number == 0x12345678:
301 byteswap = False
302 elif self.magic_number == 0x78563412:
303 byteswap = True
304 else:
305 raise ValueError("Unknown magic number in similarity matrix.")
306
307
308 if file_type=='F':
309 self.matrix = np.fromfile(f,dtype=np.float32)
310 elif file_type=='B':
311 self.matrix = np.fromfile(f,dtype=np.byte)
312 else:
313 raise TypeError("Unknown matrix file_type: %s"%file_type)
314
315 if file_type=='F' and byteswap:
316 self.matrix = self.matrix.byteswap()
317 assert self.matrix.shape[0] == self.n_targets*self.n_queries
318 self.matrix = self.matrix.reshape(self.n_queries,self.n_targets)
319
320
321 if sigset_dir == None:
322 sigset_dir = os.path.dirname(self.filename)
323 self.queries = None
324 try:
325 ss_name = os.path.join(sigset_dir,self.query_filename)
326 self.queries = parseSigSet(ss_name)
327 assert len(self.queries) == self.n_queries
328 except:
329 pass
330
331
332
333
334 self.targets = None
335 try:
336 ss_name = os.path.join(sigset_dir,self.target_filename)
337 self.targets = parseSigSet(ss_name)
338
339 assert len(self.targets) == self.n_targets
340 except:
341 pass
342
343
344
345
346
347 - def loadMatrix(self, mat, query_filename, target_filename, sigset_dir=None, is_distance=True):
348 '''
349 Creates a bee matrix from a numpy array.
350 '''
351 self.shortname=None
352
353
354 if mat.dtype != np.byte:
355 mat = mat.astype(np.float32)
356
357
358 self.is_distance = is_distance
359
360
361 self.target_filename = target_filename
362
363
364 self.query_filename = query_filename
365
366
367 self.n_queries = mat.shape[0]
368 self.n_targets = mat.shape[1]
369 self.magic_number = 0x12345678
370
371
372 self.matrix = mat
373
374
375 self.queries = None
376 self.targets = None
377 if sigset_dir != None:
378 try:
379 ss_name = os.path.join(sigset_dir,self.query_filename)
380 self.queries = parseSigSet(ss_name)
381 assert len(self.queries) == self.n_queries
382 except:
383 print "Warning: cound not read the query sigset for distance matrix"
384 print " SigSet File:",ss_name
385 print " Expected:",self.n_queries,"Read:",len(self.queries)
386
387 try:
388 ss_name = os.path.join(sigset_dir,self.target_filename)
389 self.targets = parseSigSet(ss_name)
390
391 assert len(self.targets) == self.n_targets
392 except:
393 print "Warning: cound not read the target sigset for distance matrix"
394 print " SigSet File:",ss_name
395 print " Expected:",self.n_targets,"Read:",len(self.targets)
396
397
399 for i in range(self.matrix.shape[0]):
400 a = self.matrix[i,:]
401 mn = a.mean()
402 sd = a.std()
403 self.matrix[i,:] = (self.matrix[i,:]-mn)/sd
404
405
407
408
409
410 matches = []
411 if self.queries != None and self.targets != None:
412 queries = np.array([ name for name,_ in self.queries ])
413 targets = np.array([ name for name,_ in self.targets ])
414 for i in range(self.matrix.shape[0]):
415
416 if mask != None:
417 matches.append(self.matrix[i,mask.matrix[i,:] == BEE_MATCH])
418 else:
419 query = queries[i]
420 matches.append(self.matrix[i,query==targets])
421 total = 0
422 for each in matches:
423 total += len(each)
424
425 scores = np.zeros(shape=(total),dtype=np.float32)
426 i = 0
427 for each in matches:
428 s = len(each)
429 scores[i:i+s] = each
430 i += s
431 return scores
432
433
435 assert self.queries != None
436 assert self.targets != None
437
438 matches = {}
439 queries = np.array([ name for name,_ in self.queries ])
440 targets = np.array([ name for name,_ in self.targets ])
441
442 qnames = set(queries)
443
444
445 for name in qnames:
446 rows = np.nonzero(name == queries)[0]
447 cols = np.nonzero(name == targets)[0]
448 tmp = self.matrix[rows][:,cols]
449 if mask != None:
450 m = mask.matrix[rows][:,cols] == BEE_MATCH
451 matches[name] = tmp.flatten()[m.flatten()]
452 else:
453 matches[name] = tmp.flatten()
454
455 if len(matches[name]) == 0:
456 del matches[name]
457
458 return matches
459
460
462
463
464
465 matches = []
466 if self.queries != None and self.targets != None:
467 queries = np.array([ name for name,_ in self.queries ])
468 targets = np.array([ name for name,_ in self.targets ])
469 for i in range(self.matrix.shape[0]):
470 if mask != None:
471 matches.append(self.matrix[i,mask.matrix[i,:] == BEE_NONMATCH])
472 else:
473 query = queries[i]
474 matches.append(self.matrix[i,query!=targets])
475 total = 0
476 for each in matches:
477 total += len(each)
478
479 scores = np.zeros(shape=(total),dtype=np.float32)
480 i = 0
481 for each in matches:
482 s = len(each)
483 scores[i:i+s] = each
484 i += s
485 return scores
486
488 '''query,target,score,type'''
489 r,c = self.matrix.shape
490 result = np.zeros((r*c,4),dtype=np.object)
491 for i in range(r):
492 for j in range(c):
493 result[c*i+j,0] = i
494 result[c*i+j,1] = j
495 result[c*i+j,2] = self.matrix[i,j]
496 if BEE_CODE_MAP.has_key(mask[i,j]):
497 result[c*i+j,3] = BEE_CODE_MAP[mask[i,j]]
498 else:
499 result[c*i+j,3] = "0x%02x"%mask[i,j]
500 return result
501
502
503
504
505
507 print "BEEDistanceMatrix:",self.filename
508 print " is_distance :",self.is_distance
509 print " target_filename :",self.target_filename
510 print " query_filename :",self.query_filename
511 print " n_queries :",self.n_queries
512 print " n_targets :",self.n_targets
513 print " <total size> :",self.n_targets*self.n_queries
514 print " magic_number : %x"%self.magic_number
515 print " matrix.shape :",self.matrix.shape
516
517 - def write(self,filename):
519
520 - def save(self,filename):
521 '''
522 Writes the BEE distance matrix to file. WARNING: DOES NOT HANDLE MASK MATRICES CORRECTLY!
523 '''
524 if filename.endswith('.mtx'):
525
526 self.saveBeeFormat(filename)
527 elif filename.endswith('.mat'):
528
529 if self.is_distance:
530 matrix_name = 'dist_matrix'
531 else:
532 matrix_name = 'sim_matrix'
533 spio.savemat(filename, {matrix_name:self.matrix})
534 else:
535 return NotImplementedError("Unsupported matrix format for filename %s"%filename)
536
568
569 - def histogram(self,value_range=None,bins=100,normed=True,mask=None):
570 match_scores = self.getMatchScores(mask=mask)
571 nonmatch_scores = self.getNonMatchScores(mask=mask)
572 if value_range == None:
573 value_range = (self.matrix.min(),self.matrix.max())
574
575 match_counts,_ = np.histogram(match_scores,range=value_range,bins=bins,normed=normed)
576 nonmatch_counts,vals = np.histogram(nonmatch_scores,range=value_range,bins=bins,normed=normed)
577
578 hist = pv.Table()
579 for i in range(len(match_counts)):
580 hist[i,'min'] = vals[i]
581 hist[i,'center'] = 0.5*(vals[i]+vals[i+1])
582 hist[i,'max'] = vals[i+1]
583 hist[i,'match_count'] = match_counts[i]
584 hist[i,'nonmatch_count'] = nonmatch_counts[i]
585 return hist
586
587
592
594 rows,_ = self.matrix.shape
595
596 queries = np.array([ name for name,_ in self.queries ])
597 targets = np.array([ name for name,_ in self.targets ])
598
599 success = 0.0
600 count = 0.0
601 for i in range(rows):
602 row = self.matrix[i]
603 if self.is_distance:
604 j = row.argmin()
605 else:
606 j = row.argmax()
607 if queries[i] == targets[j]:
608 success += 1
609 count += 1
610
611
612 return success/count
613
614
615
617 table = pv.Table()
618 table['Mean','Value'] = self.matrix.mean()
619
620 table['Min','Value'] = self.matrix.min()
621 table['Max','Value'] = self.matrix.max()
622 return table
623
624
626 '''
627 Returns a string describing the matrix.
628 '''
629 file_type = {True:"Distance",False:"Similarity"}[self.is_distance]
630 return "BEE[file=%s;type=%s]"%(self.shortname,file_type)
631
633 '''An accessor to quickly read matrix data'''
634 return self.matrix.__getitem__(index)
635
637 '''@returns: the number of rows and columns.'''
638 return self.matrix.shape
639
640
641 -def computeMaskMatrix(target_sigset,query_sigset,target_filename,query_filename,symmetric = True):
642 '''
643 Computes a mask matrix from two sigsets.
644
645 @param target_sigset: the target sigset to use.
646 @param query_sigset: the query sigset to use.
647 @param symmetric: if true and the sigsets are equal it assumes that the matrix is symmetric and will treat the low left triangle as DONT_CARE's.
648 @returns: a bee mask matrix.
649 '''
650 assert len(target_sigset) > 0
651 assert len(query_sigset) > 0
652 target_subid = np.array([each[0] for each in target_sigset])
653 query_subid = np.array([each[0] for each in query_sigset])
654 target_recid = np.array([each[1][0]['name'] for each in target_sigset])
655 query_recid = np.array([each[1][0]['name'] for each in query_sigset])
656
657 cols = target_subid.shape[0]
658 rows = query_subid.shape[0]
659
660 target_subid.shape = (1,cols)
661 query_subid.shape = (rows,1)
662 target_recid.shape = (1,cols)
663 query_recid.shape = (rows,1)
664
665
666 mat = np.zeros((rows,cols),dtype=np.byte)
667 mat[:,:] = pv.BEE_NONMATCH
668
669
670 matches = target_subid == query_subid
671 mat[matches] = pv.BEE_MATCH
672
673
674 duplicates = target_recid == query_recid
675 mat[duplicates] = pv.BEE_DONTCARE
676
677
678 if symmetric and rows == cols:
679 ts = target_recid.flatten()
680 qs = query_recid.flatten()
681 if (ts == qs).sum() == rows:
682
683 r = np.arange(rows)
684 c = np.arange(cols)
685 r.shape = (rows,1)
686 c.shape = (1,cols)
687 tmp = r > c
688 mat[tmp] = pv.BEE_DONTCARE
689
690 return pv.BEEDistanceMatrix(mat, query_filename, target_filename)
691