pyvision.analysis.bee

234

235 - def __init__(self, *args, **kwargs):

236 ''' 237 Creates a BEE distance matrix 238 ''' 239 if isinstance(args[0],str): 240 self.loadFile(*args,**kwargs) 241 242 elif isinstance(args[0],np.ndarray): 243 self.loadMatrix(*args,**kwargs) 244 245 else: 246 raise TypeError("Cannot create a BEEDistanceMatrix from an object of type: %s"%type(args[0]))

247

248 - def loadFile(self,filename,sigset_dir=None):

249 ''' 250 Loads a BEE matrix from a file. 251 ''' 252 self.filename = filename 253 self.shortname = os.path.basename(filename) 254 255 # open the file for reading 256 f = open(filename,'rb') 257 258 #read the distance matrix header (first four lines of the file) 259 260 line = f.readline() 261 # Test line endings 262 if len(line) != 3 or line[-1] != "\x0a": 263 # Note: \x0a is the "official" line ending char as of 264 # \x0d is also supported in the Java and C++ tools but it will cause a failure in this implementation. 265 # see IARPA BEST - Challenge Problem Specification and Executable Application Program Interface 266 # thanks to Todd Scruggs 267 raise ValueError("Unsupported line ending. Should two characters followed by LF (0x0A).") 268 # Check Format 269 line = line.strip() 270 if line not in ['D2','S2','M2']: 271 raise ValueError('Unknown matrix Format "%s". Should be D2, S2, or M2.'%line) 272 273 self.is_distance = True 274 if line[0][0] == 'S': 275 self.is_distance = False 276 277 # read and process line 2 (target sigset) 278 line = f.readline().split() 279 self.target_filename = os.path.basename(line[0]) 280 281 # read and process line 3 (query sigset) 282 line = f.readline().split() 283 self.query_filename = os.path.basename(line[0]) 284 285 # read and process line 4 (MF n_queries n_targets magic_number) 286 line = f.readline().split() 287 assert line[0] in ['MF','MB'] 288 file_type = line[0][1] 289 290 self.n_queries = int(line[1]) 291 self.n_targets = int(line[2]) 292 293 big_endian = struct.pack(">I",0x12345678) 294 little_endian = struct.pack("<I",0x12345678) 295 296 if line[3] != big_endian and line[3] != little_endian: 297 print "Warning unsupported magic number is BEE matrix: 0x%s"%binascii.hexlify(line[3]) 298 299 self.magic_number = struct.unpack_from("=I",line[3])[0] 300 if self.magic_number == 0x12345678: 301 byteswap = False 302 elif self.magic_number == 0x78563412: 303 byteswap = True 304 else: 305 raise ValueError("Unknown magic number in similarity matrix.") 306 307 # Read the matrix data 308 if file_type=='F': 309 self.matrix = np.fromfile(f,dtype=np.float32) 310 elif file_type=='B': 311 self.matrix = np.fromfile(f,dtype=np.byte) 312 else: 313 raise TypeError("Unknown matrix file_type: %s"%file_type) 314 315 if file_type=='F' and byteswap: 316 self.matrix = self.matrix.byteswap() 317 assert self.matrix.shape[0] == self.n_targets*self.n_queries 318 self.matrix = self.matrix.reshape(self.n_queries,self.n_targets) 319 320 # Try to read the sigsets. 321 if sigset_dir == None: 322 sigset_dir = os.path.dirname(self.filename) 323 self.queries = None 324 try: 325 ss_name = os.path.join(sigset_dir,self.query_filename) 326 self.queries = parseSigSet(ss_name) 327 assert len(self.queries) == self.n_queries 328 except: 329 pass 330 #print "Warning: cound not read the query sigset for distance matrix %s"%self.shortname 331 #print " SigSet File:",ss_name 332 #print " Expected:",self.n_queries,"Read:",len(self.queries) 333 334 self.targets = None 335 try: 336 ss_name = os.path.join(sigset_dir,self.target_filename) 337 self.targets = parseSigSet(ss_name) 338 339 assert len(self.targets) == self.n_targets 340 except: 341 pass

342 #print "Warning: cound not read the target sigset for distance matrix %s"%self.shortname 343 #print " SigSet File:",ss_name 344 #print " Expected:",self.n_targets,"Read:",len(self.targets) 345 346

347 - def loadMatrix(self, mat, query_filename, target_filename, sigset_dir=None, is_distance=True):

348 ''' 349 Creates a bee matrix from a numpy array. 350 ''' 351 self.shortname=None 352 353 #read the distance matrix header (first four lines of the file) 354 if mat.dtype != np.byte: 355 mat = mat.astype(np.float32) 356 357 # select distance or similarity 358 self.is_distance = is_distance 359 360 # read and process line 2 (target sigset) 361 self.target_filename = target_filename 362 363 # read and process line 3 (query sigset) 364 self.query_filename = query_filename 365 366 # read and process line 4 (MF n_queries n_targets magic_number) 367 self.n_queries = mat.shape[0] 368 self.n_targets = mat.shape[1] 369 self.magic_number = 0x12345678 370 371 # Read the matrix data 372 self.matrix = mat 373 374 # Try to read the sigsets. 375 self.queries = None 376 self.targets = None 377 if sigset_dir != None: 378 try: 379 ss_name = os.path.join(sigset_dir,self.query_filename) 380 self.queries = parseSigSet(ss_name) 381 assert len(self.queries) == self.n_queries 382 except: 383 print "Warning: cound not read the query sigset for distance matrix" 384 print " SigSet File:",ss_name 385 print " Expected:",self.n_queries,"Read:",len(self.queries) 386 387 try: 388 ss_name = os.path.join(sigset_dir,self.target_filename) 389 self.targets = parseSigSet(ss_name) 390 391 assert len(self.targets) == self.n_targets 392 except: 393 print "Warning: cound not read the target sigset for distance matrix" 394 print " SigSet File:",ss_name 395 print " Expected:",self.n_targets,"Read:",len(self.targets)

396 397

398 - def cohort_norm(self):

399 for i in range(self.matrix.shape[0]): 400 a = self.matrix[i,:] 401 mn = a.mean() 402 sd = a.std() 403 self.matrix[i,:] = (self.matrix[i,:]-mn)/sd

404 405

406 - def getMatchScores(self,mask=None):

407 #assert self.queries != None 408 #assert self.targets != None 409 410 matches = [] 411 if self.queries != None and self.targets != None: 412 queries = np.array([ name for name,_ in self.queries ]) 413 targets = np.array([ name for name,_ in self.targets ]) 414 for i in range(self.matrix.shape[0]): 415 #print i, len(matches) 416 if mask != None: 417 matches.append(self.matrix[i,mask.matrix[i,:] == BEE_MATCH]) 418 else: 419 query = queries[i] 420 matches.append(self.matrix[i,query==targets]) 421 total = 0 422 for each in matches: 423 total += len(each) 424 425 scores = np.zeros(shape=(total),dtype=np.float32) 426 i = 0 427 for each in matches: 428 s = len(each) 429 scores[i:i+s] = each 430 i += s 431 return scores

432 433

434 - def getMatchScoresBySubject(self,mask=None):

435 assert self.queries != None 436 assert self.targets != None 437 438 matches = {} 439 queries = np.array([ name for name,_ in self.queries ]) 440 targets = np.array([ name for name,_ in self.targets ]) 441 442 qnames = set(queries) 443 #tnames = set(targets) 444 445 for name in qnames: 446 rows = np.nonzero(name == queries)[0] 447 cols = np.nonzero(name == targets)[0] 448 tmp = self.matrix[rows][:,cols] 449 if mask != None: 450 m = mask.matrix[rows][:,cols] == BEE_MATCH 451 matches[name] = tmp.flatten()[m.flatten()] 452 else: 453 matches[name] = tmp.flatten() 454 455 if len(matches[name]) == 0: 456 del matches[name] 457 458 return matches

459 460

461 - def getNonMatchScores(self,mask=None):

462 #assert self.queries != None 463 #assert self.targets != None 464 465 matches = [] 466 if self.queries != None and self.targets != None: 467 queries = np.array([ name for name,_ in self.queries ]) 468 targets = np.array([ name for name,_ in self.targets ]) 469 for i in range(self.matrix.shape[0]): 470 if mask != None: 471 matches.append(self.matrix[i,mask.matrix[i,:] == BEE_NONMATCH]) 472 else: 473 query = queries[i] 474 matches.append(self.matrix[i,query!=targets]) 475 total = 0 476 for each in matches: 477 total += len(each) 478 479 scores = np.zeros(shape=(total),dtype=np.float32) 480 i = 0 481 for each in matches: 482 s = len(each) 483 scores[i:i+s] = each 484 i += s 485 return scores

486

487 - def asFlatArray(self,mask=None):

488 '''query,target,score,type''' 489 r,c = self.matrix.shape 490 result = np.zeros((r*c,4),dtype=np.object) 491 for i in range(r): 492 for j in range(c): 493 result[c*i+j,0] = i 494 result[c*i+j,1] = j 495 result[c*i+j,2] = self.matrix[i,j] 496 if BEE_CODE_MAP.has_key(mask[i,j]): 497 result[c*i+j,3] = BEE_CODE_MAP[mask[i,j]] 498 else: 499 result[c*i+j,3] = "0x%02x"%mask[i,j] 500 return result

501 502 503 504 505

506 - def printInfo(self):

507 print "BEEDistanceMatrix:",self.filename 508 print " is_distance :",self.is_distance 509 print " target_filename :",self.target_filename 510 print " query_filename :",self.query_filename 511 print " n_queries :",self.n_queries 512 print " n_targets :",self.n_targets 513 print " <total size> :",self.n_targets*self.n_queries 514 print " magic_number : %x"%self.magic_number 515 print " matrix.shape :",self.matrix.shape

516

517 - def write(self,filename):

518 self.save(filename)

519

520 - def save(self,filename):

521 ''' 522 Writes the BEE distance matrix to file. WARNING: DOES NOT HANDLE MASK MATRICES CORRECTLY! 523 ''' 524 if filename.endswith('.mtx'): 525 # save a BEE formated matrix 526 self.saveBeeFormat(filename) 527 elif filename.endswith('.mat'): 528 # save a matlab formated matrix 529 if self.is_distance: 530 matrix_name = 'dist_matrix' 531 else: 532 matrix_name = 'sim_matrix' 533 spio.savemat(filename, {matrix_name:self.matrix}) 534 else: 535 return NotImplementedError("Unsupported matrix format for filename %s"%filename)

536

537 - def saveBeeFormat(self,filename):

538 #maybe check for overwrite? and add param for allowing overwrite 539 f = open(filename, "wb") 540 541 # write line 1 : file_type and version 542 file_type = 'D' 543 if self.matrix.dtype == np.byte: 544 file_type = 'M' 545 elif self.is_distance: 546 file_type = 'D' 547 else: 548 file_type = 'S' 549 550 f.write(file_type) 551 f.write("2\x0a") 552 553 # write lines 2 and 3 (target and query sigsets) 554 f.write(self.target_filename+"\x0a") 555 f.write(self.query_filename+"\x0a") 556 557 # write line 4 (MF n_queries n_targets magic_number) 558 magic_number = struct.pack('=I',0x12345678) 559 assert len(magic_number) == 4 # Bug fix: verify the magic number is really 4 bytes 560 if file_type == 'M': 561 f.write("MB %d %d %s\x0a" %(self.n_queries, self.n_targets, magic_number)) 562 else: 563 f.write("MF %d %d %s\x0a" %(self.n_queries, self.n_targets, magic_number)) 564 565 # write the data 566 f.write(self.matrix) 567 f.close()

568

569 - def histogram(self,value_range=None,bins=100,normed=True,mask=None):

570 match_scores = self.getMatchScores(mask=mask) 571 nonmatch_scores = self.getNonMatchScores(mask=mask) 572 if value_range == None: 573 value_range = (self.matrix.min(),self.matrix.max()) 574 575 match_counts,_ = np.histogram(match_scores,range=value_range,bins=bins,normed=normed) 576 nonmatch_counts,vals = np.histogram(nonmatch_scores,range=value_range,bins=bins,normed=normed) 577 578 hist = pv.Table() 579 for i in range(len(match_counts)): 580 hist[i,'min'] = vals[i] 581 hist[i,'center'] = 0.5*(vals[i]+vals[i+1]) 582 hist[i,'max'] = vals[i+1] 583 hist[i,'match_count'] = match_counts[i] 584 hist[i,'nonmatch_count'] = nonmatch_counts[i] 585 return hist

586 587

588 - def getROC(self,mask=None):

589 nonmatch = self.getNonMatchScores(mask=mask) 590 match = self.getMatchScores(mask=mask) 591 return roc.ROC(match,nonmatch,is_distance=self.is_distance)

592

593 - def getRank1(self,mask=None):

594 rows,_ = self.matrix.shape 595 596 queries = np.array([ name for name,_ in self.queries ]) 597 targets = np.array([ name for name,_ in self.targets ]) 598 599 success = 0.0 600 count = 0.0 601 for i in range(rows): 602 row = self.matrix[i] 603 if self.is_distance: 604 j = row.argmin() 605 else: 606 j = row.argmax() 607 if queries[i] == targets[j]: 608 success += 1 609 count += 1 610 611 #print success, count, success/count 612 return success/count

613 614 615

616 - def stats(self):

617 table = pv.Table() 618 table['Mean','Value'] = self.matrix.mean() 619 # not computed effecently: table['Std','Value'] = self.matrix.flatten().std() 620 table['Min','Value'] = self.matrix.min() 621 table['Max','Value'] = self.matrix.max() 622 return table

623 624

625 - def __str__(self):

626 ''' 627 Returns a string describing the matrix. 628 ''' 629 file_type = {True:"Distance",False:"Similarity"}[self.is_distance] 630 return "BEE[file=%s;type=%s]"%(self.shortname,file_type)

631

632 - def __getitem__(self,index):

633 '''An accessor to quickly read matrix data''' 634 return self.matrix.__getitem__(index)

635

636 - def shape(self):

637 '''@returns: the number of rows and columns.''' 638 return self.matrix.shape

Source Code for Module pyvision.analysis.bee