1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35 import math
36
37
40
42 label_set = set(labels)
43
44
45 sums = {}
46 count = 0.0
47 for each in label_set:
48 sums[each] = 0.0
49
50 for each in labels:
51 sums[each] += 1.0
52 count += 1.0
53
54 ent = 0.0
55 for each in sums.values():
56 p_i = each/count
57 ent -= p_i * lg (p_i)
58 return ent
59
60
61
63 label_set = set(labels)
64
65
66 sums = {}
67 count = 0.0
68 for each in label_set:
69 sums[each] = 0.0
70
71 for each in labels:
72 sums[each] += 1.0
73 count += 1.0
74
75 highVal = 0.0
76 highLab = labels[0]
77 for key,value in sums.iteritems():
78 if value > highVal:
79 highVal = value
80 highLab = key
81 return highLab
82
86
87
89 split = {}
90 for label,values in features:
91 key = values[feature]
92 if not split.has_key(key):
93 split[key] = []
94 split[key].append([label,values])
95
96 return split
97
99
101
102 self.training_data = []
103 self.testing_data = []
104 self.labels = set()
105 self.top = None
106
108 '''Training Data'''
109 self.training_data.append((label,feature))
110 self.labels |= self.labels | set([label])
111
113 '''Training Data'''
114 self.testing_data.append((label,feature))
115
116
118 '''Train the classifier on the current data'''
119 self.top = Node(self.training_data)
120
122 '''Classify the feature vector'''
123 return self.top.classify(feature)
124
125 - def test(self, data = None):
126 if data == None:
127 data = self.testing_data
128
129 correct = 0
130 wrong = 0
131 for label,feature in data:
132 c,_ = self.classify(feature)
133 if c == label:
134 correct += 1
135 else:
136 wrong += 1
137 print "Test: %d/%d"%(correct,correct+wrong)
138 return float(correct)/float(correct+wrong)
139
140
141
144
145 self.cutoff = 2
146 self.min_entropy = 0.2
147
148 self.feature = None
149 self.entropy = None
150 self.label = None
151 self.children = None
152
153 self.train(features)
154
155 - def train(self,features):
190
191
193 '''Classify the feature vector'''
194
195 if self.feature:
196 val = feature[self.feature]
197 if self.children.has_key(val):
198 return self.children[val].classify(feature)
199 return self.label,None
200
201
202
203
205 result = []
206 for _ in range(bits):
207 result.append(val&1)
208 val = val >> 1
209
210 result.reverse()
211
212 return result
213