1
2
3
4
5
6
7
8
9
10 """
11 Cosine Classifier -- Beta version
12 """
13
14 from math import sqrt, pow
15 from nltk_lite.probability import *
16 from nltk_lite.contrib.classify import *
17
18 -class Cosine(AbstractClassify):
19 """
20 The Cosine Classifier uses the cosine distance algorithm to compute
21 the distance between the sample document and each of the specified classes.
22 A cosine classifier needs to be trained with representative examples
23 of each class. From these examples the classifier
24 calculates the most probable classification of the sample.
25
26 C . S
27 D(C|S) = -------------------------
28 sqroot(C^2) * sqroot (S^2)
29
30 Internal data structures:
31 _feature_dectector:
32 holds a feature detector function
33 _classes:
34 holds a list of classes supplied during training
35 _cls_freq_dist:
36 holds a dictionary of Frequency Distributions,
37 this structure is defined in probabilty.py in nltk_lite
38 this structure is indexed by class names and feature types
39 the frequency distributions are indexed by feature values
40
41 """
42
44 """
45 @param feature_detector: feature detector produced function, which takes
46 a sample of object to be classified (eg: string or list of words) and returns
47 a list of tuples (feature_type_name, list of values of this feature type)
48 """
49 self._feature_detector = feature_detector
50
52 """
53 Train classifier using representative examples of classes;
54 creates frequency distributions of these classes
55
56 @param gold: dictionary mapping class names to representative examples
57 """
58 self._classes = []
59 self._cls_freq_dist = {}
60 for cls in gold:
61 self._classes.append(cls)
62 for (fname, fvals) in self._feature_detector(gold[cls]):
63 self._cls_freq_dist[cls, fname] = FreqDist()
64 for fval in fvals:
65 self._cls_freq_dist[cls, fname].inc(fval)
66
67
68
70 """
71 @type sample: (any)
72 @param sample: sample to be classified
73 @return: Dictionary (class to probability)
74 """
75 return self._cosine(sample)
76
78 """
79 @param salmple: sample to be classified
80 @return: Dictionary class to probability
81
82 function uses sample to create a frequency distribution
83 cosine distance is computed between each of the class distribustions
84 and the sample's distribution
85 """
86 sample_vector_len = 0
87 dot_prod = {}
88 score = {}
89
90 sample_dist = {}
91
92 for (fname, fvals) in self._feature_detector(sample):
93 sample_dist[fname] = FreqDist()
94 for fval in fvals:
95 sample_dist[fname].inc(fval)
96
97 for cls in self._classes:
98 dot_prod[cls] = 0
99
100 for fname in sample_dist:
101 for fval in sample_dist[fname].samples():
102
103 sample_vector_len += pow(sample_dist[fname].count(fval), 2)
104
105 for cls in self._classes:
106 if fval in self._cls_freq_dist[cls, fname].samples():
107
108 dot_prod[cls] += sample_dist[fname].count(fval) * self._cls_freq_dist[cls,fname].count(fval)
109
110
111 for cls in self._classes:
112 cls_vector_len = 0
113 for fname in sample_dist:
114 for fval in self._cls_freq_dist[cls, fname].samples():
115
116 cls_vector_len += pow(self._cls_freq_dist[cls, fname].count(fval), 2)
117
118
119 if sample_vector_len == 0 or cls_vector_len == 0:
120 score[cls] = 0
121 else :
122 score[cls] = float(dot_prod[cls]) / (sqrt(sample_vector_len) * sqrt(cls_vector_len))
123
124 return score
125
127 return '<CosineClassifier: classes=%d>' % len(self._classes)
128
129
130
131
132
166
167
168
169
171 from nltk_lite.contrib import classify
172 from nltk_lite import detect
173
174 fd = detect.feature({"2-tup": lambda t: [t[n:n+2] for n in range(len(t)-1)]})
175
176 classifier = classify.Cosine(fd)
177 training_data = {"class a": "aaaaaab",
178 "class b": "bbbbbba"}
179 classifier.train(training_data)
180
181 result = classifier.get_class_dict("aaababb")
182
183 for cls in result:
184 print cls, ':', result[cls]
185 """
186 expected values:
187 class a: 'aa' = 5
188 'ab' = 1
189 vector = 5^2 + 1^2 = 26
190 b: 'bb' = 5
191 'ba' = 1
192 vector = 5^2 + 1^2 = 26
193 sample: 'aa' = 2
194 'ab' = 2
195 'ba' = 1
196 'bb' = 1
197 vector = 2^2 + 2^2 + 1^2 + 1^2 = 10
198
199 dot_prod a: 5*2 + 1*2
200 b: 5*1 + 1*1
201
202 score a: 12 / (sqrt(26) * sqrt(10)) = 0.74~
203 score b: 6 / (sqrt(26) * sqrt(10)) = 0.37~
204 """
205
206
207
209 from nltk_lite.contrib import classify
210 from nltk_lite import detect
211
212 fd = detect.feature({"1-tup": lambda t: [t[n] for n in range(len(t))],
213 "2-tup": lambda t: [t[n:n+2] for n in range(len(t)-1)]})
214
215 classifier = classify.Cosine(fd)
216 training_data = {"class a": "aaaaaab",
217 "class b": "bbbbbba"}
218 classifier.train(training_data)
219
220 result = classifier.get_class_dict("aaababb")
221
222 for cls in result:
223 print cls, ':', result[cls]
224
225 """
226 expected values:
227 class a: 'a' = 6
228 'b' = 1
229 'aa' = 5
230 'ab' = 1
231 vector = 6^2 + 5^2 + 1 + 1 = 63
232 b: 'a' = 1
233 'b' = 6
234 'bb' = 5
235 'ba' = 1
236 vector = 6^2 + 5^2 + 1 + 1 = 63
237 sample: 'a' = 4
238 'b' = 3
239 'aa' = 2
240 'ab' = 2
241 'ba' = 1
242 'bb' = 1
243 vector = 4^2 + 3^2 + 2^2 + 2^2 + 1 + 1 = 35
244
245 dot_prod a: 4*6 + 3*1 + 5*2 + 2*1 = 39
246 b: 4*1 + 3*6 + 5*1 + 1*1 = 28
247
248 score a: 39 / (sqrt(63) * sqrt(35)) = 0.83~
249 score b: 28 / (sqrt(63) * sqrt(35)) = 0.59~
250 """
251
252
254 from nltk_lite.contrib import classify
255 from nltk_lite import detect
256
257 from nltk_lite.corpora import genesis
258 from itertools import islice
259
260 fd = detect.feature({"2-tup": lambda t: [' '.join(t)[n:n+2] for n in range(len(' '.join(t))-1)],
261 "words": lambda t: t})
262
263 classifier = classify.Cosine(fd)
264 training_data = {}
265 training_data["english-kjv"] = list(islice(genesis.raw("english-kjv"), 0, 400))
266 training_data["french"] = list(islice(genesis.raw("french"), 0, 400))
267 training_data["finnish"] = list(islice(genesis.raw("finnish"), 0, 400))
268
269 classifier.train(training_data)
270
271 result = classifier.get_class_probs(list(islice(genesis.raw("english-kjv"), 150, 200)))
272
273 print 'english-kjv :', result.prob('english-kjv')
274 print 'french :', result.prob('french')
275 print 'finnish :', result.prob('finnish')
276
277
278 if __name__ == '__main__':
279 demo2()
280