1
2
3
4
5
6
7
8
9
10 from nltk_lite.wordnet import *
11 import math
12
13
14
15
16
17
18
19
20
22 """
23 Path Distance Similarity:
24 Return a score denoting how similar two word senses are, based on the
25 shortest path that connects the senses in the is-a (hypernym/hypnoym)
26 taxonomy. The score is in the range 0 to 1, except in those cases
27 where a path cannot be found (will only be true for verbs as there are
28 many distinct verb taxonomies), in which case -1 is returned. A score of
29 1 represents identity i.e. comparing a sense with itself will return 1.
30
31 @type synset2: L{Sense}
32 @param synset2: The L{Sense} that this L{Sense} is being compared to.
33
34 @return: A score denoting the similarity of the two L{Sense}s,
35 normally between 0 and 1. -1 is returned if no connecting path
36 could be found. 1 is returned if a L{Sense} is compared with
37 itself.
38 """
39
40 distance = synset1.shortest_path_distance(synset2)
41 if distance >= 0:
42 return 1.0 / (distance + 1)
43 else:
44 return -1
45
47 """
48 Leacock Chodorow Similarity:
49 Return a score denoting how similar two word senses are, based on the
50 shortest path that connects the senses (as above) and the maximum depth
51 of the taxonomy in which the senses occur. The relationship is given as
52 -log(p/2d) where p is the shortest path length and d is the taxonomy depth.
53
54 @type synset2: L{Sense}
55 @param synset2: The L{Sense} that this L{Sense} is being compared to.
56
57 @return: A score denoting the similarity of the two L{Sense}s,
58 normally greater than 0. -1 is returned if no connecting path
59 could be found. If a L{Synset} is compared with itself, the
60 maximum score is returned, which varies depending on the taxonomy depth.
61 """
62
63 taxonomy_depths = {NOUN: 19, VERB: 13}
64 if synset1.pos not in taxonomy_depths.keys():
65 raise TypeError, "Can only calculate similarity for nouns or verbs"
66 depth = taxonomy_depths[synset1.pos]
67
68 distance = synset1.shortest_path_distance(synset2)
69 if distance >= 0:
70 return -math.log((distance + 1) / (2.0 * depth))
71 else:
72 return -1
73
75 """
76 Wu-Palmer Similarity:
77 Return a score denoting how similar two word senses are, based on the
78 depth of the two senses in the taxonomy and that of their Least Common
79 Subsumer (most specific ancestor node). Note that at this time the
80 scores given do _not_ always agree with those given by Pedersen's Perl
81 implementation of Wordnet Similarity.
82
83 The LCS does not necessarily feature in the shortest path connecting the
84 two senses, as it is by definition the common ancestor deepest in the
85 taxonomy, not closest to the two senses. Typically, however, it will so
86 feature. Where multiple candidates for the LCS exist, that whose
87 shortest path to the root node is the longest will be selected. Where
88 the LCS has multiple paths to the root, the longer path is used for
89 the purposes of the calculation.
90
91 @type synset2: L{Sense}
92 @param synset2: The L{Sense} that this L{Sense} is being compared to.
93 @return: A float score denoting the similarity of the two L{Sense}s,
94 normally greater than zero. If no connecting path between the two
95 senses can be found, -1 is returned.
96 """
97
98 subsumer = _lcs_by_depth(synset1, synset2, verbose)
99
100
101 if subsumer == None:
102 return -1
103
104
105 depth = subsumer.max_depth()
106
107
108
109
110 len1 = synset1.shortest_path_distance(subsumer) + depth
111 len2 = synset2.shortest_path_distance(subsumer) + depth
112 return (2.0 * depth) / (len1 + len2)
113
115 """
116 Resnik Similarity:
117 Return a score denoting how similar two word senses are, based on the
118 Information Content (IC) of the Least Common Subsumer (most specific
119 ancestor node). Note that at this time the scores given do _not_
120 always agree with those given by Pedersen's Perl implementation of
121 Wordnet Similarity, although they are mostly very similar.
122
123 The required IC values are precomputed and stored in a file, the name
124 of which should be passed as the 'datafile' argument. For more
125 information on how they are calculated, check brown_ic.py.
126
127 @type synset2: L{Sense}
128 @param synset2: The L{Sense} that this L{Sense} is being compared to.
129 @return: A float score denoting the similarity of the two L{Sense}s.
130 Synsets whose LCS is the root node of the taxonomy will have a
131 score of 0 (e.g. N['dog'][0] and N['table'][0]). If no path exists
132 between the two synsets a score of -1 is returned.
133 """
134
135 _check_datafile(datafile)
136
137
138
139 (noun_freqs, verb_freqs) = _load_ic_data(datafile)
140
141 if synset1.pos is NOUN:
142 (lcs, lcs_ic) = _lcs_by_content(synset1, synset2, noun_freqs)
143
144 elif synset1.pos is VERB:
145 (lcs, lcs_ic) = _lcs_by_content(synset1, synset2, verb_freqs)
146
147 return lcs_ic
148
150 """
151 Jiang-Conrath Similarity:
152 Return a score denoting how similar two word senses are, based on the
153 Information Content (IC) of the Least Common Subsumer (most specific
154 ancestor node) and that of the two input Synsets. The relationship is
155 given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
156
157 Note that at this time the scores given do _not_ always agree with
158 those given by Pedersen's Perl implementation of Wordnet Similarity,
159 although they are mostly very similar.
160
161 The required IC values are calculated using precomputed frequency
162 counts, which are accessed from the 'datafile' file which is supplied
163 as an argument. For more information on how they are calculated,
164 check brown_ic.py.
165
166 @type synset2: L{Sense}
167 @param synset2: The L{Sense} that this L{Sense} is being compared to.
168 @return: A float score denoting the similarity of the two L{Sense}s.
169 If no path exists between the two synsets a score of -1 is returned.
170 """
171
172 _check_datafile(datafile)
173
174 if synset1 == synset2:
175 return inf
176
177
178
179 (noun_freqs, verb_freqs) = _load_ic_data(datafile)
180
181
182
183 if synset1.pos is NOUN: freqs = noun_freqs
184 elif synset1.pos is VERB: freqs = verb_freqs
185 else: return -1
186
187 ic1 = synset1.information_content(freqs)
188 ic2 = synset2.information_content(freqs)
189 (lcs, lcs_ic) = _lcs_by_content(synset1, synset2, freqs)
190
191
192
193 if ic1 is 0 or ic2 is 0: return 0
194
195 return 1 / (ic1 + ic2 - 2 * lcs_ic)
196
198 """
199 Lin Similarity:
200 Return a score denoting how similar two word senses are, based on the
201 Information Content (IC) of the Least Common Subsumer (most specific
202 ancestor node) and that of the two input Synsets. The relationship is
203 given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
204
205 Note that at this time the scores given do _not_ always agree with
206 those given by Pedersen's Perl implementation of Wordnet Similarity,
207 although they are mostly very similar.
208
209 The required IC values are calculated using precomputed frequency
210 counts, which are accessed from the 'datafile' file which is supplied
211 as an argument. For more information on how they are calculated,
212 check brown_ic.py.
213
214 @type synset2: L{Sense}
215 @param synset2: The L{Sense} that this L{Sense} is being compared to.
216 @return: A float score denoting the similarity of the two L{Sense}s,
217 in the range 0 to 1. If no path exists between the two synsets a
218 score of -1 is returned.
219 """
220
221 _check_datafile(datafile)
222
223
224 (noun_freqs, verb_freqs) = _load_ic_data(datafile)
225
226 if synset1.pos is NOUN: freqs = noun_freqs
227 elif synset1.pos is VERB: freqs = verb_freqs
228 else: return -1
229
230 ic1 = synset1.information_content(freqs)
231 ic2 = synset2.information_content(freqs)
232 (lcs, lcs_ic) = _lcs_by_content(synset1, synset2, freqs)
233
234 return (2.0 * lcs_ic) / (ic1 + ic2)
235
236
237
239 """
240 Finds the least common subsumer of two synsets in a Wordnet taxonomy,
241 where the least common subsumer is defined as the ancestor node common
242 to both input synsets whose shortest path to the root node is the longest.
243
244 @type synset1: L{Synset}
245 @param synset1: First input synset.
246 @type synset2: L{Synset}
247 @param synset2: Second input synset.
248 @return: The ancestor synset common to both input synsets which is also the LCS.
249 """
250 subsumer = None
251 max_min_path_length = -1
252
253
254 hypernyms1 = [synset1] + list(synset1.closure(HYPERNYM))
255 hypernyms2 = [synset2] + list(synset2.closure(HYPERNYM))
256 subsumers = [s for s in hypernyms1 if s in hypernyms2]
257
258 if verbose:
259 print "> Subsumers1:", subsumers
260
261
262
263
264 eliminated = set()
265 for s1 in subsumers:
266 for s2 in subsumers:
267 if s2 in s1.closure(HYPERNYM):
268 eliminated.add(s2)
269 if verbose:
270 print "> Eliminated:", eliminated
271
272 subsumers = [s for s in subsumers if s not in eliminated]
273
274 if verbose:
275 print "> Subsumers2:", subsumers
276
277
278
279
280 for candidate in subsumers:
281
282 paths_to_root = candidate.hypernym_paths()
283 min_path_length = -1
284
285 for path in paths_to_root:
286 if min_path_length < 0 or len(path) < min_path_length:
287 min_path_length = len(path)
288
289 if min_path_length > max_min_path_length:
290 max_min_path_length = min_path_length
291 subsumer = candidate
292
293 if verbose:
294 print "> LCS Subsumer by depth:", subsumer
295 return subsumer
296
297 -def _lcs_by_content(synset1, synset2, freqs, verbose=False):
298 """
299 Get the least common subsumer of the two input synsets, where the least
300 common subsumer is defined as the ancestor synset common to both input
301 synsets which has the highest information content (IC) value. The IC
302 value is calculated by extracting the probability of an ancestor synset
303 from a precomputed set.
304
305 @type synset1: L{Synset}
306 @param synset1: First input synset.
307
308 @type synset2: L{Synset}
309 @param synset2: Second input synset.
310
311 @return: The ancestor synset common to both input synsets which is also
312 the LCS.
313 """
314 subsumer = None
315 subsumer_ic = -1
316
317
318
319 subsumers = set()
320 for s1 in synset1.closure(HYPERNYM):
321 for s2 in synset2.closure(HYPERNYM):
322 if s1 == s2:
323 subsumers.add(s1)
324 subsumers.add(synset1)
325 subsumers.add(synset2)
326
327
328
329 for candidate in subsumers:
330 ic = candidate.information_content(freqs)
331 if (subsumer == None and ic > 0) or ic > subsumer_ic:
332 subsumer = candidate
333 subsumer_ic = ic
334
335 if verbose:
336 print "> LCS Subsumer by content:", subsumer, subsumer_ic
337
338 return (subsumer, subsumer_ic)
339