Package nltk_lite :: Package wordnet :: Module similarity
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.wordnet.similarity

  1  # Natural Language Toolkit: Wordnet Similarity 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Oliver Steele <steele@osteele.com> 
  5  #         David Ormiston Smith <daosmith@csse.unimelb.edu.au>> 
  6  #         Steven Bird <sb@csse.unimelb.edu.au> 
  7  # URL: <http://nltk.sf.net> 
  8  # For license information, see LICENSE.TXT 
  9   
 10  from nltk_lite.wordnet import * 
 11  import math 
 12   
 13  # Similarity metrics 
 14   
 15  # TODO: Add in the option to manually add a new root node; this will be 
 16  # useful for verb similarity as there exist multiple verb taxonomies. 
 17   
 18  # More information about the metrics is available at 
 19  # http://marimba.d.umn.edu/similarity/measures.html 
 20   
21 -def path_similarity(synset1, synset2, verbose=False):
22 """ 23 Path Distance Similarity: 24 Return a score denoting how similar two word senses are, based on the 25 shortest path that connects the senses in the is-a (hypernym/hypnoym) 26 taxonomy. The score is in the range 0 to 1, except in those cases 27 where a path cannot be found (will only be true for verbs as there are 28 many distinct verb taxonomies), in which case -1 is returned. A score of 29 1 represents identity i.e. comparing a sense with itself will return 1. 30 31 @type synset2: L{Sense} 32 @param synset2: The L{Sense} that this L{Sense} is being compared to. 33 34 @return: A score denoting the similarity of the two L{Sense}s, 35 normally between 0 and 1. -1 is returned if no connecting path 36 could be found. 1 is returned if a L{Sense} is compared with 37 itself. 38 """ 39 40 distance = synset1.shortest_path_distance(synset2) 41 if distance >= 0: 42 return 1.0 / (distance + 1) 43 else: 44 return -1
45
46 -def lch_similarity(synset1, synset2, verbose=False):
47 """ 48 Leacock Chodorow Similarity: 49 Return a score denoting how similar two word senses are, based on the 50 shortest path that connects the senses (as above) and the maximum depth 51 of the taxonomy in which the senses occur. The relationship is given as 52 -log(p/2d) where p is the shortest path length and d is the taxonomy depth. 53 54 @type synset2: L{Sense} 55 @param synset2: The L{Sense} that this L{Sense} is being compared to. 56 57 @return: A score denoting the similarity of the two L{Sense}s, 58 normally greater than 0. -1 is returned if no connecting path 59 could be found. If a L{Synset} is compared with itself, the 60 maximum score is returned, which varies depending on the taxonomy depth. 61 """ 62 63 taxonomy_depths = {NOUN: 19, VERB: 13} 64 if synset1.pos not in taxonomy_depths.keys(): 65 raise TypeError, "Can only calculate similarity for nouns or verbs" 66 depth = taxonomy_depths[synset1.pos] 67 68 distance = synset1.shortest_path_distance(synset2) 69 if distance >= 0: 70 return -math.log((distance + 1) / (2.0 * depth)) 71 else: 72 return -1
73
74 -def wup_similarity(synset1, synset2, verbose=False):
75 """ 76 Wu-Palmer Similarity: 77 Return a score denoting how similar two word senses are, based on the 78 depth of the two senses in the taxonomy and that of their Least Common 79 Subsumer (most specific ancestor node). Note that at this time the 80 scores given do _not_ always agree with those given by Pedersen's Perl 81 implementation of Wordnet Similarity. 82 83 The LCS does not necessarily feature in the shortest path connecting the 84 two senses, as it is by definition the common ancestor deepest in the 85 taxonomy, not closest to the two senses. Typically, however, it will so 86 feature. Where multiple candidates for the LCS exist, that whose 87 shortest path to the root node is the longest will be selected. Where 88 the LCS has multiple paths to the root, the longer path is used for 89 the purposes of the calculation. 90 91 @type synset2: L{Sense} 92 @param synset2: The L{Sense} that this L{Sense} is being compared to. 93 @return: A float score denoting the similarity of the two L{Sense}s, 94 normally greater than zero. If no connecting path between the two 95 senses can be found, -1 is returned. 96 """ 97 98 subsumer = _lcs_by_depth(synset1, synset2, verbose) 99 100 # If no LCS was found return -1 101 if subsumer == None: 102 return -1 103 104 # Get the longest path from the LCS to the root. 105 depth = subsumer.max_depth() 106 107 # Get the shortest path from the LCS to each of the synsets it is subsuming. 108 # Add this to the LCS path length to get the path length from each synset to the root. 109 110 len1 = synset1.shortest_path_distance(subsumer) + depth 111 len2 = synset2.shortest_path_distance(subsumer) + depth 112 return (2.0 * depth) / (len1 + len2)
113
114 -def res_similarity(synset1, synset2, datafile="", verbose=False):
115 """ 116 Resnik Similarity: 117 Return a score denoting how similar two word senses are, based on the 118 Information Content (IC) of the Least Common Subsumer (most specific 119 ancestor node). Note that at this time the scores given do _not_ 120 always agree with those given by Pedersen's Perl implementation of 121 Wordnet Similarity, although they are mostly very similar. 122 123 The required IC values are precomputed and stored in a file, the name 124 of which should be passed as the 'datafile' argument. For more 125 information on how they are calculated, check brown_ic.py. 126 127 @type synset2: L{Sense} 128 @param synset2: The L{Sense} that this L{Sense} is being compared to. 129 @return: A float score denoting the similarity of the two L{Sense}s. 130 Synsets whose LCS is the root node of the taxonomy will have a 131 score of 0 (e.g. N['dog'][0] and N['table'][0]). If no path exists 132 between the two synsets a score of -1 is returned. 133 """ 134 135 _check_datafile(datafile) 136 137 # TODO: Once this data has been loaded for the first time preserve it 138 # in memory in some way to prevent unnecessary recomputation. 139 (noun_freqs, verb_freqs) = _load_ic_data(datafile) 140 141 if synset1.pos is NOUN: 142 (lcs, lcs_ic) = _lcs_by_content(synset1, synset2, noun_freqs) 143 144 elif synset1.pos is VERB: 145 (lcs, lcs_ic) = _lcs_by_content(synset1, synset2, verb_freqs) 146 147 return lcs_ic
148
149 -def jcn_similarity(synset1, synset2, datafile="", verbose=False):
150 """ 151 Jiang-Conrath Similarity: 152 Return a score denoting how similar two word senses are, based on the 153 Information Content (IC) of the Least Common Subsumer (most specific 154 ancestor node) and that of the two input Synsets. The relationship is 155 given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)). 156 157 Note that at this time the scores given do _not_ always agree with 158 those given by Pedersen's Perl implementation of Wordnet Similarity, 159 although they are mostly very similar. 160 161 The required IC values are calculated using precomputed frequency 162 counts, which are accessed from the 'datafile' file which is supplied 163 as an argument. For more information on how they are calculated, 164 check brown_ic.py. 165 166 @type synset2: L{Sense} 167 @param synset2: The L{Sense} that this L{Sense} is being compared to. 168 @return: A float score denoting the similarity of the two L{Sense}s. 169 If no path exists between the two synsets a score of -1 is returned. 170 """ 171 172 _check_datafile(datafile) 173 174 if synset1 == synset2: 175 return inf 176 177 # TODO: Once this data has been loaded for the first time preserve it 178 # in memory in some way to prevent unnecessary recomputation. 179 (noun_freqs, verb_freqs) = _load_ic_data(datafile) 180 181 # Get the correct frequency dict as dependent on the input synsets' 182 # pos (Part of Speech) attribute. 183 if synset1.pos is NOUN: freqs = noun_freqs 184 elif synset1.pos is VERB: freqs = verb_freqs 185 else: return -1 186 187 ic1 = synset1.information_content(freqs) 188 ic2 = synset2.information_content(freqs) 189 (lcs, lcs_ic) = _lcs_by_content(synset1, synset2, freqs) 190 191 # If either of the input synsets are the root synset, or have a 192 # frequency of 0 (sparse data problem), return 0. 193 if ic1 is 0 or ic2 is 0: return 0 194 195 return 1 / (ic1 + ic2 - 2 * lcs_ic)
196
197 -def lin_similarity(synset1, synset2, datafile="", verbose=False):
198 """ 199 Lin Similarity: 200 Return a score denoting how similar two word senses are, based on the 201 Information Content (IC) of the Least Common Subsumer (most specific 202 ancestor node) and that of the two input Synsets. The relationship is 203 given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)). 204 205 Note that at this time the scores given do _not_ always agree with 206 those given by Pedersen's Perl implementation of Wordnet Similarity, 207 although they are mostly very similar. 208 209 The required IC values are calculated using precomputed frequency 210 counts, which are accessed from the 'datafile' file which is supplied 211 as an argument. For more information on how they are calculated, 212 check brown_ic.py. 213 214 @type synset2: L{Sense} 215 @param synset2: The L{Sense} that this L{Sense} is being compared to. 216 @return: A float score denoting the similarity of the two L{Sense}s, 217 in the range 0 to 1. If no path exists between the two synsets a 218 score of -1 is returned. 219 """ 220 221 _check_datafile(datafile) 222 223 # TODO: Once this data has been loaded cache it to save unnecessary recomputation. 224 (noun_freqs, verb_freqs) = _load_ic_data(datafile) 225 226 if synset1.pos is NOUN: freqs = noun_freqs 227 elif synset1.pos is VERB: freqs = verb_freqs 228 else: return -1 229 230 ic1 = synset1.information_content(freqs) 231 ic2 = synset2.information_content(freqs) 232 (lcs, lcs_ic) = _lcs_by_content(synset1, synset2, freqs) 233 234 return (2.0 * lcs_ic) / (ic1 + ic2)
235 236 # Utility functions 237
238 -def _lcs_by_depth(synset1, synset2, verbose=False):
239 """ 240 Finds the least common subsumer of two synsets in a Wordnet taxonomy, 241 where the least common subsumer is defined as the ancestor node common 242 to both input synsets whose shortest path to the root node is the longest. 243 244 @type synset1: L{Synset} 245 @param synset1: First input synset. 246 @type synset2: L{Synset} 247 @param synset2: Second input synset. 248 @return: The ancestor synset common to both input synsets which is also the LCS. 249 """ 250 subsumer = None 251 max_min_path_length = -1 252 253 # can't use set operations here 254 hypernyms1 = [synset1] + list(synset1.closure(HYPERNYM)) 255 hypernyms2 = [synset2] + list(synset2.closure(HYPERNYM)) 256 subsumers = [s for s in hypernyms1 if s in hypernyms2] 257 258 if verbose: 259 print "> Subsumers1:", subsumers 260 261 # Eliminate those synsets which are ancestors of other synsets in the 262 # set of subsumers. 263 264 eliminated = set() 265 for s1 in subsumers: 266 for s2 in subsumers: 267 if s2 in s1.closure(HYPERNYM): 268 eliminated.add(s2) 269 if verbose: 270 print "> Eliminated:", eliminated 271 272 subsumers = [s for s in subsumers if s not in eliminated] 273 274 if verbose: 275 print "> Subsumers2:", subsumers 276 277 # Calculate the length of the shortest path to the root for each 278 # subsumer. Select the subsumer with the longest of these. 279 280 for candidate in subsumers: 281 282 paths_to_root = candidate.hypernym_paths() 283 min_path_length = -1 284 285 for path in paths_to_root: 286 if min_path_length < 0 or len(path) < min_path_length: 287 min_path_length = len(path) 288 289 if min_path_length > max_min_path_length: 290 max_min_path_length = min_path_length 291 subsumer = candidate 292 293 if verbose: 294 print "> LCS Subsumer by depth:", subsumer 295 return subsumer
296
297 -def _lcs_by_content(synset1, synset2, freqs, verbose=False):
298 """ 299 Get the least common subsumer of the two input synsets, where the least 300 common subsumer is defined as the ancestor synset common to both input 301 synsets which has the highest information content (IC) value. The IC 302 value is calculated by extracting the probability of an ancestor synset 303 from a precomputed set. 304 305 @type synset1: L{Synset} 306 @param synset1: First input synset. 307 308 @type synset2: L{Synset} 309 @param synset2: Second input synset. 310 311 @return: The ancestor synset common to both input synsets which is also 312 the LCS. 313 """ 314 subsumer = None 315 subsumer_ic = -1 316 317 # subsumers = set(synset1.closure(HYPERNYM)) & set(synset2.closure(HYPERNYM)) -- Broken 318 319 subsumers = set() 320 for s1 in synset1.closure(HYPERNYM): 321 for s2 in synset2.closure(HYPERNYM): 322 if s1 == s2: 323 subsumers.add(s1) 324 subsumers.add(synset1) 325 subsumers.add(synset2) 326 327 # For each candidate, calculate its IC value. Keep track of the candidate 328 # with the highest score. 329 for candidate in subsumers: 330 ic = candidate.information_content(freqs) 331 if (subsumer == None and ic > 0) or ic > subsumer_ic: 332 subsumer = candidate 333 subsumer_ic = ic 334 335 if verbose: 336 print "> LCS Subsumer by content:", subsumer, subsumer_ic 337 338 return (subsumer, subsumer_ic)
339