Package nltk_lite :: Module utilities
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.utilities

  1  # Natural Language Toolkit: Utility functions 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Steven Bird <sb@csse.unimelb.edu.au> 
  5  # URL: <http://nltk.sf.net> 
  6  # For license information, see LICENSE.TXT 
  7   
  8   
  9  ########################################################################## 
 10  # PRETTY PRINTING 
 11  ########################################################################## 
 12   
13 -def pr(data, start=0, end=None):
14 """ 15 Pretty print a sequence of data items 16 17 @param data: the data stream to print 18 @type data: C{sequence} or C{iterator} 19 @param start: the start position 20 @type start: C{int} 21 @param end: the end position 22 @type end: C{int} 23 """ 24 from pprint import pprint 25 from itertools import islice 26 pprint(list(islice(data, start, end)))
27 47
48 -class SortedDict(dict):
49 """ 50 A very rudamentary sorted dictionary, whose main purpose is to 51 allow dictionaries to be displayed in a consistent order in 52 regression tests. keys(), items(), values(), iter*(), and 53 __repr__ all sort their return values before returning them. 54 (note that the sort order for values() does *not* correspond to 55 the sort order for keys(). I.e., zip(d.keys(), d.values()) is not 56 necessarily equal to d.items(). 57 """
58 - def keys(self): return sorted(dict.keys(self))
59 - def items(self): return sorted(dict.items(self))
60 - def values(self): return sorted(dict.values(self))
61 - def iterkeys(self): return iter(sorted(dict.keys(self)))
62 - def iteritems(self): return iter(sorted(dict.items(self)))
63 - def itervalues(self): return iter(sorted(dict.values(self)))
64 - def __iter__(self): return iter(sorted(dict.keys(self)))
65 - def repr(self):
66 items = ['%s=%s' % t for t in sorted(self.items())] 67 return '{%s}' % ', '.join(items)
68 69 # OrderedDict: Written Doug Winter 70 # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/438823 71
72 -class OrderedDict(dict):
73 """ 74 This implementation of a dictionary keeps track of the order 75 in which keys were inserted. 76 """ 77
78 - def __init__(self, d={}):
79 self._keys = d.keys() 80 dict.__init__(self, d)
81
82 - def __delitem__(self, key):
83 dict.__delitem__(self, key) 84 self._keys.remove(key)
85
86 - def __setitem__(self, key, item):
87 dict.__setitem__(self, key, item) 88 # a peculiar sharp edge from copy.deepcopy 89 # we'll have our set item called without __init__ 90 if not hasattr(self, '_keys'): 91 self._keys = [key,] 92 if key not in self._keys: 93 self._keys.append(key)
94
95 - def clear(self):
96 dict.clear(self) 97 self._keys = []
98
99 - def items(self):
100 for i in self._keys: 101 yield i, self[i]
102
103 - def keys(self):
104 return self._keys
105
106 - def popitem(self):
107 if len(self._keys) == 0: 108 raise KeyError('dictionary is empty') 109 else: 110 key = self._keys[-1] 111 val = self[key] 112 del self[key] 113 return key, val
114
115 - def setdefault(self, key, failobj = None):
116 dict.setdefault(self, key, failobj) 117 if key not in self._keys: 118 self._keys.append(key)
119
120 - def update(self, d):
121 for key in d.keys(): 122 if not self.has_key(key): 123 self._keys.append(key) 124 dict.update(self, d)
125
126 - def values(self):
127 for i in self._keys: 128 yield self[i]
129
130 - def move(self, key, index):
131 132 """ Move the specified to key to *before* the specified index. """ 133 134 try: 135 cur = self._keys.index(key) 136 except ValueError: 137 raise KeyError(key) 138 self._keys.insert(index, key) 139 # this may have shifted the position of cur, if it is after index 140 if cur >= index: cur = cur + 1 141 del self._keys[cur]
142
143 - def index(self, key):
144 if not self.has_key(key): 145 raise KeyError(key) 146 return self._keys.index(key)
147 148 149 150 151 ########################################################################## 152 # EDIT DISTANCE (LEVENSHTEIN) 153 ########################################################################## 154
155 -def _edit_dist_init(len1, len2):
156 lev = [] 157 for i in range(len1): 158 lev.append([0] * len2) # initialize 2-D array to zero 159 for i in range(len1): 160 lev[i][0] = i # column 0: 0,1,2,3,4,... 161 for j in range(len2): 162 lev[0][j] = j # row 0: 0,1,2,3,4,... 163 return lev
164
165 -def _edit_dist_step(lev, i, j, c1, c2):
166 a = lev[i-1][j ] + 1 # skipping s1[i] 167 b = lev[i-1][j-1] + (c1 != c2) # matching s1[i] with s2[j] 168 c = lev[i ][j-1] + 1 # skipping s2[j] 169 lev[i][j] = min(a,b,c) # pick the cheapest
170
171 -def edit_dist(s1, s2):
172 """ 173 Calculate the Levenshtein edit-distance between two strings. 174 The edit distance is the number of characters that need to be 175 substituted, inserted, or deleted, to transform s1 into s2. For 176 example, transforming "rain" to "shine" requires three steps, 177 consisting of two substitutions and one insertion: 178 "rain" -> "sain" -> "shin" -> "shine". These operations could have 179 been done in other orders, but at least three steps are needed. 180 181 @param s1, s2: The strings to be analysed 182 @type s1, s2: C{string} 183 @rtype C{int} 184 """ 185 # set up a 2-D array 186 len1 = len(s1); len2 = len(s2) 187 lev = _edit_dist_init(len1+1, len2+1) 188 189 # iterate over the array 190 for i in range(len1): 191 for j in range (len2): 192 _edit_dist_step(lev, i+1, j+1, s1[i], s2[j]) 193 return lev[len1][len2]
194 195 196 ########################################################################## 197 # MINIMAL SETS 198 ########################################################################## 199
200 -class MinimalSet(object):
201 """ 202 Find contexts where more than one possible target value can 203 appear. E.g. if targets are word-initial letters, and contexts 204 are the remainders of words, then we would like to find cases like 205 "fat" vs "cat", and "training" vs "draining". If targets are 206 parts-of-speech and contexts are words, then we would like to find 207 cases like wind (noun) 'air in rapid motion', vs wind (verb) 208 'coil, wrap'. 209 """
210 - def __init__(self, parameters=None):
211 """ 212 Create a new minimal set. 213 214 @param parameters: The (context, target, display) tuples for the item 215 @type parameters: C{list} of C{tuple} of C{string} 216 """ 217 self._targets = set() # the contrastive information 218 self._contexts = set() # what we are controlling for 219 self._seen = {} # to record what we have seen 220 self._displays = {} # what we will display 221 222 if parameters: 223 for context, target, display in parameters: 224 self.add(context, target, display)
225
226 - def add(self, context, target, display):
227 """ 228 Add a new item to the minimal set, having the specified 229 context, target, and display form. 230 231 @param context: The context in which the item of interest appears 232 @type context: C{string} 233 @param target: The item of interest 234 @type target: C{string} 235 @param display: The information to be reported for each item 236 @type display: C{string} 237 """ 238 # Store the set of targets that occurred in this context 239 if context not in self._seen: 240 self._seen[context] = set() 241 self._seen[context].add(target) 242 243 # Keep track of which contexts and targets we have seen 244 self._contexts.add(context) 245 self._targets.add(target) 246 247 # For a given context and target, store the display form 248 self._displays[(context, target)] = display
249
250 - def contexts(self, minimum=2):
251 """ 252 Determine which contexts occurred with enough distinct targets. 253 254 @param minimum: the minimum number of distinct target forms 255 @type minimum: C(int) 256 @rtype C(list) 257 """ 258 return [c for c in self._contexts if len(self._seen[c]) >= minimum]
259
260 - def display(self, context, target, default=""):
261 if self._displays.has_key((context, target)): 262 return self._displays[(context, target)] 263 else: 264 return default
265
266 - def display_all(self, context):
267 result = [] 268 for target in self._targets: 269 x = self.display(context, target) 270 if x: result.append(x) 271 return result
272
273 - def targets(self):
274 return self._targets
275 276 277 ###################################################################### 278 ## Regexp display (thanks to David Mertz) 279 ###################################################################### 280 281 import re
282 -def re_show(regexp, string):
283 """ 284 Search C{string} for substrings matching C{regexp} and wrap 285 the matches with braces. This is convenient for learning about 286 regular expressions. 287 288 @param regexp: The regular expression. 289 @param string: The string being matched. 290 @rtype: C{string} 291 @return: A string with braces surrounding the matched substrings. 292 """ 293 print re.compile(regexp, re.M).sub("{\g<0>}", string.rstrip())
294 295 296 ########################################################################## 297 # READ FROM FILE OR STRING 298 ########################################################################## 299 300 # recipe from David Mertz
301 -def filestring(f):
302 if hasattr(f, 'read'): 303 return f.read() 304 elif isinstance(f, basestring): 305 return open(f).read() 306 else: 307 raise ValueError, "Must be called with a filename or file-like object"
308 309 ########################################################################## 310 # COUNTER, FOR UNIQUE NAMING 311 ########################################################################## 312
313 -class Counter:
314 """ 315 A counter that auto-increments each time its value is read. 316 """
317 - def __init__(self, initial_value=0):
318 self._value = initial_value
319 - def get(self):
320 self._value += 1 321 return self._value
322 323 324 ########################################################################## 325 # TRIES 326 ########################################################################## 327 328 # Trie structure, by James Tauber and Leonardo Maffi (V. 1.2, July 18 2006) 329 # Extended by Steven Bird 330
331 -class Trie:
332 """A Trie is like a dictionary in that it maps keys to 333 values. However, because of the way keys are stored, it allows 334 look up based on the longest prefix that matches. Keys must be 335 strings. 336 """ 337
338 - def __init__(self, trie=None):
339 if trie is None: 340 self._root = [None, {}, 0] 341 else: 342 self._root = trie
343
344 - def clear(self):
345 self._root = [None, {}, 0]
346
347 - def isleaf(self, key):
348 """Return True if the key is present and it's a leaf of the 349 Trie, False otherwise.""" 350 351 curr_node = self._root 352 for char in key: 353 curr_node_1 = curr_node[1] 354 if char in curr_node_1: 355 curr_node = curr_node_1[char] 356 else: 357 return False 358 return curr_node[0] is not None
359
360 - def find_prefix(self, key):
361 """Find as much of the key as one can, by using the longest 362 prefix that has a value. Return (value, remainder) where 363 remainder is the rest of the given string.""" 364 365 curr_node = self._root 366 remainder = key 367 for char in key: 368 if char in curr_node[1]: 369 curr_node = curr_node[1][char] 370 else: 371 return curr_node[0], remainder 372 remainder = remainder[1:] 373 return curr_node[0], remainder
374
375 - def subtrie(self, key):
376 curr_node = self._root 377 for char in key: 378 curr_node = curr_node[1][char] 379 return Trie(trie=curr_node)
380
381 - def __len__(self):
382 return self._root[2]
383
384 - def __eq__(self, other):
385 return self._root == other._root
386
387 - def __setitem__(self, key, value):
388 curr_node = self._root 389 for char in key: 390 curr_node[2] += 1 391 curr_node = curr_node[1].setdefault(char, [None, {}, 0]) 392 curr_node[0] = value 393 curr_node[2] += 1
394
395 - def __getitem__(self, key):
396 """Return the value for the given key if it is present, raises 397 a KeyError if key not found, and return None if it is present 398 a key2 that starts with key.""" 399 400 curr_node = self._root 401 for char in key: 402 curr_node = curr_node[1][char] 403 return curr_node[0]
404
405 - def __contains__(self, key):
406 """Return True if the key is present or if it is present a 407 key2 string that starts with key.""" 408 409 curr_node = self._root 410 for char in key: 411 curr_node_1 = curr_node[1] 412 if char in curr_node_1: 413 curr_node = curr_node_1[char] 414 else: 415 return False 416 return True
417
418 - def __str__(self):
419 return str(self._root)
420
421 - def __repr__(self):
422 return "Trie(%r)" % self._root
423 424 ########################################################################## 425 # Breadth-First Search 426 ########################################################################## 427 428 # Adapted from a Python cookbook entry; original version by David Eppstein
429 -def breadth_first(tree, children=iter, depth=-1):
430 """Traverse the nodes of a tree in breadth-first order. 431 The first argument should be the tree root; children 432 should be a function taking as argument a tree node and 433 returning an iterator of the node's children. 434 """ 435 yield tree 436 last = tree 437 if depth != 0: 438 for node in breadth_first(tree, children, depth-1): 439 for child in children(node): 440 yield child 441 last = child 442 if last == node: 443 return
444 445 ########################################################################## 446 # Guess Character Encoding 447 ########################################################################## 448 449 # adapted from io.py in the docutils extension module (http://docutils.sourceforge.net) 450 # http://www.pyzine.com/Issue008/Section_Articles/article_Encodings.html 451 452 import locale
453 -def guess_encoding(data):
454 """ 455 Given a byte string, attempt to decode it. 456 Tries the standard 'UTF8' and 'latin-1' encodings, 457 Plus several gathered from locale information. 458 459 The calling program *must* first call 460 locale.setlocale(locale.LC_ALL, '') 461 462 If successful it returns 463 (decoded_unicode, successful_encoding) 464 If unsuccessful it raises a ``UnicodeError`` 465 """ 466 successful_encoding = None 467 # we make 'utf-8' the first encoding 468 encodings = ['utf-8'] 469 # 470 # next we add anything we can learn from the locale 471 try: 472 encodings.append(locale.nl_langinfo(locale.CODESET)) 473 except AttributeError: 474 pass 475 try: 476 encodings.append(locale.getlocale()[1]) 477 except (AttributeError, IndexError): 478 pass 479 try: 480 encodings.append(locale.getdefaultlocale()[1]) 481 except (AttributeError, IndexError): 482 pass 483 # 484 # we try 'latin-1' last 485 encodings.append('latin-1') 486 for enc in encodings: 487 # some of the locale calls 488 # may have returned None 489 if not enc: 490 continue 491 try: 492 decoded = unicode(data, enc) 493 successful_encoding = enc 494 495 except (UnicodeError, LookupError): 496 pass 497 else: 498 break 499 if not successful_encoding: 500 raise UnicodeError( 501 'Unable to decode input data. Tried the following encodings: %s.' 502 % ', '.join([repr(enc) for enc in encodings if enc])) 503 else: 504 return (decoded, successful_encoding)
505