Package nltk_lite :: Package contrib :: Module tree
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.tree

  1  # Natural Language Toolkit: Probability and Statistics 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> (additions) 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9   
 10  # 
 11  # tree -- An xml.etree implementation with linguistic tree operations 
 12  #         and parent pointers. 
 13   
 14   
 15  """ 
 16  This module defines three classes, extending the xml.etree.ElementTree 
 17  module: 
 18   
 19    - Tree: Linguistic trees, with support for: 
 20      convenient nested indexing t[1,1,1] instead of t[1][1][1]; 
 21      leaves() to return the trees leaf nodes; 
 22      ... 
 23   
 24    - ParentedTree: Each element keeps track of a single parent 
 25      pointer, returned by the element's `parent()` method.  Elements 
 26      may have at most one parent; i.e., subtrees may not be shared. 
 27      Any attempt to do use a single element as a child for multiple 
 28      parents will generate a ValueError. 
 29   
 30    - MultiParentedTree: Each element keeps track of a list of 
 31      parent pointers, returned by the element's `parents()` method. 
 32      Elements may have zero or more parients; i.e., subtrees may be 
 33      shraed.  If a single Element is used as multiple children of the 
 34      same parent, then that parent will appear multiple times in the 
 35      parents list. 
 36   
 37  Note: Mixing of etree implementations is not supported.  I.e., you 
 38  should never construct a tree that combines elements from ParentedTree 
 39  with elements from MultiParentedTree, or elements from either of these 
 40  implementations with elements from any other implementation.  Doing so 
 41  may result in incorrect parent pointers and ValueError exceptions. 
 42  """ 
 43   
 44  from nltk_lite.etree import ElementTree as ET 
 45   
 46  __all__ = ['Tree', 
 47             'ParentedTree', 
 48             'MultiParentedTree'] 
 49   
 50  ###################################################################### 
 51  # Trees 
 52  ###################################################################### 
 53   
54 -class _AbstractElement(ET._ElementInterface):
55 """ 56 An abstract base class for Elements. 57 - Adds original NLTK functionality. 58 """ 59
60 - def __repr__(self):
61 return "<%s %s at %x>" % (self.__class__.__name__, 62 self.tag, id(self))
63 64 #//////////////////////////////////////////////////////////// 65 # Indexing (with support for tree positions) 66 #//////////////////////////////////////////////////////////// 67
68 - def __getitem__(self, index):
69 if isinstance(index, int): 70 return ET._ElementInterface.__getitem__(self, index) 71 else: 72 if len(index) == 0: 73 return self 74 elif len(index) == 1: 75 return self[int(index[0])] 76 else: 77 return self[int(index[0])][index[1:]]
78
79 - def __setitem__(self, index, value):
80 if isinstance(index, int): 81 return ET._ElementInterface.__setitem__(self, index, value) 82 else: 83 if len(index) == 0: 84 raise IndexError('The tree position () may not be ' 85 'assigned to.') 86 elif len(index) == 1: 87 self[index[0]] = value 88 else: 89 self[index[0]][index[1:]] = value
90
91 - def __delitem__(self, index):
92 if isinstance(index, int): 93 return ET._ElementInterface.__delitem__(self, index) 94 else: 95 if len(index) == 0: 96 raise IndexError('The tree position () may not be deleted.') 97 elif len(index) == 1: 98 del self[index[0]] 99 else: 100 del self[index[0]][index[1:]]
101 102 #//////////////////////////////////////////////////////////// 103 # Basic tree operations 104 #//////////////////////////////////////////////////////////// 105
106 - def leaves(self):
107 """ 108 @return: a list containing this tree's leaves. 109 @rtype: C{list} 110 """ 111 leaves = [] 112 for child in self: 113 if isinstance(child, _AbstractElement): 114 leaves.extend(child.leaves()) 115 if child.tail: 116 leaves.append(child.tail) 117 else: 118 leaves.append(child) 119 if self.text: 120 leaves.append(self.text) 121 return leaves
122
123 - def flatten(self):
124 raise NotImplementedError
125
126 - def height(self):
127 """ 128 @return: The height of this tree. The height of a tree 129 containing no children is 1; the height of a tree 130 containing only leaves is 2; and the height of any other 131 tree is one plus the maximum of its children's 132 heights. 133 @rtype: C{int} 134 """ 135 max_child_height = 0 136 for child in self: 137 try: 138 max_child_height = max(max_child_height, child.height()) 139 except AttributeError: 140 max_child_height = max(max_child_height, 1) 141 return 1 + max_child_height
142
143 - def treepositions(self, order='preorder'):
144 raise NotImplementedError
145
146 - def productions(self):
147 raise NotImplementedError
148
149 - def subtrees(self, filter=None):
150 """ 151 Generate all the subtrees of this tree, optionally restricted 152 to trees matching the filter function. 153 @type: filter: C{function} 154 @param: filter: the function to filter all local trees 155 """ 156 if not filter or filter(self): 157 yield self 158 for child in self: 159 try: 160 for subtree in child.subtrees(filter): 161 yield subtree 162 except AttributeError: 163 pass
164
165 - def copy(self, deep=False):
166 raise NotImplementedError
167
168 - def freeze(self, leaf_freezer=None):
169 raise NotImplementedError
170 171 ###################################################################### 172 # Parented Trees 173 ###################################################################### 174
175 -class _AbstractParentedElement(_AbstractElement):
176 """ 177 An abstract base class for (multi)parented element tree Elements. 178 179 - Whenever a new child is added, L{_setparent} is called, 180 which should update that child's parent pointer to point 181 at self. 182 183 - Whenever a child is removed, L{_delparent} is called, which 184 should remove the child's parent pointer to self. 185 """
186 - def __repr__(self):
187 return "<%s %s at %x>" % (self.__class__.__name__, 188 self.tag, id(self))
189 190 #//////////////////////////////////////////////////////////// 191 # Parent management 192 #//////////////////////////////////////////////////////////// 193
194 - def _setparent(self, child):
195 """ 196 Update C{child}'s parent pointer to point to self. 197 """ 198 raise AssertionError, 'Abstract base class'
199
200 - def _delparent(self, child):
201 """ 202 Remove self from C{child}'s parent pointer. 203 """ 204 raise AssertionError, 'Abstract base class'
205 206 #//////////////////////////////////////////////////////////// 207 # Methods that add/remove children 208 #//////////////////////////////////////////////////////////// 209 # Every method that adds or removes a child must make 210 # appropriate calls to _setparent() and _delparent(). 211
212 - def __delitem__(self, index):
213 self._delparent(self[index]) 214 _AbstractElement.__delitem__(self, index)
215
216 - def __delslice__(self, start, stop):
217 for index in range(start, stop): self._delparent(self[index]) 218 _AbstractElement.__delslice__(self, start, stop)
219
220 - def __setitem__(self, index, element):
221 self._delparent(self[index]) 222 self._setparent(element) 223 _AbstractElement.__setitem__(self, index, element)
224
225 - def __setslice__(self, start, stop, elements):
226 for index in range(start, stop): self._delparent(self[index]) 227 for val in elements: self._setparent(val) 228 _AbstractElement.__setslice__(self, start, stop, elements)
229
230 - def append(self, element):
231 self._setparent(element) 232 _AbstractElement.append(self, element)
233
234 - def extend(self, elements):
235 for val in elements: self._setparent(val) 236 _AbstractElement.extend(self, elements)
237
238 - def insert(self, index, element):
239 self._setparent(element) 240 _AbstractElement.insert(self, index, element)
241
242 - def pop(self):
243 self._delparent(self[-1]) 244 return _AbstractElement.pop(self, )
245
246 - def remove(self, element):
247 index = self.index(element) 248 self._delparent(self[index]) 249 _AbstractElement.remove(self, element)
250 251
252 -class _ParentedElement(_AbstractParentedElement):
253 """ 254 A specialized version of etree.ElementTree.Element that keeps 255 track of a single parent pointer per element. 256 257 Each _ParentedElement may have at most one parent. In particular, 258 subtrees may not be shared. Any attempt to reuse a single 259 _ParentedElement as a child of more than one parent (or as 260 multiple children of the same parent) will cause a ValueError 261 exception to be raised. 262 263 _ParentedElements should never be used in the same tree as other 264 Element implementation classes. Mixing Element implementations 265 may result in incorrect parent pointers and in C{ValueError} 266 exceptions. 267 """
268 - def __init__(self, tag, attrib):
269 ET._ElementInterface.__init__(self, tag, attrib) 270 self._parent = None
271
272 - def parent(self):
273 return self._parent
274
275 - def _setparent(self, element):
276 assert is_parented_element(element) 277 if element._parent is not None: 278 raise ValueError, '%r already has a parent' % element 279 element._parent = self
280
281 - def _delparent(self, element):
282 assert is_parented_element(element) 283 assert element._parent == self 284 element._parent = None
285
286 - def makeelement(self, tag, attrib):
288
289 -def is_parented_element(element):
290 return ET.iselement(element) and hasattr(element, '_parent')
291
292 -class _MultiParentedElement(_AbstractParentedElement):
293 """ 294 A specialized version of etree.ElementTree.Element that keeps 295 track of a list of parent pointers for each element. 296 297 Each _ParentedElement may have zero or more parents. In 298 particular, subtrees may be shared. If a single 299 _MultiParentedElement is used as multiple children of the same 300 parent, then that parent will appear multiple times in the parents 301 list. 302 303 _MultiParentedElements should never be used in the same tree as 304 other Element implementation classes. Mixing Element 305 implementations may result in incorrect parent pointers and in 306 C{ValueError} exceptions. 307 """
308 - def __init__(self, tag, attrib):
309 ET._ElementInterface.__init__(self, tag, attrib) 310 self._parents = []
311
312 - def parents(self):
313 return tuple(self._parents)
314
315 - def _setparent(self, element):
316 assert is_multiparented_element(element) 317 element._parent.append(self)
318
319 - def _delparent(self, element):
320 assert is_multiparented_element(element) 321 assert self in element._parents 322 element._parents.remove(self)
323
324 - def makeelement(self, tag, attrib):
326
327 -def is_multiparented_element(element):
328 return ET.iselement(element) and hasattr(element, '_parents')
329 330 331 332
333 -class ElementTreeImplementation(object):
334 """ 335 Instances of this class can be used as drop-in replacements for 336 the xml.etree.ElementTree module. 337 """
338 - def __init__(self, ElementClass):
339 self._Element = ElementClass
340
341 - def Element(self, tag, attrib={}, **extra):
342 attrib = attrib.copy() 343 attrib.update(extra) 344 return self._Element(tag, attrib)
345 346 ################################################################## 347 # Subclasses w/ new constructors 348 #----------------------------------------------------------------- 349 # These classes needed to have their constructer overridden, to 350 # change the default values of tree-building functions to point 351 # to the versions in *this file, not in ElementTree. 352
353 - def iterparse(self, source, events=None):
354 # [XXX] We can't do this without poking around in other 355 # people's internals! 356 iterator = ET.iterparse(source, events) 357 iterator._parser._target = TreeBuilder() 358 return iterator
359 360 ################################################################## 361 # No dependencies on Element class 362 #----------------------------------------------------------------- 363 # These functions & classes do not depend, directly or indirectly, 364 # on the class used to implement elements; so we can just copy 365 # them. 366 367 SubElement = staticmethod(ET.SubElement) 368 QName = ET.QName 369 iselement = staticmethod(ET.iselement) 370 dump = staticmethod(ET.dump) 371 tostring= staticmethod(ET.tostring) 372 373 ################################################################## 374 # Modified Constructor Defaults 375 #----------------------------------------------------------------- 376 # These classes have default constructor parameter that depend on 377 # the class used to implement elements; so we wrap them in 378 # functions that provide new defaults. 379
380 - def TreeBuilder(self, element_factory=None):
381 if element_factory is None: 382 element_factory = self._Element 383 return ET.TreeBuilder(element_factory)
384
385 - def XMLTreeBuilder(self, html=0, target=None):
386 if target is None: 387 target = self.TreeBuilder() 388 return ET.XMLTreeBuilder(html, target)
389 390 XMLParser = XMLTreeBuilder 391 392 ################################################################## 393 # Modified Method Defaults 394 #----------------------------------------------------------------- 395 # The ElementTree class has a method parameter whose default value 396 # depends on the class used to implement elements; so we replace 397 # it with a new subclass (_ElementTree) that stores a private 398 # pointer back to the ElementTreeImplementation instance; and uses it 399 # to construct an appropriate default for the method parameter. 400
401 - def ElementTree(self, element=None, file=None):
402 return self._ElementTree(self, element, file)
403 - class _ElementTree(ET.ElementTree):
404 - def __init__(self, etbase, element, file):
405 self.__default_parser_class = etbase.XMLTreeBuilder 406 ET.ElementTree.__init__(self, element, file)
407 - def parse(self, source, parser=None):
408 if not parser: 409 parser = self.__default_parser_class() 410 ET.ElementTree.parse(self, source, parser)
411 412 ################################################################## 413 # Indirect dependencies on Element class 414 #----------------------------------------------------------------- 415 # These functions depend indirectly on the class used to implement 416 # elements; so we need to redefine them. Each method in this 417 # section is copied verbatim from etree/ElementTree.py, with the 418 # following exceptions: a 'self' parameter is added. And global 419 # variables that depend on the Element class are replaced with 420 # local versions. E.g., "XMLTreeBuilder" is replaced with 421 # "self.XMLTreeBuilder". 422
423 - def Comment(self, text=None):
424 element = self.Element(Comment) 425 element.text = text 426 return element
427
428 - def ProcessingInstruction(self, target, text=None):
429 element = self.Element(self.ProcessingInstruction) 430 element.text = target 431 if text: 432 element.text = element.text + " " + text 433 return element
434 435 PI = ProcessingInstruction 436
437 - def parse(self, source, parser=None):
438 tree = self.ElementTree() 439 tree.parse(source, parser) 440 return tree
441
442 - def XML(self, text):
443 parser = self.XMLTreeBuilder() 444 parser.feed(text) 445 return parser.close()
446
447 - def XMLID(self, text):
448 parser = self.XMLTreeBuilder() 449 parser.feed(text) 450 tree = parser.close() 451 ids = {} 452 for elem in tree.getiterator(): 453 id = elem.get("id") 454 if id: 455 ids[id] = elem 456 return tree, ids
457 458 fromstring = XML
459 460 461 Tree = ElementTreeImplementation(_AbstractElement) 462 ParentedTree = ElementTreeImplementation(_ParentedElement) 463 MultiParentedTree = ElementTreeImplementation(_MultiParentedElement) 464 465
466 -def demo():
467 import nltk_lite.contrib.tree 468 reload(nltk_lite.contrib.tree) 469 from nltk_lite.contrib.tree import ParentedTree as PT 470 471 TREE = ("<s><np><jj>Snow</jj><nn>flakes</nn></np>" 472 "<vp tense='past'><v>fell</v>" 473 "<pp><p>on</p><np><dt>the</dt><nn>table</nn></np></pp>" 474 "</vp></s>") 475 476 sent = PT.fromstring(TREE) 477 table = sent[1,1,1,1] 478 print PT.tostring(table) 479 print PT.tostring(table.parent()) 480 481 print "leaves:", sent.leaves() 482 print "height:", sent.height() 483 print "subtrees:", list(sent.subtrees()) 484 485 def spine(elt): 486 if elt is None: return [] 487 return [elt.tag] + spine(elt.parent())
488 print spine(table) 489 490 def spine_with_attribs(elt): 491 if elt is None: return [] 492 label = elt.tag 493 if elt.attrib: 494 attrib = ['%s=%s' % i for i in elt.attrib.items()] 495 label += '<%s>' % ', '.join(attrib) 496 return [label] + spine_with_attribs(elt.parent()) 497 print spine_with_attribs(table) 498 499 print PT.tostring(sent) 500 del sent[1][1][1] 501 # del sent[1,1,1] BROKEN 502 print PT.tostring(sent) 503 504 505 if __name__ == '__main__': 506 demo() 507