Source code for libutilitaspy.data_structures.tries

"""
Implements a trie data-structure (see http://en.wikipedia.org/wiki/Trie)

Typical use:
    
* For tries without data::
    
    t = Trie(['ab', 'ac', 'abc'])
    try:
        value = t.search('ac')
    except KeyError, e:
        print 'not found'

* For tries with data::
    
    t = Trie({'ab':'data1', 'ac':'data2', 'abc':'data3'})
    data = t.search('ac')
    
    t.assign('ac', 'data4')    # changes the data associated to 'ac'
    t.assign('acd', 'data5')  # adds 'acd' to t and associates 'data5' to it.
    
* Trees with data can also be constructed with lists of tuples, or generator expressions::
    
    t = Trie([('ab',data1), ('ac', data2), ('abc', data3)])
    
  or ::
  
    t = Trie((k, data) for k in ['ab','ac',abc'])
    
* There is a mapping type syntax for these methods::
    
    t['ac']
    
  is equivalent to ::
    
    t.search('ac')
        
  and ::
    
    t['ac'] = 'data6'
        
  is equivalent to ::
    
    t.assign('ac', 'data6')
        
"""

from collections import MutableMapping
import pprint

pp = pprint.PrettyPrinter()


[docs]class Trie(MutableMapping): """ A trie is a tree data-structure that is used to implement associative arrays or maps from sequences to some data. Typically the keys are strings, but they can be any sequence of hashable objects. The data is stored in the tree's nodes, and the branches are labelled with the objects in the key sequence. Thus, searching for an item with a key `k = (k_1,k_2, .., k_n)` is done by following the path `k` from the root, and at each node `i` choosing the branch labelled with `k_i'. Nodes in the tree are tuples of the form (`id`, `arrows`, `data`) where `id` is a unique identifier within the trie (a natural number,) `arrows` is a dictionary with keys being the objects labelling the arrows from the node, and values being the target nodes, i.e. a pair (`key`, `value`) in this dictionary is really a pair (`obj`, `target-node`), so there is an arrow from this node to the target node labelled `obj`; and `data` is any additional data associated with this node. """ def __init__(self, sequences={}): """ The constructor creates a new trie with the given dictionary indexed by sequences. The keys are expected to be the sequences of hashable objects and the values are the data to be stored at the end node for the corresponding sequence. :param sequences: A dictionary mapping sequences to values to put in the trie. :type sequences: (`sequence`, 'a) dict, where `sequence` is either a tuple, a list or any other sequence object, and 'a is any type. """ self.sequences = sequences self.counter = 0 self.root = (0, {}, None) if type(sequences) is dict: for seq in sequences: self.assign(seq, sequences[seq]) else: for item in sequences: if type(item) is str: self.assign(item) elif type(item) is tuple or type(item) is list and len(item) >= 1: self.assign(item[0], item[1:]) self.pointer = self.root def __getitem__(self, key): """Same as search.""" return self.search(key) def __setitem__(self, key, value): """Same as assign.""" self.assign(key, value)
[docs] def assign(self, key, value=None): """ Adds the (key,value) association to the trie. :param seq: The key to store :type key: hashable seq, a sequence (tuple, list, etc.) of hashable objects :param value: The associated value (any object). """ node = self.root arrows = node[1] for obj in key: if obj in arrows: node = arrows[obj] id, arrows, _ = node else: self.counter += 1 node = (self.counter, {}, None) arrows[obj] = node id, arrows, _ = node node[2] = value
[docs] def search(self, key): """ Looks for the key in the trie. It returns the data associated with the key, or raises a KeyError exception if the key is not in the trie. :param key: A key to search. :type key: hashable seq, a sequence (tuple, list, etc.) of hashable objects :returns: the value object associated to the key if it is in the trie. :raises: KeyError if the key is not in the trie. """ pointer = self.root[1] try: for obj in key: id, pointer, data = pointer[obj] except KeyError: raise KeyError(key) return data
[docs] def reset(self): """ Resets the trie's pointer to the root node. :post: self.pointer == self.root """ self.pointer = self.root
[docs] def step(self, obj): """ Moves the trie's pointer to the node following the branch labelled with the given object. :param obj: an label in one of the trie's branches :type obj: hashable :returns: the data contained in the node after the step is taken. """ id, arrows, _ = self.pointer new_node = arrows[obj] self.pointer = new_node _, _, new_data = new_node return new_data #~ def incsearch(self, string): #~ node = self.root #~ pointer = node[1] #~ try: #~ for char in string: #~ node = pointer[char] #~ id, pointer, data = node #~ yield (True, id,data) #~ except KeyError: #~ yield (False, id, data) #~ #yield node[0],node[2] #~ def old_search(self, string): #~ pointer = self.root[1] #~ try: #~ for char in string: #~ id, pointer, data = pointer[char] #~ except KeyError: #~ if type(self.strings) is DictType: #~ return (False, None) #~ else: #~ return False #~ if type(self.strings) is DictType: #~ return (True, data) #~ else: #~ return True #~ def old_step(self, char): #~ try: #~ id, arrows, data = self.pointer #~ new_node = arrows[char] #~ self.pointer = new_node #~ new_id, new_arrows, new_data = new_node #~ return new_data #~ except KeyError: #~ raise KeyError()
def __str__(self): return pp.pformat(self.root)