Package nltk_lite :: Package contrib :: Package dependency :: Module tag2tab
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.dependency.tag2tab

 1  """ 
 2  Utilities for converting chunked treebank into format that can be  
 3  input to Nivre's MaltParser. 
 4  """ 
 5  from nltk_lite.corpora import get_basedir 
 6  from nltk_lite import tokenize 
 7  from itertools import islice 
 8  import os 
 9   
10 -def tag2tab(s, sep='/'):
11 loc = s.rfind(sep) 12 if loc >= 0: 13 word = s[:loc] 14 tag = s[loc+1:] 15 tag = tag.replace('(', '-LRB-').replace(')', '-RRB-') 16 return "%s\t%s\n" % (word, tag) 17 else: 18 return (s, None)
19
20 -def tabtagged(files = 'chunked', basedir= None):
21 """ 22 @param files: One or more treebank files to be processed 23 @type files: L{string} or L{tuple(string)} 24 @return: iterator over lines in Malt-TAB input format 25 """ 26 if type(files) is str: files = (files,) 27 28 if not basedir: basedir = get_basedir() 29 30 for file in files: 31 path = os.path.join(get_basedir(), "treebank", file) 32 f = open(path).read() 33 34 for sent in tokenize.blankline(f): 35 l = [] 36 for t in tokenize.whitespace(sent): 37 if (t != '[' and t != ']'): 38 l.append(tag2tab(t)) 39 #add a blank line as sentence separator 40 l.append('\n') 41 yield l
42
43 -def demo():
44 from nltk_lite.corpora import treebank 45 #f = open('ptb_input.tab', 'w') 46 #s = '' 47 for sent in islice(tabtagged(), 3): 48 for line in sent: 49 print line,
50 #s += ''.join(sent) 51 #print >>f, s 52 #f.close() 53 54 55 56 if __name__ == '__main__': 57 demo() 58