Home | Trees | Indices | Help |
|
---|
|
1 # Natural Language Toolkit: CONLL Corpus Reader 2 # 3 # Copyright (C) 2001-2007 University of Pennsylvania 4 # Author: Steven Bird <sb@ldc.upenn.edu> 5 # Edward Loper <edloper@gradient.cis.upenn.edu> 6 # URL: <http://nltk.sf.net> 7 # For license information, see LICENSE.TXT 8 9 """ 10 Read chunk structures from the CONLL-2000 Corpus 11 """ 12 13 from nltk_lite.corpora import get_basedir 14 from nltk_lite import tokenize, chunk 15 from nltk_lite.parse import tree 16 import os 17 18 items = ['train', 'test'] 19 20 item_name = { 21 'train': 'training set', 22 'test': 'test set' 23 } 24 2729 if type(files) is str: files = (files,) 30 for file in files: 31 path = os.path.join(get_basedir(), "conll2000", file + ".txt") 32 s = open(path).read() 33 for sent in tokenize.blankline(s): 34 yield [word for (word, tag, chunk) in _list_sent(sent)]3537 if type(files) is str: files = (files,) 38 for file in files: 39 path = os.path.join(get_basedir(), "conll2000", file + ".txt") 40 s = open(path).read() 41 for sent in tokenize.blankline(s): 42 yield [(word, tag) for (word, tag, chunk) in _list_sent(sent)]4345 if type(files) is str: files = (files,) 46 for file in files: 47 path = os.path.join(get_basedir(), "conll2000", file + ".txt") 48 s = open(path).read() 49 for sent in tokenize.blankline(s): 50 yield chunk.conllstr2tree(sent, chunk_types)5153 from nltk_lite.corpora import conll2000 54 from itertools import islice 55 56 print "CONLL Chunked data\n" 57 58 print "Raw text:" 59 for sent in islice(conll2000.raw(), 0, 5): 60 print sent 61 print 62 63 print "Tagged text:" 64 for sent in islice(conll2000.tagged(), 0, 5): 65 print sent 66 print 67 68 print "Chunked text:" 69 for tree in islice(conll2000.chunked(chunk_types=('NP','PP')), 0, 5): 70 print tree.pp() 71 print72 73 74 if __name__ == '__main__': 75 demo() 76
Home | Trees | Indices | Help |
|
---|
Generated by Epydoc 3.0beta1 on Wed May 16 22:47:52 2007 | http://epydoc.sourceforge.net |