Package nltk_lite :: Package corpora :: Module conll2002
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.conll2002

  1  # Natural Language Toolkit: CONLL 2002 Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Steven Bird <sb@ldc.upenn.edu> 
  5  #              Edward Loper <edloper@gradient.cis.upenn.edu> 
  6  #              Ewan Klein <ewan@inf.ed.ac.uk> 
  7  # URL: <http://nltk.sf.net> 
  8  # For license information, see LICENSE.TXT 
  9   
 10  """ 
 11  Read Named Entity tagged data as chunk structures from the CONLL-2002 Corpus 
 12  """        
 13   
 14  from nltk_lite.corpora import get_basedir 
 15  from nltk_lite import tokenize, chunk 
 16  from nltk_lite.parse import tree 
 17  import os 
 18   
 19  esp = ['esp.train', 'esp.testa', 'esp.testb']    
 20  ned = ['ned.train', 'ned.testa', 'ned.testb'] 
 21  items = esp + ned 
 22   
 23  item_name = { 
 24      'ned.train': 'Dutch training set', 
 25      'ned.testa': 'Dutch test set a', 
 26      'ned.testb': 'Dutch test set b', 
 27      'esp.train': 'Spanish training set', 
 28      'esp.testa': 'Spanish test set a', 
 29      'ned.testb': 'Spanish test set b', 
 30      } 
 31   
32 -def _list_sent(sent):
33 return [tokenize.whitespace(line) for line in tokenize.line(sent)]
34
35 -def raw(files = ['ned.train', 'esp.train']):
36 if type(files) is str: files = (files,) 37 for file in files: 38 path = os.path.join(get_basedir(), "conll2002", file) 39 s = open(path).read() 40 # remove initial -DOCSTART- -DOCSTART- O 41 if s.startswith('-DOCSTART'): 42 s = s[23:] 43 for sent in tokenize.blankline(s): 44 yield [word for (word, tag, ner) in _list_sent(sent)]
45
46 -def pos_tagged(files = ['ned.train', 'esp.train']):
47 if type(files) is str: files = (files,) 48 for file in files: 49 path = os.path.join(get_basedir(), "conll2002", file) 50 s = open(path).read() 51 # remove initial -DOCSTART- -DOCSTART- O 52 if s.startswith('-DOCSTART-'): 53 s = s[23:] 54 for sent in tokenize.blankline(s): 55 yield [(word, tag) for (word, tag, ner) in _list_sent(sent)]
56
57 -def ne_chunked(files = ['ned.train', 'esp.train'], chunk_types=('LOC','ORG','PER')):
58 """ 59 MISC has been omitted 60 """ 61 if type(files) is str: files = (files,) 62 for file in files: 63 path = os.path.join(get_basedir(), "conll2002", file) 64 s = open(path).read() 65 # remove initial -DOCSTART- -DOCSTART- O 66 if s.startswith('-DOCSTART'): 67 s = s[23:] 68 for sent in tokenize.blankline(s): 69 yield chunk.conllstr2tree(sent, chunk_types)
70
71 -def demo():
72 from nltk_lite.corpora import conll2002 73 from itertools import islice 74 75 print "CONLL2002 NE data\n" 76 77 print "Raw text -- Dutch:" 78 for sent in islice(conll2002.raw(files = ['ned.train']), 0, 5): 79 print sent 80 print 81 82 print "Raw text --Spanish:" 83 for sent in islice(conll2002.raw(files = ['esp.train']), 0, 5): 84 print sent 85 print 86 87 print "POS Tagged text -- Dutch:" 88 for sent in islice(conll2002.pos_tagged(files = ['ned.train']), 0, 5): 89 print sent 90 print 91 92 print "POS Tagged text --Spanish:" 93 for sent in islice(conll2002.pos_tagged(files = ['esp.train']), 0, 5): 94 print sent 95 print 96 97 print "Named Entity chunked text -- Dutch:" 98 for tree in islice(conll2002.ne_chunked(files = ['ned.train']), 0, 5): 99 print tree.pp() 100 print 101 102 print "Named Entity chunked text --Spanish:" 103 for tree in islice(conll2002.ne_chunked(files = ['esp.train']), 0, 5): 104 print tree.pp() 105 print
106 107 108 if __name__ == '__main__': 109 demo() 110