Package nltk_lite :: Package corpora :: Module sinica_treebank
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.sinica_treebank

  1  # Natural Language Toolkit: Sinica Treebank Reader 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Steven Bird <sb@ldc.upenn.edu> 
  5  # URL: <http://nltk.sf.net> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  from nltk_lite.corpora import get_basedir 
  9  from nltk_lite import tokenize 
 10  from nltk_lite.tag import tag2tuple 
 11  from nltk_lite.parse import tree 
 12  import os, re 
 13   
 14  """ 
 15  Sinica Treebank Corpus Sample 
 16   
 17  http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm 
 18   
 19  10,000 parsed sentences, drawn from the Academia Sinica Balanced 
 20  Corpus of Modern Chinese.  Parse tree notation is based on 
 21  Information-based Case Grammar.  Tagset documentation is available 
 22  at http://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html 
 23   
 24  Language and Knowledge Processing Group, Institute of Information 
 25  Science, Academia Sinica 
 26   
 27  It is distributed with the Natural Language Toolkit under the terms of 
 28  the Creative Commons Attribution-NonCommercial-ShareAlike License 
 29  [http://creativecommons.org/licenses/by-nc-sa/2.5/]. 
 30   
 31  References: 
 32   
 33  Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999) 
 34  The Construction of Sinica Treebank. Computational Linguistics and 
 35  Chinese Language Processing, 4, pp 87-104. 
 36   
 37  Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming 
 38  Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria, 
 39  Annotation Guidelines, and On-line Interface. Proceedings of 2nd 
 40  Chinese Language Processing Workshop, Association for Computational 
 41  Linguistics. 
 42   
 43  Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar 
 44  Extraction, Proceedings of IJCNLP-04, pp560-565. 
 45  """ 
 46   
 47  IDENTIFIER = re.compile(r'^#\S+\s') 
 48  APPENDIX = re.compile(r'(?<=\))#.*$') 
 49  TAGWORD = re.compile(r':([^:()|]+):([^:()|]+)') 
 50   
51 -def raw(files = 'raw'):
52 """ 53 @param files: One or more treebank files to be processed 54 @type files: L{string} or L{tuple(string)} 55 @rtype: iterator over L{list(string)} 56 """ 57 58 # Just one file to process? If so convert to a tuple so we can iterate 59 if type(files) is str: files = (files,) 60 61 for file in files: 62 path = os.path.join(get_basedir(), "sinica_treebank", file) 63 for line in open(path): 64 yield line.split()[1:]
65
66 -def tagged(files = 'parsed'):
67 """ 68 @param files: One or more treebank files to be processed 69 @type files: L{string} or L{tuple(string)} 70 @rtype: iterator over L{tree} 71 """ 72 73 # Just one file to process? If so convert to a tuple so we can iterate 74 if type(files) is str: files = (files,) 75 76 for file in files: 77 path = os.path.join(get_basedir(), "sinica_treebank", file) 78 for sent in open(path): 79 sent = re.sub(IDENTIFIER, '', sent) 80 tagged_tokens = re.findall(TAGWORD, sent) 81 yield [(token, tag) for (tag, token) in tagged_tokens]
82
83 -def parsed(files = 'parsed'):
84 """ 85 @param files: One or more treebank files to be processed 86 @type files: L{string} or L{tuple(string)} 87 @rtype: iterator over L{tree} 88 """ 89 90 # Just one file to process? If so convert to a tuple so we can iterate 91 if type(files) is str: files = (files,) 92 93 for file in files: 94 path = os.path.join(get_basedir(), "sinica_treebank", file) 95 for sent in open(path): 96 sent = re.sub(IDENTIFIER, '', sent) 97 sent = re.sub(APPENDIX, '', sent) 98 yield tree.sinica_parse(sent)
99 100
101 -def demo():
102 from nltk_lite.corpora import sinica_treebank 103 from nltk_lite.draw.tree import draw_trees 104 from itertools import islice 105 106 print "Raw:" 107 for sent in islice(sinica_treebank.raw(), 10): 108 print sent 109 print 110 111 print "Tagged:" 112 for sent in islice(sinica_treebank.tagged(), 10): 113 print sent 114 print 115 116 print "Parsed:" 117 trees = list(islice(sinica_treebank.parsed(), 10)) 118 for tree in trees: 119 print tree 120 print 121 122 draw_trees(*trees)
123 124 if __name__ == '__main__': 125 demo() 126