1
2
3
4
5
6
7
8 from nltk_lite.corpora import get_basedir
9 from nltk_lite import tokenize
10 from nltk_lite.tag import tag2tuple
11 from nltk_lite.parse import tree
12 import os, re
13
14 """
15 Sinica Treebank Corpus Sample
16
17 http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm
18
19 10,000 parsed sentences, drawn from the Academia Sinica Balanced
20 Corpus of Modern Chinese. Parse tree notation is based on
21 Information-based Case Grammar. Tagset documentation is available
22 at http://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html
23
24 Language and Knowledge Processing Group, Institute of Information
25 Science, Academia Sinica
26
27 It is distributed with the Natural Language Toolkit under the terms of
28 the Creative Commons Attribution-NonCommercial-ShareAlike License
29 [http://creativecommons.org/licenses/by-nc-sa/2.5/].
30
31 References:
32
33 Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
34 The Construction of Sinica Treebank. Computational Linguistics and
35 Chinese Language Processing, 4, pp 87-104.
36
37 Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
38 Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
39 Annotation Guidelines, and On-line Interface. Proceedings of 2nd
40 Chinese Language Processing Workshop, Association for Computational
41 Linguistics.
42
43 Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
44 Extraction, Proceedings of IJCNLP-04, pp560-565.
45 """
46
47 IDENTIFIER = re.compile(r'^#\S+\s')
48 APPENDIX = re.compile(r'(?<=\))#.*$')
49 TAGWORD = re.compile(r':([^:()|]+):([^:()|]+)')
50
51 -def raw(files = 'raw'):
52 """
53 @param files: One or more treebank files to be processed
54 @type files: L{string} or L{tuple(string)}
55 @rtype: iterator over L{list(string)}
56 """
57
58
59 if type(files) is str: files = (files,)
60
61 for file in files:
62 path = os.path.join(get_basedir(), "sinica_treebank", file)
63 for line in open(path):
64 yield line.split()[1:]
65
82
99
100
123
124 if __name__ == '__main__':
125 demo()
126