1
2
3
4
5
6
7
8
9 """
10 Read tokens from the Brown Corpus.
11
12 Brown Corpus: A Standard Corpus of Present-Day Edited American
13 English, for use with Digital Computers, by W. N. Francis and
14 H. Kucera (1964), Department of Linguistics, Brown University,
15 Providence, Rhode Island, USA. Revised 1971, Revised and
16 Amplified 1979. Distributed with NLTK with the permission of the
17 copyright holder. Source: http://www.hit.uib.no/icame/brown/bcm.html
18
19 The Brown Corpus is divided into the following files:
20
21 a. press: reportage
22 b. press: editorial
23 c. press: reviews
24 d. religion
25 e. skill and hobbies
26 f. popular lore
27 g. belles-lettres
28 h. miscellaneous: government & house organs
29 j. learned
30 k: fiction: general
31 l: fiction: mystery
32 m: fiction: science
33 n: fiction: adventure
34 p. fiction: romance
35 r. humor
36 """
37
38 from nltk_lite.corpora import get_basedir
39 from nltk_lite import tokenize
40 from nltk_lite.tag import string2tags, string2words
41 import os
42
43 items = list('abcdefghjklmnpr')
44
45 item_name = {
46 'a': 'press: reportage',
47 'b': 'press: editorial',
48 'c': 'press: reviews',
49 'd': 'religion',
50 'e': 'skill and hobbies',
51 'f': 'popular lore',
52 'g': 'belles-lettres',
53 'h': 'miscellaneous: government & house organs',
54 'j': 'learned',
55 'k': 'fiction: general',
56 'l': 'fiction: mystery',
57 'm': 'fiction: science',
58 'n': 'fiction: adventure',
59 'p': 'fiction: romance',
60 'r': 'humor'
61 }
62
63 -def _read(files, conversion_function):
72
75
78
87
88 if __name__ == '__main__':
89 demo()
90