Package nltk_lite :: Package corpora :: Module brown
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.brown

 1  # Natural Language Toolkit: Brown Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2007 University of Pennsylvania 
 4  # Author: Steven Bird <sb@ldc.upenn.edu> 
 5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
 6  # URL: <http://nltk.sf.net> 
 7  # For license information, see LICENSE.TXT 
 8   
 9  """ 
10  Read tokens from the Brown Corpus. 
11   
12  Brown Corpus: A Standard Corpus of Present-Day Edited American 
13  English, for use with Digital Computers, by W. N. Francis and 
14  H. Kucera (1964), Department of Linguistics, Brown University, 
15  Providence, Rhode Island, USA.  Revised 1971, Revised and 
16  Amplified 1979.  Distributed with NLTK with the permission of the 
17  copyright holder.  Source: http://www.hit.uib.no/icame/brown/bcm.html 
18   
19  The Brown Corpus is divided into the following files: 
20   
21  a. press: reportage 
22  b. press: editorial 
23  c. press: reviews 
24  d. religion 
25  e. skill and hobbies 
26  f. popular lore 
27  g. belles-lettres 
28  h. miscellaneous: government & house organs 
29  j. learned 
30  k: fiction: general 
31  l: fiction: mystery 
32  m: fiction: science 
33  n: fiction: adventure 
34  p. fiction: romance 
35  r. humor 
36  """        
37   
38  from nltk_lite.corpora import get_basedir 
39  from nltk_lite import tokenize 
40  from nltk_lite.tag import string2tags, string2words 
41  import os 
42   
43  items = list('abcdefghjklmnpr') 
44   
45  item_name = { 
46      'a': 'press: reportage', 
47      'b': 'press: editorial', 
48      'c': 'press: reviews', 
49      'd': 'religion', 
50      'e': 'skill and hobbies', 
51      'f': 'popular lore', 
52      'g': 'belles-lettres', 
53      'h': 'miscellaneous: government & house organs', 
54      'j': 'learned', 
55      'k': 'fiction: general', 
56      'l': 'fiction: mystery', 
57      'm': 'fiction: science', 
58      'n': 'fiction: adventure', 
59      'p': 'fiction: romance', 
60      'r': 'humor' 
61      } 
62   
63 -def _read(files, conversion_function):
64 if type(files) is str: files = (files,) 65 66 for file in files: 67 path = os.path.join(get_basedir(), "brown", file) 68 f = open(path).read() 69 for sent in tokenize.line(f): 70 if sent: 71 yield conversion_function(sent)
72
73 -def raw(files = items):
74 return _read(files, string2words)
75
76 -def tagged(files = items):
77 return _read(files, string2tags)
78
79 -def demo():
80 from nltk_lite.corpora import brown 81 from itertools import islice 82 from pprint import pprint 83 84 pprint(list(islice(brown.raw('a'), 0, 5))) 85 86 pprint(list(islice(brown.tagged('a'), 0, 5)))
87 88 if __name__ == '__main__': 89 demo() 90