Package nltk_lite :: Package corpora :: Module names
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.names

 1  # Natural Language Toolkit: Names Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2007 University of Pennsylvania 
 4  # Author: Steven Bird <sb@ldc.upenn.edu> 
 5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
 6  # URL: <http://nltk.sf.net> 
 7  # For license information, see LICENSE.TXT 
 8   
 9  """ 
10  Names Corpus, Version 1.3 (1994-03-29) 
11  Copyright (C) 1991 Mark Kantrowitz 
12  Additions by Bill Ross 
13   
14  This corpus contains 5001 female names and 2943 male names, sorted 
15  alphabetically, one per line. 
16  (Used in NLTK with permission.  See the README file for details.) 
17  """        
18   
19  from nltk_lite.corpora import get_basedir 
20  import os 
21   
22  items = ['female', 'male'] 
23   
24  item_name = { 
25      'female':     'Female names', 
26      'male':       'Male names' 
27      } 
28   
29 -def raw(files = ['female', 'male']):
30 if type(files) is str: files = (files,) 31 32 for file in files: 33 path = os.path.join(get_basedir(), "names", file+".txt") 34 for word in open(path): 35 yield word.strip()
36
37 -def demo():
38 from nltk_lite.corpora import names 39 from random import shuffle 40 from pprint import pprint 41 42 print "20 female names" 43 female = list(names.raw('female')) 44 shuffle(female) 45 pprint(female[:20]) 46 47 print "20 male names" 48 male = list(names.raw('male')) 49 shuffle(male) 50 pprint(male[:20])
51 52 if __name__ == '__main__': 53 demo() 54