Package nltk_lite :: Package corpora :: Module stopwords
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.stopwords

 1  # Natural Language Toolkit: Stopwords Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2007 University of Pennsylvania 
 4  # Author: Steven Bird <sb@ldc.upenn.edu> 
 5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
 6  # URL: <http://nltk.sf.net> 
 7  # For license information, see LICENSE.TXT 
 8   
 9  """ 
10  Read tokens from the Stopwords Corpus. 
11  """        
12   
13  from nltk_lite.corpora import get_basedir 
14  import os 
15   
16  items = ['danish', 'dutch', 'english', 'french', 'german', 'italian', 
17           'norwegian', 'portuguese', 'russian', 'spanish', 'swedish'] 
18   
19  item_name = { 
20      'danish':     'Danish stopwords', 
21      'dutch':      'Dutch stopwords', 
22      'english':    'English stopwords', 
23      'french':     'French stopwords', 
24      'german':     'German stopwords', 
25      'italian':    'Italian stopwords', 
26      'norwegian':  'Norwegian stopwords', 
27      'portuguese': 'Portuguese stopwords', 
28      'russian':    'Russian stopwords', 
29      'spanish':    'Spanish stopwords', 
30      'swedish':    'Swedish stopwords', 
31      } 
32   
33 -def raw(files = 'english'):
34 if type(files) is str: files = (files,) 35 36 for file in files: 37 path = os.path.join(get_basedir(), "stopwords", file) 38 for word in open(path): 39 yield word.strip()
40
41 -def demo():
42 from nltk_lite.corpora import stopwords 43 from itertools import islice 44 from pprint import pprint 45 46 print "20 English stopwords" 47 pprint(list(islice(stopwords.raw(), 0, 20))) 48 49 print "20 Danish stopwords" 50 pprint(list(islice(stopwords.raw('danish'), 0, 20)))
51 52 if __name__ == '__main__': 53 demo() 54