Package nltk_lite :: Package corpora :: Module indian
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.indian

 1  # Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2007 University of Pennsylvania 
 4  # Author: Steven Bird <sb@ldc.upenn.edu> 
 5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
 6  # URL: <http://nltk.sf.net> 
 7  # For license information, see LICENSE.TXT 
 8   
 9  """ 
10  Indian Language POS-Tagged Corpus 
11  Collected by A Kumaran, Microsoft Research, India 
12  Distributed with permission 
13   
14  Contents: 
15  - Bangla: IIT Kharagpur 
16  - Hindi: Microsoft Research India 
17  - Marathi: IIT Bombay 
18  - Telugu: IIIT Hyderabad 
19  """        
20   
21  from nltk_lite.corpora import get_basedir 
22  from nltk_lite import tokenize 
23  from nltk_lite.tag import string2tags, string2words 
24  import os 
25   
26  items = list(['bangla', 'hindi', 'marathi', 'telugu']) 
27   
28 -def _read(files, conversion_function):
29 if type(files) is str: files = (files,) 30 31 for file in files: 32 path = os.path.join(get_basedir(), "indian", file + ".pos") 33 f = open(path).read() 34 for sent in tokenize.line(f): 35 if sent and sent[0] != "<": 36 yield conversion_function(sent)
37
38 -def xreadlines(files = items):
39 if type(files) is str: files = (files,) 40 for file in files: 41 path = os.path.join(get_basedir(), "indian", file + ".pos") 42 for line in open(path): 43 yield line
44
45 -def raw(files = items):
46 return _read(files, lambda s: string2words(s, sep="_"))
47
48 -def tagged(files = items):
49 return _read(files, lambda s: string2tags(s, sep="_"))
50 51
52 -def sample(language):
53 from nltk_lite.corpora import indian, extract 54 print language.capitalize() + ":", 55 for word, tag in extract(8, indian.tagged(language)): 56 print word + "/" + `tag`, 57 print
58
59 -def demo():
60 61 sample('bangla') 62 sample('hindi') 63 sample('marathi') 64 sample('telugu')
65 66 if __name__ == '__main__': 67 demo() 68