Package nltk_lite :: Package corpora :: Module senseval
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.senseval

  1  # Natural Language Toolkit: Senseval 2 Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Trevor Cohn <tacohn@cs.mu.oz.au> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> (modifications) 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  Read from the Senseval 2 Corpus. 
 11   
 12  SENSEVAL [http://www.senseval.org/] 
 13  Evaluation exercises for Word Sense Disambiguation. 
 14  Organized by ACL-SIGLEX [http://www.siglex.org/] 
 15   
 16  Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota, 
 17  http://www.d.umn.edu/~tpederse/data.html 
 18  Distributed with permission. 
 19   
 20  The NLTK version of the Senseval 2 files uses well-formed XML. 
 21  Each instance of the ambiguous words "hard", "interest", "line", and "serve" 
 22  is tagged with a sense identifier, and supplied with context. 
 23  """        
 24   
 25  from nltk_lite.corpora import get_basedir 
 26  from nltk_lite import tokenize 
 27  import os, re, xml.sax 
 28   
 29  items = ["hard", "interest", "line", "serve"] 
 30   
31 -class SensevalParser(xml.sax.ContentHandler):
32
33 - def __init__(self, buffer_size=1024):
34 xml.sax.ContentHandler.__init__(self) 35 self._lemma = '' 36 self._buffer_size = buffer_size 37 self.reset()
38
39 - def parse(self, text):
40 if hasattr(text, '__iter__') and hasattr(text, 'next'): 41 text = ''.join(text) 42 parser = xml.sax.make_parser() 43 parser.setContentHandler(self) 44 current = 0 45 while current < len(text): 46 buffer = text[current : current + self._buffer_size] 47 parser.feed(buffer) 48 for instance in self._instances: 49 yield instance 50 self.reset(True, False) 51 current += self._buffer_size 52 parser.close()
53
54 - def characters(self, ch):
55 self._data += _to_ascii(ch)
56
57 - def startElement(self, tag, attr):
58 if tag == 'wf': 59 self._pos = _to_ascii(attr.getValueByQName('pos')) 60 elif tag == 'answer': 61 instance_id = _to_ascii(attr.getValueByQName('instance')) 62 self._senses.append(_to_ascii(attr.getValueByQName('senseid'))) 63 self._iloc = instance_id 64 65 elif tag == 'context': 66 self._data = '' 67 elif tag == 'lexelt': 68 self._lemma = _to_ascii(attr.getValueByQName('item')) 69 elif tag == 'head': 70 self._head = self._wnum - 1
71
72 - def endElement(self, tag):
73 if tag == 'wf': 74 text = self._data.strip() 75 pos = self._pos 76 self._tokens.append((text, pos)) 77 self._wnum += 1 78 self._data = '' 79 elif tag == 'context': 80 self._instances.append((tuple(self._senses), self._head, self._tokens)) 81 self.reset(False)
82
83 - def instances(self):
84 return self._instances
85
86 - def reset(self, instances=True, state=True):
87 if instances: 88 self._instances = [] 89 if state: 90 self._senses = [] 91 self._head = None 92 self._data = '' 93 self._wnum = 1 94 self._iloc = None 95 self._tokens = [] 96 self._pos = None
97
98 -def _to_ascii(text):
99 return text.encode('Latin-1')
100 101
102 -def raw(files = items):
103 """ 104 @param files: One or more Senseval files to be processed 105 @type files: L{string} or L{tuple(string)} 106 @rtype: iterator over L{tuple} 107 """ 108 109 if type(files) is str: files = (files,) 110 parser = SensevalParser() 111 for file in files: 112 path = os.path.join(get_basedir(), "senseval", file+".pos") 113 f = open(path).read() 114 for entry in parser.parse(f): 115 yield entry
116
117 -def demo():
118 from nltk_lite.corpora import senseval 119 from itertools import islice 120 import string 121 122 # Print one example of each sense 123 124 seen = set() 125 for (senses, position, context) in senseval.raw('line'): 126 if senses not in seen: 127 seen.add(senses) 128 print "senses:", senses 129 print "position:", position 130 print "context:", string.join('%s/%s' % ttok for ttok in context) 131 print
132 133 if __name__ == '__main__': 134 demo() 135