Package nltk_lite :: Package contrib :: Package toolbox :: Module lexicon
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.toolbox.lexicon

  1  # Natural Language Toolkit: Toolbox Lexicon 
  2  # 
  3  # Copyright (C) 2001-2006 University of Pennsylvania 
  4  # Author: Stuart Robinson <stuart@zapata.org> 
  5  # URL: <http://nltk.sf.net> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  """ 
  9  This modules provides functionality for parsing and manipulating the 
 10  contents of a Toolbox lexicon without reference to its metadata. 
 11  """ 
 12   
 13  import os, re, sys 
 14  from nltk_lite.corpora import get_basedir 
 15  from nltk_lite.corpora.toolbox import StandardFormat 
 16  from utilities import Field, SequentialDictionary 
 17   
 18   
19 -class Lexicon(StandardFormat):
20 21 """ 22 This class represents a Toolbox lexicon, which consists of an 23 optional header and one or more Entry objects, saved in a dictionary 24 whose keys are passed as a parameter to the parse() method. 25 """ 26
27 - def __init__(self, file):
28 """ 29 This method construct a Lexicon object with a header and a dictionary of 30 entries. 31 """ 32 self._key_fields = ['lx'] 33 self._header = '' 34 self._entries = {} 35 self._file = file
36
37 - def __str__(self):
38 """ 39 This method defines the string representation of a Lexicon object 40 """ 41 s = "%s\n" % self.get_header() 42 for e in self.get_entries(): 43 s = "%s%s\n" % (s, e) 44 return s
45
46 - def set_header(self, header):
47 """ 48 This method sets the raw text of the header. 49 @param header: header (as raw text) 50 @type header: string 51 """ 52 self._header = header
53
54 - def get_header(self):
55 """ 56 This method obtains the raw text of the header. 57 @return: raw header 58 @rtype: string 59 """ 60 return self._header
61
62 - def get_entries(self):
63 """ 64 This method obtains all of the entries found in a 65 parsed Toolbox lexicon. 66 67 @return: all of the entries in the Lexicon 68 @rtype: list of Entry objects 69 """ 70 keys = self._entries.keys() 71 keys.sort() 72 for k in keys : 73 v = self._entries[k] 74 for e in v : 75 yield e
76
77 - def add_entry(self, entry, unique=False):
78 """ 79 This method adds an Entry object to a Lexicon object. It adds the 80 entry to the Lexicon keyed by the values of the fields specified 81 by the I{key_fields} argument. 82 83 @param entry: a parsed entry from a Toolbox lexicon 84 @type entry: Entry object 85 @param unique: raise exception if entry key already exists 86 @type unique: boolean 87 """ 88 key = "" 89 for field_marker in self._key_fields: 90 f = entry.get_field(field_marker) 91 if f: 92 values = f.get_values("/") 93 key = key + "-" + values 94 else: 95 # Should this throw an error if a field with no values 96 # is used in the list of key fields? 97 pass 98 if self._entries.has_key(key) : 99 if unique : 100 msg = "Non-unique entry! \nEntry: \n%s\nKey Fields: %s\nKey: '%s'\n" % (entry, self._key_fields, key) 101 raise ValueError, msg 102 else : 103 self._entries[key] = [] 104 # Now append entry to list of entries for key 105 self._entries[key].append(entry)
106 107
108 - def parse(self, 109 head_field_marker = 'lx', 110 subentry_field_marker = None, 111 key_fields = None, 112 unique_entry = True, 113 unique_subentry = False):
114 """ 115 This method parses a Toolbox file in a Lexicon object. It will also parse 116 subentries provided that the field marker identifying subentries is passed to it. 117 118 @param head_field_marker: field marker that identifies the start of an entry 119 @type head_field_marker: string 120 @param key_fields: the field(s) to which entries are keyed 121 @type key_fields: list of strings 122 @param subentry_field_marker: field marker that identifies subentries 123 @type subentry_field_marker: string 124 @param unique_entry: raise warning if entries are non-unique according 125 to I{key_fields} parameter 126 @type unique_entry: boolean 127 @param unique_subentry: raise warning if entries are non-unique according to 128 I{key_fields} parameter 129 @type unique_subentry: boolean 130 @return: a parsed Lexicon object 131 @rtype: dictionary object 132 """ 133 134 if key_fields : 135 self._key_fields = key_fields 136 137 # Set up variables 138 inside_entry = False 139 inside_subentry = False 140 e = None 141 se = None 142 # Use low-level functionality to get raw fields and walk through them 143 self.open(self._file) 144 for f in self.raw_fields() : 145 fmarker, fvalue = f 146 # What kind of field marker is it? 147 if fmarker.startswith("_") : 148 # TODO: Add field to header 149 pass 150 elif fmarker == head_field_marker : 151 inside_entry = True 152 inside_subentry = False 153 if e : 154 self.add_entry(e, unique_entry) 155 e = Entry() 156 elif subentry_field_marker and fmarker == subentry_field_marker : 157 inside_subentry = True 158 if se : 159 e.add_subentry(se) 160 se = Entry() 161 # Add field to entry or subentry 162 if inside_subentry : 163 se.add_field(fmarker, fvalue) 164 elif inside_entry : 165 e.add_field(fmarker, fvalue) 166 else : 167 pass 168 # Deal with last entry 169 if e : 170 self.add_entry(e, unique_entry) 171 self.close()
172
173 -class Entry:
174 """ 175 This class represents an entry (record) from a Toolbox lexicon. Each entry 176 consists of a collection of fields, stored as a special type of dictionary 177 which keeps track of the sequence in which its keys were entered. 178 """ 179
180 - def __init__(self):
181 """ 182 This method constructs a new Entry object. 183 """ 184 self._fields = SequentialDictionary() 185 self._rawText = "" 186 self._number = None 187 self._subentries = None
188
189 - def __str__(self):
190 """ 191 This method defines the string representation of an entry. 192 193 @rtype: string 194 @return: an entry as a string in Standard Format 195 """ 196 s = "" 197 fields = self.get_fields() 198 for fm, fvs in self._fields.items(): 199 for fv in fvs: 200 s = s + "\n\\%s %s" % (fm, fv) 201 return s
202
203 - def set_raw_text(self, rawText):
204 """ 205 This method provides access to the raw text from which the 206 Entry object was parsed. 207 208 @param rawText: raw Toolbox text from which entry was parsed 209 @type rawText: string 210 """ 211 self._rawText = rawText
212
213 - def get_raw_text(self):
214 """ 215 This method sets the raw text from which the Entry object was parsed. 216 217 @rtype: string 218 """ 219 return self._rawText
220
221 - def get_subentries(self):
222 """ 223 This method obtains all of the subentries for an entry. 224 225 @rtype: list of Entry objects 226 @returns: all of the subentries of an entry 227 """ 228 return self._subentries
229
230 - def add_subentry(self, subentry):
231 """ 232 This method adds to an entry a subentry, which is simply another 233 Entry object. 234 235 @param subentry: subentry 236 @type subentry: Entry object : 237 """ 238 if not self._subentries: 239 self._subentries = [] 240 self._subentries.append(subentry)
241
242 - def set_number(self, number):
243 """ 244 This method sets the position of the entry in 245 the dictionary as a cardinal number. 246 247 @param number: number of entry 248 @type number: integer 249 """ 250 self._number = number
251
252 - def get_number(self):
253 """ 254 This method obtains the position of the entry in the dictionary 255 as a cardinal number. 256 257 @rtype: integer 258 """ 259 return self._number
260
261 - def get_fields(self):
262 """ 263 This method obtains all of the fields found in the Entry object. 264 265 @rtype: list of Field objects 266 """ 267 return self._fields.values()
268
269 - def get_field_markers(self):
270 """ 271 This method obtains of the field markers found in the Entry object. 272 273 @return: the field markers of an entry 274 @rtype: list 275 """ 276 return self._fields.keys()
277
278 - def get_values_by_marker(self, field_marker, sep=None) :
279 return self.get_field_values_by_field_marker(field_marker, sep)
280
281 - def get_field_values_by_field_marker(self, field_marker, sep=None):
282 """ 283 This method returns all of the field values for a given field marker. 284 If the L(sep) is set, it will return a string; otherwise, it will 285 return a list of Field objects. 286 287 @param field_marker: marker of desired field 288 @type field_marker: string 289 @param sep: separator for field values 290 @type sep: string 291 @rtype: string (if sep); otherwise, list of Field objects 292 """ 293 try: 294 values = self._fields[field_marker] 295 if sep == None: 296 return values 297 else: 298 return sep.join(values) 299 except KeyError: 300 return None
301
302 - def get_field_as_string(self, 303 field_marker, 304 join_string=""):
305 """ 306 This method returns a particular field given a field marker. 307 Returns a blank string if field is not found. 308 309 @param field_marker: marker of desired field 310 @type field_marker: string 311 @param join_string: string used to join field values (default to blank string) 312 @type join_string: string 313 @rtype: string 314 """ 315 try: 316 return join_string.join(self._fields[field_marker]) 317 except KeyError: 318 return ""
319
320 - def get_field(self, fieldMarker):
321 """ 322 This method returns a particular field given a field marker. 323 324 @param fieldMarker: marker of desired field 325 @type fieldMarker: string 326 @rtype: Field object 327 """ 328 try: 329 return Field(fieldMarker, self._fields[fieldMarker]) 330 except KeyError: 331 return None
332
333 - def set_field(self, fieldMarker, field):
334 """ 335 This method sets a field, given a marker and its associated data. 336 337 @param fieldMarker: field marker to set 338 @type fieldMarker: string 339 @param field : field object associated with field marker 340 @type field : Field 341 """ 342 fvs = [] 343 fvs.append(fieldData) 344 self._fields[fieldMarker] = fvs
345
346 - def set_field_values(self, fieldMarker, fieldValues):
347 """ 348 This method sets all of the values associated with a field. 349 350 @param fieldMarker: field marker to set 351 @type fieldMarker: string 352 @param fieldValues: list of field values 353 @type fieldValues: list 354 """ 355 self._fields[fieldMarker] = fieldValues
356
357 - def add_field(self, marker, value):
358 """ 359 This method adds a field to an entry if it does not already exist 360 and adds a new value to the field of an entry if it does. 361 362 @param marker: field marker 363 @type marker: string 364 @param value : field value 365 @type value : string 366 """ 367 if self._fields.has_key(marker): 368 fvs = self._fields[marker] 369 fvs.append(value) 370 else: 371 fvs = [] 372 fvs.append(value) 373 self._fields[marker] = fvs
374
375 - def remove_field(self, fieldMarker):
376 """ 377 This method removes from an entry every field for a given 378 field marker. It will not raise an error if the specified field 379 does not exist. 380 381 @param fieldMarker: field marker to be deleted 382 @type fieldMarker: string 383 """ 384 if self._fields.has_key(fieldMarker): 385 del self._fields[fieldMarker]
386
387 -def demo() :
388 path = os.path.join(get_basedir(), "toolbox", "rotokas.dic") 389 l = Lexicon(path) 390 l.parse(key_fields=['lx','ps','sn'], unique_entry=False) 391 h = l.get_header() 392 for e in l.get_entries() : 393 print "<%s><%s><%s>" % (e.get_field_as_string("lx", ""), 394 e.get_field_as_string("ps", ""), 395 e.get_field_as_string("sn", ""))
396 397 if __name__ == '__main__': 398 demo() 399