Package nltk_lite :: Package contrib :: Package toolbox :: Module text
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.toolbox.text

  1  # Natural Language Toolkit: Shoebox Text 
  2  # 
  3  # Author: Stuart Robinson <stuart@zapata.org> 
  4  # URL: <http://nltk.sf.net> 
  5  # For license information, see LICENSE.TXT 
  6   
  7  """ 
  8  This module provides tools for parsing and manipulating the contents 
  9  of a Shoebox text without reference to its metadata. 
 10  """ 
 11   
 12  import re 
 13  from utilities import Field, SequentialDictionary 
 14  from nltk_lite.corpora.toolbox import StandardFormat 
 15   
 16   
 17  # -------------------------------------------------------- 
 18  # CLASS: Word 
 19  # DESC:  Object that represents a word. 
 20  # -------------------------------------------------------- 
 21   
22 -class Word:
23 """ 24 This class defines a word object, which consists of fixed number 25 of attributes: a wordform, a gloss, a part of speech, and a list 26 of morphemes. 27 """ 28
29 - def __init__(self, 30 form = None, 31 gloss = None, 32 morphemes = None, 33 partOfSpeech = None):
34 """Constructor that initializes Word object. 35 36 @param form: the surface form for a word 37 @type form: string 38 @param gloss: the gloss for a word 39 @type gloss: string 40 @param morphemes: list of Morpheme objects for a word 41 @type morphemes: list 42 @param partOfSpeech: the part of speech for a word 43 @type partOfSpeech: string 44 """ 45 self._form = form 46 self._gloss = gloss 47 self._morphemes = morphemes 48 self._partOfSpeech = partOfSpeech 49 self._rawGloss = None 50 self._rawMorphemes = None 51 self._rawPartOfSpeech = None 52 return
53
54 - def get_form(self):
55 """Gives the surface form of a word.""" 56 return self._form
57
58 - def set_form(self, form):
59 """Changes the surface form of a word.""" 60 self._form = form
61
62 - def get_gloss(self):
63 """Gives the gloss for a word as a string (without alignment spacing).""" 64 return self._gloss
65
66 - def set_gloss(self, gloss):
67 """Change the gloss for a word.""" 68 self._gloss = gloss
69
70 - def get_morphemes(self):
71 """Gives a list of Morpheme objects for a word.""" 72 return self._morphemes
73
74 - def set_morphemes(self, morphemes):
75 """Change a list of Morpheme objects for a word.""" 76 self._morphemes = morphemes
77
78 - def get_part_of_speech(self):
79 """Gives the part of speech for a word as a string (without alignment spacing).""" 80 return self._partOfSpeech
81
82 - def set_part_of_speech(self, partOfSpeech):
83 """Change the part of speech for a word.""" 84 self._partOfSpeech = partOfSpeech
85
86 - def get_raw_gloss(self):
87 return self._rawGloss
88
89 - def set_raw_gloss(self, rawGloss):
90 self._rawGloss = rawGloss
91
92 - def get_raw_morphemes(self):
93 return self._rawMorphemes
94
95 - def set_raw_morphemes(self, rawMorphemes):
96 self._rawMorphemes = rawMorphemes
97
98 - def get_raw_part_of_speech(self):
99 return self._rawPartOfSpeech
100
101 - def set_raw_part_of_speech(self, rawPartOfSpeech):
102 self._rawPartOfSpeech = rawPartOfSpeech
103 104 105 # -------------------------------------------------------- 106 # CLASS: Morpheme 107 # DESC: Object that represents a morpheme. 108 # -------------------------------------------------------- 109
110 -class Morpheme:
111 """ 112 This class defines a morpheme object, which consists of fixed number 113 of attributes: a surface form, an underlying form, a gloss, and a 114 part of speech. 115 """ 116
117 - def __init__(self, 118 form = None, 119 gloss = None, 120 partOfSpeech = None):
121 """Constructor that creates Morpheme object.""" 122 self._form = form 123 self._gloss = gloss 124 self._partOfSpeech = partOfSpeech 125 return
126
127 - def get_form(self):
128 """Returns form for morpheme.""" 129 return self._form
130
131 - def set_form(self, form):
132 """Change form for morpheme.""" 133 self._form = form
134
135 - def get_gloss(self):
136 """Returns gloss for morpheme.""" 137 return self._gloss
138
139 - def set_gloss(self, gloss):
140 """Change gloss for morpheme.""" 141 self._gloss = gloss
142
143 - def get_part_of_speech(self):
144 """Returns part of speech for morpheme.""" 145 return self._partOfSpeech
146
147 - def set_part_of_speech(self, partOfSpeech):
148 """Change part of speech for morpheme.""" 149 self._partOfSpeech = partOfSpeech
150 151 152 # -------------------------------------------------------- 153 # CLASS: Line 154 # DESC: Object that represents a line from an interlinear 155 # text. 156 # -------------------------------------------------------- 157
158 -class Line:
159 """This class defines a line of interlinear glossing, such as:: 160 161 \\ref 9 162 \\t Vigei avapaviei atarisia. 163 \\m vigei ava -pa -vi -ei atari -sia 164 \\g 1.PL.INC go -PROG -1.PL.INCL -PRES fish -PURP 165 \\p PRO.PERS V.I -SUFF.V.3 -SUFF.VI.4 -SUFF.VI.5 V.I -SUFF.V.4 166 \\fp Yumi bai go kisim pis. 167 \\fe We're going fishing. 168 169 The tiers of a line are saved as a sequential dictionary with 170 all of its associated fields. Identified by the field marker \\ref 171 by default.""" 172
173 - def __init__(self, 174 label=None):
175 """Constructor that initializes Line object.""" 176 self._fields = SequentialDictionary() 177 self._label = label 178 return
179
180 - def add_field(self, field):
181 """Add field to line.""" 182 fm = field.get_marker() 183 fv = field.get_values() 184 self._fields[fm] = fv
185
186 - def get_field_markers(self):
187 """Obtain list of unique fields for the line.""" 188 return self._fields.keys()
189
190 - def get_field_as_string(self, 191 field_marker, 192 join_string=""):
193 """ 194 This method returns a particular field given a field marker. 195 Returns a blank string if field is not found. 196 197 @param field_marker: marker of desired field 198 @type field_marker: string 199 @param join_string: string used to join field values (default to blank string) 200 @type join_string: string 201 @rtype: string 202 """ 203 try: 204 return join_string.join(self._fields[field_marker]) 205 except KeyError: 206 return ""
207
208 - def get_field_values_by_field_marker(self, field_marker, sep=None):
209 """Obtain all fields for a line, given a field marker.""" 210 try: 211 values = self._fields[field_marker] 212 if sep == None: 213 return values 214 else: 215 return sep.join(values) 216 except KeyError: 217 return None
218 219 # def getField(self, field_marker): 220 # try: 221 # return self._fields[field_marker] 222 # except: 223 # return None 224
225 - def get_field_values(self):
226 """Obtain list of field values for the line.""" 227 return self._fields.values()
228
229 - def get_label(self):
230 """Obtain identifier for line.""" 231 return self._label
232
233 - def get_raw_text(self):
234 """Obtain original line of text.""" 235 return self._rawtext
236
237 - def set_label(self, label):
238 """Set identifier for line.""" 239 self._label = label
240
241 - def set_raw_text(self, rawtext):
242 """Set original line of text.""" 243 self._rawtext = rawtext
244
245 - def get_morphemes(self):
246 """Obtain a list of morpheme objects for the line.""" 247 morphemes = [] 248 indices = get_indices(self.getFieldValueByFieldMarker("m")) 249 print "%s" % indices 250 morphemeFormField = self.getFieldValueByFieldMarker("m") 251 morphemeGlossField = self.getFieldValueByFieldMarker("g") 252 morphemeFormSlices = get_slices_by_indices(morphemeFormField, indices) 253 morphemeGlossSlices = get_slices_by_indices(morphemeGlossField, indices) 254 for i in range(0, len(morphemeFormSlices)): 255 m = Morpheme() 256 m.set_form(morphemeFormSlices[i].strip(" ").strip("-")) 257 m.set_gloss(morphemeGlossSlices[i].strip(" ").strip("-")) 258 morphemes.append(m) 259 return morphemes
260
261 - def get_words(self, flagParseMorphemes=True):
262 """Obtain a list of word objects for the line.""" 263 words = [] 264 265 # Obtain raw field values 266 lineWordFormField = self.get_field_values_by_field_marker("t") 267 lineMorphemeFormField = self.get_field_values_by_field_marker("m") 268 lineMorphemeGlossField = self.get_field_values_by_field_marker("g") 269 linePOSField = self.get_field_values_by_field_marker("p") 270 271 wordIndices = get_indices(lineWordFormField) 272 273 # Slice raw field values by indices 274 lineWordFormSlices = get_slices_by_indices(lineWordFormField, wordIndices) 275 lineMorphemeFormSlices = get_slices_by_indices(lineMorphemeFormField, wordIndices) 276 lineMorphemeGlossSlices = get_slices_by_indices(lineMorphemeGlossField, wordIndices) 277 linePOSSlices = get_slices_by_indices(linePOSField, wordIndices) 278 279 # Go through each slice 280 for i in range(0, len(lineWordFormSlices)): 281 wordForm = lineWordFormSlices[i] 282 wordMorphemeForms = lineMorphemeFormSlices[i] 283 wordMorphemeGlosses = lineMorphemeGlossSlices[i] 284 wordPOS = linePOSSlices[i] 285 286 # Initialize word object and set raw fields 287 w = Word() 288 w.set_form(wordForm.strip(" ").strip("-")) 289 w.set_raw_morphemes(wordMorphemeForms.strip(" ").strip("-")) 290 w.set_raw_gloss(wordMorphemeGlosses.strip(" ").strip("-")) 291 w.set_part_of_speech(wordPOS.strip(" ").strip("-")) 292 293 # Should the word be inflated with morpheme objects? 294 # If so, build morpheme object for each morpheme in word 295 if flagParseMorphemes: 296 morphemes = [] 297 298 # Get indices from morpheme-breakdown line in order to make slices 299 morphemeIndices = get_indices(wordMorphemeForms) 300 morphemeFormSlices = get_slices_by_indices(wordMorphemeForms, morphemeIndices) 301 morphemeGlossSlices = get_slices_by_indices(wordMorphemeGlosses, morphemeIndices) 302 morphemePOSSlices = get_slices_by_indices(wordPOS, morphemeIndices) 303 304 # Go through each morpheme 305 for i in range(0, len(morphemeFormSlices)): 306 morphemeForm = morphemeFormSlices[i].strip(" ") 307 morphemeGloss = morphemeGlossSlices[i].strip(" ") 308 morphemePOS = morphemePOSSlices[i].strip(" ") 309 310 # Construct morpheme object from slices 311 m = Morpheme() 312 m.set_form(morphemeForm) 313 m.set_gloss(morphemeGloss) 314 m.set_part_of_speech(morphemePOS) 315 316 # Add cooked morpheme to temporary collection for word 317 morphemes.append(m) 318 319 # Inflate word with cooked morphemes 320 w.set_morphemes(morphemes) 321 322 words.append(w) 323 return words
324
325 - def get_field_value_by_field_marker_and_column(self, field_marker, columnIndex):
326 """Get values for line, given a field and column index.""" 327 fv = self.getFieldValueByFieldMarker(field_marker) 328 field_markers = self.getFieldMarkers() 329 sliceFieldMarker = field_markers[columnIndex-1] 330 indices = getIndices(self.getFieldValueByFieldMarker(field_marker)) 331 slices = get_slices_by_indices(fv, indices) 332 return slices[columnIndex-1]
333 334 335 # -------------------------------------------------------- 336 # CLASS: Paragraph 337 # DESC: Object that represents a paragraph (i.e., a unit 338 # larger than a line) from an interlinear text. 339 # -------------------------------------------------------- 340
341 -class Paragraph:
342 """ 343 This class defines a unit of analysis above the line and below 344 the text. Every text will have at least one paragraph and some 345 will have more. Identified by the field marker \id by default. 346 """ 347
348 - def __init__(self, 349 label=None):
350 """Constructor that initializes Paragraph object.""" 351 self._lines = [] 352 self._label = label 353 return
354
355 - def add_line(self, line):
356 """Add line object to list of line objects for paragraph.""" 357 self._lines.append(line)
358
359 - def get_label(self):
360 """Obtain identifier for paragraph.""" 361 return self._label
362
363 - def get_lines(self):
364 """Get list of line objects for paragraph.""" 365 return self._lines
366
367 - def set_label(self, label):
368 """Set identifier for paragraph.""" 369 self._label = label
370 371 372 # -------------------------------------------------------- 373 # CLASS: InterlinearText 374 # DESC: Object that represents an interlinear text and 375 # provides functionality for its querying and 376 # manipulation. 377 # -------------------------------------------------------- 378
379 -class Text(StandardFormat) :
380 """ 381 This class defines an interlinearized text, which consists of a collection of Paragraph objects. 382 """ 383
384 - def __init__(self, 385 file = None, 386 fm_line = "ref", 387 fm_paragraph = "id", 388 fm_morpheme = "m", 389 fm_morpheme_gloss = "g", 390 fm_word = "w" 391 ):
392 """Constructor for Text object. All arguments are optional. By default, 393 the fields used to parse the Shoebox file are the following: 394 @param file: filepath 395 @type file: str 396 @param fm_line: field marker identifying line (default: 'ref') 397 @type fm_line: str 398 @param fm_paragraph: field marker identifying paragraph (default: 'id') 399 @type fm_paragraph: str 400 @param fm_morpheme: field marker identifying morpheme tier (default: 'm') 401 @type fm_morpheme: str 402 @param fm_morpheme_gloss: field marker identifying morpheme gloss tier (default: 'g') 403 @type fm_morpheme_gloss: str 404 @param fm_word: field marker identifying word tier (???) 405 @type fm_word: str 406 """ 407 self._file = file 408 self._fm_line = fm_line 409 self._fm_paragraph = fm_paragraph 410 self._fm_morpheme = "m" 411 self._fm_morpheme_gloss = "g" 412 self._fm_word = "w" 413 #self._rawtext = rawtext 414 self._paragraphs = [] 415 return
416
417 - def get_lines(self):
418 """Obtain a list of line objects (ignoring paragraph structure).""" 419 lines = [] 420 for p in self.get_paragraphs(): 421 for l in p.get_lines(): 422 lines.append(l) 423 return lines
424
425 - def get_paragraphs(self):
426 """Obtain a list of paragraph objects.""" 427 return self._paragraphs
428 429 # def set_paragraphs(self, paragraphs): 430 # self._paragraphs = paragraphs 431
432 - def add_paragraph(self, paragraph):
433 """Add paragraph object to list of paragraph objects. 434 @param paragraph: paragraph to be added to text 435 @type paragraph: Paragraph 436 """ 437 self._paragraphs.append(paragraph)
438 439 # def getRawText(self): 440 # return self._rawtext 441 442 # def setRawText(self, rawtext): 443 # self._rawtext = rawtext 444
445 - def getLineFM(self):
446 """Get field marker that identifies a new line.""" 447 return self._fm_line
448
449 - def setLineFM(self, lineHeadFieldMarker):
450 """Change default field marker that identifies new line.""" 451 self._fm_line = lineHeadFieldMarker
452
453 - def getParagraphFM(self):
454 """Get field marker that identifies a new paragraph.""" 455 return self._fm_paragraph
456
457 - def setParagraphFM(self, paragraphHeadFieldMarker):
458 """Change default field marker that identifies new paragraph.""" 459 self._fm_paragraph = paragraphHeadFieldMarker
460
461 - def getWordFM(self):
462 """Get field marker that identifies word tier.""" 463 return self._wordFieldMarker
464
465 - def setWordFM(self, wordFieldMarker):
466 """Change default field marker that identifies word tier.""" 467 self._wordFieldMarker = wordFieldMarker
468
469 - def getMorphemeFM(self):
470 """Get field marker that identifies morpheme tier.""" 471 return self._morphemeFieldMarker
472
473 - def setMorphemeFM(self, morphemeFieldMarker):
474 """Change default field marker that identifies morpheme tier.""" 475 self._morphemeFieldMarker = morphemeFieldMarker
476
477 - def getMorphemeGlossFM(self):
478 """Get field marker that identifies morpheme gloss tier.""" 479 return self._morphemeGlossFieldMarker
480
481 - def setMorphemeGlossFM(self, morphemeGlossFieldMarker):
482 """Change default field marker that identifies morpheme gloss tier.""" 483 self._morphemeGlossFieldMarker = morphemeGlossFieldMarker
484
485 - def get_file(self):
486 """Get file path as string.""" 487 return self._file
488
489 - def set_file(self, file):
490 """Change file path set upon initialization.""" 491 self._file = file
492
493 - def parse(self) :
494 """Parse specified Shoebox file into Text object.""" 495 # Use low-level functionality to get raw fields and walk through them 496 self.open(self._file) 497 p, l = None, None 498 for f in self.raw_fields() : 499 fmarker, fvalue = f 500 if fmarker == self.getParagraphFM() : 501 if p : 502 self.add_paragraph(p) 503 p = Paragraph(fvalue) 504 elif fmarker == self.getLineFM() : 505 if l : 506 p.add_line(l) 507 l = Line(fvalue) 508 else : 509 if l : 510 l.add_field(Field(fmarker, fvalue)) 511 p.add_line(l) 512 self.add_paragraph(p)
513 514 515 # ------------------------------------------------------------- 516 # FUNCTION: get_indices 517 # ------------------------------------------------------------
518 -def get_indices(str):
519 """This method finds the indices for the leftmost boundaries 520 of the units in a line of aligned text. 521 522 Given the field \um, this function will find the 523 indices identifing leftmost word boundaries, as 524 follows:: 525 526 0 5 8 12 <- indices 527 | | | | 528 ||||||||||||||||||||||||||| 529 \sf dit is een goede <- surface form 530 \um dit is een goed -e <- underlying morphemes 531 \mg this is a good -ADJ <- morpheme gloss 532 \gc DEM V ART ADJECTIVE -SUFF <- grammatical categories 533 \ft This is a good explanation. <- free translation 534 535 The function walks through the line char by char:: 536 537 c flag.before flag.after index? 538 -- ----------- ---------- ------ 539 0 1 0 yes 540 1 0 1 no 541 2 1 0 no 542 3 0 1 no 543 4 1 0 no 544 5 1 0 yes 545 546 @param str: aligned text 547 @type str: string 548 """ 549 indices = [] 550 flag = 1 551 for i in range(0, len(str)): 552 c = str[i] 553 if flag and c != ' ': 554 indices.append(i) 555 flag = 0 556 elif not flag and c == ' ': 557 flag = 1 558 return indices
559 560 561 # ------------------------------------------------------------- 562 # FUNCTION: get_slices_by_indices 563 # -------------------------------------------------------------
564 -def get_slices_by_indices(str, indices):
565 """Given a string and a list of indices, this function returns 566 a list of the substrings defined by those indices. For example, 567 given the arguments:: 568 str='antidisestablishmentarianism', indices=[4, 7, 16, 20, 25] 569 this function returns the list:: 570 ['anti', 'dis', 'establish', 'ment', arian', 'ism'] 571 572 @param str: text 573 @type str: string 574 @param indices: indices 575 @type indices: list of integers 576 """ 577 slices = [] 578 for i in range(0, len(indices)): 579 slice = None 580 start = indices[i] 581 if i == len(indices)-1: 582 slice = str[start: ] 583 else: 584 finish = indices[i+1] 585 slice = str[start: finish] 586 slices.append(slice) 587 return slices
588