1
2
3
4
5
6
7 """
8 This module provides tools for parsing and manipulating the contents
9 of a Shoebox text without reference to its metadata.
10 """
11
12 import re
13 from utilities import Field, SequentialDictionary
14 from nltk_lite.corpora.toolbox import StandardFormat
15
16
17
18
19
20
21
23 """
24 This class defines a word object, which consists of fixed number
25 of attributes: a wordform, a gloss, a part of speech, and a list
26 of morphemes.
27 """
28
29 - def __init__(self,
30 form = None,
31 gloss = None,
32 morphemes = None,
33 partOfSpeech = None):
34 """Constructor that initializes Word object.
35
36 @param form: the surface form for a word
37 @type form: string
38 @param gloss: the gloss for a word
39 @type gloss: string
40 @param morphemes: list of Morpheme objects for a word
41 @type morphemes: list
42 @param partOfSpeech: the part of speech for a word
43 @type partOfSpeech: string
44 """
45 self._form = form
46 self._gloss = gloss
47 self._morphemes = morphemes
48 self._partOfSpeech = partOfSpeech
49 self._rawGloss = None
50 self._rawMorphemes = None
51 self._rawPartOfSpeech = None
52 return
53
57
61
63 """Gives the gloss for a word as a string (without alignment spacing)."""
64 return self._gloss
65
67 """Change the gloss for a word."""
68 self._gloss = gloss
69
71 """Gives a list of Morpheme objects for a word."""
72 return self._morphemes
73
75 """Change a list of Morpheme objects for a word."""
76 self._morphemes = morphemes
77
79 """Gives the part of speech for a word as a string (without alignment spacing)."""
80 return self._partOfSpeech
81
83 """Change the part of speech for a word."""
84 self._partOfSpeech = partOfSpeech
85
88
90 self._rawGloss = rawGloss
91
93 return self._rawMorphemes
94
96 self._rawMorphemes = rawMorphemes
97
99 return self._rawPartOfSpeech
100
102 self._rawPartOfSpeech = rawPartOfSpeech
103
104
105
106
107
108
109
111 """
112 This class defines a morpheme object, which consists of fixed number
113 of attributes: a surface form, an underlying form, a gloss, and a
114 part of speech.
115 """
116
117 - def __init__(self,
118 form = None,
119 gloss = None,
120 partOfSpeech = None):
121 """Constructor that creates Morpheme object."""
122 self._form = form
123 self._gloss = gloss
124 self._partOfSpeech = partOfSpeech
125 return
126
130
134
136 """Returns gloss for morpheme."""
137 return self._gloss
138
140 """Change gloss for morpheme."""
141 self._gloss = gloss
142
144 """Returns part of speech for morpheme."""
145 return self._partOfSpeech
146
148 """Change part of speech for morpheme."""
149 self._partOfSpeech = partOfSpeech
150
151
152
153
154
155
156
157
159 """This class defines a line of interlinear glossing, such as::
160
161 \\ref 9
162 \\t Vigei avapaviei atarisia.
163 \\m vigei ava -pa -vi -ei atari -sia
164 \\g 1.PL.INC go -PROG -1.PL.INCL -PRES fish -PURP
165 \\p PRO.PERS V.I -SUFF.V.3 -SUFF.VI.4 -SUFF.VI.5 V.I -SUFF.V.4
166 \\fp Yumi bai go kisim pis.
167 \\fe We're going fishing.
168
169 The tiers of a line are saved as a sequential dictionary with
170 all of its associated fields. Identified by the field marker \\ref
171 by default."""
172
175 """Constructor that initializes Line object."""
176 self._fields = SequentialDictionary()
177 self._label = label
178 return
179
181 """Add field to line."""
182 fm = field.get_marker()
183 fv = field.get_values()
184 self._fields[fm] = fv
185
187 """Obtain list of unique fields for the line."""
188 return self._fields.keys()
189
193 """
194 This method returns a particular field given a field marker.
195 Returns a blank string if field is not found.
196
197 @param field_marker: marker of desired field
198 @type field_marker: string
199 @param join_string: string used to join field values (default to blank string)
200 @type join_string: string
201 @rtype: string
202 """
203 try:
204 return join_string.join(self._fields[field_marker])
205 except KeyError:
206 return ""
207
209 """Obtain all fields for a line, given a field marker."""
210 try:
211 values = self._fields[field_marker]
212 if sep == None:
213 return values
214 else:
215 return sep.join(values)
216 except KeyError:
217 return None
218
219
220
221
222
223
224
226 """Obtain list of field values for the line."""
227 return self._fields.values()
228
230 """Obtain identifier for line."""
231 return self._label
232
233 - def get_raw_text(self):
234 """Obtain original line of text."""
235 return self._rawtext
236
238 """Set identifier for line."""
239 self._label = label
240
241 - def set_raw_text(self, rawtext):
242 """Set original line of text."""
243 self._rawtext = rawtext
244
246 """Obtain a list of morpheme objects for the line."""
247 morphemes = []
248 indices = get_indices(self.getFieldValueByFieldMarker("m"))
249 print "%s" % indices
250 morphemeFormField = self.getFieldValueByFieldMarker("m")
251 morphemeGlossField = self.getFieldValueByFieldMarker("g")
252 morphemeFormSlices = get_slices_by_indices(morphemeFormField, indices)
253 morphemeGlossSlices = get_slices_by_indices(morphemeGlossField, indices)
254 for i in range(0, len(morphemeFormSlices)):
255 m = Morpheme()
256 m.set_form(morphemeFormSlices[i].strip(" ").strip("-"))
257 m.set_gloss(morphemeGlossSlices[i].strip(" ").strip("-"))
258 morphemes.append(m)
259 return morphemes
260
261 - def get_words(self, flagParseMorphemes=True):
324
326 """Get values for line, given a field and column index."""
327 fv = self.getFieldValueByFieldMarker(field_marker)
328 field_markers = self.getFieldMarkers()
329 sliceFieldMarker = field_markers[columnIndex-1]
330 indices = getIndices(self.getFieldValueByFieldMarker(field_marker))
331 slices = get_slices_by_indices(fv, indices)
332 return slices[columnIndex-1]
333
334
335
336
337
338
339
340
342 """
343 This class defines a unit of analysis above the line and below
344 the text. Every text will have at least one paragraph and some
345 will have more. Identified by the field marker \id by default.
346 """
347
350 """Constructor that initializes Paragraph object."""
351 self._lines = []
352 self._label = label
353 return
354
356 """Add line object to list of line objects for paragraph."""
357 self._lines.append(line)
358
360 """Obtain identifier for paragraph."""
361 return self._label
362
364 """Get list of line objects for paragraph."""
365 return self._lines
366
368 """Set identifier for paragraph."""
369 self._label = label
370
371
372
373
374
375
376
377
378
379 -class Text(StandardFormat) :
380 """
381 This class defines an interlinearized text, which consists of a collection of Paragraph objects.
382 """
383
384 - def __init__(self,
385 file = None,
386 fm_line = "ref",
387 fm_paragraph = "id",
388 fm_morpheme = "m",
389 fm_morpheme_gloss = "g",
390 fm_word = "w"
391 ):
392 """Constructor for Text object. All arguments are optional. By default,
393 the fields used to parse the Shoebox file are the following:
394 @param file: filepath
395 @type file: str
396 @param fm_line: field marker identifying line (default: 'ref')
397 @type fm_line: str
398 @param fm_paragraph: field marker identifying paragraph (default: 'id')
399 @type fm_paragraph: str
400 @param fm_morpheme: field marker identifying morpheme tier (default: 'm')
401 @type fm_morpheme: str
402 @param fm_morpheme_gloss: field marker identifying morpheme gloss tier (default: 'g')
403 @type fm_morpheme_gloss: str
404 @param fm_word: field marker identifying word tier (???)
405 @type fm_word: str
406 """
407 self._file = file
408 self._fm_line = fm_line
409 self._fm_paragraph = fm_paragraph
410 self._fm_morpheme = "m"
411 self._fm_morpheme_gloss = "g"
412 self._fm_word = "w"
413
414 self._paragraphs = []
415 return
416
417 - def get_lines(self):
418 """Obtain a list of line objects (ignoring paragraph structure)."""
419 lines = []
420 for p in self.get_paragraphs():
421 for l in p.get_lines():
422 lines.append(l)
423 return lines
424
425 - def get_paragraphs(self):
426 """Obtain a list of paragraph objects."""
427 return self._paragraphs
428
429
430
431
432 - def add_paragraph(self, paragraph):
433 """Add paragraph object to list of paragraph objects.
434 @param paragraph: paragraph to be added to text
435 @type paragraph: Paragraph
436 """
437 self._paragraphs.append(paragraph)
438
439
440
441
442
443
444
445 - def getLineFM(self):
446 """Get field marker that identifies a new line."""
447 return self._fm_line
448
449 - def setLineFM(self, lineHeadFieldMarker):
450 """Change default field marker that identifies new line."""
451 self._fm_line = lineHeadFieldMarker
452
453 - def getParagraphFM(self):
454 """Get field marker that identifies a new paragraph."""
455 return self._fm_paragraph
456
457 - def setParagraphFM(self, paragraphHeadFieldMarker):
458 """Change default field marker that identifies new paragraph."""
459 self._fm_paragraph = paragraphHeadFieldMarker
460
461 - def getWordFM(self):
462 """Get field marker that identifies word tier."""
463 return self._wordFieldMarker
464
465 - def setWordFM(self, wordFieldMarker):
466 """Change default field marker that identifies word tier."""
467 self._wordFieldMarker = wordFieldMarker
468
469 - def getMorphemeFM(self):
470 """Get field marker that identifies morpheme tier."""
471 return self._morphemeFieldMarker
472
473 - def setMorphemeFM(self, morphemeFieldMarker):
474 """Change default field marker that identifies morpheme tier."""
475 self._morphemeFieldMarker = morphemeFieldMarker
476
478 """Get field marker that identifies morpheme gloss tier."""
479 return self._morphemeGlossFieldMarker
480
481 - def setMorphemeGlossFM(self, morphemeGlossFieldMarker):
482 """Change default field marker that identifies morpheme gloss tier."""
483 self._morphemeGlossFieldMarker = morphemeGlossFieldMarker
484
485 - def get_file(self):
486 """Get file path as string."""
487 return self._file
488
489 - def set_file(self, file):
490 """Change file path set upon initialization."""
491 self._file = file
492
494 """Parse specified Shoebox file into Text object."""
495
496 self.open(self._file)
497 p, l = None, None
498 for f in self.raw_fields() :
499 fmarker, fvalue = f
500 if fmarker == self.getParagraphFM() :
501 if p :
502 self.add_paragraph(p)
503 p = Paragraph(fvalue)
504 elif fmarker == self.getLineFM() :
505 if l :
506 p.add_line(l)
507 l = Line(fvalue)
508 else :
509 if l :
510 l.add_field(Field(fmarker, fvalue))
511 p.add_line(l)
512 self.add_paragraph(p)
513
514
515
516
517
519 """This method finds the indices for the leftmost boundaries
520 of the units in a line of aligned text.
521
522 Given the field \um, this function will find the
523 indices identifing leftmost word boundaries, as
524 follows::
525
526 0 5 8 12 <- indices
527 | | | |
528 |||||||||||||||||||||||||||
529 \sf dit is een goede <- surface form
530 \um dit is een goed -e <- underlying morphemes
531 \mg this is a good -ADJ <- morpheme gloss
532 \gc DEM V ART ADJECTIVE -SUFF <- grammatical categories
533 \ft This is a good explanation. <- free translation
534
535 The function walks through the line char by char::
536
537 c flag.before flag.after index?
538 -- ----------- ---------- ------
539 0 1 0 yes
540 1 0 1 no
541 2 1 0 no
542 3 0 1 no
543 4 1 0 no
544 5 1 0 yes
545
546 @param str: aligned text
547 @type str: string
548 """
549 indices = []
550 flag = 1
551 for i in range(0, len(str)):
552 c = str[i]
553 if flag and c != ' ':
554 indices.append(i)
555 flag = 0
556 elif not flag and c == ' ':
557 flag = 1
558 return indices
559
560
561
562
563
565 """Given a string and a list of indices, this function returns
566 a list of the substrings defined by those indices. For example,
567 given the arguments::
568 str='antidisestablishmentarianism', indices=[4, 7, 16, 20, 25]
569 this function returns the list::
570 ['anti', 'dis', 'establish', 'ment', arian', 'ism']
571
572 @param str: text
573 @type str: string
574 @param indices: indices
575 @type indices: list of integers
576 """
577 slices = []
578 for i in range(0, len(indices)):
579 slice = None
580 start = indices[i]
581 if i == len(indices)-1:
582 slice = str[start: ]
583 else:
584 finish = indices[i+1]
585 slice = str[start: finish]
586 slices.append(slice)
587 return slices
588