1
2
3
4
5 from rules import KimmoArrowRule, KimmoFSARule
6 from pairs import KimmoPair, sort_subsets
7 from morphology import *
8 from fsa import FSA
9 import yaml
10
16
17
19 """
20 An object that represents the morphological rules for a language.
21
22 The KimmoRuleSet stores a list of rules which must all succeed when they
23 process a given string. These rules can be used for generating a surface
24 form from a lexical form, or recognizing a lexical form from a surface
25 form.
26 """
27 yaml_tag = '!KimmoRuleSet'
28 - def __init__(self, subsets, defaults, rules, morphology=None, null='0', boundary='#'):
29 """
30 Creates a KimmoRuleSet. You may not want to do this directly, but use
31 KimmoRuleSet.load to load one from a YAML file.
32
33 A KimmoRuleSet takes these parameters:
34 subsets: a dictionary mapping strings to lists of strings. The strings
35 in the map become subsets representing all of the strings in the
36 list.
37 defaults: a list of KimmoPairs that can appear without being
38 specifically mentioned by a rule.
39 rules: a list of KimmoFSARules or KimmoArrowRules that define the
40 two-level morphology rules.
41 morphology: a KimmoMorphology object that defines a lexicon of word
42 roots and affixes.
43 null: the symbol representing the empty string in rules.
44 boundary: the symbol that will always appear at the end of lexical and
45 surface forms.
46 """
47 self.debug = False
48 self._rules = list(rules)
49 self._pair_alphabet = set()
50 self._subsets = {}
51 self._null = null
52 self._boundary = boundary
53 self._subsets = subsets
54 self._morphology = morphology
55
56 for pair in defaults:
57
58 if self.is_subset(pair.input()) or self.is_subset(pair.output()):
59 raise ValueError('default ' + str(pair) + ' contains subset')
60 self._pair_alphabet.add( pair )
61
62 for r in self.rules():
63 for kp in r.pairs():
64 if (not (self.is_subset(kp.input()) or self.is_subset(kp.output()))):
65 self._pair_alphabet.add( kp )
66
68 "The list of rules in this ruleset."
69 return self._rules
71 "The dictionary defining subsets of characters of the language."
72 return self._subsets
74 "Is this string a subset representing other strings?"
75 return key[0] == '~' or key in self.subsets()
77 "The null symbol for this ruleset."
78 return self._null
80 """The morphological lexicon (as a KimmoMorphology). Could be None, if
81 the ruleset is only used for generation.
82 """
83 return self._morphology
84
85 - def _pairtext(self, char):
86 if char == self._null: return ''
87 else: return char
88
89 - def _generate(self, pairs, state_list, morphology_state=None, word='',
90 lexical=None, surface=None, features='', log=None):
91 if morphology_state:
92 morph = self._morphology
93 morphed = False
94 for state, feat in morph.next_states(morphology_state, word):
95 if feat is not None:
96 newfeat = combine_features(features, feat)
97 else: newfeat = features
98 for result in self._generate(pairs, state_list,
99 state, '', lexical, surface, newfeat, log):
100 yield result
101 morphed = True
102 if morphed: return
103 lexical_chars = list(morph.valid_lexical(morphology_state,
104 word, self._pair_alphabet)) + list(self._null)
105 else: lexical_chars = None
106
107 if lexical == '' or surface == '':
108 if morphology_state is None or morphology_state.lower() == 'end':
109
110 for r in range(len(self._rules)):
111 rule = self._rules[r]
112 state = state_list[r]
113 if state not in rule.fsa().finals():
114 return
115 if log:
116 log.succeed(pairs)
117 yield pairs, features
118 return
119
120 next_pairs = [p for p in self._pair_alphabet if
121 (lexical is None or lexical.startswith(self._pairtext(p.input()))) and
122 (surface is None or surface.startswith(self._pairtext(p.output())))]
123 for pair in next_pairs:
124 if pair.input() == self._null and pair.output() == self._null:
125 continue
126 if lexical_chars is not None and pair.input() not in lexical_chars:
127 continue
128 new_states = state_list[:]
129 for r in range(len(self._rules)):
130 rule = self._rules[r]
131 state = state_list[r]
132 next_state = self._advance_rule(rule, state, pair)
133 new_states[r] = next_state
134
135 newword = word + self._pairtext(pair.input())
136
137 if log:
138 log.step(pairs, pair, self._rules, state_list, new_states,
139 morphology_state, newword)
140 fail = False
141 for new_state in new_states:
142 if new_state is None or str(new_state) == '0'\
143 or str(new_state) == 'reject':
144 fail = True
145 break
146 if fail: continue
147 newlex, newsurf = lexical, surface
148 if lexical: newlex = lexical[len(self._pairtext(pair.input())):]
149 if surface: newsurf = surface[len(self._pairtext(pair.output())):]
150 for result in self._generate(pairs+[pair], new_states,
151 morphology_state, newword, newlex, newsurf, features, log):
152 yield result
153
155 """
156 Given a lexical form, return all possible surface forms that fit
157 these rules.
158
159 Optionally, a 'log' object such as TextTrace(1) can be provided; this
160 object will display to the user all the steps of the Kimmo algorithm.
161 """
162 if log: log.reset()
163 if not lexical.endswith(self._boundary):
164 lexical += self._boundary
165 got = self._generate([], [rule.fsa().start() for rule in
166 self._rules], lexical=lexical, log=log)
167 results = []
168 for (pairs, features) in got:
169 results.append(''.join(self._pairtext(pair.output()).strip(self._boundary) for pair in pairs))
170 return results
171
173 """
174 Given a surface form, return all possible lexical forms that fit
175 these rules. Because the components of a lexical form can include
176 features such as the grammatical part of speech, each surface form
177 is returned as a 2-tuple of (surface text, features).
178
179 Optionally, a 'log' object such as TextTrace(1) can be provided; this
180 object will display to the user all the steps of the Kimmo algorithm.
181 """
182 if log: log.reset()
183 if not surface.endswith(self._boundary):
184 surface += self._boundary
185 got = self._generate([], [rule.fsa().start() for rule in
186 self._rules], morphology_state='Begin', surface=surface, log=log)
187 results = []
188 for (pairs, features) in got:
189 results.append((''.join(self._pairtext(pair.input()).strip(self._boundary) for pair in pairs), features))
190 return results
191
193 trans = rule.fsa()._transitions[state]
194 expected_pairs = sort_subsets(trans.keys(), self._subsets)
195 for comppair in expected_pairs:
196 if comppair.includes(pair, self._subsets):
197 return rule.fsa().nextState(state, comppair)
198 return None
199
200 - def _test_case(self, input, outputs, arrow, method):
201 outputs.sort()
202 if arrow == '<=':
203 print '%s %s %s' % (', '.join(outputs), arrow, input)
204 else:
205 print '%s %s %s' % (input, arrow, ', '.join(outputs))
206 value = method(input)
207 if len(value) and isinstance(value[0], tuple):
208 results = [v[0] for v in value]
209 else: results = value
210 results.sort()
211 if outputs != results:
212 print ' Failed: got %s' % (', '.join(results) or 'no results')
213 return False
214 else: return True
215
217 """
218 Test a rule set by reading lines from a file.
219
220 Each line contains one or more lexical forms on the left, and one or
221 more surface forms on the right (separated by commas if there are more
222 than one). In between, there is an arrow (=>, <=, or <=>), indicating
223 whether recognition, generation, or both should be tested. Comments
224 can be marked with ;.
225
226 Each form should produce the exact list of forms on the other side of
227 the arrow; if one is missing, or an extra one is produced, the test
228 will fail.
229
230 Examples of test lines:
231 cat+s => cats ; test generation only
232 conoc+o <=> conozco ; test generation and recognition
233 <= conoco ; this string should fail to be recognized
234 """
235 f = open(filename)
236 try:
237 for line in f:
238 line = line[:line.find(';')].strip()
239 if not line: continue
240 arrow = None
241 for arrow_to_try in ['<=>', '=>', '<=']:
242 if line.find(arrow_to_try) >= 0:
243 lexicals, surfaces = line.split(arrow_to_try)
244 arrow = arrow_to_try
245 break
246 if arrow is None:
247 raise ValueError, "Can't find arrow in line: %s" % line
248 lexicals = lexicals.strip().split(', ')
249 surfaces = surfaces.strip().split(', ')
250 if lexicals == ['']: lexicals = []
251 if surfaces == ['']: surfaces = []
252 if arrow == '=>' or arrow == '<=>':
253 outputs = surfaces
254 for input in lexicals:
255 self._test_case(input, outputs, '=>', self.generate)
256 if arrow == '<=' or arrow == '<=>':
257 outputs = lexicals
258 for input in surfaces:
259 self._test_case(input, outputs, '<=', self.recognize)
260 finally:
261 f.close()
262
263 @classmethod
265 """
266 Loads a KimmoRuleSet from a parsed YAML node.
267 """
268 map = loader.construct_mapping(node)
269 return cls.from_yaml_dict(map)
270
271 @classmethod
272 - def load(cls, filename):
273 """
274 Loads a KimmoRuleSet from a YAML file.
275
276 The YAML file should contain a dictionary, with the following keys:
277 lexicon: the filename of the lexicon to load.
278 subsets: a dictionary mapping subset characters to space-separated
279 lists of symbols. One of these should usually be '@', mapping
280 to the entire alphabet.
281 defaults: a space-separated list of KimmoPairs that should be allowed
282 without a rule explicitly mentioning them.
283 null: the symbol that will be used to represent 'null' (usually '0').
284 boundary: the symbol that represents the end of the word
285 (usually '#').
286 rules: a dictionary mapping rule names to YAML representations of
287 those rules.
288
289 A rule can take these forms:
290 * a dictionary of states, where each state is a dictionary mapping
291 input pairs to following states. The start state is named 'start',
292 the state named 'reject' instantly rejects, and state names can be
293 prefixed with the word 'rejecting' so that they reject if the machine
294 ends in that state.
295
296 i-y-spelling:
297 start:
298 'i:y': step1
299 '@': start
300 rejecting step1:
301 'e:0': step2
302 '@': reject
303 rejecting step2:
304 '+:0': step3
305 '@': reject
306 rejecting step3:
307 'i:i': start
308 '@': reject
309
310
311 * a block of text with a DFA table in it, of the form used by
312 PC-KIMMO. The text should begin with a | so that YAML keeps your
313 line breaks, and the next line should be 'FSA'. State 0 instantly
314 rejects, and states with a period instead of a colon reject if the
315 machine ends in that state.
316 Examples:
317
318 i-y-spelling: | # this is the same rule as above
319 FSA
320 i e + i @
321 y 0 0 i @
322 1: 2 1 1 1 1
323 2. 0 3 0 0 0
324 3. 0 0 4 0 0
325 4. 0 0 0 1 0
326
327 epenthesis: |
328 FSA
329 c h s Csib y + # 0 @
330 c h s Csib i 0 # e @
331 1: 2 1 4 3 3 1 1 0 1
332 2: 2 3 3 3 3 1 1 0 1
333 3: 2 1 3 3 3 5 1 0 1
334 4: 2 3 3 3 3 5 1 0 1
335 5: 2 1 2 2 2 1 1 6 1
336 6. 0 0 7 0 0 0 0 0 0
337 7. 0 0 0 0 0 1 1 0 0
338
339 """
340 f = open(filename)
341 result = cls._from_yaml_dict(yaml.load(f))
342 f.close()
343 return result
344
345 @classmethod
347 lexicon = map.get('lexicon')
348 if lexicon:
349 lexicon = KimmoMorphology.load(lexicon)
350 subsets = map['subsets']
351 for key, value in subsets.items():
352 if isinstance(value, basestring):
353 subsets[key] = value.split()
354 defaults = map['defaults']
355 if isinstance(defaults, basestring):
356 defaults = defaults.split()
357 defaults = [KimmoPair.make(text) for text in defaults]
358 ruledic = map['rules']
359 rules = []
360 for (name, rule) in ruledic.items():
361 if isinstance(rule, dict):
362 rules.append(KimmoFSARule.from_dfa_dict(name, rule, subsets))
363 elif isinstance(rule, basestring):
364 if rule.strip().startswith('FSA'):
365 rules.append(KimmoFSARule.parse_table(name, rule, subsets))
366 else: rules.append(KimmoArrowRule(name, rule, subsets))
367 else:
368 raise ValueError, "Can't recognize the data structure in '%s' as a rule: %s" % (name, rule)
369 return cls(subsets, defaults, rules, lexicon)
370
371 - def gui(self, startTk=True):
374 draw_graphs = gui
375
376 -class TextTrace(object):
377 """
378 Supply a TextTrace object as the 'log' argument to KimmoRuleSet.generate or
379 KimmoRuleSet.recognize, and it will output the steps it goes through
380 on a text terminal.
381 """
382 - def __init__(self, verbosity=1):
383 """
384 Creates a TextTrace. The 'verbosity' argument ranges from 1 to 3, and
385 specifies how much text will be output to the screen.
386 """
387 self.verbosity = verbosity
388 - def reset(self): pass
389 - def step(self, pairs, curr, rules, prev_states, states,
390 morphology_state, word):
391 lexical = ''.join(p.input() for p in pairs)
392 surface = ''.join(p.output() for p in pairs)
393 indent = ' '*len(lexical)
394 if self.verbosity > 2:
395 print '%s%s<%s>' % (indent, lexical, curr.input())
396 print '%s%s<%s>' % (indent, surface, curr.output())
397 for rule, state1, state2 in zip(rules, prev_states, states):
398 print '%s%s: %s => %s' % (indent, rule.name(), state1, state2)
399 if morphology_state:
400 print '%sMorphology: %r => %s' % (indent, word, morphology_state)
401 print
402 elif self.verbosity > 1:
403 print '%s%s<%s>' % (indent, lexical, curr.input())
404 print '%s%s<%s>' % (indent, surface, curr.output())
405 z = zip(prev_states, states)
406 if morphology_state:
407 z.append((word, morphology_state))
408 print indent + (" ".join('%s>%s' % (old, new) for old, new in z))
409 blocked = []
410 for rule, state in zip(rules, states):
411 if str(state).lower() in ['0', 'reject']:
412 blocked.append(rule.name())
413 if blocked:
414 print '%s[blocked by %s]' % (indent, ", ".join(blocked))
415 print
416 else:
417 print '%s%s<%s> | %s<%s>' % (indent, lexical, curr.input(),
418 surface, curr.output()),
419 if morphology_state:
420 print '\t%r => %s' % (word, morphology_state),
421 blocked = []
422 for rule, state in zip(rules, states):
423 if str(state).lower() in ['0', 'reject']:
424 blocked.append(rule.name())
425 if blocked:
426 print ' [blocked by %s]' % (", ".join(blocked)),
427 print
428
429 - def succeed(self, pairs):
430 lexical = ''.join(p.input() for p in pairs)
431 surface = ''.join(p.output() for p in pairs)
432 indent = ' '*len(lexical)
433
434 print '%s%s' % (indent, lexical)
435 print '%s%s' % (indent, surface)
436 print '%sSUCCESS: %s <=> %s' % (indent, lexical, surface)
437 print
438 print
439
441 """
442 Loads a ruleset from a file in YAML format.
443
444 See KimmoRuleSet.load for a more detailed description.
445 """
446 return KimmoRuleSet.load(filename)
447
449 "An example of loading rules into the GUI."
450 rules = load('turkish.yaml')
451 rules.gui()
452
454 """If a YAML file is specified on the command line, load it as a
455 KimmoRuleSet in the GUI."""
456 import sys
457 if len(sys.argv) > 1:
458 filename = sys.argv[1]
459 k = load(filename)
460 k.gui()
461
462 if __name__ == '__main__': main()
463