Package nltk_lite :: Package contrib :: Module kimmo
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.kimmo

   1  # Natural Language Toolkit: Kimmo Morphological Analyzer 
   2  # 
   3  # Copyright (C) 2001-2006 MIT 
   4  # Author: Carl de Marcken <carl@demarcken.org> 
   5  #         Beracah Yankama <beracah@mit.edu> 
   6  #         Robert Berwick <berwick@ai.mit.edu> 
   7  # 
   8  # URL: <http://nltk.sf.net> 
   9  # For license information, see LICENSE.TXT 
  10   
  11  """ 
  12  Kimmo Morphological Analyzer.  Supports proper recognizer completion, 
  13  generator ordering, kimmo control class, loader for own file format, 
  14  also .rul compatible with old pckimmo. 
  15  """ 
  16   
  17  # TODO: remove Unix dependencies 
  18   
  19  import Tkinter 
  20  import os, re, sys, types, string, glob, time, md5 
  21   
  22  from nltk_lite.contrib.fsa import * 
  23  from nltk_lite.corpora import get_basedir 
  24  from nltk_lite import tokenize 
  25   
  26  ############################# KIMMO GUI ################################## 
  27  """ 
  28  A gui for input of generative & recognition models 
  29  need 3 input boxes, one for text input, lexicon box, rules box 
  30  one output box? 
  31   
  32  need alternations rules and lexicon 
  33  plus 1 input test & recognition box. 
  34   
  35  we want to "step" through alternations 
  36  we want to "show" the rules that fire. 
  37  and we want batch mode, big file, or big input test with output. 
  38  """ 
  39  ########################################################################### 
  40  from ScrolledText import ScrolledText 
  41   
42 -class KimmoGUI:
43 - def __init__(self, grammar, text, title='Kimmo Interface v1.78'):
44 self.root = None 45 try: 46 self.dbgTracing = None 47 self.highlightIds = [] 48 self.tagId = 0 49 50 self.lexmd5 = None 51 self.rulemd5 = None 52 self.lexicalGraphWindow = None 53 54 self.rulfilename = '' 55 self.lexfilename = '' 56 self.altfilename = '' 57 self.kimmoResultFile = '' 58 59 self.helpFilename = 'kimmo.help' 60 61 self._root = Tkinter.Tk() 62 self._root.title(title) 63 64 ctlbuttons = Tkinter.Frame(self._root) 65 ctlbuttons.pack(side='top', fill='x') 66 level1 = Tkinter.Frame(self._root) 67 level1.pack(side='top', fill='none') 68 Tkinter.Frame(self._root).pack(side='top', fill='none') 69 level2 = Tkinter.Frame(self._root) 70 level2.pack(side='top', fill='x') 71 buttons = Tkinter.Frame(self._root) 72 buttons.pack(side='top', fill='none') 73 batchFrame = Tkinter.Frame(self._root) 74 batchFrame.pack(side='top', fill='x') 75 76 self.batchpath = Tkinter.StringVar() 77 Tkinter.Label(batchFrame, text="Batch File:").pack(side='left') 78 Tkinter.Entry(batchFrame, background='white', foreground='black', 79 width=30, textvariable=self.batchpath).pack(side='left') 80 Tkinter.Button(batchFrame, text='Go!', 81 background='#a0c0c0', foreground='black', 82 command=self.batch).pack(side='left') 83 84 self.debugWin = Tkinter.StringVar() # change to a window and field eventually. 85 Tkinter.Entry(batchFrame, background='grey', foreground='red', 86 width=30, textvariable=self.debugWin).pack(side='right') 87 88 self.wordIn = Tkinter.StringVar() 89 Tkinter.Label(level2, text="Generate or Recognize:").pack(side='left') 90 Tkinter.Entry(level2, background='white', foreground='black', 91 width=30, textvariable=self.wordIn).pack(side='left') 92 93 lexiconFrame = Tkinter.Frame(level1) 94 Tkinter.Label(lexiconFrame, text="Lexicon & Alternations").pack(side='top', 95 fill='x') 96 self.lexicon = ScrolledText(lexiconFrame, background='white', 97 foreground='black', width=50, height=36, wrap='none') 98 99 # setup the scrollbar 100 scroll = Tkinter.Scrollbar(lexiconFrame, orient='horizontal',command=self.lexicon.xview) 101 102 scroll.pack(side='bottom', fill='x') 103 self.lexicon.configure(xscrollcommand = scroll.set) 104 105 self.lexicon.pack(side='top') 106 107 108 midFrame = Tkinter.Frame(level1) 109 rulesFrame = Tkinter.Frame(midFrame) 110 rulesFrame.pack(side='top', fill='x') 111 Tkinter.Label(rulesFrame, text="Rules/Subsets").pack(side='top', 112 fill='x') 113 self.rules = ScrolledText(rulesFrame, background='white', 114 foreground='black', width=60, height=19, wrap='none') 115 # setup the scrollbar 116 scroll = Tkinter.Scrollbar(rulesFrame, orient='horizontal',command=self.rules.xview) 117 scroll.pack(side='bottom', fill='x') 118 self.rules.configure(xscrollcommand = scroll.set) 119 120 self.rules.pack(side='top') 121 122 midbetweenFrame = Tkinter.Frame(midFrame) 123 midbetweenFrame.pack(side='top', fill='x') 124 125 Tkinter.Button(midbetweenFrame, text='clear', 126 background='#f0f0f0', foreground='black', 127 command= lambda start=1.0, end=Tkinter.END : self.results.delete(start,end) 128 ).pack(side='right') 129 130 Tkinter.Label(midbetweenFrame, 131 text="Results ").pack(side='right') 132 133 self.results = ScrolledText(midFrame, background='white', 134 foreground='black', width=60, height=13, wrap='none') 135 136 # setup the scrollbar 137 scroll = Tkinter.Scrollbar(midFrame, orient='horizontal',command=self.results.xview) 138 scroll.pack(side='bottom', fill='x') 139 self.results.configure(xscrollcommand = scroll.set) 140 141 self.results.pack(side='bottom') 142 143 144 145 """ 146 alternationFrame = Tkinter.Frame(level1) 147 Tkinter.Label(alternationFrame, text="Alternations").pack(side='top', 148 fill='x') 149 self.alternation = ScrolledText(alternationFrame, background='white', 150 foreground='black', width=1, wrap='none') 151 self.alternation.pack(side='top') 152 """ 153 154 Tkinter.Button(ctlbuttons, text='Quit', 155 background='#a0c0c0', foreground='black', 156 command=self.destroy).pack(side='left') 157 158 self.loadMenuButton = Tkinter.Menubutton(ctlbuttons, text='Load', background='#a0c0c0', foreground='black', relief='raised') 159 self.loadMenuButton.pack(side='left') 160 self.loadMenu=Tkinter.Menu(self.loadMenuButton,tearoff=0) 161 162 self.loadMenu.add_command(label='Load Lexicon', underline=0,command = lambda filetype='.lex', targetWindow = self.lexicon, tf = 'l' : self.loadTypetoTarget(filetype, targetWindow, tf)) 163 self.loadMenu.add_command(label='Load Rules', underline=0,command = lambda filetype='.rul', targetWindow = self.rules, tf = 'r' : self.loadTypetoTarget(filetype, targetWindow, tf)) 164 # self.loadMenu.add_command(label='Load Lexicon', underline=0,command = lambda filetype='.lex', targetWindow = self.lexicon : loadTypetoTarget(self, filetype, targetWindow)) 165 self.loadMenuButton["menu"]=self.loadMenu 166 167 # 168 169 self.saveMenuButton = Tkinter.Menubutton(ctlbuttons, text='Save',background='#a0c0c0', foreground='black', relief='raised') 170 self.saveMenuButton.pack(side='left') 171 self.saveMenu=Tkinter.Menu(self.saveMenuButton,tearoff=0) 172 self.saveMenu.add_command(label='Save Lexicon', underline=0,command = lambda filename=self.lexfilename, sourceWindow = self.lexicon : self.writeToFilefromWindow(filename, sourceWindow,'w',0,'l')) 173 self.saveMenu.add_command(label='Save Rules', underline=0,command = lambda filename=self.rulfilename, sourceWindow = self.rules : self.writeToFilefromWindow(filename, sourceWindow,'w',0,'r')) 174 self.saveMenu.add_command(label='Save Results', underline=0,command = lambda filename='.results', sourceWindow = self.results : self.writeToFilefromWindow(filename, sourceWindow,'w',0)) 175 self.saveMenu.add_command(label='Save All', underline=0,command = self.saveAll) 176 self.saveMenuButton["menu"]=self.saveMenu 177 178 179 Tkinter.Label(ctlbuttons, text=" Preset:").pack(side='left') 180 181 self.configValue = Tkinter.StringVar() 182 self.configsMenuButton = Tkinter.Menubutton(ctlbuttons, text='Configs', background='#a0c0c0', foreground='black', relief='raised') 183 self.configsMenuButton.pack(side='left') 184 self.configsMenu=Tkinter.Menu(self.configsMenuButton,tearoff=0) 185 # read the directory for cfgs, add them to the menu 186 # add path expander, to expand ~ & given home dirs. 187 188 189 # !!! this does not handle student student directories, if not the current dir! 190 currentconfigfiles = glob.glob('*.cfg') 191 for x in currentconfigfiles: 192 newname = x # [0:len(x)-4] # remove the '.cfg' 193 self.configsMenu.add_command(label=newname, underline=0,command = lambda newname=x : self.configLoader(newname)) # Callback(self.configLoader,newname)) 194 # we want this to call load on the specific config file 195 196 if len(currentconfigfiles) == 0: 197 # configsMenu.add_command(label='<none>',underline=0) 198 self.configsMenuButton.configure(text='<none>') 199 200 self.configsMenuButton["menu"]=self.configsMenu 201 202 203 204 # toggle the different modes of this window 205 # Tkinter.Button(ctlbuttons, text='->', 206 # background='#ffd564', foreground='red', 207 # command=self.generate).pack(side='right') 208 # 209 # Tkinter.Checkbutton(ctlbuttons, text='Stepping', 210 # background='#b0f0d0', foreground='#008b45', 211 # command=self.generate).pack(side='right') 212 213 self.tracingbtn = Tkinter.Button(ctlbuttons, text='Tracing', 214 background='#fff0f0', foreground='black', 215 command=lambda : self.create_destroyDebugTracing()).pack(side='right') 216 217 218 self.graphMenuButton = Tkinter.Menubutton(ctlbuttons, text='Graph', background='#d0d0e8', foreground='black', relief='raised') 219 self.graphMenuButton.pack(side='right') 220 self.graphMenu=Tkinter.Menu(self.graphMenuButton,tearoff=0) 221 222 self.graphMenu.add_command(label='Graph Lexicon', underline=0,command = lambda which = 'l' : self.graph(which)) 223 self.graphMenu.add_command(label='Graph FSA Rules', underline=0,command = lambda which = 'r' : self.graph(which)) 224 # self.loadMenu.add_command(label='Load Lexicon', underline=0,command = lambda filetype='.lex', targetWindow = self.lexicon : loadTypetoTarget(self, filetype, targetWindow)) 225 self.graphMenuButton["menu"]=self.graphMenu 226 227 self.helpbtn = Tkinter.Button(ctlbuttons, text='Help', 228 background='#f0fff0', foreground='black', 229 command=self.kimmoHelp).pack(side='right') 230 231 232 lexiconFrame.pack(side='left') 233 midFrame.pack(side='left') 234 # alternationFrame.pack(side='left') 235 236 Tkinter.Button(level2, text='Generate', 237 background='#a0c0c0', foreground='black', 238 command=self.generate).pack(side='left') 239 Tkinter.Button(level2, text='Recognize', 240 background='#a0c0c0', foreground='black', 241 command=self.recognize).pack(side='left') 242 243 244 # setup the vars for kimmo 245 # eventually make this a kimmo object 246 """ 247 self.klexicons = [] 248 self.kalternations = [] 249 self.ksubsets = [] 250 self.kdefaults = [] 251 self.krules = [] 252 """ 253 254 self.kimmoinstance = None 255 256 self.kimmoResultFile = '' 257 self.traceWindow = '' 258 259 self.debug = False 260 261 self.configLoader('kimmo.cfg') 262 # self.batchpath.set("kimmo.batch_test") 263 264 # capture all print messages 265 self.phOut = PrintHook() 266 self.phOut.Start(self.capturePrint) 267 268 269 # Enter mainloop. 270 Tkinter.mainloop() 271 except: 272 print 'Error creating Tree View' 273 self.destroy() 274 raise
275
276 - def init_menubar(self):
277 menubar = Tkinter.Menu(self._root) 278 279 filemenu = Tkinter.Menu(menubar, tearoff=0) 280 filemenu.add_command(label='Save Rules', underline=0, 281 command=self.save, accelerator='Ctrl-s') 282 self._root.bind('<Control-s>', self.save) 283 filemenu.add_command(label='Load Rules', underline=0, 284 command=self.load, accelerator='Ctrl-o') 285 self._root.bind('<Control-o>', self.load) 286 filemenu.add_command(label='Clear Rules', underline=0, 287 command=self.clear, accelerator='Ctrl-r') 288 self._root.bind('<Control-r>', self.clear) 289 filemenu.add_command(label='Exit', underline=1, 290 command=self.destroy, accelerator='Ctrl-q') 291 self._root.bind('<Control-q>', self.destroy) 292 menubar.add_cascade(label='File', underline=0, 293 menu=filemenu) 294 self._root.config(menu=menubar)
295
296 - def guiError(self, *args):
297 self.debugWin.set(args[0].strip())
298 299
300 - def create_destroyDebugTracing(self, *args):
301 # test creating tracing/debug window 302 303 if (self.dbgTracing): 304 self.dbgTracing.destroy() 305 self.dbgTracing = None 306 self.debug = False 307 308 else: 309 try: 310 # have in its own special di decial class 311 self.dbgTracing = Tkinter.Toplevel() 312 self.dbgTracing.title("Tracing/Debug") 313 dbgTraceFrame2 = Tkinter.Frame(self.dbgTracing) 314 dbgTraceFrame2.pack(side='top', fill='x') 315 dbgTraceFrame = Tkinter.Frame(self.dbgTracing) 316 dbgTraceFrame.pack(side='top', fill='x',expand='yes') 317 self.traceWindow = ScrolledText(dbgTraceFrame, background='#f4f4f4', 318 foreground='#aa0000', width=45, height=24, wrap='none') 319 320 Tkinter.Button(dbgTraceFrame2, text='clear', 321 background='#a0c0c0', foreground='black', 322 command= lambda start=1.0, end=Tkinter.END : self.traceWindow.delete(start,end) 323 ).pack(side='right') 324 Tkinter.Button(dbgTraceFrame2, text='Save', 325 background='#a0c0c0', foreground='black', 326 command= lambda file=self.kimmoResultFile,windowName=self.traceWindow,mode='w',auto=0 : self.writeToFilefromWindow(file,windowName,mode,auto) 327 ).pack(side='left') 328 329 330 scroll = Tkinter.Scrollbar(dbgTraceFrame, orient='horizontal',command=self.traceWindow.xview) 331 scroll.pack(side='bottom', fill='x') 332 333 self.traceWindow.configure(xscrollcommand = scroll.set) 334 self.traceWindow.pack(side='bottom') 335 336 337 self.debug = True 338 339 # this will automatically clean itself up. 340 self.dbgTracing.protocol("WM_DELETE_WINDOW", self.create_destroyDebugTracing) 341 342 except: 343 print 'Error creating Tree View' 344 self.dbgTracing.destroy() 345 self.dbgTracing = None 346 self.debug = False 347 raise
348 349
350 - def writeToFilefromWindow(self, filename, windowName, mode, auto, wt=None):
351 # filename from var 352 353 # if not file: file='.txt' 354 # if append, add on, if overwrite, then ya 355 356 if not (auto and windowName and filename): 357 358 from tkFileDialog import asksaveasfilename 359 ftypes = [('Text file', '.txt'),('Rule file', '.rul'),('Lexicon file', '.lex'),('Alternations file', '.alt'), 360 ('All files', '*')] 361 filename = asksaveasfilename(filetypes=ftypes, 362 defaultextension='', initialfile=filename) 363 364 if not filename: 365 self.guiError('Need File Name') 366 return 367 f = open(filename, 'w') 368 f.write(windowName.get(1.0,Tkinter.END)) 369 f.close() 370 371 if filename: 372 if wt == 'l': self.lexfilename = filename 373 elif wt == 'r': self.rulfilename = filename
374 375 376 # create a window update class 377 # and a window resize class 378 379 # default save; all file names are known, so it saves to them.
380 - def saveAll(self, *args):
381 382 # automatic write 383 self.writeToFilefromWindow(self.lexfilename,self.lexicon,'w',1) 384 self.writeToFilefromWindow(self.rulfilename,self.rules,'w',1) 385 # self.writeToFilefromWindow(self.altfilename,self.alternation,'w',1) 386 self.writeToFilefromWindow(self.resfilename,self.results,'w',1)
387 388 """ 389 def save(self, *args): 390 "Save a rule/lexicon set to a text file" 391 from tkFileDialog import asksaveasfilename 392 ftypes = [('Text file', '.txt'), 393 ('All files', '*')] 394 filename = asksaveasfilename(filetypes=ftypes, 395 defaultextension='.txt') 396 if not filename: return 397 f = open(filename, 'w') 398 f.write('---- Rules -----\n%s\n' % '\n'.join(self.getRules(False))) 399 f.write('---- Lexicon -----\n%s\n' % '\n'.join(self.getLexicon(False))) 400 f.close() 401 """ 402
403 - def configLoader(self,*args):
404 print args[0] 405 filename = args[0] 406 407 # if arg is a valid file, load by line. 408 # handle the different types of files 409 if filename: 410 f = read_kimmo_file(filename, self) 411 lines = f.readlines() 412 f.close() 413 414 # clear all panes 415 self.clear() 416 417 # now set the menu 418 self.configsMenuButton.configure(text=filename) 419 420 # reset gui name variables 421 # so that nothing gets overwritten. 422 # these file name variables will be changed if 423 # either the cfg changes it, or the person loads a different file 424 425 self.rulfilename = '' 426 self.lexfilename = '' 427 self.altfilename = '' 428 self.kimmoResultFile = '' 429 self.batchpath.set('') 430 431 for line in lines: 432 line = line.strip() 433 cfgargs = line.split(":") 434 for x in range(len(cfgargs)): cfgargs[x] = cfgargs[x].strip() 435 436 if len(line) == 0: continue 437 elif (line[0] == '#') or (line[0] == ';'): continue # comment 438 elif cfgargs[0] == 'lexicon': 439 self.lexfilename = self.loadIntoWindow(os.path.expanduser(cfgargs[1]),self.lexicon) 440 elif cfgargs[0] == 'rules': 441 self.rulfilename = self.loadIntoWindow(os.path.expanduser(cfgargs[1]),self.rules) 442 #elif cfgargs[0] == 'alternations': 443 # self.loadIntoWindow(cfgargs[1],self.alternation) 444 # self.altfilename = cfgargs[1] 445 elif cfgargs[0] == 'results': 446 self.kimmoResultFile = os.path.expanduser(cfgargs[1]) 447 self.resfilename = os.path.expanduser(cfgargs[1]) 448 elif cfgargs[0] == 'batch': self.batchpath.set(os.path.expanduser(cfgargs[1])) 449 # ! 450 else: self.guiError('unknown line :' + line) 451 # print line 452 453 else: self.guiError('Empty Filename')
454 455 456
457 - def loadIntoWindow(self, filename, windowField):
458 "Load rule/lexicon set from a text file directly into the window pane specified" 459 # filename = args[0] 460 # windowField = args[1] 461 462 if filename: 463 filename = os.path.expanduser(filename) 464 f = read_kimmo_file(filename, self) 465 lines = f.readlines() 466 f.close() 467 468 text = [] 469 for line in lines: 470 line = line.strip() 471 text.append(line) 472 473 # empty the window now that the file was valid 474 windowField.delete(1.0, Tkinter.END) 475 476 windowField.insert(1.0, '\n'.join(text)) 477 478 return filename 479 return ''
480 481 # opens a load dialog for files of a specified type to be loaded into a specified window
482 - def loadTypetoTarget(self, fileType, targetWindow, ftype = None):
483 484 if not (fileType and targetWindow): return 485 486 from tkFileDialog import askopenfilename 487 ftypes = [(fileType, fileType)] 488 489 filename = askopenfilename(filetypes=ftypes, defaultextension=fileType) 490 491 self.loadIntoWindow(filename, targetWindow) 492 493 # set the config menu to blank 494 self.configsMenuButton.configure(text='<none>') 495 496 # !!! remember to reset all the filenames as well! 497 if filename: 498 if ftype == 'l': self.lexfilename = filename 499 elif ftype == 'r': self.rulfilename = filename
500
501 - def load(self, *args):
502 # graphical interface to file loading. 503 504 "Load rule/lexicon set from a text file" 505 from tkFileDialog import askopenfilename 506 ftypes = [('Text file', '.txt'), 507 ('All files', '*')] 508 # filename = askopenfilename(filetypes=ftypes, defaultextension='.txt') 509 filename = 'kimmo.lex' 510 511 if filename: 512 f = read_kimmo_file(filename, self) 513 lines = f.readlines() 514 f.close() 515 516 rules = [] 517 lexicon = [] 518 alternations = [] 519 520 state = 'rules' 521 for line in lines: 522 line = line.strip() 523 lexicon.append(line) 524 525 self.clear() 526 self.lexicon.insert(1.0, '\n'.join(lexicon)) 527 528 529 # now load up the alternations 530 531 filename = 'kimmo.alt' 532 533 if filename: 534 f = read_kimmo_file(filename, self) 535 lines = f.readlines() 536 f.close() 537 538 for line in lines: 539 line = line.strip() 540 alternations.append(line) 541 542 self.alternation.insert(1.0, '\n'.join(alternations)) 543 544 filename = 'kimmo.rul' 545 546 if filename: 547 f = read_kimmo_file(filename, self) 548 lines = f.readlines() 549 f.close() 550 551 for line in lines: 552 line = line.strip() 553 rules.append(line) 554 555 self.rules.insert(1.0, '\n'.join(rules))
556
557 - def clear(self, *args):
558 "Clears the grammar and lexical and sentence inputs" 559 self.lexicon.delete(1.0, Tkinter.END) 560 self.rules.delete(1.0, Tkinter.END) 561 # self.alternation.delete(1.0, Tkinter.END) 562 self.results.delete(1.0, Tkinter.END)
563
564 - def destroy(self, *args):
565 if self._root is None: return 566 self.phOut.Stop() 567 self._root.destroy() 568 self._root = None
569 570 # for single stepping through a trace. 571 # need to make the kimmo class capable of being interrupted & resumed.
572 - def step(self, *args):
573 print 'a'
574
575 - def singlestep(self, *args):
576 print 'a'
577
578 - def batch(self, *args):
579 filename = self.batchpath.get() 580 if filename: 581 f = read_kimmo_file(filename, self) 582 lines = f.readlines() 583 f.close() 584 585 self.initKimmo() 586 587 # space the results out a little 588 self.results.insert(1.0, '\n') 589 590 results_string = '' 591 for line in lines: 592 # a 'g word' 'r word' format 593 singleword = line.strip() # should be a single word, no spaces, etc. 594 spcr = re.compile(r"\s+") 595 linevals = [] 596 linevals = spcr.split(singleword) 597 598 599 batch_result = [] 600 batch_result_str = '' 601 if not singleword: continue # ignore blank lines 602 elif (singleword[0] == '#') or (singleword[0] == ';'): # commented; 603 results_string += (singleword + '\n') 604 # self.results.insert(Tkinter.END, singleword + '\n') # send directly to results pane 605 606 elif (linevals[0] == 'g') and (len(linevals) == 2): 607 batch_result = self.kimmoinstance.generate(linevals[1]) 608 elif (linevals[0] == 'r') and (len(linevals) == 2): 609 batch_result = self.kimmoinstance.recognize(linevals[1]) 610 611 elif '+' in singleword: 612 batch_result = self.kimmoinstance.generate(singleword) 613 else: 614 batch_result = self.kimmoinstance.recognize(singleword) 615 616 # if a valid results 617 if len(batch_result) > 0: 618 for x in batch_result: batch_result_str = batch_result_str + x 619 batch_result_str = batch_result_str + '\n' 620 results_string += (batch_result_str) 621 # self.results.insert(Tkinter.END, batch_result_str) 622 623 # place a separator between results 624 self.results.insert(1.0, '----- '+ time.strftime("%a, %d %b %Y %I:%M %p", time.gmtime()) +' -----\n') 625 self.results.insert(2.0, results_string) 626 self.results.see(1.0) 627 628 if self.traceWindow: 629 self.highlightMatches(' BLOCKED',self.traceWindow,'#ffe0e0') 630 self.highlightMatches(' AT END OF WORD',self.traceWindow,'#e0ffe0')
631 632 633 # if the path is set, load the file 634 # init the engine 635 # choose between recognize & generate 636 637 638 639 # generation test
640 - def generate(self, *args):
641 if self._root is None: return 642 643 if len(self.wordIn.get()) > 0: 644 self.initKimmo() 645 646 tmpword = self.wordIn.get() 647 648 tmpword.strip() 649 650 # generate_result = _generate_test(self.ks, tmpword) 651 generate_result = self.kimmoinstance.generate(tmpword) 652 generate_result_str = '' 653 # convert list to string 654 for x in generate_result: generate_result_str = generate_result_str + x 655 generate_result_str = generate_result_str + '\n' 656 self.results.insert(1.0, generate_result_str) 657 658 if self.dbgTracing: 659 self.highlightMatches(' BLOCKED',self.traceWindow,'#ffe0e0') 660 self.highlightMatches(' AT END OF WORD',self.traceWindow,'#e0ffe0') 661 self.highlightMatches('SUCCESS!',self.traceWindow,'#e0ffe0')
662 663
664 - def recognize(self, *args):
665 self.lexicon.tag_delete("highlight") 666 if self._root is None: return 667 668 if len(self.wordIn.get()) > 0: 669 self.initKimmo() 670 671 tmpword = self.wordIn.get() 672 # pad with terminators 673 tmpword.strip() 674 675 # recognize_result = _recognize_test(self.ks, tmpword, self.km) 676 recognize_result = self.kimmoinstance.recognize(tmpword) 677 recognize_result_str = '' 678 # convert list to string 679 for x in recognize_result: recognize_result_str = recognize_result_str + x 680 recognize_result_str = recognize_result_str + '\n' 681 self.results.insert(1.0, recognize_result_str) 682 683 if self.dbgTracing: 684 self.highlightMatches(' BLOCKED',self.traceWindow,'#ffe0e0') 685 self.highlightMatches(' AT END OF WORD',self.traceWindow,'#e0ffe0')
686 687 688 689 # accept gui graph command 690 # create kimmoinstance 691 # and then process / display one of the graphs.
692 - def graph(self, which):
693 694 self.initKimmo() 695 graphtitle = '' 696 697 698 # we want to save in the local dir. 699 # lex/rulefilenames are fully qualified. 700 701 # so we test the local dir & strip the path off of the filename. 702 703 704 # check & set path, if necessary, need read and write access to path 705 path = '' 706 pathstatus = os.stat('./') # 0600 is r/w, binary evaluation 707 if not ((pathstatus[0] & 0600) == 0600): 708 path = '/tmp/' + str(os.environ.get("USER")) + '/' # need terminating / 709 if not os.path.exists(path): 710 os.mkdir(path,0777) 711 712 pathre = re.compile(r"^.*\/") 713 714 if which == 'l': 715 graphfname = path + pathre.sub("", self.lexfilename) 716 dotstring = dotformat(self.kimmoinstance.lexicalNodes) 717 leximagefile = dot2image(graphfname, dotstring) 718 graphtitle = 'Lexicon Graph' 719 720 elif which == 'r': 721 graphfname = path + pathre.sub("", self.rulfilename) 722 723 tmpOptions = [] 724 for x in self.kimmoinstance.fsasNodes: 725 # print x['name'] 726 tmpOptions.append(x['name']) 727 728 ld = ListDialog(self._root,tmpOptions,"Select FSA") 729 730 if not ld.result: return 731 732 # now create the dotstring & image from the (single) selection 733 dotstring = dotformat(self.kimmoinstance.fsasNodes[string.atoi(ld.result[0])]['nodes']) 734 graphtitle = 'FSA ' + self.kimmoinstance.fsasNodes[string.atoi(ld.result[0])]['name'] 735 736 # make file read: 737 # something.rul.1.gif (where 1 is the rule index number) 738 graphfname += ('.' + str(ld.result[0])) 739 740 # check if that file already exists, if so, append an iteration number onto it. 741 742 leximagefile = dot2image(graphfname, dotstring) 743 744 745 # if this is an imagefile, then create a new window for it. 746 if leximagefile: 747 if self.lexicalGraphWindow: self.lexicalGraphWindow.destroy() 748 self.lexicalGraphWindow = tkImageView(leximagefile, graphtitle)
749 750 751 752 # validates the lexicon against the alternations to make certain there 753 # are no misreferences/mispellings of refs.
754 - def validate(self,*args):
755 self.tagId = 1 756 757 for x in self.lexicon.tag_names(): self.lexicon.tag_delete(x) 758 759 # for x in self.highlightIds: x[0].tag_delete(x[1]) 760 761 for l in self.kimmoinstance.validateLexicon: 762 if not l in self.kimmoinstance.validateAlternations: 763 if l: 764 self.guiError('Unused Alternation') 765 self.highlightMatches(l,self.lexicon,'#ffffc0') 766 767 for a in self.kimmoinstance.validateAlternations: 768 if not a in self.kimmoinstance.validateLexicon: 769 if a: 770 self.guiError('Unknown Alternation Name') 771 self.highlightMatches(a,self.lexicon,'#ffffc0')
772 773 774 # highlight matching words in given window
775 - def highlightMatches(self, word, window,color):
776 # assumes unbroken with whitespace words. 777 if not word: return 778 779 matchIdx = '1.0' 780 matchRight = '1.0' 781 while matchIdx != '': 782 matchIdx = window.search(word,matchRight,count=1,stopindex=Tkinter.END) 783 if matchIdx == '': break 784 785 strptr = matchIdx.split(".") 786 matchRight = strptr[0] + '.' + str((int(strptr[1],10) + len(word))) 787 788 window.tag_add(self.tagId, matchIdx, matchRight ) 789 window.tag_configure(self.tagId,background=color, foreground='black') 790 self.highlightIds.append([window,self.tagId]) 791 self.tagId = self.tagId + 1
792 793 794 795 # INIT KIMMO
796 - def initKimmo(self, *args):
797 """ 798 Initialize the Kimmo engine from the lexicon. This will get called no matter generate 799 or recognize. (i.e. loading all rules, lexicon, and alternations 800 """ 801 # only initialize Kimmo if the contents of the *rules* have changed 802 tmprmd5 = md5.new(self.rules.get(1.0, Tkinter.END)) 803 tmplmd5 = md5.new(self.lexicon.get(1.0, Tkinter.END)) 804 if (not self.kimmoinstance) or (self.rulemd5 != tmprmd5) or (self.lexmd5 != tmplmd5): 805 self.guiError("Creating new Kimmo instance") 806 self.kimmoinstance = KimmoControl(self.lexicon.get(1.0, Tkinter.END),self.rules.get(1.0, Tkinter.END),'','',self.debug) 807 self.guiError("") 808 self.rulemd5 = tmprmd5 809 self.lexmd5 = tmplmd5 810 811 if not self.kimmoinstance.ok: 812 self.guiError("Creation of Kimmo Instance Failed") 813 return 814 if not self.kimmoinstance.m.initial_state() : 815 self.guiError("Morphology Setup Failed") 816 elif self.kimmoinstance.errors: 817 self.guiError(self.kimmoinstance.errors) 818 self.kimmoinstance.errors = ''
819 # self.validate() 820
821 - def refresh(self, *args):
822 if self._root is None: return 823 print self.wordIn.get()
824 825 826 # CAPTURE PYTHON-KIMMO OUTPUT 827 # redirect to debug window, if operational
828 - def capturePrint(self,*args):
829 # self.debugWin.set(string.join(args," ")) 830 831 # if there is a trace/debug window 832 if self.dbgTracing: 833 self.traceWindow.insert(Tkinter.END, string.join(args," ")) 834 self.traceWindow.see(Tkinter.END) 835 836 837 # otherwise, just drop the output. 838 839 # no no, if tracing is on, but no window, turn tracing off and cleanup window 840 841 # !!! if tracing is on, but window is not defined, create it. 842 # this will cause a post-recover from an improper close of the debug window 843 844 # if tracing is not on, ignore it. 845 846 # return 1,1,'Out Hooked:'+text 847 return 0,0,''
848 849 850
851 - def kimmoHelp(self,*args):
852 853 # helpText = """ 854 # """ 855 856 # load help into helpfile 857 858 # helpText = Tkinter.StringVar() 859 helpText = '' 860 try: f = open(self.helpFilename, 'r') 861 except IOError, e: 862 self.guiError("HelpFile not loaded") 863 return 864 865 self.guiError("") # no errors to report here 866 # this is not the best idea, what if there are many errors 867 # from different functions? 868 869 helpText = str(f.read()) 870 f.close() 871 872 # clean any crl stuff 873 helpText = re.sub("\r","",helpText) 874 875 876 helpWindow = Tkinter.Toplevel() 877 helpWindow.title("PyKimmo Documentation & Help") 878 879 # help = Tkinter.Label(helpWindow,textvariable=helpText, justify='left' ) # 880 help = ScrolledText(helpWindow, background='#f0f0f0', 881 foreground='black', width=70, height=40,wrap='none', 882 font='Times 12 bold') # 883 884 help.pack(side='top') 885 help.insert(1.0, helpText) 886 # setup the scrollbar 887 scroll = Tkinter.Scrollbar(helpWindow, orient='horizontal',command=help.xview) 888 scroll.pack(side='bottom', fill='x') 889 help.configure(xscrollcommand = scroll.set) 890 891 # now highlight up the file 892 matchIdx = Tkinter.END 893 matchRight = Tkinter.END 894 matchLen = Tkinter.IntVar() 895 tagId = 1 896 while 1: 897 matchIdx = help.search(r"::[^\n]*::",matchIdx, stopindex=1.0, backwards=True, regexp=True, count=matchLen ) 898 if not matchIdx: break 899 900 matchIdxFields = matchIdx.split(".") 901 matchLenStr = matchIdxFields[0] + "." + str(string.atoi(matchIdxFields[1],10) + matchLen.get()) 902 903 print (matchIdx, matchLenStr) 904 help.tag_add(tagId, matchIdx, matchLenStr ) 905 help.tag_configure(tagId, background='aquamarine', foreground='blue', underline=True) 906 tagId += 1
907 908 909 910 911 ################################ PRINT HOOK ###################### 912 # this class gets all output directed to stdout(e.g by print statements) 913 # and stderr and redirects it to a user defined function
914 -class PrintHook:
915 #out = 1 means stdout will be hooked 916 #out = 0 means stderr will be hooked
917 - def __init__(self,out=1):
918 self.func = None ##self.func is userdefined function 919 self.origOut = None 920 self.out = out
921 #user defined hook must return three variables 922 #proceed,lineNoMode,newText
923 - def TestHook(self,text):
924 f = open('hook_log.txt','a') 925 f.write(text) 926 f.close() 927 return 0,0,text
928 - def Start(self,func=None):
929 if self.out: 930 sys.stdout = self 931 self.origOut = sys.__stdout__ 932 else: 933 sys.stderr= self 934 self.origOut = sys.__stderr__ 935 if func: 936 self.func = func 937 else: 938 self.func = self.TestHook
939 #Stop will stop routing of print statements thru this class
940 - def Stop(self):
941 self.origOut.flush() 942 if self.out: 943 sys.stdout = sys.__stdout__ 944 else: 945 sys.stderr = sys.__stderr__ 946 self.func = None
947 #override write of stdout
948 - def write(self,text):
949 proceed = 1 950 lineNo = 0 951 addText = '' 952 if self.func != None: 953 proceed,lineNo,newText = self.func(text) 954 if proceed: 955 if text.split() == []: 956 self.origOut.write(text) 957 else: 958 #if goint to stdout then only add line no file etc 959 #for stderr it is already there 960 if self.out: 961 if lineNo: 962 try: 963 raise "Dummy" 964 except: 965 newText = 'line('+str(sys.exc_info()[2].tb_frame.f_back.f_lineno)+'):'+newText 966 codeObject = sys.exc_info()[2].tb_frame.f_back.f_code 967 fileName = codeObject.co_filename 968 funcName = codeObject.co_name 969 self.origOut.write('file '+fileName+','+'func '+funcName+':') 970 self.origOut.write(newText)
971 #pass all other methods to __stdout__ so that we don't have to override them
972 - def __getattr__(self, name):
973 return self.origOut.__getattr__(name)
974
975 -class tkImageView:
976 - def __init__(self, imagefileName, title):
977 self._root = Tkinter.Toplevel() 978 self._root.title(title + ' (' + imagefileName + ')') 979 self.image = Tkinter.PhotoImage("LGraph",file=imagefileName) 980 981 Tkinter.Label(self._root, image=self.image).pack(side='top',fill='x')
982 # self._root.mainloop() 983
984 - def destroy(self, *args):
985 if self._root: 986 self._root.destroy() 987 self._root = None 988 self.image = None
989 990 991 ######################### Dialog Boxes ##############################
992 -class ListDialog(Tkinter.Toplevel):
993
994 - def __init__(self, parent, listOptions, title = None):
995 996 Tkinter.Toplevel.__init__(self, parent) 997 self.transient(parent) 998 999 if title: 1000 self.title(title) 1001 1002 self.parent = parent 1003 1004 self.result = None 1005 1006 body = Tkinter.Frame(self) 1007 1008 self.initial_focus = self.body(body) 1009 body.pack(padx=5, pady=5) 1010 1011 box = Tkinter.Frame(self) 1012 Tkinter.Label(box,text="Select an FSA to graph").pack(side='top',fill='x') 1013 box.pack() 1014 1015 1016 1017 self.listbox(listOptions) 1018 1019 self.buttonbox() 1020 1021 self.grab_set() 1022 1023 if not self.initial_focus: 1024 self.initial_focus = self 1025 1026 self.protocol("WM_DELETE_WINDOW", self.cancel) 1027 1028 self.geometry("+%d+%d" % (parent.winfo_rootx()+50, 1029 parent.winfo_rooty()+50)) 1030 1031 self.initial_focus.focus_set() 1032 1033 self.wait_window(self)
1034 1035 # 1036 # construction hooks 1037
1038 - def body(self, master):
1039 # create dialog body. return widget that should have 1040 # initial focus. this method should be overridden 1041 1042 pass
1043 1044
1045 - def listbox(self, listOptions):
1046 box = Tkinter.Frame(self) 1047 self.lb = Tkinter.Listbox(box,height=len(listOptions),width=30,background='#f0f0ff', selectbackground='#c0e0ff' 1048 ,selectmode='single') 1049 self.lb.pack() 1050 1051 for x in listOptions: 1052 self.lb.insert(Tkinter.END,x) 1053 1054 box.pack()
1055
1056 - def buttonbox(self):
1057 # add standard button box. override if you don't want the 1058 # standard buttons 1059 1060 box = Tkinter.Frame(self) 1061 1062 w = Tkinter.Button(box, text="OK", width=10, command=self.ok, default="active") 1063 w.pack(side="left", padx=5, pady=5) 1064 w = Tkinter.Button(box, text="Cancel", width=10, command=self.cancel) 1065 w.pack(side="left", padx=5, pady=5) 1066 1067 self.bind("&lt;Return&gt;", self.ok) 1068 self.bind("&lt;Escape&gt;", self.cancel) 1069 1070 box.pack()
1071 1072 # 1073 # standard button semantics 1074
1075 - def ok(self, event=None):
1076 1077 if not self.validate(): 1078 self.initial_focus.focus_set() # put focus back 1079 return 1080 1081 self.withdraw() 1082 self.update_idletasks() 1083 1084 self.apply() 1085 1086 # we want to return self.lb.curselection() 1087 self.result = self.lb.curselection() 1088 1089 self.cancel()
1090 1091
1092 - def cancel(self, event=None):
1093 1094 # put focus back to the parent window 1095 self.parent.focus_set() 1096 self.destroy()
1097 1098 # 1099 # command hooks 1100
1101 - def validate(self):
1102 1103 return 1 # override
1104
1105 - def apply(self):
1106 1107 pass # override
1108 1109 1110 1111 1112 1113 ################################ Dot Grapher ###################### 1114 # given a state table with names, draw graphs in dot format. 1115 1116 """ 1117 + CNsib + s # y o @ 1118 e CNsib @ s # i o @ 1119 1: 0 2 1 2 1 2 7 1 1120 2: 3 2 5 2 1 2 7 1 1121 3. 0 0 0 4 0 0 0 0 1122 4. 0 0 1 0 1 0 0 0 1123 5: 0 1 1 6 1 1 1 1 1124 6: 0 1 0 1 0 1 1 1 1125 7: 3 2 1 2 1 2 7 1 1126 """ 1127 1128 # so first we will create the states. 1129 # then we will write the edges & name them. 1130 # name 0 as fail 1131 1132 # call the dot drawer on the file & display the graph. 1133
1134 -def dotformat(nodeEdgeAry):
1135 # choose graphsize based upon number of nodes 1136 graphWH = '4,4' 1137 if len(nodeEdgeAry) > 3: graphWH = '5,5' 1138 if len(nodeEdgeAry) > 5: graphWH = '6,6' 1139 if len(nodeEdgeAry) > 7: graphWH = '7,7' 1140 if len(nodeEdgeAry) > 10: graphWH = '7.5,7.5' 1141 1142 # print len(nodeEdgeAry) 1143 # print graphWH 1144 1145 dotstring = '' 1146 dotstring += " size=\""+ graphWH +"\"\n" 1147 # dotstring += " page=\"7,7\"\n" 1148 dotstring += " ratio=fill\n" 1149 # dotstring += " rankdir=LR\n" 1150 # dotstring += " center=1\n" 1151 for x in nodeEdgeAry: 1152 if x['node'] == 'Begin': features = ' [' + 'shape=box,color=lightblue,style=filled] ' 1153 elif x['node'] == 'End': features = ' [' + 'color="Light Coral",style=filled] ' 1154 elif x['features'] : features = ' [' + x['features'] + '] ' 1155 elif not x['features'] : features = '' 1156 1157 dotstring += (' "' + x['node'] + '" ' + features + ";\n") 1158 for e in range(len(x['edges'])): 1159 dotstring += (' "' + x['node'] + '" -> "' + x['edges'][e] + '" ') 1160 if e < len(x['edgenames']) : dotstring += ('[label="\l'+ x['edgenames'][e] + '"]' ) 1161 dotstring += ";\n" 1162 1163 dotstring = "digraph autograph {\n" + dotstring + "\n}\n" 1164 return dotstring
1165
1166 -def _classeq(instance1, instance2):
1167 """ 1168 @return: true iff the given objects are instances of the same 1169 class. 1170 @rtype: C{bool} 1171 """ 1172 return (type(instance1) == types.InstanceType and 1173 type(instance2) == types.InstanceType and 1174 instance1.__class__ == instance2.__class__)
1175 1176 # given a dot string, write to a tmp file and invoke the grapher 1177 # return a filename to open. 1178 # imagetype is hardcoded for now
1179 -def dot2image(filename, dotstring):
1180 dotfilename = filename + '.dot' 1181 # imgfilename = filename + '.gif' 1182 psfilename = filename + '.ps' 1183 imgfilename = filename + '.ppm' 1184 pngfilename = filename + '.png' 1185 1186 # whack the file if already there... (for now) 1187 f = open(dotfilename, 'w') 1188 f.write(dotstring) 1189 f.close() 1190 1191 os.system('dot -Tps -o ' + psfilename +' ' + dotfilename) 1192 # os.system('dot -Tgif -o ' + imgfilename +' ' + dotfilename) 1193 1194 #print filename + "\n" 1195 #print imgfilename + "\n" 1196 1197 # cheap hack now that graphviz is not working right... 1198 os.system('rm -f ' + imgfilename) 1199 os.system('pstopnm -stdout -portrait -ppm ' + psfilename + ' > ' + imgfilename) 1200 1201 if os.path.isfile(imgfilename) : return imgfilename 1202 1203 return ''
1204 1205 1206 1207 1208 1209 ################################ KIMMO SET ###################### 1210 1211 # ----------- KIMMOCONTROL --------------- 1212 # Master instance for creating a kimmo object 1213 # from files or strings or rules & lexical entries 1214 # -------------------------------------
1215 -class KimmoControl:
1216 - def __init__(self, lexicon_string, rule_string, lexicon_file, rule_file, debug):
1217 1218 self.validateLexicon = [] 1219 self.validateAlternations = [] 1220 1221 self.lexicalNodes = [] # transition states and edges for graphing lexicon 1222 self.ruleNodes = [] # transition states & edges for graphing of rules 1223 1224 # a better way is just to use a destructor and check if the object exists. 1225 self.ok = 0 1226 self.errors = '' 1227 1228 # load lexicon file 1229 if lexicon_file: 1230 f = read_kimmo_file(lexicon_file) 1231 lexicon_string = string.join(f.readlines(),"") 1232 f.close() 1233 1234 # load rule file 1235 if rule_file: 1236 f = read_kimmo_file(rule_file) 1237 rule_string = string.join(f.readlines(),"") 1238 f.close() 1239 1240 try: 1241 self.processRules(rule_string) 1242 self.processLexicon(lexicon_string) 1243 self.m = KimmoMorphology(self.kalternations, self.klexicons) 1244 self.m.set_boundary(self.boundary_char) 1245 self.s = KimmoRuleSet(self.ksubsets, self.kdefaults, self.krules) 1246 self.s.debug = debug 1247 self.ok = 1 1248 except RuntimeError, e: 1249 self.errors = ('Caught:' + str(e) + ' ' + self.errors) 1250 print 'Caught:', e 1251 print "Setup of the kimmoinstance failed. Most likely cause" 1252 print "is infinite recursion due to self-referential lexicon" 1253 print "For instance:" 1254 print "Begin: Begin Noun End" 1255 print "Begin is pointing to itself. Simple example, but check" 1256 print "to insure no directed loops" 1257 self.ok = 0
1258 1259 1260
1261 - def generate(self, word):
1262 if self.boundary_char: word += self.boundary_char 1263 genlist = _generate_test(self.s, word) 1264 1265 genliststr = genlist.__repr__() 1266 if self.boundary_char: genliststr = genliststr.replace(self.boundary_char,'') 1267 1268 return eval(genliststr)
1269
1270 - def recognize(self, word):
1271 return _recognize_test(self.s, word, self.m)
1272 1273 1274 # run a batch and print to console. This is different than the 1275 # batch for the gui; 1276 # the kimmo object should already be created when the batch is run. 1277 # the output is also not formatted nicely
1278 - def batch(self, filename):
1279 if filename: 1280 f = read_kimmo_file(filename) 1281 lines = f.readlines() 1282 f.close() 1283 1284 # space the results out a little 1285 results_string = '' 1286 for line in lines: 1287 # a 'g word' 'r word' format 1288 singleword = line.strip() # should be a single word, no spaces, etc. 1289 spcr = re.compile(r"\s+") 1290 linevals = [] 1291 linevals = spcr.split(singleword) 1292 1293 batch_result = [] 1294 batch_result_str = '' 1295 if not singleword: continue # ignore blank lines 1296 elif (singleword[0] == '#') or (singleword[0] == ';'): # commented; 1297 results_string += (singleword + '\n') 1298 1299 elif (linevals[0] == 'g') and (len(linevals) == 2): 1300 batch_result = self.generate(linevals[1]) 1301 elif (linevals[0] == 'r') and (len(linevals) == 2): 1302 batch_result = self.recognize(linevals[1]) 1303 1304 elif '+' in singleword: 1305 batch_result = self.generate(singleword) 1306 else: 1307 batch_result = self.recognize(singleword) 1308 1309 # if a valid results 1310 if len(batch_result) > 0: 1311 for x in batch_result: batch_result_str = batch_result_str + x 1312 batch_result_str = batch_result_str + '\n' 1313 results_string += (batch_result_str) 1314 1315 # place a separator between results 1316 print '----- '+ time.strftime("%a, %d %b %Y %I:%M %p", time.gmtime()) +' -----\n' 1317 print results_string
1318 1319 1320 1321 1322 # move this out into a kimmo files & frontend class. 1323 # make this also process alternations, if contained.
1324 - def processLexicon(self, text):
1325 """ 1326 Takes the currently typed in lexicon and turns them from text into 1327 the kimmo lexicon array. 1328 """ 1329 # text = self.lexicon.get(1.0, Tkinter.END) 1330 testlex = [] 1331 self.klexicons = [] # lexicons needs to be an object of the gui scope 1332 lexigroup = '' 1333 kimmoWords = [] 1334 alternationText = '' 1335 1336 tmpnode = {} # a node and its edges 1337 tmpnode['node'] = '' 1338 tmpnode['features'] = '' 1339 tmpnode['edges'] = [] 1340 tmpnode['edgenames'] = [] 1341 self.lexicalNodes = [] # list of nodes & their edges for the lexicon 1342 1343 for item in text.split("\n"): 1344 # '' None Genitive 1345 cleanLine = item.strip() 1346 1347 1348 if len(cleanLine) == 0 : continue # blank line 1349 elif cleanLine[0] == '#' : continue # a comment 1350 elif cleanLine[0] == ';' : continue # a comment 1351 1352 # elsif there is a : then start up this lexicon entry. 1353 # if there is already a value in lexigroup, then append to lexicons 1354 # assume that : is the last char. 1355 # LEXICON N_ROOT1 1356 elif cleanLine[len(cleanLine)-1] == ':' : 1357 if (len(lexigroup) > 0): 1358 if len(kimmoWords): 1359 # print lexigroup 1360 # print kimmoWord 1361 self.klexicons.append( KimmoLexicon(lexigroup, kimmoWords) ) 1362 self.lexicalNodes.append(tmpnode) 1363 kimmoWords = [] 1364 lexigroup = cleanLine[0:len(cleanLine)-1] # remove trailing ':' , new group 1365 1366 # create the state transitions for the lexicon. 1367 tmpnode = {} 1368 tmpnode['node'] = lexigroup 1369 tmpnode['features'] = '' 1370 tmpnode['edges'] = [] 1371 tmpnode['edgenames'] = [] 1372 1373 self.validateLexicon.append(lexigroup) 1374 # print lexigroup 1375 1376 # assume that a : contained in the line that is not a last char means it is an alternation. 1377 elif ':' in cleanLine: 1378 alternationText += ( cleanLine + "\n") 1379 1380 elif lexigroup: 1381 p = re.compile(r"\s+") 1382 moreitems = [] 1383 # moreitems = item.split(" ") # make sure to add tabs and other whitespace.. 1384 moreitems = p.split(item) 1385 1386 # this is splitting on the wrong char 1387 1388 # *recollect*. doesn't work on multiple spaces. 1389 # this code only works for the last field 1390 rangestart = -1 1391 for x in range(len(moreitems)): 1392 # print moreitems[x] 1393 if (moreitems[x][0] == '"') and (rangestart < 0): rangestart = x 1394 elif (moreitems[x][len(moreitems[x])-1] == '"') and (rangestart > -1): 1395 rangeend = x 1396 moreitems[rangestart] = string.join(moreitems[rangestart:rangeend+1], " ") 1397 1398 i = 0 1399 for furtheritem in moreitems: 1400 furtheritem = furtheritem.strip() 1401 moreitems[i] = furtheritem 1402 1403 if not len(moreitems[i]): continue 1404 if i > 2 : continue 1405 else: testlex.append(moreitems[i]) 1406 i += 1 1407 1408 for x in range(len(moreitems)): 1409 if x > 2: continue 1410 elif (moreitems[x] == '\'\'') or (moreitems[x] == '""'): 1411 moreitems[x] = '' 1412 elif (moreitems[x][0] == '"') and (moreitems[x][len(moreitems[x])-1] == '"'): 1413 moreitems[x] = moreitems[x][1:len(moreitems[x])-1] 1414 elif (moreitems[x][0] == '\'') and (moreitems[x][len(moreitems[x])-1] == '\''): 1415 1416 tmpitem = moreitems[x] 1417 moreitems[x] = tmpitem[1:(len(tmpitem)-1)] 1418 1419 elif moreitems[x] == 'None' : moreitems[x] = None 1420 1421 # EXPECTED FORMAT IS: 1422 # WORD ALTERNATION DESCRIPTION 1423 if len(moreitems) > 2 : 1424 kimmoWords.append( KimmoWord(moreitems[0], moreitems[2], moreitems[1]) ) 1425 self.validateLexicon.append(moreitems[1]) 1426 # print moreitems 1427 elif len(moreitems) > 1 : 1428 kimmoWords.append( KimmoWord(moreitems[0], '', moreitems[1]) ) 1429 self.validateLexicon.append(moreitems[1]) 1430 1431 if (len(moreitems) > 1) and not (moreitems[1] in tmpnode['edges']): 1432 tmpnode['edges'].append(moreitems[1]) 1433 1434 else : 1435 # an undefined line. 1436 self.errors += "Unknown Line in Lexicon (" + cleanLine + ")" 1437 1438 # if the end of file and there is a group defined, add this last group 1439 if (len(lexigroup) > 0) and (len(kimmoWords)): 1440 self.klexicons.append( KimmoLexicon(lexigroup, kimmoWords) ) 1441 self.lexicalNodes.append(tmpnode) 1442 1443 # process the alternations 1444 # print alternationText 1445 self.processAlternations(alternationText) 1446 1447 1448 # return an array of state and edge objects. 1449 return self.lexicalNodes
1450 1451 1452 1453 # process ALTERNATIONS 1454 # self.kalternations = [ 1455 # KimmoAlternation('Begin', [ 'N_ROOT', 'ADJ_PREFIX', 'V_PREFIX', 'End' ]), 1456
1457 - def processAlternations(self, text):
1458 """ 1459 Takes the currently typed in alternations and turns them from text into 1460 the kimmo alternation array. 1461 """ 1462 # text = self.alternation.get(1.0, Tkinter.END) 1463 testalt = [] 1464 self.kalternations = [] # lexicons needs to be an object of the gui scope 1465 altgroup = '' 1466 kimmoAlts = [] 1467 1468 for line in text.split("\n"): 1469 # '' None Genitive 1470 cleanLine = line.strip() 1471 1472 if len(cleanLine) == 0 : continue # blank line 1473 elif cleanLine[0] == '#' : continue # a comment 1474 elif cleanLine[0] == ';' : continue # a comment 1475 else: 1476 # lets do this one differently. 1477 # lets break it first, then keep on looping until we find the next group (signified by a : ) 1478 p = re.compile(r"\s+") 1479 items = [] 1480 items = p.split(cleanLine) 1481 1482 for item in items: 1483 item_tmp = item.strip() 1484 1485 1486 if len(item_tmp) == 0 : continue 1487 # ALTERNATION V_root 1488 elif ':' in item_tmp : 1489 # all all prior alternations to prior altgroup (if defined) 1490 if len(altgroup) > 0: 1491 if len(kimmoAlts) > 0: 1492 self.kalternations.append( 1493 KimmoAlternation(altgroup, kimmoAlts) ) 1494 1495 self.validateAlternations.append(altgroup) 1496 for x in kimmoAlts: self.validateAlternations.append(x) 1497 self.lexicalNodes.append(tmpnode) 1498 1499 1500 # set new altgroup 1501 altgroup = cleanLine[0:len(item_tmp)-1] 1502 kimmoAlts = [] 1503 1504 tmpnode = {} 1505 tmpnode['node'] = altgroup 1506 tmpnode['features'] = 'color=\"aquamarine2\", style=filled' 1507 tmpnode['edges'] = [] 1508 tmpnode['edgenames'] = [] 1509 1510 1511 else : 1512 # remove '' surrounding alternations 1513 if (item_tmp[0] == '\'') and (item_tmp[len(item_tmp)-1] == '\''): 1514 item_tmp = item_tmp[1:(len(item_tmp)-1)] 1515 # convert None 1516 elif item_tmp == 'None' : item_tmp = None 1517 1518 # print 'a \'' + item_tmp + '\'' 1519 kimmoAlts.append(item_tmp) 1520 1521 # add alternation edges ; order independent. 1522 tmpnode['edges'].append(item_tmp) 1523 1524 if len(altgroup) > 0: 1525 if len(kimmoAlts) > 0: 1526 self.kalternations.append( 1527 KimmoAlternation(altgroup, kimmoAlts) ) 1528 self.validateAlternations.append(altgroup) 1529 for x in kimmoAlts: self.validateAlternations.append(x) 1530 self.lexicalNodes.append(tmpnode)
1531 1532 # print self.validateAlternations 1533 1534 1535 1536 # RULES 1537 # Rule format 1538 # KimmoFSARule('08:elision: e:0 <= VCC*___+:0 V', 1539 # ' Cpal C e:0 e:@ +:0 Vbk V @', # english.rul needed pairs re-ordered 1540 # [ (1, True, [ 1, 1, 1, 2, 1, 2, 2, 1 ]), 1541 # (2, True, [ 3, 6, 1, 2, 1, 2, 2, 1 ]), # V... 1542 # (3, True, [ 3, 6, 1, 4, 1, 2, 2, 1 ]), # V Cpal... 1543 # (4, True, [ 1, 1, 1, 2, 5, 2, 2, 1 ]), # V Cpal e... 1544 # (5, True, [ 1, 1, 1, 0, 1, 2, 0, 1 ]), # V Cpal e +:0... [english.rul needed fixing] 1545 # (6, True, [ 1, 1, 1, 7, 1, 2, 2, 1 ]), # V C... 1546 # (7, True, [ 1, 1, 1, 2, 8, 2, 2, 1 ]), # V C e... 1547 # (8, True, [ 1, 1, 1, 0, 1, 0, 0, 1 ]) ]), # V C e +:0... [english.rul needed fixing]
1548 - def processRules(self, text):
1549 """ 1550 Takes the currently typed in rules and processes them into the python kimmo 1551 format. expects rules to be in c version of .rul file format. needs to 1552 be file compatible. 1553 """ 1554 # text = self.rules.get(1.0, Tkinter.END) 1555 testrule = [] 1556 self.krules = [] 1557 self.ksubsets = [] 1558 self.kdefaults = [] 1559 self.boundary_char = '' 1560 setgroup = '' 1561 rulegroup = '' 1562 rulerowcnt = 0 1563 rulecolcnt = 0 1564 kimmoRule = [] 1565 1566 1567 1568 ruleFrom = [] 1569 ruleTo = [] 1570 ruleTran = [] 1571 1572 anyset = ['','','',''] 1573 1574 1575 tmpnode = {} # a node and its edges 1576 tmpnode['node'] = '' 1577 tmpnode['features'] = '' 1578 tmpnode['edges'] = [] # list of the transitions 1579 tmpnode['edgenames'] = [] # matched array naming each transition 1580 1581 tmpfsanodes = {} 1582 tmpfsanodes['nodes'] = [] 1583 tmpfsanodes['name'] = '' 1584 self.fsasNodes = [] # list of nodes & their edges for the lexicon 1585 1586 1587 for line in text.split("\n"): 1588 # '' None Genitive 1589 cleanLine = line.strip() 1590 1591 1592 1593 if len(cleanLine) == 0 : continue # blank line 1594 # this char can be a comment if it is not the boundary char. 1595 # yes, yes, it should be defined such that it is not in the alphabet at all 1596 # also boundary would need to be defined before ... 1597 elif (cleanLine[0] == '#') and (anyset[3] != '#'): continue # a comment 1598 elif (cleanLine[0] == ';') and (anyset[3] != ';') : continue # a comment 1599 else: 1600 # lets do this one differently. 1601 # lets break it first, then keep on looping until we find the next group (signified by a : ) 1602 p = re.compile(r"\s+") 1603 items = [] 1604 items = p.split(cleanLine) 1605 1606 # now handle subset keywords 1607 # KimmoSubset('C', 'b c d f g h j k l m n p q r s t v w x y z'), 1608 1609 if items[0] == 'SUBSET': 1610 if items[1] == 'ALL': items[1] = '@' 1611 self.ksubsets.append( 1612 KimmoSubset(items[1], string.join(items[2:len(items)]," ") )) 1613 # print items[1] + ' ' + string.join(items[2:len(items)]," ") 1614 1615 # load up the fsa regexp based on alphabet 1616 # also set up the @ subset if alphabet is defined (old rule file style) 1617 elif items[0] == 'ALPHABET': anyset[1] = string.join(items[1:len(items)]," ") 1618 1619 elif items[0] == 'ANY': anyset[0] = items[1] 1620 1621 elif items[0] == 'NULL': anyset[2] = items[1] 1622 1623 # using the boundary char, set the final boundary & also add to the any set. 1624 elif items[0] == 'BOUNDARY': 1625 anyset[3] = items[1] 1626 self.boundary_char = items[1] 1627 1628 elif items[0] == 'DEFAULT': 1629 self.kdefaults = [ KimmoDefaults(string.join(items[1:len(items)]," ")) ] 1630 1631 elif items[0] == 'ARROWRULE': 1632 # ARROWRULE 03:epenthesis1 0:e ==> [Csib (c h) (s h) y:i] +:0 _ s [+:0 #] 1633 # KimmoArrowRule('03:epenthesis1', '0:e ==> [Csib (c h) (s h) y:i] +:0 _ s [+:0 #]'), 1634 # print items[1] + ' ' + string.join(items[2:len(items)]," ") 1635 self.krules.append( 1636 KimmoArrowRule(items[1], string.join(items[2:len(items)]," ")) 1637 # KimmoArrowRule('05:y:i-spelling', 'y:i <=> @:C +:0? _ +:0 ~I') 1638 ) 1639 1640 elif items[0] == 'RULE': # this is actually FSArules 1641 # make compatible with rul files 1642 1643 if rulegroup: self.guiError('error, fsa rule not finished') 1644 1645 rulecolcnt = string.atoi(items[len(items)-1]) 1646 rulerowcnt = string.atoi(items[len(items)-2]) 1647 rulegroup = string.join(items[1:len(items)-2]) 1648 1649 # create the structure (for graphing) for storing the transitions 1650 # of the fsas 1651 tmpfsanodes = {} 1652 tmpfsanodes['nodes'] = [] 1653 tmpfsanodes['name'] = rulegroup 1654 1655 # add the fail node by default 1656 tmpnode = {} # a node and its edges 1657 tmpnode['node'] = '0' 1658 tmpnode['features'] = 'color="indianred1", style=filled, shape=box' 1659 tmpnode['edges'] = [] 1660 tmpnode['edgenames'] = [] 1661 1662 tmpfsanodes['nodes'].append(tmpnode) 1663 1664 1665 1666 elif rulegroup: 1667 1668 # assume TRUE rules for now 1669 # non-char test; already stripped of whitespace 1670 ct = re.compile('[^0-9:\.]') # go with [A-Za-z] 1671 # if text, then add to first lines of fsa 1672 # get row1 and row2 of text & translate into x:y col format. 1673 1674 # if a number and until number is equal to row count, add 1675 # i.e. not text 1676 if ((':' in items[0]) or ('.' in items[0])) and (not ct.match(items[0])): 1677 # make sure to check for TRUE vs FALSE rows... 1678 # sprint items[0][0:len(items[0])-1] + ' -- ' + string.join(items[1:len(items)], " ") 1679 1680 if (items[0][len(items[0])-1] == ':') : finalstate = True 1681 elif (items[0][len(items[0])-1] == '.') : finalstate = False 1682 else : 1683 self.guiError("FSA table failure -- 'final state defn'") 1684 continue 1685 1686 items[0] = items[0][0:len(items[0])-1] # remove the ':' 1687 1688 # convert to integers (instead of strings) 1689 for x in range(rulecolcnt + 1): items[x] = string.atoi(items[x]) # including the first row number - i.e. '4:' 1690 1691 # add this row. 1692 kimmoRule.append((items[0], finalstate, items[1:len(items)])) 1693 1694 # now make this row into graph transitions 1695 tmpnode = {} # a node and its edges 1696 tmpnode['node'] = str(items[0]) 1697 tmpnode['features'] = 'shape=box, fillcolor="lavender blush", style=filled' 1698 if finalstate and (items[0] == 1): 1699 tmpnode['features'] = 'shape=circle, color="paleturquoise2", style=filled' 1700 elif (items[0] == 1): 1701 tmpnode['features'] = 'color="paleturquoise2", style=filled, shape=box' 1702 elif (finalstate): 1703 tmpnode['features'] = 'shape=circle,fillcolor="honeydew2", style=filled' 1704 tmpnode['edges'] = [] 1705 tmpnode['edgenames'] = [] 1706 # add as strings 1707 # add unique, but group edgenames together 1708 1709 tmpitems = items[1:len(items)] 1710 for i in range(len(tmpitems)): 1711 if str(tmpitems[i]) in tmpnode['edges']: 1712 # find the index j of the matching target 1713 for j in range(len(tmpnode['edges'])): 1714 if str(tmpnode['edges'][j]) == str(tmpitems[i]): 1715 1716 m = re.match(r"(^|\\n)([^\\]*)$", tmpnode['edgenames'][j]) 1717 # instead use a regular expression... 1718 # this should really be done in dotstring 1719 1720 if not m: 1721 tmpnode['edgenames'][j] += (',' + ruleTran[i]) 1722 elif (len(m.group(2)) >= 15): 1723 tmpnode['edgenames'][j] += ('\\n ' + ruleTran[i]) 1724 else: 1725 tmpnode['edgenames'][j] += (',' + ruleTran[i]) 1726 else: 1727 tmpnode['edges'].append(str(tmpitems[i])) 1728 tmpnode['edgenames'].append(ruleTran[i]) 1729 1730 1731 """ 1732 for x in items[1:len(items)]: 1733 # go through and check, already added? 1734 # for i in range(len(tmpnode['edges'])): 1735 # if tmpnode['edges'][i] == x: 1736 # tmpnode['edgenames'][i] += "," + 1737 1738 tmpnode['edges'].append(str(x)) 1739 for x in ruleTran: tmpnode['edgenames'].append(x) 1740 """ 1741 tmpfsanodes['nodes'].append(tmpnode) 1742 1743 1744 # if number is equal to row count, then add total and reset rule group 1745 if ( items[0] == rulerowcnt): 1746 self.krules.append( 1747 KimmoFSARule(str(rulerowcnt)+':'+rulegroup, string.join(ruleTran," "), kimmoRule)) 1748 1749 # add to the master graph list 1750 self.fsasNodes.append(tmpfsanodes) 1751 1752 1753 rulegroup = '' 1754 rulerowcnt = 0 1755 rulecolcnt = 0 1756 ruleTran = [] # reset the translation array 1757 kimmoRule = [] # resent the kimmo rules as well 1758 1759 # the char class/translations 1760 elif len(items) == rulecolcnt: 1761 # old style has 2 rows, class from, class to 1762 if len(ruleFrom) == 0: ruleFrom = items 1763 elif len(ruleTo) == 0: ruleTo = items 1764 1765 # if ruleTo is ruleFrom: continue 1766 1767 if (len(ruleTo) != rulecolcnt) or (len(ruleFrom) != rulecolcnt): continue 1768 else: 1769 for x in range(rulecolcnt): 1770 if ruleTo[x] == ruleFrom[x]: ruleTran.append(ruleTo[x]) 1771 else: 1772 ruleTran.append(ruleFrom[x] + ':' + ruleTo[x]) 1773 1774 ruleTo = [] 1775 ruleFrom = [] 1776 1777 # take care of the anyset, if it was defined (make into a subset) 1778 if (anyset[0] and anyset[1]): 1779 self.ksubsets.append(KimmoSubset(anyset[0], string.join(anyset[1:len(anyset)]," ") ))
1780 1781 # print self.fsasNodes 1782 1783 1784 # ----------- KIMMOPAIR --------------- 1785 # 1786 # -------------------------------------
1787 -class KimmoPair:
1788 """ 1789 Input/Output character pair 1790 """
1791 - def __init__(self, input_subset, output_subset):
1792 self._input = input_subset 1793 self._output = output_subset
1794 1795
1796 - def input(self): return self._input
1797 - def output(self): return self._output
1798 1799
1800 - def __repr__(self):
1801 sI = self.input() 1802 sO = self.output() 1803 s = sI + ':' + sO 1804 return s
1805 1806
1807 - def __eq__(self, other):
1808 return (_classeq(self, other) and 1809 self._input == other._input and 1810 self._output == other._output)
1811 1812
1813 - def __hash__(self):
1814 return hash( (self._input, self._output,) )
1815 1816
1817 - def matches(self, input, output, subsets, negatedOutputMatch=False):
1818 if not(self._matches(self.input(), input, subsets)): return False 1819 m = self._matches(self.output(), output, subsets) 1820 if negatedOutputMatch: return not(m) 1821 return m
1822 1823
1824 - def _matches(self, me, terminal, subsets):
1825 if (me == terminal): return True 1826 if (me[0] == '~'): 1827 m = me[1:] 1828 if (m in subsets): 1829 return not(terminal in subsets[m]) 1830 else: 1831 return False 1832 if (me in subsets): 1833 return terminal in subsets[me] 1834 else: 1835 return False
1836 1837 _kimmo_terminal_regexp = '[a-zA-Z0-9\+\'\-\#\@\$\%\!\^\`\}\{]+' # \}\{\<\>\,\.\~ # (^|\s)?\*(\s|$) !!! * is already covered in the re tokenizer 1838 _kimmo_terminal_regexp_fsa = '[^:\s]+' # for FSA, only invalid chars are whitespace and : 1839 # '[a-zA-Z0-9\+\'\-\#\@\$\%\!\^\`\}\{\<\>\,\.\~\*]+' 1840 _kimmo_terminal_regexp_ext= '~?' + _kimmo_terminal_regexp 1841 1842 _kimmo_defaults = _kimmo_terminal_regexp + '|\:' 1843 _kimmo_defaults_fsa = _kimmo_terminal_regexp_fsa + '|\:' 1844 _kimmo_rule = _kimmo_terminal_regexp_ext + '|[\:\(\)\[\]\?\&\*\_]|<=>|==>|<==|/<=' 1845 1846 _arrows = ['==>', '<=>', '<==', '/<='] 1847 1848 1849 _special_tokens = ['(', ')', '[', ']', '*', '&', '_', ':'] 1850 _special_tokens.extend(_arrows) 1851 _non_list_initial_special_tokens = [')', ']', '*', '&', '_', ':'] 1852 _non_list_initial_special_tokens.extend(_arrows) 1853 1854
1855 -def parse_pair_sequence(description,token_type):
1856 """Read the description, which should be in form [X|X:Y]+, and return a list of pairs""" 1857 1858 if token_type == 'FSA': 1859 desc = list(tokenize.regexp(description, _kimmo_defaults_fsa)) 1860 else: 1861 desc = list(tokenize.regexp(description, _kimmo_defaults)) 1862 1863 prev = None 1864 colon = False 1865 result = [] 1866 for token in desc: 1867 if token == ':': 1868 if colon: raise ValueError('two colons in a row') 1869 if prev == None: raise ValueError('colon must follow identifier') 1870 colon = True 1871 elif colon: 1872 result.append(KimmoPair(prev, token)) 1873 prev = None 1874 colon = False 1875 else: 1876 if prev: 1877 result.append(KimmoPair(prev, prev)) 1878 prev = token 1879 colon = False 1880 if colon: raise ValueError('colon with no following identifier') 1881 if prev: result.append(KimmoPair(prev, prev)) 1882 return result
1883 1884 1885
1886 -class KimmoSubset:
1887 - def __init__(self, name, description):
1888 self._name = name 1889 self._description = description 1890 self._subset = list(set(tokenize.regexp(description, _kimmo_terminal_regexp_fsa)))
1891 - def name(self): return self._name
1892 - def description(self): return self._description
1893 - def subset(self): return self._subset
1894 - def __repr__(self):
1895 return '<KimmoSubset %s: %s>' % (self.name(), self.description(),)
1896
1897 -class KimmoDefaults:
1898 - def __init__(self, description):
1899 self._description = description 1900 self._defaults = set() 1901 for p in parse_pair_sequence(description, ''): 1902 self.defaults().add(p)
1903 - def defaults(self): return self._defaults
1904 - def __repr__(self):
1905 return '<KimmoDefaults %s>' % (self._description,)
1906
1907 -class KimmoRule:
1908 - def pairs(self): raise RuntimeError('unimplemented: KimmoRule.pairs()')
1909 - def right_advance(self, current_states, input, output, subsets):
1910 raise RuntimeError('unimplemented: KimmoRule.right_advance()')
1911 1912
1913 -class KimmoArrowRule:
1914 """ 1915 Two level rule 1916 """ 1917
1918 - def leftFSA(self): return self._left_fsa
1919 - def rightFSA(self): return self._right_fsa
1920 - def pairs(self): return self._pairs
1921 - def arrow(self): return self._arrow
1922 - def lhpair(self): return self._lhpair
1923
1924 - def __init__(self, name, description):
1925 self._name = name 1926 self._description = description 1927 self._negated = False 1928 self._pairs = set() 1929 desc = list(tokenize.regexp(description, _kimmo_rule)) 1930 self._parse(desc)
1931
1932 - def __repr__(self):
1933 return '<KimmoArrowRule %s: %s>' % (self._name, self._description)
1934
1935 - def advance(self, fsa, current_states, input, output, subsets):
1936 """Returns a tuple of (next_states, contains_halt_state)""" 1937 result = [] 1938 contains_halt_state = False 1939 for current_state in current_states: 1940 for next_state in fsa.forward_traverse(current_state): 1941 ok = False 1942 for pair in fsa._labels[(current_state, next_state)]: 1943 if pair.matches(input, output, subsets): 1944 ok = True 1945 break 1946 if (ok): 1947 if (next_state in fsa.finals()): contains_halt_state = True 1948 if not(next_state in result): result.append(next_state) 1949 return (result, contains_halt_state)
1950 1951
1952 - def right_advance(self, current_states, input, output, subsets):
1953 return self.advance(self.rightFSA(), current_states, input, output, subsets)
1954
1955 - def matches(self, input, output, subsets):
1956 """Does this rule's LHS match this input/output pair? 1957 1958 1959 If it doesn't, return None. If it does, return True if the rule must pass, False if the rule must fail.""" 1960 1961 1962 if (self.arrow() == '==>'): 1963 if self.lhpair().matches(input, output, subsets): 1964 return True 1965 else: 1966 return None 1967 elif (self.arrow() == '<=='): 1968 if self.lhpair().matches(input, output, subsets, negatedOutputMatch=True): 1969 return False 1970 else: 1971 return None 1972 elif (self.arrow() == '/<='): 1973 if self.lhpair().matches(input, output, subsets, negatedOutputMatch=False): 1974 return False 1975 else: 1976 return None 1977 elif (self.arrow() == '<=>'): 1978 if self.lhpair().matches(input, output, subsets, negatedOutputMatch=False): 1979 return True 1980 elif self.lhpair().matches(input, output, subsets, negatedOutputMatch=True): 1981 return False 1982 else: 1983 return None 1984 else: 1985 raise RuntimeError('unknown arrow: '+self.arrow())
1986
1987 - def _parse(self, tokens):
1988 1989 (end_pair, tree) = self._parse_pair(tokens, 0) 1990 lhpair = self._pair_from_tree(tree) 1991 self._lhpair = lhpair 1992 self._pairs.add(lhpair) 1993 1994 end_arrow = self._parse_arrow(tokens, end_pair) 1995 (end_left, lfsa) = self._parse_context(tokens, end_arrow, True) 1996 end_slot = self._parse_slot(tokens, end_left) 1997 (end_right, rfsa) = self._parse_context(tokens, end_slot, False) 1998 if not(end_right == len(tokens)): 1999 raise ValueError('unidentified tokens') 2000 2001 self._left_fsa = lfsa 2002 self._right_fsa = rfsa
2003
2004 - def _next_token(self, tokens, i, raise_error=False):
2005 if i >= len(tokens): 2006 if raise_error: 2007 raise ValueError('ran off end of input') 2008 else: 2009 return None 2010 return tokens[i]
2011
2012 - def _pair_from_tree(self, tree):
2013 if (tree.node != 'Pair'): raise RuntimeException('expected Pair, got ' + str(tree)) 2014 if len(tree) == 1: 2015 return KimmoPair(tree[0], tree[0]) 2016 else: 2017 return KimmoPair(tree[0], tree[2])
2018
2019 - def _parse_pair(self, tokens, i):
2020 # print 'parsing pair at ' + str(i) 2021 t1 = self._next_token(tokens, i, True) 2022 if t1 in _special_tokens: raise ValueError('expected identifier, not ' + t1) 2023 t2 = t1 2024 j = i + 1 2025 if self._next_token(tokens, j) == ':': 2026 t2 = self._next_token(tokens, j+1, True) 2027 if t2 in _special_tokens: raise ValueError('expected identifier, not ' + t2) 2028 j = j + 2 2029 tree = Tree('Pair', tokens[i:j]) 2030 else: 2031 tree = Tree('Pair', [tokens[i]]) 2032 #print str(self._pair_from_tree(tree)) + ' from ' + str(i) + ' to ' + str(j) 2033 return (j, tree)
2034 2035
2036 - def _parse_arrow(self, tokens, i):
2037 self._arrow = self._next_token(tokens, i, True) 2038 if not(self.arrow() in _arrows): 2039 raise ValueError('expected arrow, not ' + self.arrow()) 2040 #print 'arrow from ' + str(i) + ' to ' + str(i+1) 2041 return i + 1
2042 2043
2044 - def _parse_slot(self, tokens, i):
2045 slot = self._next_token(tokens, i, True) 2046 if slot != '_': 2047 raise ValueError('expected _, not ' + slot) 2048 # print 'slot from ' + str(i) + ' to ' + str(i+1) 2049 return i + 1
2050 2051
2052 - def _parse_context(self, tokens, i, reverse):
2053 (j, tree) = self._parse_list(tokens, i) 2054 if j == i: return (i, None) 2055 2056 sigma = set() 2057 self._collect_alphabet(tree, sigma) 2058 fsa = FSA(sigma) 2059 final_state = self._build_fsa(fsa, fsa.new_state(), tree, reverse) 2060 fsa.set_final([final_state]) 2061 #fsa.pp() 2062 dfa = fsa.dfa() 2063 #dfa.pp() 2064 dfa.prune() 2065 #dfa.pp() 2066 return (j, dfa)
2067 2068
2069 - def _collect_alphabet(self, tree, sigma):
2070 if tree.node == 'Pair': 2071 pair = self._pair_from_tree(tree) 2072 sigma.add(pair) 2073 self._pairs.add(pair) 2074 else: 2075 for d in tree: self._collect_alphabet(d, sigma)
2076 2077
2078 - def _parse_list(self, tokens, i, type='Cons'):
2079 # print 'parsing list at ' + str(i) 2080 t = self._next_token(tokens, i) 2081 if t == None or t in _non_list_initial_special_tokens: 2082 # print ' failing immediately ' 2083 return (i, None) 2084 (j, s) = self._parse_singleton(tokens, i) 2085 (k, r) = self._parse_list(tokens, j, type) 2086 # print (k,r) 2087 if r == None: 2088 # print ' returning (%d, %s)' % (j, s) 2089 return (j, s) 2090 tree = Tree(type, [s, r]) 2091 # print ' returning (%d, %s)' % (k, tree) 2092 return (k, tree)
2093 2094
2095 - def _parse_singleton(self, tokens, i):
2096 # print 'parsing singleton at ' + str(i) 2097 t = self._next_token(tokens, i, True) 2098 j = i 2099 result = None 2100 if t == '(': 2101 (j, result) = self._parse_list(tokens, i + 1, 'Cons') 2102 if result == None: raise ValueError('missing contents of (...)') 2103 t = self._next_token(tokens, j, True) 2104 if t != ')': raise ValueError('missing final parenthesis, instead found ' + t) 2105 j = j + 1 2106 elif t == '[': 2107 (j, result) = self._parse_list(tokens, i + 1, 'Or') 2108 if result == None: raise ValueError('missing contents of [...]') 2109 t = self._next_token(tokens, j, True) 2110 if t != ']': raise ValueError('missing final bracket, instead found ' + t) 2111 j = j + 1 2112 elif t in _special_tokens: 2113 raise ValueError('expected identifier, found ' + t) 2114 else: 2115 (j, tree) = self._parse_pair(tokens, i) 2116 result = tree 2117 t = self._next_token(tokens, j) 2118 if t in ['*', '&', '?']: 2119 j = j + 1 2120 result = Tree(t, [result]) 2121 return (j, result)
2122 2123
2124 - def _build_fsa(self, fsa, entry_node, tree, reverse):
2125 if tree.node == 'Pair': 2126 return self._build_terminal(fsa, entry_node, self._pair_from_tree(tree)) 2127 elif tree.node == 'Cons': 2128 return self._build_seq(fsa, entry_node, tree[0], tree[1], reverse) 2129 elif tree.node == 'Or': 2130 return self._build_or(fsa, entry_node, tree[0], tree[1], reverse) 2131 elif tree.node == '*': 2132 return self._build_star(fsa, entry_node, tree[0], reverse) 2133 elif tree.node == '&': 2134 return self._build_plus(fsa, entry_node, tree[0], reverse) 2135 elif tree.node == '?': 2136 return self._build_qmk(fsa, entry_node, tree[0], reverse) 2137 else: 2138 raise RuntimeError('unknown tree node'+tree.node)
2139 2140
2141 - def _build_terminal(self, fsa, entry_node, terminal):
2142 new_exit_node = fsa.new_state() 2143 fsa.insert(entry_node, terminal, new_exit_node) 2144 #print '_build_terminal(%d,%s) -> %d' % (entry_node, terminal, new_exit_node) 2145 return new_exit_node
2146 2147
2148 - def _build_plus(self, fsa, node, tree, reverse):
2149 node1 = self._build_fsa(fsa, node, tree[0], reverse) 2150 fsa.insert(node1, epsilon, node) 2151 return node1
2152 2153
2154 - def _build_qmk(self, fsa, node, tree, reverse):
2155 node1 = fsa.new_state() 2156 node2 = self._build_fsa(fsa, node1, tree, reverse) 2157 node3 = fsa.new_state() 2158 fsa.insert(node, epsilon, node1) 2159 fsa.insert(node, epsilon, node3) 2160 fsa.insert(node2, epsilon, node3) 2161 return node3
2162 2163
2164 - def _build_star(self, fsa, node, tree, reverse):
2165 node1 = fsa.new_state() 2166 node2 = self._build_fsa(fsa, node1, tree, reverse) 2167 node3 = fsa.new_state() 2168 fsa.insert(node, epsilon, node1) 2169 fsa.insert(node, epsilon, node3) 2170 fsa.insert(node2, epsilon, node1) 2171 fsa.insert(node2, epsilon, node3) 2172 return node3
2173 2174
2175 - def _build_seq(self, fsa, node, tree0, tree1, reverse):
2176 (d0, d1) = (tree0, tree1) 2177 if reverse: (d0, d1) = (d1, d0) 2178 node1 = self._build_fsa(fsa, node, d0, reverse) 2179 node2 = self._build_fsa(fsa, node1, d1, reverse) 2180 # print '_build_seq(%d,%s,%s) -> %d,%d' % (node, tree0, tree1, node1, node2) 2181 return node2
2182
2183 - def _build_or(self, fsa, node, tree0, tree1, reverse):
2184 node0 = fsa.new_state() 2185 node1 = fsa.new_state() 2186 node2 = self._build_fsa(fsa, node0, tree0, reverse) 2187 node3 = self._build_fsa(fsa, node1, tree1, reverse) 2188 node4 = fsa.new_state() 2189 fsa.insert(node, epsilon, node0) 2190 fsa.insert(node, epsilon, node1) 2191 fsa.insert(node2, epsilon, node4) 2192 fsa.insert(node3, epsilon, node4) 2193 return node4
2194 2195
2196 -class KimmoFSARule:
2197 - def __init__(self, name, pair_description, state_descriptions):
2198 self._name = name 2199 self._pairs = parse_pair_sequence(pair_description, 'FSA') 2200 self.transitions = {} 2201 self.is_final = {} 2202 self._state_descriptions = state_descriptions 2203 # validate transitions 2204 for (index, is_final, next_state_array) in state_descriptions: 2205 if not(is_final == True or is_final == False): 2206 raise ValueError('each state description must take the form (index, True/False, [next_state_indices...]') 2207 2208 if len(next_state_array) != len(self.pairs()): 2209 raise ValueError('transition array of wrong size '+ str(len(next_state_array)) + ' ' + str(len(self.pairs()))) 2210 self.transitions[index] = next_state_array 2211 self.is_final[index] = is_final
2212
2213 - def name(self): return self._name
2214 - def pairs(self): return self._pairs
2215 - def start(self): return self._state_descriptions[0][0]
2216 - def is_state(self, index): return self.transitions.has_key(index)
2217 2218
2219 - def contains_final(self, indices):
2220 for i in indices: 2221 if self.is_final[i]: return True 2222 return False
2223 2224
2225 - def sorted_pairs(self, subsets):
2226 # pairs are ordered with the transition table, we want to order by the subset size. 2227 # returns a list of pairs AND their indices for use. 2228 # (index, pair) ; index represents the index of the position in the transitions table 2229 2230 sorted_with_index = [] 2231 for idx, pair in enumerate(self.pairs()): # enumerate lists all & assigns an index 2232 # important to note that pairs() are in order 2233 # corresponding with transition table 2234 size1 = 1 2235 size2 = 1 2236 if pair.input() in subsets: size1 = len(subsets[pair.input()]) 2237 if pair.output() in subsets: size2 = len(subsets[pair.output()]) 2238 # setsize = size1 # + size2 2239 sorted_with_index.append([idx,pair,size1,size2]) 2240 2241 sorted_with_index.sort(lambda x,y: self.mycompare(x[2],y[2],x[3],y[3]) ) # lambda x, y: x[2] - y[2]) 2242 return sorted_with_index
2243 2244 2245 # two field compare.
2246 - def mycompare(self, x1, y1, x2=0, y2=0):
2247 if x1 == y1: return x2-y2 2248 else: return x1-y1
2249
2250 - def right_advance(self, current_states, input, output, subsets):
2251 2252 next_states = [] 2253 contains_halt_state = False 2254 for index in current_states: 2255 2256 2257 # flush the any states 2258 any_next_state = '' 2259 next_state_isset = 0 2260 any_next_states_ary = [] 2261 2262 for i, pair, size1, size2 in self.sorted_pairs(subsets): # enumerate(self.pairs()): 2263 2264 # print pair.__repr__() 2265 2266 if pair.matches(input, output, subsets): 2267 2268 # print input, output 2269 # we want to temporarily store an any state (if one matches) 2270 # only 1 any_next_state allowed 2271 # '@' 2272 # consequence of this is that moving to the back prevents discovery 2273 # of of all possible enumerations in forced -> 0 state cases. ie. 0:i -> 0 2274 # recognition causes a problem, here's why. this routine encounters @ before +:i 2275 # it ignores it and goes on to 0:i. 0:i returns under yield, maintaining iterator 2276 # state. advance is called again, iterator state is resumed, but @ was already 2277 # passed, and memory of that state is lost. 2278 # it would be best if enumerate would just sort, but it cannot as it would lose ordering 2279 # also invert under recognize is not properly recursing, as it never even sees the possible 2280 # +:i option. 2281 # OLD CODE; PROBLEM SOLVED (ordering of subsets) 2282 if 0: # ('@' in pair.__repr__()): 2283 # print 'any state match' 2284 # {col num, next state num (0 if fail), is final state} 2285 # if transition row is valid 2286 if self.transitions.has_key(self.transitions[index][i]): ft = self.is_final[self.transitions[index][i]] 2287 else : ft = '' 2288 any_next_states_ary.append([ i, self.transitions[index][i], ft, pair.__repr__() ] ) 2289 if not any_next_state: 2290 any_next_state = self.transitions[index][i] 2291 2292 2293 else: 2294 # if not an any state, add like usual 2295 # if not already in next_states, add 2296 # !!! but won't this break without evaluating @ when called several 2297 # times? (i.e. our state is already in next_state 2298 next_state_isset = 1 2299 next_state = self.transitions[index][i] 2300 if self.transitions.has_key(next_state): 2301 if not(next_state in next_states): 2302 next_states.append(next_state) 2303 2304 if self.is_final[next_state]: contains_halt_state = True 2305 break 2306 2307 return (next_states, contains_halt_state)
2308 2309
2310 - def __repr__(self):
2311 return '<KimmoFSARule %s>' % (self.name(), )
2312 2313
2314 -class KimmoWord:
2315 - def __init__(self, letters, gloss, next_alternation=None):
2316 self._letters = letters 2317 self._gloss = gloss 2318 self._next_alternation = next_alternation
2319 2320
2321 - def __repr__(self):
2322 return '<KimmoWord %s: %s>' % (self.letters(), self.gloss())
2323 2324
2325 - def letters(self): return self._letters
2326 - def gloss(self): return self._gloss
2327 - def next_alternation(self): return self._next_alternation
2328 2329
2330 -class KimmoLexicon:
2331 - def __init__(self, name, words):
2332 self._name = name 2333 self._words = words 2334 self._trie = self.build_trie(words)
2335 2336
2337 - def __repr__(self):
2338 return '<KimmoLexicon ' + self.name() + '>'
2339 2340
2341 - def name(self): return self._name
2342 - def words(self): return self._words
2343 - def trie(self): return self._trie # tree is ([KimmoWord], [ (char, sub-trie), ... ])
2344 2345
2346 - def build_trie(self, words, word_position=0):
2347 if len(words) == 0: return ([], []) 2348 first_chars = {} 2349 for w in words: 2350 if len(w.letters()) <= word_position: continue 2351 fc = w.letters()[word_position] 2352 if first_chars.has_key(fc): 2353 first_chars[fc].append(w) 2354 else: 2355 first_chars[fc] = [ w ] 2356 sub_tries = [] 2357 for c, sub_words in first_chars.items(): 2358 sub_tries.append( (c, self.build_trie(sub_words, word_position+1)) ) 2359 return ( [w for w in words if len(w.letters()) == word_position], sub_tries )
2360 2361
2362 -class KimmoAlternation:
2363 - def __init__(self, name, lexicon_names):
2364 self._name = name 2365 self._lexicon_names = lexicon_names
2366
2367 - def __repr__(self):
2368 return '<KimmoAlternation ' + self.name() + ': ' + str(self.lexicon_names()) + '>'
2369 2370
2371 - def name(self): return self._name
2372 - def lexicon_names(self): return self._lexicon_names
2373 2374
2375 -class KimmoMorphology:
2376 - def __init__(self, alternations, lexicons, start='Begin'):
2377 self.alternations = {} 2378 self.lexicons = {} 2379 self._start = start 2380 for a in alternations: self.alternations[a.name()] = a 2381 for l in lexicons: self.lexicons[l.name()] = l
2382
2383 - def set_boundary(self, boundary_char):
2384 self.boundary = boundary_char
2385
2386 - def initial_state(self):
2387 return self._collect(self._start)
2388 2389
2390 - def possible_next_characters(self, state):
2391 chars = set() 2392 self._possible_next_characters(state, chars) 2393 return chars
2394 2395 # from the lexicon, return the next possible character from all words that match the current state 2396 # for instance, if lexicon has iti, ili, and iyi, and current state is first [i], then 2397 # this function will return a set of (t,l,y)
2398 - def _possible_next_characters(self, state, chars):
2399 for s in state: 2400 if isinstance(s, KimmoLexicon): 2401 (words, sub_tries) = s.trie() 2402 else: 2403 (words, sub_tries) = s 2404 for w in words: 2405 self._possible_next_characters(self._collect(w.next_alternation()), chars) 2406 for c, sub_trie in sub_tries: 2407 chars.add(c)
2408
2409 - def _collect(self, name):
2410 # print 'current alternation: ' + name 2411 if name == None: 2412 return [] 2413 elif self.alternations.has_key(name): 2414 result = [] 2415 for ln in self.alternations[name].lexicon_names(): 2416 result.extend(self._collect(ln)) 2417 return result 2418 elif self.lexicons.has_key(name): 2419 return [ self.lexicons[name] ] 2420 else: 2421 # raise ValueError('no lexicon or alternation named ' + name) 2422 return []
2423
2424 - def advance(self, state, char):
2425 result = [] 2426 # print 'advance' 2427 2428 for s in state: 2429 if isinstance(s, KimmoLexicon): 2430 # print s.name() 2431 (words, sub_tries) = s.trie() 2432 else: 2433 (words, sub_tries) = s 2434 for w in words: 2435 for v in self._advance_through_word(w, char): 2436 yield v 2437 for c, sub_trie in sub_tries: 2438 if c == char: result.append(sub_trie) 2439 if len(result) > 0: 2440 yield (result, [])
2441 # else: 2442 # print 'No Matches in state ' 2443 2444
2445 - def _advance_through_word(self, word, char):
2446 for s in self.advance(self._collect(word.next_alternation()), char): 2447 state, words = s 2448 if word.gloss(): 2449 yield (state, [word] + words) 2450 else: 2451 yield s
2452
2453 -class KimmoRuleSet:
2454 - def __init__(self, subsets, defaults, rules, null='0'):
2455 self.debug = False 2456 self._rules = rules 2457 self._pair_alphabet = set() 2458 self._subsets = {} 2459 self._null = null 2460 for s in subsets: 2461 self._subsets[s.name()] = s.subset() 2462 2463 for kd in defaults: 2464 for pair in kd.defaults(): 2465 # defaults shouldn't contain subsets 2466 if self.is_subset(pair.input()) or self.is_subset(pair.output()): 2467 raise ValueError('default ' + str(pair) + ' contains subset') 2468 self._pair_alphabet.add( ( pair.input() , pair.output() ) ) 2469 for r in self.rules(): 2470 for kp in r.pairs(): 2471 if (not (self.is_subset(kp.input()) or self.is_subset(kp.output()))): 2472 self._pair_alphabet.add( ( kp.input(), kp.output() ) )
2473
2474 - def rules(self): return self._rules
2475 - def subsets(self): return self._subsets
2476 - def is_subset(self, key):
2477 return key[0] == '~' or key in self.subsets()
2478
2479 - def null(self): return self._null;
2480 2481
2482 - def _evaluate_rule_left_context(self, rule, input, output):
2483 fsa = rule.leftFSA() 2484 if fsa == None: return True 2485 states = [ fsa.start() ] 2486 i = len(input) - 1 2487 while i >= 0: 2488 next_states = [] 2489 (result, contains_halt_state) = rule.advance(fsa, states, input[i], output[i], self.subsets()) 2490 if contains_halt_state: return True 2491 for s in result: 2492 if not(s in next_states): next_states.append(s) 2493 if (len(next_states) == 0): return False 2494 states = next_states 2495 i = i - 1 2496 return False
2497
2498 - def _debug_print_input_and_output(self, position, rule_states, morphological_state, 2499 input, output, this_input, this_output, invert):
2500 if (self.debug): 2501 #indent str 2502 padstring = '' 2503 for x in range(position): padstring = padstring + ' ' 2504 2505 print '%s%d %s:%s \n' % (padstring, position, this_input, this_output), 2506 print '%s%d: Input: ' % (padstring, position,), 2507 for i in input: 2508 print ' ' + i + ' ', 2509 if this_input: 2510 print '[' + this_input + ']...', 2511 print 2512 2513 2514 print '%s%d> Output: ' % (padstring, position,), 2515 for o in output: 2516 print ' ' + o + ' ', 2517 if this_output: 2518 print '<' + this_output + '>...', 2519 print 2520 2521 2522 # for (start, rule, fsa_states, required_truth_value) in rule_states: 2523 # print ' {%d %s %s %s}' % (start, rule, fsa_states, required_truth_value) 2524 2525 2526 if False: # morphological_state: 2527 print ' possible input chars = %s' % invert.possible_next_characters(morphological_state)
2528 # print morphological_state 2529 2530 2531 # generate works by passing in the word at each position of the word 2532 # _generate is responsible for testing all the valid chars in the transition alphabet to see if 2533 # they are appropriate surface-underlying transitions. 2534 # it fails entirely if no valid transitions are found 2535 # if one is found, that is the one that is used. 2536 # essentially this is a possible word tree being expanded and failed on branches. 2537 # should return a list of matching words.
2538 - def _generate(self, input_tokens, position, rule_states, morphological_state, input, output, result_str, result_words, 2539 invert=False):
2540 # state is [ ( start, rule, states, required_truth_value ) ] 2541 # print 'morphological_state' 2542 # print morphological_state 2543 2544 # if (self.debug) : 2545 # print '_generate' 2546 # print input_tokens, position, input, output, result_str, result_words 2547 # when at the last token or past it. 2548 2549 if ((position >= len(input_tokens)) ): # and (not morphological_state) 2550 2551 if (self.debug) : print ' AT END OF WORD' 2552 # FOR RECOGNIZER 2553 # this will yield some words twice, not all 2554 # also, recognizer is failing to put on the added information like "+genetive" 2555 2556 # we are at the end, so check to see if a boundary char is in the possible set 2557 # and if so, add it and the remaining morphos 2558 if morphological_state: 2559 2560 # print 'morpho' 2561 possible_next_input_chars = invert.possible_next_characters(morphological_state) 2562 # print 'possible_next_input_chars' 2563 # print possible_next_input_chars 2564 # change to boundary char, instead of hardcode 2565 if ('0' in possible_next_input_chars) or ('#' in possible_next_input_chars): 2566 if '0' in possible_next_input_chars: boundary = '0' 2567 elif '#' in possible_next_input_chars: boundary = '#' 2568 2569 # are at the end of the word, so we need to check and return those results 2570 # that contain the boundary char. 2571 2572 # should only be one potential boundary word '0' 2573 # not correct, there can be more than one boundary word. 2574 for next_morphological_state, new_words in invert.advance(morphological_state, boundary): 2575 # yield result_str, result_words + new_words 2576 # print new_words 2577 # print next_morphological_state 2578 # for o in self._generate(input_tokens, position + 1, [] , next_morphological_state, 2579 # new_input, new_output, new_result_str, 2580 # result_words + new_words, 2581 # invert): 2582 # yield o 2583 yield result_str, result_words + new_words 2584 2585 # yield result_str, result_words 2586 2587 else: 2588 # GENERATION CASE 2589 # print 'no-morpho' 2590 self._debug_print_input_and_output(position, rule_states, morphological_state, input, output, None, None, invert) 2591 for (start, rule, fsa_states, required_truth_value) in rule_states: 2592 if isinstance(rule, KimmoArrowRule): 2593 truth_value = False # since it hasn't reached a halt state 2594 elif isinstance(rule, KimmoFSARule): 2595 truth_value = rule.contains_final(fsa_states) 2596 2597 if (required_truth_value != truth_value): 2598 if (self.debug): 2599 print ' BLOCKED by rule {%d %s %s}' % (start, rule, required_truth_value) 2600 print fsa_states 2601 break 2602 else: 2603 if 0: # (self.debug): 2604 print ' passed rule {%d %s %s}' % (start, rule, required_truth_value) 2605 2606 else: 2607 if (self.debug): 2608 print ' SUCCESS!' 2609 yield result_str, result_words 2610 else: 2611 if morphological_state: # recognizer; get the next possible surface chars that can result in 2612 # the next char 2613 possible_next_input_chars = invert.possible_next_characters(morphological_state) 2614 # print 'possible_next_input_chars' 2615 # print possible_next_input_chars 2616 2617 # foreach pair in our alphabet (includes per subset) 2618 # print self._pair_alphabet 2619 for pair_input, pair_output in self._pair_alphabet: 2620 2621 if (pair_input != self.null() and morphological_state): 2622 # if this pair does not apply, i.e. it is not in the possible 2623 # chars from the lexicon 2624 if not(pair_input in possible_next_input_chars): 2625 continue 2626 2627 if invert: 2628 # check if the output of a transition is in the input string (input_tokens) 2629 compare_token = pair_output 2630 else: 2631 compare_token = pair_input 2632 2633 if not(compare_token == self.null() or compare_token == input_tokens[position]): continue 2634 2635 2636 self._debug_print_input_and_output(position, rule_states, morphological_state, 2637 input, output, pair_input, pair_output, invert) 2638 2639 2640 fail = None 2641 next_rule_states = [] 2642 2643 # first, evaluate currently activated rules 2644 # s is the current rule & its state 2645 rule_state_debug = ' ' 2646 for s in rule_states: 2647 2648 # advance one through each rule 2649 (start, rule, fsa_state_set, required_truth_value) = s 2650 2651 current_state_str = '[' 2652 for x in fsa_state_set: current_state_str += str(x) 2653 rule_state_debug += current_state_str 2654 2655 (next_fsa_state_set, contains_halt_state) = rule.right_advance(fsa_state_set, pair_input, pair_output, 2656 self.subsets()) 2657 2658 current_state_str = '' 2659 for x in next_fsa_state_set: current_state_str += str(x) 2660 if not current_state_str: current_state_str = '0 (FAIL)' 2661 rule_state_debug += ('->' + current_state_str + '] ') 2662 2663 if (contains_halt_state == True and isinstance(rule, KimmoArrowRule)): 2664 if (required_truth_value == False): 2665 fail = s 2666 break 2667 else: 2668 if (0): # (self.debug): 2669 print ' passed rule {%d %s %s}' % (start, rule, required_truth_value) 2670 elif (len(next_fsa_state_set) == 0): 2671 # if it isn't true, then it will have to fail, bcs we are at 2672 # the end of the state set. 2673 # truth is evaluated by following the states until the end. 2674 if (required_truth_value == True): 2675 fail = s 2676 break 2677 else: 2678 if (0): # (self.debug): 2679 print ' passed rule {%d %s %s}' % (start, rule, required_truth_value) 2680 else: 2681 next_rule_states.append( (start, rule, next_fsa_state_set, required_truth_value) ) 2682 2683 if (self.debug) : print rule_state_debug 2684 2685 if (fail): 2686 if (self.debug): 2687 print ' BLOCKED by rule %s' % (fail,) 2688 continue 2689 2690 2691 # activate new KimmoArrowRules 2692 for rule in self.rules(): 2693 if not(isinstance(rule, KimmoArrowRule)): continue 2694 2695 required_truth_value = rule.matches(pair_input, pair_output, self.subsets()) 2696 if required_truth_value == None: continue 2697 left_value = self._evaluate_rule_left_context(rule, input, output) 2698 if (left_value == False): 2699 if (required_truth_value == True): 2700 fail = rule 2701 continue 2702 2703 2704 if (rule.rightFSA()): 2705 if (self.debug): 2706 print ' adding rule {%d %s %s}' % (position, rule, required_truth_value) 2707 next_rule_states.append( (position, rule, [ rule.rightFSA().start() ], required_truth_value) ) 2708 else: 2709 if (required_truth_value == False): 2710 fail = rule 2711 continue 2712 else: 2713 if (0): # (self.debug): 2714 print ' passed rule ' + str(rule) 2715 2716 # if did not fail, call recursively on next chars 2717 if (fail == None): 2718 new_position = position 2719 new_input = input + [pair_input] 2720 new_output = output + [pair_output] 2721 new_result_str = result_str 2722 2723 if (pair_input != self.null()): 2724 if invert: 2725 new_result_str = result_str + pair_input 2726 else: 2727 new_position = position + 1 2728 if (pair_output != self.null()): 2729 if invert: 2730 new_position = position + 1 2731 else: 2732 new_result_str = result_str + pair_output 2733 2734 2735 # morph state & generation steps through a char at a time. 2736 # as it is, it only yields its morph if there is a valid next morphology 2737 if morphological_state and pair_input != self.null(): 2738 for next_morphological_state, new_words in invert.advance(morphological_state, pair_input): 2739 # print 'ENTERING LEXICON ' 2740 for o in self._generate(input_tokens, new_position, next_rule_states, next_morphological_state, 2741 new_input, new_output, new_result_str, 2742 result_words + new_words, 2743 invert): 2744 yield o 2745 else: 2746 for o in self._generate(input_tokens, new_position, next_rule_states, morphological_state, 2747 new_input, new_output, new_result_str, result_words, invert): 2748 yield o 2749 else: 2750 if (self.debug): 2751 print ' BLOCKED by rule ' + str(fail)
2752
2753 - def _initial_rule_states(self):
2754 return [ (0, rule, [ rule.start() ], True) for rule in self.rules() if isinstance(rule, KimmoFSARule)]
2755
2756 - def generate(self, input_tokens):
2757 """Generator: yields output strings""" 2758 for o, w in self._generate(input_tokens, 0, self._initial_rule_states(), None, [], [], '', None): 2759 yield o
2760 2761
2762 - def recognize(self, input_tokens, morphology=None):
2763 """Recognizer: yields (input_string, input_words)""" 2764 morphology_state = None 2765 output_words = None 2766 invert = True 2767 if morphology: 2768 morphology_state = morphology.initial_state() 2769 output_words = [] 2770 invert = morphology 2771 2772 2773 if not morphology_state: 2774 print "Bad Morphological State, failing recognition" 2775 return 2776 if (self.debug) : print 'recognize: ' + input_tokens 2777 # print output_words 2778 for o in self._generate(input_tokens, 0, self._initial_rule_states(), morphology_state, [], [], '', 2779 output_words, invert): 2780 yield o # yielding a list of possible words.
2781 2782
2783 -def _generate_test(s, input):
2784 resultlist = '%s -> ' % (input,), 2785 padlevel = len(input) + 4 2786 padstring = '' 2787 # for x in range(padlevel): padstring = padstring + ' ' 2788 2789 tmplist = '%s' % ('***NONE***'), 2790 for o in s.generate(input): 2791 tmplist = '%s%s\n' % (padstring,o,), 2792 resultlist = resultlist + tmplist 2793 padstring = '' 2794 for x in range(padlevel): padstring = padstring + ' ' 2795 tmplist = '%s' % (''), 2796 resultlist = resultlist + tmplist 2797 2798 return resultlist
2799 2800
2801 -def _recognize_test(s, input, morphology=None):
2802 resultlist = '%s <- ' % (input,), 2803 padlevel = len(input) + 4 2804 padstring = '' 2805 # for x in range(padlevel): padstring = padstring + ' ' 2806 2807 tmplist = '%s' % ('***NONE***'), 2808 for o, w in s.recognize(input, morphology): 2809 if w: 2810 # print 2811 tmplist = '\n %s %s \n' % (o, w), 2812 resultlist = resultlist + tmplist 2813 else: 2814 tmplist = '%s%s \n' % (padstring,o,), 2815 resultlist = resultlist + tmplist 2816 2817 padstring = '' 2818 for x in range(padlevel): padstring = padstring + ' ' 2819 tmplist = '%s' % (''), 2820 # print 2821 # q = re.compile('(\{|\})') 2822 # q.sub("", resultstring[0]) 2823 resultlist = resultlist + tmplist 2824 2825 return resultlist
2826
2827 -def read_kimmo_file(filename, gui=None):
2828 path = os.path.expanduser(filename) 2829 try: 2830 f = open(path, 'r') 2831 except IOError, e: 2832 path = os.path.join(get_basedir(), "kimmo", filename) 2833 try: 2834 f = open(path, 'r') 2835 except IOError, e: 2836 if gui: 2837 gui.guiError(str(e)) 2838 else: 2839 print str(e) 2840 print "FAILURE" 2841 return "" 2842 print "Loaded:", path 2843 return f
2844 2845 # MAIN 2846 # if __name__ == '__main__': KimmoGUI(None, None) 2847 # if __name__ == '__main__': tkImageView("") 2848 if __name__ == '__main__': 2849 filename_lex = '' 2850 filename_rul = '' 2851 filename_batch_test = '' 2852 recognize_string = '' 2853 generate_string = '' 2854 console_debug = 0 2855 2856 for x in sys.argv: 2857 # if -r/g is defined (recognize or generate word) 2858 # or batch file is defined 2859 # run in commandline mode. 2860 2861 if ".lex" in x: filename_lex = x 2862 elif ".rul" in x: filename_rul = x 2863 elif ".batch" in x: filename_batch_test = x 2864 elif x[0:3] == "-r:": recognize_string = x[3:len(x)] 2865 elif x[0:3] == "-g:": generate_string = x[3:len(x)] 2866 elif x == "debug": console_debug = 1 2867 2868 2869 print 'Tips:' 2870 print 'kimmo.cfg is loaded by default, so if you name your project that, ' 2871 print "it will be loaded at startup\n" 2872 2873 print 'For commandline operation:' 2874 print ' (for instance if you want to use a different editor)' 2875 print "To Recognize:" 2876 print " % python kimmo.py english.lex english.rul -r:cats" 2877 print "To Generate:" 2878 print " % python kimmo.py english.lex english.rul -g:cat+s" 2879 print "To Batch Test:" 2880 print " % python kimmo.py english.lex english.rul english.batch_test" 2881 print "With Debug and Tracing:" 2882 print " % python kimmo.py english.lex english.rul -r:cats debug\n" 2883 2884 2885 # print filename_lex 2886 # print filename_rul 2887 # print filename_batch_test 2888 # print recognize_string 2889 # print generate_string 2890 2891 2892 if (recognize_string or generate_string or filename_batch_test) and filename_rul: 2893 kimmoinstance = KimmoControl("","",filename_lex,filename_rul,console_debug) 2894 2895 # creation failed, stop 2896 if not kimmoinstance.ok : 2897 print kimmoinstance.errors 2898 sys.exit() 2899 2900 2901 if recognize_string: 2902 recognize_results = kimmoinstance.recognize(recognize_string) 2903 print recognize_results 2904 2905 if generate_string: 2906 generate_results = kimmoinstance.generate(generate_string) 2907 print generate_results # remember to format 2908 2909 if filename_batch_test: # run a batch 2910 kimmoinstance.batch(filename_batch_test) 2911 2912 else: 2913 KimmoGUI(None, None) 2914 2915 # constructor takes arguments: 2916 # KimmoControl(lexicon_string, rule_string, lexicon_filename, rule_filename, debug) 2917 # the constructor requires both lexicon and rules for recognition. 2918 # you can provide either the file contents as a string, or as a filename. 2919 # if only used to generate, only a rule file/string is necessary. 2920 2921 # kimmoinstance = KimmoControl("","",'','./englex/english.rul',0) 2922 # kimmoinstance = KimmoControl("","",'kimmo.lex','kimmo.rul',0) 2923 # generate_results = kimmoinstance.generate("cat+s") 2924 # print generate_results 2925 2926 # recognize_results = kimmoinstance.recognize("cats") 2927 # print recognize_results 2928