Package nltk_lite :: Package contrib :: Module combined
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.combined

  1  import math 
  2  import os 
  3   
  4  # tagger importing 
  5  from nltk_lite import tag 
  6  from nltk_lite.tag import SequentialBackoff 
  7  # work-around while marshal is not moved into standard tree 
  8  from nltk_lite.contrib.marshal import MarshalDefault ; Default = MarshalDefault 
  9  from nltk_lite.contrib.marshal import MarshalUnigram ; Unigram = MarshalUnigram 
 10  from nltk_lite.contrib.marshal import MarshalAffix   ; Affix   = MarshalAffix 
 11  from nltk_lite.contrib.marshal import MarshalNgram   ; Ngram   = MarshalNgram 
 12  from nltk_lite.contrib.marshalbrill import * 
 13   
14 -class CombinedTagger (SequentialBackoff):
15 - def __init__ (self):
16 self._tagger = [] 17 self._brill = None
18
19 - def _append_default (self, default_tag, verbose=False):
20 self._tagger.append( Default(default_tag) )
21
22 - def _append_affix (self, a_len, w_len, train_sents, verbose=False):
23 self._tagger.append( Affix(a_len, w_len, backoff=self._tagger[-1]) ) 24 self._tagger[-1].train([train_sents], verbose)
25
26 - def _append_unigram (self, train_sents, verbose=False):
27 self._tagger.append( Unigram(backoff=self._tagger[-1]) ) 28 self._tagger[-1].train(train_sents, verbose)
29
30 - def _append_ngram (self, size, train_sents, verbose=False, cutoff_value=0.001):
31 cutoff = math.floor(len(train_sents)*cutoff_value) 32 self._tagger.append( Ngram(size, cutoff=cutoff, backoff=self._tagger[-1]) ) 33 self._tagger[-1].train([train_sents], verbose)
34
35 - def _append_brill (self, train_sents, max_rules, min_score=2, trace=0):
51
52 - def marshal (self, basepath):
53 # create the model files, one for each tagger (*.mod) plus a general one 54 handler = file(os.path.join(basepath, "model.mrs"), "w") 55 56 for index in range(len(self._tagger)): 57 filename = os.path.join(basepath, "tagger%02d.mod" % index) 58 handler.write("%s %s\n" % (self._tagger[index]._classname, filename) ) 59 self._tagger[index].marshal(filename) 60 61 filename = os.path.join(basepath, "tagger%02d.mod" % (index+1)) 62 handler.write("%s %s\n" % (self._brill._classname, filename) ) 63 self._brill.marshal(filename) 64 65 handler.close()
66
67 - def unmarshal (self, basepath):
68 # clear taggers 69 self._tagger = [] 70 self._brill = None 71 72 # read model's configuration 73 filename = os.path.join(basepath, "model.mrs") 74 handler = file(filename, "r") 75 model = handler.readlines() 76 handler.close() 77 model = [line[:-1] for line in model] # remove "\n"s 78 model = [line for line in model if len(line) > 0] # remove empty lines 79 80 # tagger by tagger 81 for tagger in model: 82 tagger_type, tagger_file = tagger.split(" ") 83 if tagger_type == "DefaultTagger": 84 self._tagger.append( Default("") ) 85 self._tagger[-1].unmarshal(tagger_file) 86 elif tagger_type == "AffixTagger": 87 self._tagger.append( Affix(1, 2, backoff=self._tagger[-1]) ) 88 self._tagger[-1].unmarshal(tagger_file) 89 elif tagger_type == "UnigramTagger": 90 self._tagger.append( Unigram(backoff=self._tagger[-1]) ) 91 self._tagger[-1].unmarshal(tagger_file) 92 elif tagger_type == "NgramTagger": 93 self._tagger.append( Ngram(2, backoff=self._tagger[-1]) ) 94 self._tagger[-1].unmarshal(tagger_file) 95 elif tagger_type == "BrillTagger": 96 self._brill = Brill(self._tagger[-1], []) 97 self._brill.unmarshal(tagger_file) 98 else: 99 print "error, tagger type not recognized."
100
101 - def exemple_train (self, train_sents, verbose=False):
102 self._append_default("N") 103 104 self._append_affix(-2, 6, train_sents, verbose) 105 self._append_affix(-3, 7, train_sents, verbose) 106 self._append_affix(-4, 8, train_sents, verbose) 107 self._append_affix(-5, 9, train_sents, verbose) 108 109 self._append_unigram(train_sents, verbose) 110 111 self._append_ngram(2, train_sents, verbose) 112 113 self._append_brill(train_sents, 1, 2, trace=3)
114
115 - def tag_one (self, token):
116 return self._tagger[-1].tag_one(token)
117
118 - def tag (self, tokens, verbose=False):
119 return self._tagger[-1].tag(tokens, verbose)
120
121 -def create_tagger (train_sents):
122 ct = CombinedTagger() 123 # ct.example_train(train_sents, True) 124 ct.unmarshal("tresoldi") 125 126 tokens = "Mauro viu o livro sobre a mesa".split() 127 print list(ct.tag(tokens)) 128 129 # tests 130 acc = tag.accuracy(ct, [train_sents]) 131 print 'Accuracy = %4.2f%%' % (100 * acc)
132