Package nltk_lite :: Package contrib :: Module marshal
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.marshal

  1  # Marshaling code, contributed by Tiago Tresoldi 
  2  # This saves/loads models to/from plain text files. 
  3  # Unlike Python's shelve and pickle utilities, 
  4  # this is useful for inspecting or tweaking the models. 
  5  # We may incorporate this as a marshal method in each model. 
  6   
  7  # TODO: describe each tagger marshal format in the epydocs? 
  8   
  9  from itertools import islice 
 10  import re 
 11   
 12  import nltk_lite.tag as tag 
 13  from nltk_lite.corpora import brown 
 14   
 15  # marshal-classes 
 16   
17 -class MarshalDefault (tag.Default):
18 _classname = "DefaultTagger" 19
20 - def marshal (self, filename):
21 """ 22 Marshals (saves to a plain text file) the tagger model. 23 24 @param filename: Name of the file to which save the model (will 25 be overwritten if it already exists). 26 @type filename: C{string} 27 """ 28 handler = file(filename, "w") 29 handler.write(self._tag) 30 handler.close()
31
32 - def unmarshal (self, filename):
33 """ 34 Unmarshals (loads from a plain text file) the tagger model. For 35 safety, this operation is intended to be performed only on 36 newly created taggers (i.e., without any previous model). 37 38 @param filename: Name of the file from which the model will 39 be read. 40 @type filename: C{string} 41 """ 42 handler = file(filename, "r") 43 self._tag = handler.read() 44 handler.close()
45
46 -class MarshalUnigram (tag.Unigram):
47 _classname = "UnigramTagger" 48
49 - def marshal (self, filename):
50 """ 51 Marshals (saves to a plain text file) the tagger model. 52 53 @param filename: Name of the file to which save the model (will 54 be overwritten if it already exists). 55 @type filename: C{string} 56 """ 57 handler = file(filename, "w") 58 59 for text, tag in self._model.iteritems(): 60 handler.write("%s:%s\n" % (text, tag)) 61 62 handler.close()
63
64 - def unmarshal (self, filename):
65 """ 66 Unmarshals (loads from a plain text file) the tagger model. For 67 safety, this operation is intended to be performed only on 68 newly created taggers (i.e., without any previous model). 69 70 @param filename: Name of the file from which the model will 71 be read. 72 @type filename: C{string} 73 """ 74 handler = file(filename, "r") 75 76 pattern = re.compile(r'^(.+):(.+?)$', re.UNICODE) 77 for line in handler.readlines(): 78 m = re.match(pattern, line) 79 text, tag = m.groups() 80 self._model[text] = tag 81 82 handler.close()
83
84 -class MarshalAffix (tag.Affix):
85 _classname = "AffixTagger" 86
87 - def marshal (self, filename):
88 """ 89 Marshals (saves to a plain text file) the tagger model. 90 91 @param filename: Name of the file to which save the model (will 92 be overwritten if it already exists). 93 @type filename: C{string} 94 """ 95 handler = file(filename, "w") 96 97 handler.write("length %i\n" % self._length) 98 handler.write("minlength %i\n" % self._minlength) 99 100 for text, tag in self._model.iteritems(): 101 handler.write("%s:%s\n" % (text, tag)) 102 103 handler.close()
104
105 - def unmarshal (self, filename):
106 """ 107 Unmarshals (loads from a plain text file) the tagger model. For 108 safety, this operation is intended to be performed only on 109 newly created taggers (i.e., without any previous model). 110 111 @param filename: Name of the file from which the model will 112 be read. 113 @type filename: C{string} 114 """ 115 handler = file(filename, "r") 116 117 lines = handler.readlines() 118 # will fail if "length " and "minlength " are not present 119 self._length = int(lines[0].split("length ")[1]) 120 self._minlength = int(lines[1].split("minlength ")[1]) 121 122 pattern = re.compile(r'^(.+):(.+?)$', re.UNICODE) 123 for line in lines[2:]: 124 m = re.match(pattern, line) 125 text, tag = m.groups() 126 self._model[text] = tag 127 128 handler.close()
129
130 -class MarshalNgram (tag.Ngram):
131 _classname = "NgramTagger" 132
133 - def marshal (self, filename):
134 """ 135 Marshals (saves to a plain text file) the tagger model. 136 137 @param filename: Name of the file to which save the model (will 138 be overwritten if it already exists). 139 @type filename: C{string} 140 """ 141 handler = file(filename, "w") 142 143 handler.write("n %i\n" % self._n) 144 145 for entry in self._model: 146 context, text, tag = entry[0], entry[1], self._model[entry] 147 148 try: 149 entry_str = "[%s]:%s:%s\n" % (":".join(context), text, tag) 150 handler.write(entry_str) 151 except TypeError: 152 # None found in 'context', pass silently 153 pass 154 155 handler.close()
156
157 - def unmarshal (self, filename):
158 """ 159 Unmarshals (loads from a plain text file) the tagger model. For 160 safety, this operation is intended to be performed only on 161 newly created taggers (i.e., without any previous model). 162 163 @param filename: Name of the file from which the model will 164 be read. 165 @type filename: C{string} 166 """ 167 handler = file(filename, "r") 168 169 lines = handler.readlines() 170 # will fail if "n " is not present 171 self._n = int(lines[0].split("n ")[1]) 172 173 174 pattern = re.compile(r'^\[(.+)\]:(.+):(.+?)$', re.UNICODE) 175 176 # As the separator-char ":" can be used as a tag or as a text, 177 # 'context_pattern' is built based on the context's size (self._n), 178 # for example: 179 # self._n = 2 -> r'^(.+?)$', like 'tag1' 180 # self._n = 3 -> r'^(.+?):(.+?)$', like 'tag1:tag2' 181 # self._n = 4 -> r'^(.+?):(.+?):(.+?)$', like 'tag1:tag2:tag3' 182 context_pattern_str = r'^(.+?)%s$' % ( r':(.+?)' * (self._n-2) ) 183 context_pattern = re.compile(context_pattern_str, re.UNICODE) 184 185 for line in lines[1:]: 186 m = re.match(pattern, line) 187 context, text, tag = m.groups() 188 189 c_m = re.match(context_pattern, context) 190 key = (c_m.groups(), text) 191 self._model[key] = tag 192 193 handler.close()
194
195 -def demo ():
196 # load train corpus 197 train_sents = list(islice(brown.tagged(), 500)) 198 199 # create taggers 200 tagger = MarshalNgram(3) 201 202 #tagger.train(train_sents) 203 #tagger.marshal("ngram.test") 204 205 tagger.unmarshal("ngram.test") 206 print tagger._model
207