Package Martel :: Module RecordReader
[hide private]
[frames] | no frames]

Source Code for Module Martel.RecordReader

  1  # Copyright 2000-2001, Dalke Scientific Software, LLC 
  2  # Distributed under the Biopython License Agreement (see the LICENSE file). 
  3   
  4  # The existing parsers are in-memory.  For large data files, like 
  5  # swissprot, that requires too much memory. 
  6   
  7  # On the other hand, the records aren't all that large (there's just a 
  8  # lot of them.)  This module has readers which scan the input to get a 
  9  # record as a string. 
 10   
 11  import string 
 12  from mx import TextTools as TT 
 13   
 14  SIZEHINT = 100000 
 15   
16 -class ReaderError(TypeError):
17 pass
18
19 -class RecordReader:
20 - def __init__(self, infile):
21 self.infile = infile
22 - def next(self):
23 raise NotImplementedError
24 - def remainder(self):
25 raise NotImplementedError
26
27 -def _startswith_tagtable_rest_of_line(text):
28 return ( 29 # Ensure the text starts with the given word 30 ("begin", TT.Word, text, TT.MatchFail, +1), 31 32 # Read to the end of line 33 (None, TT.AllInSet, TT.invset('\r\n'), +1, +1), 34 35 # Read the end of line 36 (None, TT.Is, '\n', +1, +4), # matches '\n' or 37 (None, TT.Is, '\r', +2, +1), # '\r' followed by 38 (None, TT.Is, '\n', +2, +2), # optional '\n' 39 40 # Check if EOF (allow EOF if no EOL found) 41 (None, TT.EOF, TT.Here, +1, TT.MatchOk), 42 43 # Not EOF, so look for the next line starting with text 44 ("begin", TT.Word, text, +1, -5), 45 46 # Not what I am looking for, so read to the end of line 47 (None, TT.AllInSet, TT.invset('\r\n'), +1, +1), 48 49 # Read the end of line then test the next line 50 (None, TT.Is, '\n', +1, -2), # '\n' 51 (None, TT.Is, '\r', +2, +1), # '\r' followed by 52 (None, TT.Is, '\n', -4, -4), # optional '\n' 53 # Allow termination at EOF 54 (None, TT.EOF, TT.Here, TT.MatchFail, TT.MatchOk), 55 )
56
57 -def _startswith_tagtable_newline(text):
58 return ( 59 # Ensure the text starts with the given word ... 60 ("begin", TT.Word, text, TT.MatchFail, +1), 61 62 # ... followed by the end of line 63 (None, TT.Is, '\n', +1, +4), # matches '\n' or 64 (None, TT.Is, '\r', +2, +1), # '\r' followed by 65 (None, TT.Is, '\n', +2, +2), # optional '\n' 66 67 # Check if EOF instead of a newline (allow EOF if found) 68 # Otherwise, this means the line starts with the text but 69 # doesn't have a successive newline. 70 # XXX BUG! When looking for "A\n" should not fail on "AA\n"! 71 (None, TT.EOF, TT.Here, TT.MatchFail, TT.MatchOk), 72 73 # Look for the next line starting with text 74 ("begin", TT.Word, text, +1, -4), 75 76 # Not what I am looking for, so read to the end of line 77 (None, TT.AllInSet, TT.invset('\r\n'), +1, +1), 78 79 # Read the end of line then test the next line 80 (None, TT.Is, '\n', +1, -2), # '\n' 81 (None, TT.Is, '\r', +2, +1), # '\r' followed by 82 (None, TT.Is, '\n', -4, -4), # optional '\n' 83 # Allow termination at EOF 84 (None, TT.EOF, TT.Here, TT.MatchFail, TT.MatchOk), 85 )
86 87
88 -def _find_begin_positions(text, tagtable):
89 success, tags, pos = TT.tag(text, tagtable) 90 # print "XXX", success, tags, pos, len(text) 91 if not success: 92 raise ReaderError("invalid format starting with %s" % repr(text[:50])) 93 if pos != len(text): 94 raise ReaderError, \ 95 "could not parse to end of text (ended at %d of %d)" % \ 96 (pos, len(text)) 97 return [tag[1] for tag in tags]
98 99
100 -class StartsWith(RecordReader):
101 - def __init__(self, infile, text, sizehint = SIZEHINT, lookahead = ""):
102 RecordReader.__init__(self, infile) 103 self.text = text 104 self.sizehint = sizehint 105 106 pos = string.find(text, "\n") 107 if pos != -1: 108 if pos != len(text)-1: 109 raise AssertionError, "'\\n' can only exist at the end of the string" 110 text = text[:-1] 111 has_newline = 1 112 else: 113 has_newline = 0 114 assert len(text), "StartsWith text size is too short" 115 assert len(text) < sizehint - 2, \ 116 "StartsWith text size larger than sizehint allows" 117 118 119 if has_newline: 120 raise NotImplementedError, "there's a bug in the '\\n' option" 121 self.tagtable = _startswith_tagtable_newline(text) 122 else: 123 self.tagtable = _startswith_tagtable_rest_of_line(text) 124 125 self.lookahead = lookahead 126 127 # Start parsing here. This guarantees the first line is in 128 # the right format. 129 if len(self.lookahead) < len(text) + 2: 130 self.lookahead += infile.read(sizehint) 131 if self.lookahead: 132 self.positions = _find_begin_positions(self.lookahead, 133 self.tagtable) 134 else: 135 self.positions = [0] 136 self.index = 0
137
138 - def next(self):
139 # Are any precomputed positions remaining? 140 if self.index + 1 < len(self.positions): 141 # Yes, so return the text in the range 142 s = self.lookahead[self.positions[self.index]: 143 self.positions[self.index+1]] 144 self.index += 1 145 return s 146 147 # The last record may be incomplete, so reset the 148 # lookahead to be just that text 149 self.lookahead = self.lookahead[self.positions[-1]:] 150 151 # Read past at least the start of the second record or to the 152 # end of file. 153 positions = [self.positions[-1]] 154 while 1: 155 data = self.infile.read(self.sizehint) 156 if not data: 157 break 158 self.lookahead = self.lookahead + data 159 positions = _find_begin_positions(self.lookahead, self.tagtable) 160 if len(positions) > 1: 161 break 162 if len(positions) > 1: 163 self.positions = positions 164 self.index = 1 165 return self.lookahead[positions[0]:positions[1]] 166 elif not self.lookahead: 167 # No data (either empty file or at EOF) 168 self.positions = [0] 169 self.index = 0 170 return None 171 else: 172 # Read to the end of file and it's the last record 173 assert len(positions) == 1 174 self.positions = [0] 175 self.index = 0 176 s = self.lookahead 177 self.lookahead = "" 178 return s
179
180 - def remainder(self):
181 return self.infile, self.lookahead[self.positions[self.index]:]
182
183 -def _endswith_tagtable_newline(text):
184 return ( 185 # Is the current line the end of record marker? 186 (None, TT.Word, text, +6, +1), 187 188 # Make sure it ends the line 189 ("end", TT.Is, '\n', +1, -1), # matches '\n' 190 (None, TT.Is, '\r', +4, +1), 191 ("end", TT.Is, '\n', +1, -3), 192 (None, TT.Skip, -1, +1, +1), 193 ("end", TT.Skip, +1, -5, -5), 194 195 # Not the end of record marker, so read to the end of line 196 (None, TT.AllInSet, TT.invset('\r\n'), +1, +1), 197 198 # Check if EOF 199 (None, TT.EOF, TT.Here, +1, TT.MatchOk), 200 201 # Not EOF, so scarf any newlines 202 (None, TT.AllInSet, TT.set('\r\n'), TT.MatchFail, -8), 203 )
204
205 -def _endswith_tagtable_rest_of_line(text):
206 return ( 207 # Is the current line the end of record marker? 208 (None, TT.Word, text, +8, +1), 209 210 # Read whatever else is on that line (could be nothing) 211 (None, TT.AllInSet, TT.invset('\r\n'), +1, +1), 212 213 # Get the end of line 214 ("end", TT.Is, '\n', +1, -2), # matches '\n' 215 (None, TT.Is, '\r', +4, +1), 216 ("end", TT.Is, '\n', +1, -4), 217 (None, TT.Skip, -1, +1, +1), 218 ("end", TT.Skip, +1, -6, -6), 219 220 # Check if EOF (only tests when the end of record line has no \n) 221 # Only time this should fail is with a bug in TT. 222 ("end", TT.EOF, TT.Here, TT.MatchFail, TT.MatchOk), 223 224 # Not the end of record marker, so read to the end of line 225 (None, TT.AllInSet, TT.invset('\r\n'), +1, +1), 226 227 # Check if EOF 228 (None, TT.EOF, TT.Here, +1, TT.MatchOk), 229 230 # Not EOF, so scarf any newlines and try again 231 (None, TT.AllInSet, TT.set('\r\n'), TT.MatchFail, -10), 232 )
233 234
235 -def _find_end_positions(text, tagtable):
236 success, tags, pos = TT.tag(text, tagtable) 237 #print "XXX", success, tags, pos, len(text), repr(text) 238 if not success: 239 raise ReaderError("invalid format starting with %s" % repr(text[:50])) 240 if pos != len(text): 241 raise ReaderError, \ 242 "could not parse to end of text (ended at %d of %d)" % \ 243 (pos, len(text)) 244 return [tag[2] for tag in tags]
245
246 -class EndsWith(RecordReader):
247 - def __init__(self, infile, text, sizehint = SIZEHINT, lookahead = ""):
248 RecordReader.__init__(self, infile) 249 self.text = text 250 self.sizehint = sizehint 251 252 pos = string.find(text, "\n") 253 if pos != -1: 254 if pos != len(text)-1: 255 raise AssertionError, "'\\n' can only exist at the end of the string" 256 text = text[:-1] 257 has_newline = 1 258 else: 259 has_newline = 0 260 assert len(text) < sizehint - 2, \ 261 "EndsWith text size larger than sizehint allows" 262 263 if has_newline: 264 self.tagtable = _endswith_tagtable_newline(text) 265 else: 266 self.tagtable = _endswith_tagtable_rest_of_line(text) 267 268 269 self.lookahead = lookahead 270 self.positions = [] 271 self.index = 0 272 self.pos = 0
273
274 - def next(self):
275 # Are any precomputed positions remaining? 276 if self.index < len(self.positions): 277 # Yes, so return that text 278 newpos = self.positions[self.index] 279 s = self.lookahead[self.pos:newpos] 280 self.pos = newpos 281 self.index = self.index + 1 282 return s 283 284 # No available information, so use what remains to seed the 285 # next level. 286 lookahead = self.lookahead[self.pos:] 287 288 data = "" 289 positions = [] 290 # Add text until I've found a record or there is no more data 291 while 1: 292 data = self.infile.read(self.sizehint) 293 if not data: 294 if not positions: 295 positions = _find_end_positions(lookahead, self.tagtable) 296 break 297 lookahead = lookahead + data 298 positions = _find_end_positions(lookahead, self.tagtable) 299 if len(positions) > 1: 300 del positions[-1] 301 break 302 303 self.lookahead = lookahead 304 self.positions = positions 305 306 if positions: 307 self.index = 1 308 self.pos = positions[0] 309 return lookahead[:positions[0]] 310 elif not lookahead: 311 # No data (either empty file or at EOF) 312 self.pos = 0 313 self.index = 0 314 return None 315 316 # This is likely an unterminated record. However, it could be 317 # that there is no terminal end-of-line character so check for 318 # that case. 319 if lookahead[-1:] not in "\r\n": 320 special_case = lookahead + "\n" 321 positions = _find_end_positions(special_case, self.tagtable) 322 if positions: 323 assert len(positions) == 1, "this case should not occur" 324 pos = positions[0] 325 assert pos == len(special_case), "wrong sizes: %d and %d" % \ 326 (pos, len(special_case)) 327 self.lookahead = "" 328 self.positions = [] 329 self.pos = 0 330 self.index = 0 331 return lookahead 332 333 # Really could not find a terminator 334 self.index = 0 335 self.pos = 0 336 raise ReaderError("Last record not terminated: at %s ..." % 337 repr(self.lookahead[:50]))
338
339 - def remainder(self):
340 return self.infile, self.lookahead[self.pos:]
341 342 343
344 -class Until(RecordReader):
345 - def __init__(self, infile, text, sizehint = SIZEHINT, lookahead = ""):
346 RecordReader.__init__(self, infile) 347 self.text = text 348 self.lookahead = lookahead 349 self.sizehint = sizehint 350 self.found = 0 351 352 if text[-1] == "\n": 353 raise NotImplementedError, "Until reader does not support '\\n'" 354 if "\n" in text: 355 raise AssertionError, "'\\n' can only exist at the end of the string"
356
357 - def next(self):
358 if self.found: 359 return None 360 361 # Use the StartsWith reader to get to the end of this record. 362 # Need to fake the first line.. 363 fake = self.text + "\n" 364 reader = StartsWith(self.infile, self.text, self.sizehint, 365 fake + self.lookahead) 366 rec = reader.next() 367 rec = rec[len(fake):] # remove the fake data 368 self.infile, self.lookahead = reader.remainder() 369 self.found = 1 370 return rec
371
372 - def remainder(self):
373 return self.infile, self.lookahead
374 375 # Tag the last byte of every newline 376 _tag_lines_tagtable = ( 377 # Skip non-newline characters 378 (None, TT.AllInSet, TT.invset('\r\n'), +1, +1), 379 380 # Check if newline 381 ("newline", TT.Is, '\n', +1, -1), # can be '\n' 382 (None, TT.Is, '\r', +3, +1), # or start a '\r' followed by .. 383 ("newline", TT.Is, '\n', +1, -3), # .. an optional '\n' 384 ("newline", TT.Skip, 0, -4, -4), # get here with just an '\r' 385 (None, TT.EOF, TT.Here, -5, TT.MatchOk), # stop at end of text 386 ) 387 388
389 -class CountLines(RecordReader):
390 """Read a specified (fixed) number of lines"""
391 - def __init__(self, infile, count, sizehint = SIZEHINT, lookahead = ""):
392 assert count > 0, "CountLines reader must read at least one line" 393 assert lookahead > 0, "Must read at least a character at a time" 394 assert sizehint > 0, "sizehint must be positive" 395 RecordReader.__init__(self, infile) 396 self.sizehint = sizehint 397 self.lookahead = lookahead 398 self.count = count 399 self.pos = 0 400 self.positions = [] 401 self.index = 0
402
403 - def next(self):
404 if self.index + self.count < len(self.positions): 405 self.index = self.index + self.count 406 endpos = self.positions[self.index-1] 407 s = self.lookahead[self.pos:endpos] 408 self.pos = endpos 409 return s 410 411 lookahead = self.lookahead[self.pos:] 412 while 1: 413 positions = _find_end_positions(lookahead, _tag_lines_tagtable) 414 if len(positions) > self.count: 415 # Last line may be incomplete, as with "\r" of "\r\n" 416 # Hmm, is this *really* needed? Doesn't hurt. XXX 417 del positions[-1] 418 break 419 data = self.infile.read(self.sizehint) 420 if not data: 421 break 422 lookahead = lookahead + data 423 424 self.lookahead = lookahead 425 self.pos = 0 426 self.positions = positions 427 428 if not lookahead: 429 return None 430 431 if len(positions) >= self.count: 432 self.index = self.count 433 endpos = self.positions[self.count-1] 434 s = lookahead[0:endpos] 435 self.pos = endpos 436 return s 437 438 # Commented out to require final newline 439 # Don't allow that case since it is more frequent that the line 440 # count is wrong. (Could change in future releases.) 441 ## elif len(positions) == self.count - 1 and not data: 442 ## # This was the last record, and it has no trailing newline 443 ## s = self.lookahead 444 ## self.lookahead = "" 445 ## self.positions = [] 446 ## self.index = 0 447 ## return s 448 raise ReaderError, \ 449 "Only found %d lines, expecting %d (starting with %s ...)" % \ 450 (len(positions), self.count, repr(lookahead[:20]))
451
452 - def remainder(self):
453 return self.infile, self.lookahead[self.pos:]
454
455 -class Nothing(RecordReader):
456 """Reads nothing"""
457 - def __init__(self, infile, sizehint = SIZEHINT, lookahead = ""):
458 RecordReader.__init__(self, infile) 459 self.lookahead = lookahead
460
461 - def next(self):
462 return None
463
464 - def remainder(self):
465 return self.infile, self.lookahead
466
467 -class Everything(RecordReader):
468 """Reads everything"""
469 - def __init__(self, infile, sizehint = SIZEHINT, lookahead = ""):
470 RecordReader.__init__(self, infile) 471 self.lookahead = lookahead 472 self.found = 0
473
474 - def next(self):
475 if self.found: 476 return None 477 s = self.lookahead + self.infile.read() 478 self.lookahead = "" 479 self.found = 1 480 return s
481
482 - def remainder(self):
483 return self.infile, self.lookahead
484