Package Bio :: Package SeqIO :: Module _index
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._index

  1  # Copyright 2009-2010 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Dictionary like indexing of sequence files (PRIVATE). 
  6   
  7  You are not expected to access this module, or any of its code, directly. This 
  8  is all handled internally by the Bio.SeqIO.index(...) function which is the 
  9  public interface for this functionality. 
 10   
 11  The basic idea is that we scan over a sequence file, looking for new record 
 12  markers. We then try and extract the string that Bio.SeqIO.parse/read would 
 13  use as the record id, ideally without actually parsing the full record. We 
 14  then use a subclassed Python dictionary to record the file offset for the 
 15  record start against the record id. 
 16   
 17  Note that this means full parsing is on demand, so any invalid or problem 
 18  record may not trigger an exception until it is accessed. This is by design. 
 19   
 20  This means our dictionary like objects have in memory ALL the keys (all the 
 21  record identifiers), which shouldn't be a problem even with second generation 
 22  sequencing. If this is an issue later on, storing the keys and offsets in a 
 23  temp lookup file might be one idea (e.g. using SQLite or an OBDA style index). 
 24  """ 
 25   
 26  import re 
 27  from Bio import SeqIO 
 28  from Bio import Alphabet 
 29   
30 -class _IndexedSeqFileDict(dict):
31 """Read only dictionary interface to a sequential sequence file. 32 33 Keeps the keys in memory, reads the file to access entries as 34 SeqRecord objects using Bio.SeqIO for parsing them. This approach 35 is memory limited, but will work even with millions of sequences. 36 37 Note - as with the Bio.SeqIO.to_dict() function, duplicate keys 38 (record identifiers by default) are not allowed. If this happens, 39 a ValueError exception is raised. 40 41 By default the SeqRecord's id string is used as the dictionary 42 key. This can be changed by suppling an optional key_function, 43 a callback function which will be given the record id and must 44 return the desired key. For example, this allows you to parse 45 NCBI style FASTA identifiers, and extract the GI number to use 46 as the dictionary key. 47 48 Note that this dictionary is essentially read only. You cannot 49 add or change values, pop values, nor clear the dictionary. 50 """
51 - def __init__(self, filename, format, alphabet, key_function):
52 #Use key_function=None for default value 53 dict.__init__(self) #init as empty dict! 54 if format in SeqIO._BinaryFormats: 55 mode = "rb" 56 else: 57 mode = "rU" 58 self._handle = open(filename, mode) 59 self._alphabet = alphabet 60 self._format = format 61 self._key_function = key_function 62 if key_function: 63 offset_iter = ((key_function(k),o) for (k,o) in self._build()) 64 else: 65 offset_iter = self._build() 66 for key, offset in offset_iter: 67 if key in self: 68 raise ValueError("Duplicate key '%s'" % key) 69 else: 70 dict.__setitem__(self, key, offset)
71
72 - def _build(self):
73 """Actually scan the file identifying records and offsets (PRIVATE). 74 75 Returns an iterator giving tuples of record names and their offsets. 76 """ 77 pass
78
79 - def __repr__(self):
80 return "SeqIO.index('%s', '%s', alphabet=%s, key_function=%s)" \ 81 % (self._handle.name, self._format, 82 repr(self._alphabet), self._key_function)
83
84 - def __str__(self):
85 if self: 86 return "{%s : SeqRecord(...), ...}" % repr(self.keys()[0]) 87 else: 88 return "{}"
89 90 if hasattr(dict, "iteritems"): 91 #Python 2, use iteritems but not items etc
92 - def values(self):
93 """Would be a list of the SeqRecord objects, but not implemented. 94 95 In general you can be indexing very very large files, with millions 96 of sequences. Loading all these into memory at once as SeqRecord 97 objects would (probably) use up all the RAM. Therefore we simply 98 don't support this dictionary method. 99 """ 100 raise NotImplementedError("Due to memory concerns, when indexing a " 101 "sequence file you cannot access all the " 102 "records at once.")
103
104 - def items(self):
105 """Would be a list of the (key, SeqRecord) tuples, but not implemented. 106 107 In general you can be indexing very very large files, with millions 108 of sequences. Loading all these into memory at once as SeqRecord 109 objects would (probably) use up all the RAM. Therefore we simply 110 don't support this dictionary method. 111 """ 112 raise NotImplementedError("Due to memory concerns, when indexing a " 113 "sequence file you cannot access all the " 114 "records at once.")
115
116 - def iteritems(self):
117 """Iterate over the (key, SeqRecord) items.""" 118 for key in self.__iter__(): 119 yield key, self.__getitem__(key)
120 else: 121 #Python 3 - define items and values as iterators
122 - def items(self):
123 """Iterate over the (key, SeqRecord) items.""" 124 for key in self.__iter__(): 125 yield key, self.__getitem__(key)
126
127 - def values(self):
128 """Iterate over the SeqRecord items.""" 129 for key in self.__iter__(): 130 yield self.__getitem__(key)
131 132
133 - def __getitem__(self, key):
134 """x.__getitem__(y) <==> x[y]""" 135 #Should be done by each sub-class 136 raise NotImplementedError("Not implemented for this file format (yet).")
137
138 - def get(self, k, d=None):
139 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" 140 try: 141 return self.__getitem__(k) 142 except KeyError: 143 return d
144
145 - def get_raw(self, key):
146 """Similar to the get method, but returns the record as a raw string. 147 148 If the key is not found, a KeyError exception is raised. 149 150 NOTE - This functionality is not supported for every file format. 151 """ 152 #Should be done by each sub-class (if possible) 153 raise NotImplementedError("Not available for this file format.")
154
155 - def __setitem__(self, key, value):
156 """Would allow setting or replacing records, but not implemented.""" 157 raise NotImplementedError("An indexed a sequence file is read only.")
158
159 - def update(self, **kwargs):
160 """Would allow adding more values, but not implemented.""" 161 raise NotImplementedError("An indexed a sequence file is read only.")
162 163
164 - def pop(self, key, default=None):
165 """Would remove specified record, but not implemented.""" 166 raise NotImplementedError("An indexed a sequence file is read only.")
167
168 - def popitem(self):
169 """Would remove and return a SeqRecord, but not implemented.""" 170 raise NotImplementedError("An indexed a sequence file is read only.")
171 172
173 - def clear(self):
174 """Would clear dictionary, but not implemented.""" 175 raise NotImplementedError("An indexed a sequence file is read only.")
176
177 - def fromkeys(self, keys, value=None):
178 """A dictionary method which we don't implement.""" 179 raise NotImplementedError("An indexed a sequence file doesn't " 180 "support this.")
181
182 - def copy(self):
183 """A dictionary method which we don't implement.""" 184 raise NotImplementedError("An indexed a sequence file doesn't " 185 "support this.")
186 187 188 189 #################### 190 # Special indexers # 191 #################### 192 193 # Anything where the records cannot be read simply by parsing from 194 # the record start. For example, anything requiring information from 195 # a file header - e.g. SFF files where we would need to know the 196 # number of flows. 197
198 -class SffDict(_IndexedSeqFileDict) :
199 """Indexed dictionary like access to a Standard Flowgram Format (SFF) file."""
200 - def _build(self):
201 """Load any index block in the file, or build it the slow way (PRIVATE).""" 202 if self._alphabet is None: 203 self._alphabet = Alphabet.generic_dna 204 handle = self._handle 205 #Record the what we'll need for parsing a record given its offset 206 header_length, index_offset, index_length, number_of_reads, \ 207 self._flows_per_read, self._flow_chars, self._key_sequence \ 208 = SeqIO.SffIO._sff_file_header(handle) 209 if index_offset and index_length: 210 #There is an index provided, try this the fast way: 211 try : 212 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle) : 213 yield name, offset 214 assert len(self) == number_of_reads, \ 215 "Indexed %i records, expected %i" \ 216 % (len(self), number_of_reads) 217 return 218 except ValueError, err : 219 import warnings 220 warnings.warn("Could not parse the SFF index: %s" % err) 221 assert len(self)==0, "Partially populated index" 222 handle.seek(0) 223 else : 224 #TODO - Remove this debug warning? 225 import warnings 226 warnings.warn("No SFF index, doing it the slow way") 227 #Fall back on the slow way! 228 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle) : 229 yield name, offset 230 assert len(self) == number_of_reads, \ 231 "Indexed %i records, expected %i" % (len(self), number_of_reads)
232
233 - def __getitem__(self, key) :
234 handle = self._handle 235 handle.seek(dict.__getitem__(self, key)) 236 return SeqIO.SffIO._sff_read_seq_record(handle, 237 self._flows_per_read, 238 self._flow_chars, 239 self._key_sequence, 240 self._alphabet)
241 242
243 -class SffTrimmedDict(SffDict) :
244 - def __getitem__(self, key) :
245 handle = self._handle 246 handle.seek(dict.__getitem__(self, key)) 247 return SeqIO.SffIO._sff_read_seq_record(handle, 248 self._flows_per_read, 249 self._flow_chars, 250 self._key_sequence, 251 self._alphabet, 252 trim=True)
253 254 255 ################### 256 # Simple indexers # 257 ################### 258
259 -class SequentialSeqFileDict(_IndexedSeqFileDict):
260 """Indexed dictionary like access to most sequential sequence files."""
261 - def __init__(self, filename, format, alphabet, key_function):
262 _IndexedSeqFileDict.__init__(self, filename, format, alphabet, key_function) 263 #Load the parser class/function once an avoid the dict lookup in each 264 #__getitem__ call: 265 i = SeqIO._FormatToIterator[format] 266 #The following alphabet code is a bit nasty... duplicates logic in 267 #Bio.SeqIO.parse() 268 if alphabet is None: 269 def _parse(): 270 """Dynamically generated parser function (PRIVATE).""" 271 return i(self._handle).next()
272 else: 273 #TODO - Detect alphabet support ONCE at __init__ 274 def _parse(): 275 """Dynamically generated parser function (PRIVATE).""" 276 try: 277 return i(self._handle, alphabet=alphabet).next() 278 except TypeError: 279 return SeqIO._force_alphabet(i(self._handle), 280 alphabet).next()
281 self._parse = _parse 282
283 - def _build(self):
284 handle = self._handle 285 marker = {"ace" : "CO ", 286 "fasta": ">", 287 "phd" : "BEGIN_SEQUENCE", 288 "pir" : ">..;", 289 "qual": ">", 290 }[self._format] 291 marker_re = re.compile("^%s" % marker) 292 marker_offset = len(marker) 293 self._marker_re = marker_re #saved for the get_raw method 294 while True: 295 offset = handle.tell() 296 line = handle.readline() 297 if marker_re.match(line): 298 #Here we can assume the record.id is the first word after the 299 #marker. This is generally fine... but not for GenBank, EMBL, Swiss 300 yield line[marker_offset:].strip().split(None, 1)[0], offset 301 elif not line: 302 #End of file 303 break
304
305 - def __getitem__(self, key):
306 """x.__getitem__(y) <==> x[y]""" 307 handle = self._handle 308 handle.seek(dict.__getitem__(self, key)) 309 record = self._parse() 310 if self._key_function: 311 assert self._key_function(record.id) == key, \ 312 "Requested key %s, found record.id %s which has key %s" \ 313 % (repr(key), repr(record.id), 314 repr(self._key_function(record.id))) 315 else: 316 assert record.id == key, \ 317 "Requested key %s, found record.id %s" \ 318 % (repr(key), repr(record.id)) 319 return record
320
321 - def get_raw(self, key):
322 """Similar to the get method, but returns the record as a raw string.""" 323 #For non-trivial file formats this must be over-ridden in the subclass 324 handle = self._handle 325 marker_re = self._marker_re 326 handle.seek(dict.__getitem__(self, key)) 327 data = handle.readline() 328 while True: 329 line = handle.readline() 330 if marker_re.match(line) or not line: 331 #End of file, or start of next record => end of this record 332 break 333 data += line 334 return data
335 336 337 ####################################### 338 # Fiddly indexers: GenBank, EMBL, ... # 339 ####################################### 340
341 -class GenBankDict(SequentialSeqFileDict):
342 """Indexed dictionary like access to a GenBank file."""
343 - def _build(self):
344 handle = self._handle 345 marker_re = re.compile("^LOCUS ") 346 self._marker_re = marker_re #saved for the get_raw method 347 while True: 348 offset = handle.tell() 349 line = handle.readline() 350 if marker_re.match(line): 351 #We cannot assume the record.id is the first word after LOCUS, 352 #normally the first entry on the VERSION or ACCESSION line is used. 353 key = None 354 done = False 355 while not done: 356 line = handle.readline() 357 if line.startswith("ACCESSION "): 358 key = line.rstrip().split()[1] 359 elif line.startswith("VERSION "): 360 version_id = line.rstrip().split()[1] 361 if version_id.count(".")==1 and version_id.split(".")[1].isdigit(): 362 #This should mimic the GenBank parser... 363 key = version_id 364 done = True 365 break 366 elif line.startswith("FEATURES ") \ 367 or line.startswith("ORIGIN ") \ 368 or line.startswith("//") \ 369 or marker_re.match(line) \ 370 or not line: 371 done = True 372 break 373 if not key: 374 raise ValueError("Did not find ACCESSION/VERSION lines") 375 yield key, offset 376 elif not line: 377 #End of file 378 break
379 380
381 -class EmblDict(SequentialSeqFileDict):
382 """Indexed dictionary like access to an EMBL file."""
383 - def _build(self):
384 handle = self._handle 385 marker_re = re.compile("^ID ") 386 self._marker_re = marker_re #saved for the get_raw method 387 while True: 388 offset = handle.tell() 389 line = handle.readline() 390 if marker_re.match(line): 391 #We cannot assume the record.id is the first word after ID, 392 #normally the SV line is used. 393 if line[2:].count(";") == 6: 394 #Looks like the semi colon separated style introduced in 2006 395 parts = line[3:].rstrip().split(";") 396 if parts[1].strip().startswith("SV "): 397 #The SV bit gives the version 398 key = "%s.%s" \ 399 % (parts[0].strip(), parts[1].strip().split()[1]) 400 else: 401 key = parts[0].strip() 402 elif line[2:].count(";") == 3: 403 #Looks like the pre 2006 style, take first word only 404 key = line[3:].strip().split(None,1)[0] 405 else: 406 raise ValueError('Did not recognise the ID line layout:\n' + line) 407 while True: 408 line = handle.readline() 409 if line.startswith("SV "): 410 key = line.rstrip().split()[1] 411 break 412 elif line.startswith("FH ") \ 413 or line.startswith("FT ") \ 414 or line.startswith("SQ ") \ 415 or line.startswith("//") \ 416 or marker_re.match(line) \ 417 or not line: 418 break 419 yield key, offset 420 elif not line: 421 #End of file 422 break
423 424
425 -class SwissDict(SequentialSeqFileDict):
426 """Indexed dictionary like access to a SwissProt file."""
427 - def _build(self):
428 handle = self._handle 429 marker_re = re.compile("^ID ") 430 self._marker_re = marker_re #saved for the get_raw method 431 while True: 432 offset = handle.tell() 433 line = handle.readline() 434 if marker_re.match(line): 435 #We cannot assume the record.id is the first word after ID, 436 #normally the following AC line is used. 437 line = handle.readline() 438 assert line.startswith("AC ") 439 key = line[3:].strip().split(";")[0].strip() 440 yield key, offset 441 elif not line: 442 #End of file 443 break
444 445
446 -class UniprotDict(SequentialSeqFileDict):
447 """Indexed dictionary like access to a UniProt XML file."""
448 - def _build(self):
449 handle = self._handle 450 marker_re = re.compile("^<entry ") #Assumes start of line! 451 self._marker_re = marker_re #saved for the get_raw method 452 while True: 453 offset = handle.tell() 454 line = handle.readline() 455 if marker_re.match(line): 456 #We expect the next line to be <accession>xxx</accession> 457 #but allow it to be later on within the <entry> 458 key = None 459 done = False 460 while True: 461 line = handle.readline() 462 if line.startswith("<accession>"): 463 assert "</accession>" in line, line 464 key = line[11:].split("<")[0] 465 break 466 elif "</entry>" in line: 467 break 468 elif marker_re.match(line) or not line: 469 #Start of next record or end of file 470 raise ValueError("Didn't find end of record") 471 if not key: 472 raise ValueError("Did not find <accession> line") 473 yield key, offset 474 elif not line: 475 #End of file 476 break
477
478 - def get_raw(self, key):
479 """Similar to the get method, but returns the record as a raw string.""" 480 handle = self._handle 481 marker_re = self._marker_re 482 handle.seek(dict.__getitem__(self, key)) 483 data = handle.readline() 484 while True: 485 line = handle.readline() 486 i = line.find("</entry>") 487 if i != -1: 488 data += line[:i+8] 489 break 490 if marker_re.match(line) or not line: 491 #End of file, or start of next record 492 raise ValueError("Didn't find end of record") 493 data += line 494 return data
495
496 - def __getitem__(self, key) :
497 #TODO - Can we handle this directly in the parser? 498 #This is a hack - use get_raw for <entry>...</entry> and wrap it with 499 #the apparently required XML header and footer. 500 data = """<?xml version='1.0' encoding='UTF-8'?> 501 <uniprot xmlns="http://uniprot.org/uniprot" 502 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 503 xsi:schemaLocation="http://uniprot.org/uniprot 504 http://www.uniprot.org/support/docs/uniprot.xsd"> 505 %s 506 </uniprot> 507 """ % self.get_raw(key) 508 #TODO - For consistency, this function should not accept a string: 509 return SeqIO.UniprotIO.UniprotIterator(data).next()
510 511
512 -class IntelliGeneticsDict(SequentialSeqFileDict):
513 """Indexed dictionary like access to a IntelliGenetics file."""
514 - def _build(self):
515 handle = self._handle 516 marker_re = re.compile("^;") 517 self._marker_re = marker_re #saved for the get_raw method 518 while True: 519 offset = handle.tell() 520 line = handle.readline() 521 if marker_re.match(line): 522 #Now look for the first line which doesn't start ";" 523 while True: 524 line = handle.readline() 525 if line[0] != ";" and line.strip(): 526 key = line.split()[0] 527 yield key, offset 528 break 529 if not line: 530 raise ValueError("Premature end of file?") 531 elif not line: 532 #End of file 533 break
534 535
536 -class TabDict(SequentialSeqFileDict):
537 """Indexed dictionary like access to a simple tabbed file."""
538 - def _build(self):
539 handle = self._handle 540 while True: 541 offset = handle.tell() 542 line = handle.readline() 543 if not line : break #End of file 544 try: 545 key = line.split("\t")[0] 546 except ValueError, err: 547 if not line.strip(): 548 #Ignore blank lines 549 continue 550 else: 551 raise err 552 else: 553 yield key, offset
554
555 - def get_raw(self, key):
556 """Like the get method, but returns the record as a raw string.""" 557 handle = self._handle 558 handle.seek(dict.__getitem__(self, key)) 559 return handle.readline()
560 561 562 ########################## 563 # Now the FASTQ indexers # 564 ########################## 565
566 -class FastqDict(SequentialSeqFileDict):
567 """Indexed dictionary like access to a FASTQ file (any supported variant). 568 569 With FASTQ the records all start with a "@" line, but so can quality lines. 570 Note this will cope with line-wrapped FASTQ files. 571 """
572 - def _build(self):
573 handle = self._handle 574 pos = handle.tell() 575 line = handle.readline() 576 if not line: 577 #Empty file! 578 return 579 if line[0] != "@": 580 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line)) 581 while line: 582 #assert line[0]=="@" 583 #This record seems OK (so far) 584 yield line[1:].rstrip().split(None, 1)[0], pos 585 #Find the seq line(s) 586 seq_len = 0 587 while line: 588 line = handle.readline() 589 if line.startswith("+") : break 590 seq_len += len(line.strip()) 591 if not line: 592 raise ValueError("Premature end of file in seq section") 593 #assert line[0]=="+" 594 #Find the qual line(s) 595 qual_len = 0 596 while line: 597 if seq_len == qual_len: 598 #Should be end of record... 599 pos = handle.tell() 600 line = handle.readline() 601 if line and line[0] != "@": 602 ValueError("Problem with line %s" % repr(line)) 603 break 604 else: 605 line = handle.readline() 606 qual_len += len(line.strip()) 607 if seq_len != qual_len: 608 raise ValueError("Problem with quality section")
609 #print "EOF" 610
611 - def get_raw(self, key):
612 """Similar to the get method, but returns the record as a raw string.""" 613 #TODO - Refactor this and the __init__ method to reduce code duplication? 614 handle = self._handle 615 handle.seek(dict.__getitem__(self, key)) 616 line = handle.readline() 617 data = line 618 if line[0] != "@": 619 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line)) 620 identifier = line[1:].rstrip().split(None, 1)[0] 621 if self._key_function: 622 identifier = self._key_function(identifier) 623 if key != identifier: 624 raise ValueError("Key did not match") 625 #Find the seq line(s) 626 seq_len = 0 627 while line: 628 line = handle.readline() 629 data += line 630 if line.startswith("+") : break 631 seq_len += len(line.strip()) 632 if not line: 633 raise ValueError("Premature end of file in seq section") 634 assert line[0]=="+" 635 #Find the qual line(s) 636 qual_len = 0 637 while line: 638 if seq_len == qual_len: 639 #Should be end of record... 640 pos = handle.tell() 641 line = handle.readline() 642 if line and line[0] != "@": 643 ValueError("Problem with line %s" % repr(line)) 644 break 645 else: 646 line = handle.readline() 647 data += line 648 qual_len += len(line.strip()) 649 if seq_len != qual_len: 650 raise ValueError("Problem with quality section") 651 return data
652 653 654 ############################################################################### 655 656 _FormatToIndexedDict = {"ace" : SequentialSeqFileDict, 657 "embl" : EmblDict, 658 "fasta" : SequentialSeqFileDict, 659 "fastq" : FastqDict, #Class handles all three variants 660 "fastq-sanger" : FastqDict, #alias of the above 661 "fastq-solexa" : FastqDict, 662 "fastq-illumina" : FastqDict, 663 "genbank" : GenBankDict, 664 "gb" : GenBankDict, #alias of the above 665 "ig" : IntelliGeneticsDict, 666 "imgt" : EmblDict, 667 "phd" : SequentialSeqFileDict, 668 "pir" : SequentialSeqFileDict, 669 "sff" : SffDict, 670 "sff-trim" : SffTrimmedDict, 671 "swiss" : SwissDict, 672 "tab" : TabDict, 673 "qual" : SequentialSeqFileDict, 674 "uniprot-xml" : UniprotDict, 675 } 676