Package Bio :: Package GenBank :: Module Scanner
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Scanner

   1  # Copyright 2007 by Peter Cock.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # 
   6  # This code is NOT intended for direct use.  It provides a basic scanner 
   7  # (for use with a event consumer such as Bio.GenBank._FeatureConsumer) 
   8  # to parse a GenBank or EMBL file (with their shared INSDC feature table). 
   9  # 
  10  # It is used by Bio.GenBank to parse GenBank files 
  11  # It is also used by Bio.SeqIO to parse GenBank and EMBL files 
  12  # 
  13  # Feature Table Documentation: 
  14  # http://www.insdc.org/files/feature_table.html 
  15  # http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html 
  16  # ftp://ftp.ncbi.nih.gov/genbank/docs/ 
  17   
  18  import sys 
  19  from Bio.Seq import Seq 
  20  from Bio.SeqRecord import SeqRecord 
  21  from Bio.Alphabet import generic_alphabet, generic_protein 
  22   
23 -class InsdcScanner :
24 """Basic functions for breaking up a GenBank/EMBL file into sub sections. 25 26 The International Nucleotide Sequence Database Collaboration (INSDC) 27 between the DDBJ, EMBL, and GenBank. These organisations all use the 28 same "Feature Table" layout in their plain text flat file formats. 29 30 However, the header and sequence sections of an EMBL file are very 31 different in layout to those produced by GenBank/DDBJ.""" 32 33 #These constants get redefined with sensible values in the sub classes: 34 RECORD_START = "XXX" # "LOCUS " or "ID " 35 HEADER_WIDTH = 3 # 12 or 5 36 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"] 37 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"] 38 FEATURE_QUALIFIER_INDENT = 0 39 FEATURE_QUALIFIER_SPACER = "" 40 SEQUENCE_HEADERS=["XXX"] #with right hand side spaces removed 41
42 - def __init__(self, debug=0) :
43 assert len(self.RECORD_START)==self.HEADER_WIDTH 44 for marker in self.SEQUENCE_HEADERS : 45 assert marker==marker.rstrip() 46 assert len(self.FEATURE_QUALIFIER_SPACER)==self.FEATURE_QUALIFIER_INDENT 47 self.debug = debug 48 self.line = None
49
50 - def set_handle(self, handle) :
51 self.handle = handle 52 self.line = ""
53
54 - def find_start(self) :
55 """Read in lines until find the ID/LOCUS line, which is returned. 56 57 Any preamble (such as the header used by the NCBI on *.seq.gz archives) 58 will we ignored.""" 59 while True : 60 if self.line : 61 line = self.line 62 self.line = "" 63 else : 64 line = self.handle.readline() 65 if not line : 66 if self.debug : print "End of file" 67 return None 68 if line[:self.HEADER_WIDTH]==self.RECORD_START : 69 if self.debug > 1: print "Found the start of a record:\n" + line 70 break 71 line = line.rstrip() 72 if line == "//" : 73 if self.debug > 1: print "Skipping // marking end of last record" 74 elif line == "" : 75 if self.debug > 1: print "Skipping blank line before record" 76 else : 77 #Ignore any header before the first ID/LOCUS line. 78 if self.debug > 1: 79 print "Skipping header line before record:\n" + line 80 self.line = line 81 return line
82
83 - def parse_header(self) :
84 """Return list of strings making up the header 85 86 New line characters are removed. 87 88 Assumes you have just read in the ID/LOCUS line. 89 """ 90 assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \ 91 "Not at start of record" 92 93 header_lines = [] 94 while True : 95 line = self.handle.readline() 96 if not line : 97 raise ValueError("Premature end of line during sequence data") 98 line = line.rstrip() 99 if line in self.FEATURE_START_MARKERS : 100 if self.debug : print "Found header table" 101 break 102 #if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH] : 103 # if self.debug : print "Found header table (?)" 104 # break 105 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS : 106 if self.debug : print "Found start of sequence" 107 break 108 if line == "//" : 109 raise ValueError("Premature end of sequence data marker '//' found") 110 header_lines.append(line) 111 self.line = line 112 return header_lines
113
114 - def parse_features(self, skip=False) :
115 """Return list of tuples for the features (if present) 116 117 Each feature is returned as a tuple (key, location, qualifiers) 118 where key and location are strings (e.g. "CDS" and 119 "complement(join(490883..490885,1..879))") while qualifiers 120 is a list of two string tuples (feature qualifier keys and values). 121 122 Assumes you have already read to the start of the features table. 123 """ 124 if self.line.rstrip() not in self.FEATURE_START_MARKERS : 125 if self.debug : print "Didn't find any feature table" 126 return [] 127 128 while self.line.rstrip() in self.FEATURE_START_MARKERS : 129 self.line = self.handle.readline() 130 131 features = [] 132 line = self.line 133 while True : 134 if not line : 135 raise ValueError("Premature end of line during features table") 136 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS : 137 if self.debug : print "Found start of sequence" 138 break 139 line = line.rstrip() 140 if line == "//" : 141 raise ValueError("Premature end of features table, marker '//' found") 142 if line in self.FEATURE_END_MARKERS : 143 if self.debug : print "Found end of features" 144 line = self.handle.readline() 145 break 146 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "" : 147 raise ValueError("Expected a feature qualifier in line '%s'" % line) 148 149 if skip : 150 line = self.handle.readline() 151 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER : 152 line = self.handle.readline() 153 else : 154 #Build up a list of the lines making up this feature: 155 feature_key = line[2:self.FEATURE_QUALIFIER_INDENT].strip() 156 feature_lines = [line[self.FEATURE_QUALIFIER_INDENT:]] 157 line = self.handle.readline() 158 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 159 or line.rstrip() == "" : # cope with blank lines in the midst of a feature 160 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].rstrip()) 161 line = self.handle.readline() 162 features.append(self.parse_feature(feature_key, feature_lines)) 163 self.line = line 164 return features
165
166 - def parse_feature(self, feature_key, lines) :
167 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers) 168 169 For example given this GenBank feature: 170 171 CDS complement(join(490883..490885,1..879)) 172 /locus_tag="NEQ001" 173 /note="conserved hypothetical [Methanococcus jannaschii]; 174 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear 175 localization signal; IPR002743: Protein of unknown 176 function DUF57" 177 /codon_start=1 178 /transl_table=11 179 /product="hypothetical protein" 180 /protein_id="NP_963295.1" 181 /db_xref="GI:41614797" 182 /db_xref="GeneID:2732620" 183 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK 184 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK 185 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP 186 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE 187 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS 188 LNSMGFGFVNTKKNSAR" 189 190 Then should give input key="CDS" and the rest of the data as a list of strings 191 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"] 192 where the leading spaces and trailing newlines have been removed. 193 194 Returns tuple containing: (key as string, location string, qualifiers as list) 195 as follows for this example: 196 197 key = "CDS", string 198 location = "complement(join(490883..490885,1..879))", string 199 qualifiers = list of string tuples: 200 201 [('locus_tag', '"NEQ001"'), 202 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'), 203 ('codon_start', '1'), 204 ('transl_table', '11'), 205 ('product', '"hypothetical protein"'), 206 ('protein_id', '"NP_963295.1"'), 207 ('db_xref', '"GI:41614797"'), 208 ('db_xref', '"GeneID:2732620"'), 209 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')] 210 211 In the above example, the "note" and "translation" were edited for compactness, 212 and they would contain multiple new line characters (displayed above as \n) 213 214 If a qualifier is quoted (in this case, everything except codon_start and 215 transl_table) then the quotes are NOT removed. 216 217 Note that no whitespace is removed. 218 """ 219 #Skip any blank lines 220 iterator = iter(filter(None, lines)) 221 try : 222 line = iterator.next() 223 224 feature_location = line.strip() 225 while feature_location[-1:]=="," : 226 #Multiline location, still more to come! 227 feature_location += iterator.next().strip() 228 229 qualifiers=[] 230 231 for line in iterator : 232 if line[0]=="/" : 233 #New qualifier 234 i = line.find("=") 235 key = line[1:i] #does not work if i==-1 236 value = line[i+1:] #we ignore 'value' if i==-1 237 if i==-1 : 238 #Qualifier with no key, e.g. /pseudo 239 key = line[1:] 240 qualifiers.append((key,None)) 241 elif value[0]=='"' : 242 #Quoted... 243 if value[-1]!='"' or value!='"' : 244 #No closing quote on the first line... 245 while value[-1] != '"' : 246 value += "\n" + iterator.next() 247 else : 248 #One single line (quoted) 249 assert value == '"' 250 if self.debug : print "Quoted line %s:%s" % (key, value) 251 #DO NOT remove the quotes... 252 qualifiers.append((key,value)) 253 else : 254 #Unquoted 255 #if debug : print "Unquoted line %s:%s" % (key,value) 256 qualifiers.append((key,value)) 257 else : 258 #Unquoted continuation 259 assert len(qualifiers) > 0 260 assert key==qualifiers[-1][0] 261 #if debug : print "Unquoted Cont %s:%s" % (key, line) 262 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line) 263 return (feature_key, feature_location, qualifiers) 264 except StopIteration: 265 #Bummer 266 raise ValueError("Problem with '%s' feature:\n%s" \ 267 % (feature_key, "\n".join(lines)))
268 289
290 - def _feed_first_line(self, consumer, line) :
291 """Handle the LOCUS/ID line, passing data to the comsumer 292 293 This should be implemented by the EMBL / GenBank specific subclass 294 295 Used by the parse_records() and parse() methods. 296 """ 297 pass
298
299 - def _feed_header_lines(self, consumer, lines) :
300 """Handle the header lines (list of strings), passing data to the comsumer 301 302 This should be implemented by the EMBL / GenBank specific subclass 303 304 Used by the parse_records() and parse() methods. 305 """ 306 pass
307 308
309 - def _feed_feature_table(self, consumer, feature_tuples) :
310 """Handle the feature table (list of tuples), passing data to the comsumer 311 312 Used by the parse_records() and parse() methods. 313 """ 314 consumer.start_feature_table() 315 for feature_key, location_string, qualifiers in feature_tuples : 316 consumer.feature_key(feature_key) 317 consumer.location(location_string) 318 for q_key, q_value in qualifiers : 319 consumer.feature_qualifier_name([q_key]) 320 if q_value is not None : 321 consumer.feature_qualifier_description(q_value.replace("\n"," "))
322
323 - def _feed_misc_lines(self, consumer, lines) :
324 """Handle any lines between features and sequence (list of strings), passing data to the consumer 325 326 This should be implemented by the EMBL / GenBank specific subclass 327 328 Used by the parse_records() and parse() methods. 329 """ 330 pass
331
332 - def feed(self, handle, consumer, do_features=True) :
333 """Feed a set of data into the consumer. 334 335 This method is intended for use with the "old" code in Bio.GenBank 336 337 Arguments: 338 handle - A handle with the information to parse. 339 consumer - The consumer that should be informed of events. 340 do_features - Boolean, should the features be parsed? 341 Skipping the features can be much faster. 342 343 Return values: 344 true - Passed a record 345 false - Did not find a record 346 """ 347 #Should work with both EMBL and GenBank files provided the 348 #equivalent Bio.GenBank._FeatureConsumer methods are called... 349 self.set_handle(handle) 350 if not self.find_start() : 351 #Could not find (another) record 352 consumer.data=None 353 return False 354 355 #We use the above class methods to parse the file into a simplified format. 356 #The first line, header lines and any misc lines after the features will be 357 #dealt with by GenBank / EMBL specific derived classes. 358 359 #First line and header: 360 self._feed_first_line(consumer, self.line) 361 self._feed_header_lines(consumer, self.parse_header()) 362 363 #Features (common to both EMBL and GenBank): 364 if do_features : 365 self._feed_feature_table(consumer, self.parse_features(skip=False)) 366 else : 367 self.parse_features(skip=True) # ignore the data 368 369 #Footer and sequence 370 misc_lines, sequence_string = self.parse_footer() 371 self._feed_misc_lines(consumer, misc_lines) 372 373 consumer.sequence(sequence_string) 374 #Calls to consumer.base_number() do nothing anyway 375 consumer.record_end("//") 376 377 assert self.line == "//" 378 379 #And we are done 380 return True
381
382 - def parse(self, handle, do_features=True) :
383 """Returns a SeqRecord (with SeqFeatures if do_features=True) 384 385 See also the method parse_records() for use on multi-record files. 386 """ 387 from Bio.GenBank import _FeatureConsumer 388 from Bio.GenBank.utils import FeatureValueCleaner 389 390 consumer = _FeatureConsumer(use_fuzziness = 1, 391 feature_cleaner = FeatureValueCleaner()) 392 393 if self.feed(handle, consumer) : 394 return consumer.data 395 else : 396 return None
397 398
399 - def parse_records(self, handle, do_features=True) :
400 """Returns a SeqRecord object iterator 401 402 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord 403 404 The SeqRecord objects include SeqFeatures if do_features=True 405 406 This method is intended for use in Bio.SeqIO 407 """ 408 #This is a generator function 409 while True : 410 record = self.parse(handle) 411 if record is None : break 412 assert record.id is not None 413 assert record.name != "<unknown name>" 414 assert record.description != "<unknown description>" 415 yield record
416
417 - def parse_cds_features(self, handle, 418 alphabet=generic_protein, 419 tags2id=('protein_id','locus_tag','product')) :
420 """Returns SeqRecord object iterator 421 422 Each CDS feature becomes a SeqRecord. 423 424 alphabet - Used for any sequence found in a translation field. 425 tags2id - Tupple of three strings, the feature keys to use 426 for the record id, name and description, 427 428 This method is intended for use in Bio.SeqIO 429 """ 430 self.set_handle(handle) 431 while self.find_start() : 432 #Got an EMBL or GenBank record... 433 self.parse_header() # ignore header lines! 434 feature_tuples = self.parse_features() 435 #self.parse_footer() # ignore footer lines! 436 for line in self.handle : 437 if line[:2]=="//" : break 438 self.line = line.rstrip() 439 440 #Now go though those features... 441 for key, location_string, qualifiers in feature_tuples : 442 if key=="CDS" : 443 #Create SeqRecord 444 #================ 445 #SeqRecord objects cannot be created with annotations, they 446 #must be added afterwards. So create an empty record and 447 #then populate it: 448 record = SeqRecord(seq=None) 449 annotations = record.annotations 450 451 #Should we add a location object to the annotations? 452 #I *think* that only makes sense for SeqFeatures with their 453 #sub features... 454 annotations['raw_location'] = location_string.replace(' ','') 455 456 for (qualifier_name, qualifier_data) in qualifiers : 457 if qualifier_data is not None \ 458 and qualifier_data[0]=='"' and qualifier_data[-1]=='"' : 459 #Remove quotes 460 qualifier_data = qualifier_data[1:-1] 461 #Append the data to the annotation qualifier... 462 if qualifier_name == "translation" : 463 assert record.seq is None, "Multiple translations!" 464 record.seq = Seq(qualifier_data.replace("\n",""), alphabet) 465 elif qualifier_name == "db_xref" : 466 #its a list, possibly empty. Its safe to extend 467 record.dbxrefs.append(qualifier_data) 468 else : 469 if qualifier_data is not None : 470 qualifier_data = qualifier_data.replace("\n"," ").replace(" "," ") 471 try : 472 annotations[qualifier_name] += " " + qualifier_data 473 except KeyError : 474 #Not an addition to existing data, its the first bit 475 annotations[qualifier_name]= qualifier_data 476 477 #Fill in the ID, Name, Description 478 #================================= 479 try : 480 record.id = annotations[tags2id[0]] 481 except KeyError : 482 pass 483 try : 484 record.name = annotations[tags2id[1]] 485 except KeyError : 486 pass 487 try : 488 record.description = annotations[tags2id[2]] 489 except KeyError : 490 pass 491 492 yield record
493
494 -class EmblScanner(InsdcScanner) :
495 """For extracting chunks of information in EMBL files""" 496 497 RECORD_START = "ID " 498 HEADER_WIDTH = 5 499 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers","FH"] 500 FEATURE_END_MARKERS = ["XX"] #XX can also mark the end of many things! 501 FEATURE_QUALIFIER_INDENT = 21 502 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2) 503 SEQUENCE_HEADERS=["SQ"] #Remove trailing spaces 504 538
539 - def _feed_first_line(self, consumer, line) :
540 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 541 if line[self.HEADER_WIDTH:].count(";") == 6 : 542 #Looks like the semi colon separated style introduced in 2006 543 self._feed_first_line_new(consumer, line) 544 elif line[self.HEADER_WIDTH:].count(";") == 3 : 545 #Looks like the pre 2006 style 546 self._feed_first_line_old(consumer, line) 547 else : 548 raise ValueError('Did not recognise the ID line layout:\n' + line)
549
550 - def _feed_first_line_old(self, consumer, line) :
551 #Expects an ID line in the style before 2006, e.g. 552 #ID SC10H5 standard; DNA; PRO; 4870 BP. 553 #ID BSUB9999 standard; circular DNA; PRO; 4214630 BP. 554 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 555 fields = [line[self.HEADER_WIDTH:].split(None,1)[0]] 556 fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";")) 557 fields = [entry.strip() for entry in fields] 558 """ 559 The tokens represent: 560 0. Primary accession number 561 (space sep) 562 1. ??? (e.g. standard) 563 (semi-colon) 564 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA') 565 3. Taxonomic division (e.g. 'PRO') 566 4. Sequence length (e.g. '4639675 BP.') 567 """ 568 consumer.locus(fields[0]) #Should we also call the accession consumer? 569 consumer.residue_type(fields[2]) 570 consumer.data_file_division(fields[3]) 571 self._feed_seq_length(consumer, fields[4])
572
573 - def _feed_first_line_new(self, consumer, line) :
574 #Expects an ID line in the style introduced in 2006, e.g. 575 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 576 #ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. 577 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 578 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")] 579 assert len(fields) == 7 580 """ 581 The tokens represent: 582 0. Primary accession number 583 1. Sequence version number 584 2. Topology: 'circular' or 'linear' 585 3. Molecule type (e.g. 'genomic DNA') 586 4. Data class (e.g. 'STD') 587 5. Taxonomic division (e.g. 'PRO') 588 6. Sequence length (e.g. '4639675 BP.') 589 """ 590 591 consumer.locus(fields[0]) 592 593 #Call the accession consumer now, to make sure we record 594 #something as the record.id, in case there is no AC line 595 consumer.accession(fields[0]) 596 597 #TODO - How to deal with the version field? At the moment the consumer 598 #will try and use this for the ID which isn't ideal for EMBL files. 599 version_parts = fields[1].split() 600 if len(version_parts)==2 \ 601 and version_parts[0]=="SV" \ 602 and version_parts[1].isdigit() : 603 consumer.version_suffix(version_parts[1]) 604 605 #Based on how the old GenBank parser worked, merge these two: 606 consumer.residue_type(" ".join(fields[2:4])) #TODO - Store as two fields? 607 608 #consumer.xxx(fields[4]) #TODO - What should we do with the data class? 609 610 consumer.data_file_division(fields[5]) 611 612 self._feed_seq_length(consumer, fields[6])
613
614 - def _feed_seq_length(self, consumer, text) :
615 length_parts = text.split() 616 assert len(length_parts) == 2 617 assert length_parts[1].upper() in ["BP", "BP."] 618 consumer.size(length_parts[0])
619
620 - def _feed_header_lines(self, consumer, lines) :
621 EMBL_INDENT = self.HEADER_WIDTH 622 EMBL_SPACER = " " * EMBL_INDENT 623 consumer_dict = { 624 'AC' : 'accession', 625 'SV' : 'version', # SV line removed in June 2006, now part of ID line 626 'DE' : 'definition', 627 #'RN' : 'reference_num', 628 #'RP' : 'reference_bases', 629 #'RX' : reference cross reference... DOI or Pubmed 630 'RA' : 'authors', 631 'RT' : 'title', 632 'RL' : 'journal', 633 'OS' : 'organism', 634 'OC' : 'taxonomy', 635 #'DR' : data reference? 636 'CC' : 'comment', 637 #'XX' : splitter 638 } 639 #We have to handle the following specially: 640 #RX (depending on reference type...) 641 lines = filter(None,lines) 642 line_iter = iter(lines) 643 try : 644 while True : 645 try : 646 line = line_iter.next() 647 except StopIteration : 648 break 649 if not line : break 650 line_type = line[:EMBL_INDENT].strip() 651 data = line[EMBL_INDENT:].strip() 652 653 if line_type == 'XX' : 654 pass 655 elif line_type == 'RN' : 656 # Reformat reference numbers for the GenBank based consumer 657 # e.g. '[1]' becomes '1' 658 if data[0] == "[" and data[-1] == "]" : data = data[1:-1] 659 consumer.reference_num(data) 660 elif line_type == 'RP' : 661 # Reformat reference numbers for the GenBank based consumer 662 # e.g. '1-4639675' becomes '(bases 1 to 4639675)' 663 assert data.count("-")==1 664 consumer.reference_bases("(bases " + data.replace("-", " to ") + ")") 665 elif line_type == 'RX' : 666 # TODO - I have seen both DOI and PubMed reference cross references 667 # The GenBank based consumer and Reference class may need extending here. 668 pass 669 elif line_type == 'CC' : 670 # Have to pass a list of strings for this one (not just a string) 671 consumer.comment([data]) 672 elif line_type == 'DR' : 673 # TODO - Data reference... 674 pass 675 elif line_type in consumer_dict : 676 #Its a semi-automatic entry! 677 getattr(consumer, consumer_dict[line_type])(data) 678 else : 679 if self.debug : 680 print "Ignoring EMBL header line:\n%s" % line 681 except StopIteration : 682 raise ValueError("Problem with header")
683
684 - def _feed_misc_lines(self, consumer, lines) :
685 #TODO - Should we do something with the information on the SQ line(s)? 686 pass
687
688 -class GenBankScanner(InsdcScanner) :
689 """For extracting chunks of information in GenBank files""" 690 691 RECORD_START = "LOCUS " 692 HEADER_WIDTH = 12 693 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers","FEATURES"] 694 FEATURE_END_MARKERS = [] 695 FEATURE_QUALIFIER_INDENT = 21 696 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 697 SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT"] # trailing spaces removed 698 740
741 - def _feed_first_line(self, consumer, line) :
742 ##################################### 743 # LOCUS line # 744 ##################################### 745 GENBANK_INDENT = self.HEADER_WIDTH 746 GENBANK_SPACER = " "*GENBANK_INDENT 747 assert line[0:GENBANK_INDENT] == 'LOCUS ', \ 748 'LOCUS line does not start correctly:\n' + line 749 750 #Have to break up the locus line, and handle the different bits of it. 751 #There are at least two different versions of the locus line... 752 if line[29:33] in [' bp ', ' aa '] : 753 #Old... 754 # 755 # Positions Contents 756 # --------- -------- 757 # 00:06 LOCUS 758 # 06:12 spaces 759 # 12:?? Locus name 760 # ??:?? space 761 # ??:29 Length of sequence, right-justified 762 # 29:33 space, bp, space 763 # 33:41 strand type 764 # 41:42 space 765 # 42:51 Blank (implies linear), linear or circular 766 # 51:52 space 767 # 52:55 The division code (e.g. BCT, VRL, INV) 768 # 55:62 space 769 # 62:73 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 770 # 771 assert line[29:33] in [' bp ', ' aa '] , \ 772 'LOCUS line does not contain size units at expected position:\n' + line 773 assert line[41:42] == ' ', \ 774 'LOCUS line does not contain space at position 42:\n' + line 775 assert line[42:51].strip() in ['','linear','circular'], \ 776 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 777 assert line[51:52] == ' ', \ 778 'LOCUS line does not contain space at position 52:\n' + line 779 assert line[55:62] == ' ', \ 780 'LOCUS line does not contain spaces from position 56 to 62:\n' + line 781 assert line[64:65] == '-', \ 782 'LOCUS line does not contain - at position 65 in date:\n' + line 783 assert line[68:69] == '-', \ 784 'LOCUS line does not contain - at position 69 in date:\n' + line 785 786 name_and_length_str = line[GENBANK_INDENT:29] 787 while name_and_length_str.find(' ')!=-1 : 788 name_and_length_str = name_and_length_str.replace(' ',' ') 789 name_and_length = name_and_length_str.split(' ') 790 assert len(name_and_length)<=2, \ 791 'Cannot parse the name and length in the LOCUS line:\n' + line 792 assert len(name_and_length)!=1, \ 793 'Name and length collide in the LOCUS line:\n' + line 794 #Should be possible to split them based on position, if 795 #a clear definition of the standard exists THAT AGREES with 796 #existing files. 797 consumer.locus(name_and_length[0]) 798 consumer.size(name_and_length[1]) 799 #consumer.residue_type(line[33:41].strip()) 800 801 if line[33:51].strip() == "" and line[29:33] == ' aa ' : 802 #Amino acids -> protein (even if there is no residue type given) 803 #We want to use a protein alphabet in this case, rather than a 804 #generic one. Not sure if this is the best way to achieve this, 805 #but it works because the scanner checks for this: 806 consumer.residue_type("PROTEIN") 807 else : 808 consumer.residue_type(line[33:51].strip()) 809 810 consumer.data_file_division(line[52:55]) 811 consumer.date(line[62:73]) 812 elif line[40:44] in [' bp ', ' aa '] : 813 #New... 814 # 815 # Positions Contents 816 # --------- -------- 817 # 00:06 LOCUS 818 # 06:12 spaces 819 # 12:?? Locus name 820 # ??:?? space 821 # ??:40 Length of sequence, right-justified 822 # 40:44 space, bp, space 823 # 44:47 Blank, ss-, ds-, ms- 824 # 47:54 Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA 825 # 54:55 space 826 # 55:63 Blank (implies linear), linear or circular 827 # 63:64 space 828 # 64:67 The division code (e.g. BCT, VRL, INV) 829 # 67:68 space 830 # 68:79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 831 # 832 assert line[40:44] in [' bp ', ' aa '] , \ 833 'LOCUS line does not contain size units at expected position:\n' + line 834 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 835 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 836 assert line[47:54].strip() == "" \ 837 or line[47:54].strip().find('DNA') != -1 \ 838 or line[47:54].strip().find('RNA') != -1, \ 839 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 840 assert line[54:55] == ' ', \ 841 'LOCUS line does not contain space at position 55:\n' + line 842 assert line[55:63].strip() in ['','linear','circular'], \ 843 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 844 assert line[63:64] == ' ', \ 845 'LOCUS line does not contain space at position 64:\n' + line 846 assert line[67:68] == ' ', \ 847 'LOCUS line does not contain space at position 68:\n' + line 848 assert line[70:71] == '-', \ 849 'LOCUS line does not contain - at position 71 in date:\n' + line 850 assert line[74:75] == '-', \ 851 'LOCUS line does not contain - at position 75 in date:\n' + line 852 853 name_and_length_str = line[GENBANK_INDENT:40] 854 while name_and_length_str.find(' ')!=-1 : 855 name_and_length_str = name_and_length_str.replace(' ',' ') 856 name_and_length = name_and_length_str.split(' ') 857 assert len(name_and_length)<=2, \ 858 'Cannot parse the name and length in the LOCUS line:\n' + line 859 assert len(name_and_length)!=1, \ 860 'Name and length collide in the LOCUS line:\n' + line 861 #Should be possible to split them based on position, if 862 #a clear definition of the stand exists THAT AGREES with 863 #existing files. 864 consumer.locus(name_and_length[0]) 865 consumer.size(name_and_length[1]) 866 867 if line[44:54].strip() == "" and line[40:44] == ' aa ' : 868 #Amino acids -> protein (even if there is no residue type given) 869 #We want to use a protein alphabet in this case, rather than a 870 #generic one. Not sure if this is the best way to achieve this, 871 #but it works because the scanner checks for this: 872 consumer.residue_type(("PROTEIN " + line[54:63]).strip()) 873 else : 874 consumer.residue_type(line[44:63].strip()) 875 876 consumer.data_file_division(line[64:67]) 877 consumer.date(line[68:79]) 878 elif line[GENBANK_INDENT:].strip().count(" ")==0 : 879 #Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762 880 # 881 #e.g. 882 # 883 # "LOCUS U00096" 884 # 885 #rather than: 886 # 887 # "LOCUS U00096 4639675 bp DNA circular BCT" 888 # 889 # Positions Contents 890 # --------- -------- 891 # 00:06 LOCUS 892 # 06:12 spaces 893 # 12:?? Locus name 894 if line[GENBANK_INDENT:].strip() != "" : 895 consumer.locus(line[GENBANK_INDENT:].strip()) 896 else : 897 #Must just have just "LOCUS ", is this even legitimate? 898 #We should be able to continue parsing... we need real world testcases! 899 print >> sys.stderr, "Warning: Minimal LOCUS line found - is this correct?\n" + line 900 elif len(line.split())>=4 and line.split()[3] in ["aa","bp"] : 901 #Cope with EMBOSS seqret output where it seems the locus id can cause 902 #the other fields to overflow. We just IGNORE the other fields! 903 consumer.locus(line.split()[1]) 904 consumer.size(line.split()[2]) 905 print >> sys.stderr, "Warning: Malformed LOCUS line found - is this correct?\n" + line 906 else : 907 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
908 909
910 - def _feed_header_lines(self, consumer, lines) :
911 #Following dictionary maps GenBank lines to the associated 912 #consumer methods - the special cases like LOCUS where one 913 #genbank line triggers several consumer calls have to be 914 #handled individually. 915 GENBANK_INDENT = self.HEADER_WIDTH 916 GENBANK_SPACER = " "*GENBANK_INDENT 917 consumer_dict = { 918 'DEFINITION' : 'definition', 919 'ACCESSION' : 'accession', 920 'NID' : 'nid', 921 'PID' : 'pid', 922 'DBSOURCE' : 'db_source', 923 'KEYWORDS' : 'keywords', 924 'SEGMENT' : 'segment', 925 'SOURCE' : 'source', 926 'AUTHORS' : 'authors', 927 'CONSRTM' : 'consrtm', 928 'PROJECT' : 'project', 929 'DBLINK' : 'dblink', 930 'TITLE' : 'title', 931 'JOURNAL' : 'journal', 932 'MEDLINE' : 'medline_id', 933 'PUBMED' : 'pubmed_id', 934 'REMARK' : 'remark'} 935 #We have to handle the following specially: 936 #ORIGIN (locus, size, residue_type, data_file_division and date) 937 #COMMENT (comment) 938 #VERSION (version and gi) 939 #REFERENCE (eference_num and reference_bases) 940 #ORGANISM (organism and taxonomy) 941 lines = filter(None,lines) 942 lines.append("") #helps avoid getting StopIteration all the time 943 line_iter = iter(lines) 944 try : 945 line = line_iter.next() 946 while True : 947 if not line : break 948 line_type = line[:GENBANK_INDENT].strip() 949 data = line[GENBANK_INDENT:].strip() 950 951 if line_type == 'VERSION' : 952 #Need to call consumer.version(), and maybe also consumer.gi() as well. 953 #e.g. 954 # VERSION AC007323.5 GI:6587720 955 while data.find(' ')!=-1: 956 data = data.replace(' ',' ') 957 if data.find(' GI:')==-1 : 958 consumer.version(data) 959 else : 960 if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]" 961 consumer.version(data.split(' GI:')[0]) 962 consumer.gi(data.split(' GI:')[1]) 963 #Read in the next line! 964 line = line_iter.next() 965 elif line_type == 'REFERENCE' : 966 if self.debug >1 : print "Found reference [" + data + "]" 967 #Need to call consumer.reference_num() and consumer.reference_bases() 968 #e.g. 969 # REFERENCE 1 (bases 1 to 86436) 970 # 971 #Note that this can be multiline, see Bug 1968, e.g. 972 # 973 # REFERENCE 42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to 974 # 28259) 975 # 976 #For such cases we will call the consumer once only. 977 data = data.strip() 978 979 #Read in the next line, and see if its more of the reference: 980 while True: 981 line = line_iter.next() 982 if line[:GENBANK_INDENT] == GENBANK_SPACER : 983 #Add this continuation to the data string 984 data += " " + line[GENBANK_INDENT:] 985 if self.debug >1 : print "Extended reference text [" + data + "]" 986 else : 987 #End of the reference, leave this text in the variable "line" 988 break 989 990 #We now have all the reference line(s) stored in a string, data, 991 #which we pass to the consumer 992 while data.find(' ')!=-1: 993 data = data.replace(' ',' ') 994 if data.find(' ')==-1 : 995 if self.debug >2 : print 'Reference number \"' + data + '\"' 996 consumer.reference_num(data) 997 else : 998 if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"' 999 consumer.reference_num(data[:data.find(' ')]) 1000 consumer.reference_bases(data[data.find(' ')+1:]) 1001 elif line_type == 'ORGANISM' : 1002 #The first line is the organism, but subsequent lines go to the taxonomy consumer 1003 consumer.organism(data) 1004 data = "" 1005 while True : 1006 line = line_iter.next() 1007 if line[0:GENBANK_INDENT] == GENBANK_SPACER : 1008 data += ' ' + line[GENBANK_INDENT:] 1009 else : 1010 #We now have all the data for this taxonomy: 1011 if data.strip() == "" : 1012 if self.debug > 1 : print "Taxonomy line(s) missing or blank" 1013 consumer.taxonomy(data.strip()) 1014 #End of continuation - return to top of loop! 1015 break 1016 elif line_type == 'COMMENT' : 1017 if self.debug > 1 : print "Found comment" 1018 #This can be multiline, and should call consumer.comment() once 1019 #with a list where each entry is a line. 1020 comment_list=[] 1021 comment_list.append(data) 1022 while True: 1023 line = line_iter.next() 1024 if line[0:GENBANK_INDENT] == GENBANK_SPACER : 1025 data = line[GENBANK_INDENT:] 1026 comment_list.append(data) 1027 if self.debug > 2 : print "Comment continuation [" + data + "]" 1028 else : 1029 #End of the comment 1030 break 1031 consumer.comment(comment_list) 1032 del comment_list 1033 elif line_type in consumer_dict : 1034 #Its a semi-automatic entry! 1035 #Now, this may be a multi line entry... 1036 while True : 1037 line = line_iter.next() 1038 if line[0:GENBANK_INDENT] == GENBANK_SPACER : 1039 data += ' ' + line[GENBANK_INDENT:] 1040 else : 1041 #We now have all the data for this entry: 1042 getattr(consumer, consumer_dict[line_type])(data) 1043 #End of continuation - return to top of loop! 1044 break 1045 else : 1046 if self.debug : 1047 print "Ignoring GenBank header line:\n" % line 1048 #Read in next line 1049 line = line_iter.next() 1050 except StopIteration : 1051 raise ValueError("Problem in header")
1052
1053 - def _feed_misc_lines(self, consumer, lines) :
1054 #Deals with a few misc lines between the features and the sequence 1055 GENBANK_INDENT = self.HEADER_WIDTH 1056 GENBANK_SPACER = " "*GENBANK_INDENT 1057 lines.append("") 1058 line_iter = iter(lines) 1059 try : 1060 for line in line_iter : 1061 if line.find('BASE COUNT')==0 : 1062 line = line[10:].strip() 1063 if line : 1064 if self.debug : print "base_count = " + line 1065 consumer.base_count(line) 1066 if line.find("ORIGIN")==0 : 1067 line = line[6:].strip() 1068 if line : 1069 if self.debug : print "origin_name = " + line 1070 consumer.origin_name(line) 1071 if line.find("CONTIG")==0 : 1072 line = line[6:].strip() 1073 contig_location = line + '\n' 1074 while True : 1075 line = line_iter.next() 1076 if not line : 1077 break 1078 elif line[:GENBANK_INDENT]==GENBANK_SPACER : 1079 contig_location += line.rstrip() 1080 else: 1081 raise ValueError('Expected CONTIG continuation line, got:\n' + line) 1082 consumer.contig_location(contig_location) 1083 return 1084 except StopIteration : 1085 raise ValueError("Problem in misc lines before sequence")
1086 1087 if __name__ == "__main__" : 1088 from StringIO import StringIO 1089 1090 gbk_example = \ 1091 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 1092 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p 1093 (AXL2) and Rev7p (REV7) genes, complete cds. 1094 ACCESSION U49845 1095 VERSION U49845.1 GI:1293613 1096 KEYWORDS . 1097 SOURCE Saccharomyces cerevisiae (baker's yeast) 1098 ORGANISM Saccharomyces cerevisiae 1099 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; 1100 Saccharomycetales; Saccharomycetaceae; Saccharomyces. 1101 REFERENCE 1 (bases 1 to 5028) 1102 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. 1103 TITLE Cloning and sequence of REV7, a gene whose function is required for 1104 DNA damage-induced mutagenesis in Saccharomyces cerevisiae 1105 JOURNAL Yeast 10 (11), 1503-1509 (1994) 1106 PUBMED 7871890 1107 REFERENCE 2 (bases 1 to 5028) 1108 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. 1109 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel 1110 plasma membrane glycoprotein 1111 JOURNAL Genes Dev. 10 (7), 777-793 (1996) 1112 PUBMED 8846915 1113 REFERENCE 3 (bases 1 to 5028) 1114 AUTHORS Roemer,T. 1115 TITLE Direct Submission 1116 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New 1117 Haven, CT, USA 1118 FEATURES Location/Qualifiers 1119 source 1..5028 1120 /organism="Saccharomyces cerevisiae" 1121 /db_xref="taxon:4932" 1122 /chromosome="IX" 1123 /map="9" 1124 CDS <1..206 1125 /codon_start=3 1126 /product="TCP1-beta" 1127 /protein_id="AAA98665.1" 1128 /db_xref="GI:1293614" 1129 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA 1130 AEVLLRVDNIIRARPRTANRQHM" 1131 gene 687..3158 1132 /gene="AXL2" 1133 CDS 687..3158 1134 /gene="AXL2" 1135 /note="plasma membrane glycoprotein" 1136 /codon_start=1 1137 /function="required for axial budding pattern of S. 1138 cerevisiae" 1139 /product="Axl2p" 1140 /protein_id="AAA98666.1" 1141 /db_xref="GI:1293615" 1142 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF 1143 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN 1144 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE 1145 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE 1146 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV 1147 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG 1148 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ 1149 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA 1150 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA 1151 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN 1152 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ 1153 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS 1154 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK 1155 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL 1156 VDFSNKSNVNVGQVKDIHGRIPEML" 1157 gene complement(3300..4037) 1158 /gene="REV7" 1159 CDS complement(3300..4037) 1160 /gene="REV7" 1161 /codon_start=1 1162 /product="Rev7p" 1163 /protein_id="AAA98667.1" 1164 /db_xref="GI:1293616" 1165 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ 1166 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD 1167 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR 1168 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK 1169 LISGDDKILNGVYSQYEEGESIFGSLF" 1170 ORIGIN 1171 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg 1172 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct 1173 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa 1174 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg 1175 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa 1176 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa 1177 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat 1178 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga 1179 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc 1180 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga 1181 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta 1182 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag 1183 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa 1184 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata 1185 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga 1186 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac 1187 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg 1188 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc 1189 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa 1190 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca 1191 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac 1192 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa 1193 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag 1194 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct 1195 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac 1196 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa 1197 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc 1198 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata 1199 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca 1200 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc 1201 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc 1202 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca 1203 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc 1204 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg 1205 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt 1206 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc 1207 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg 1208 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca 1209 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata 1210 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg 1211 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga 1212 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt 1213 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat 1214 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt 1215 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc 1216 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag 1217 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta 1218 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa 1219 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact 1220 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt 1221 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa 1222 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag 1223 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct 1224 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt 1225 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact 1226 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa 1227 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg 1228 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt 1229 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc 1230 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca 1231 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc 1232 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc 1233 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat 1234 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa 1235 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga 1236 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat 1237 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc 1238 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc 1239 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa 1240 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg 1241 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc 1242 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt 1243 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg 1244 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg 1245 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt 1246 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt 1247 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat 1248 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc 1249 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct 1250 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta 1251 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac 1252 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct 1253 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct 1254 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc 1255 //""" 1256 1257 # GenBank format protein (aka GenPept) file from: 1258 # http://www.molecularevolution.org/resources/fileformats/ 1259 gbk_example2 = \ 1260 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001 1261 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica]. 1262 ACCESSION AAD51968 1263 VERSION AAD51968.1 GI:5805369 1264 DBSOURCE locus AF171097 accession AF171097.1 1265 KEYWORDS . 1266 SOURCE Yersinia enterocolitica 1267 ORGANISM Yersinia enterocolitica 1268 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; 1269 Enterobacteriaceae; Yersinia. 1270 REFERENCE 1 (residues 1 to 143) 1271 AUTHORS Revell,P.A. and Miller,V.L. 1272 TITLE A chromosomally encoded regulator is required for expression of the 1273 Yersinia enterocolitica inv gene and for virulence 1274 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000) 1275 MEDLINE 20138369 1276 PUBMED 10672189 1277 REFERENCE 2 (residues 1 to 143) 1278 AUTHORS Revell,P.A. and Miller,V.L. 1279 TITLE Direct Submission 1280 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington 1281 University School of Medicine, Campus Box 8230, 660 South Euclid, 1282 St. Louis, MO 63110, USA 1283 COMMENT Method: conceptual translation. 1284 FEATURES Location/Qualifiers 1285 source 1..143 1286 /organism="Yersinia enterocolitica" 1287 /mol_type="unassigned DNA" 1288 /strain="JB580v" 1289 /serotype="O:8" 1290 /db_xref="taxon:630" 1291 Protein 1..143 1292 /product="transcriptional regulator RovA" 1293 /name="regulates inv expression" 1294 CDS 1..143 1295 /gene="rovA" 1296 /coded_by="AF171097.1:380..811" 1297 /note="regulator of virulence" 1298 /transl_table=11 1299 ORIGIN 1300 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq 1301 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp 1302 121 deiellsgli dklerniiql qsk 1303 // 1304 """ 1305 1306 embl_example="""ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 1307 XX 1308 AC X56734; S46826; 1309 XX 1310 DT 12-SEP-1991 (Rel. 29, Created) 1311 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) 1312 XX 1313 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase 1314 XX 1315 KW beta-glucosidase. 1316 XX 1317 OS Trifolium repens (white clover) 1318 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; 1319 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; 1320 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. 1321 XX 1322 RN [5] 1323 RP 1-1859 1324 RX PUBMED; 1907511. 1325 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; 1326 RT "Nucleotide and derived amino acid sequence of the cyanogenic 1327 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; 1328 RL Plant Mol. Biol. 17(2):209-219(1991). 1329 XX 1330 RN [6] 1331 RP 1-1859 1332 RA Hughes M.A.; 1333 RT ; 1334 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases. 1335 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle 1336 RL Upon Tyne, NE2 4HH, UK 1337 XX 1338 FH Key Location/Qualifiers 1339 FH 1340 FT source 1..1859 1341 FT /organism="Trifolium repens" 1342 FT /mol_type="mRNA" 1343 FT /clone_lib="lambda gt10" 1344 FT /clone="TRE361" 1345 FT /tissue_type="leaves" 1346 FT /db_xref="taxon:3899" 1347 FT CDS 14..1495 1348 FT /product="beta-glucosidase" 1349 FT /EC_number="3.2.1.21" 1350 FT /note="non-cyanogenic" 1351 FT /db_xref="GOA:P26204" 1352 FT /db_xref="InterPro:IPR001360" 1353 FT /db_xref="InterPro:IPR013781" 1354 FT /db_xref="UniProtKB/Swiss-Prot:P26204" 1355 FT /protein_id="CAA40058.1" 1356 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI 1357 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK 1358 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ 1359 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR 1360 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD 1361 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF 1362 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ 1363 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA 1364 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" 1365 FT mRNA 1..1859 1366 FT /experiment="experimental evidence, no additional details 1367 FT recorded" 1368 XX 1369 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; 1370 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 1371 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 1372 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 1373 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 1374 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 1375 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 1376 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 1377 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 1378 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 1379 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 1380 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 1381 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 1382 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 1383 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 1384 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 1385 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 1386 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 1387 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 1388 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 1389 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 1390 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 1391 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 1392 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 1393 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 1394 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 1395 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 1396 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 1397 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 1398 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 1399 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 1400 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 1401 // 1402 """ 1403 1404 print "GenBank CDS Iteration" 1405 print "=====================" 1406 1407 g = GenBankScanner() 1408 for record in g.parse_cds_features(StringIO(gbk_example)) : 1409 print record 1410 1411 g = GenBankScanner() 1412 for record in g.parse_cds_features(StringIO(gbk_example2), 1413 tags2id=('gene','locus_tag','product')) : 1414 print record 1415 1416 g = GenBankScanner() 1417 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2), 1418 tags2id=('gene','locus_tag','product')) : 1419 print record 1420 1421 print 1422 print "GenBank Iteration" 1423 print "=================" 1424 g = GenBankScanner() 1425 for record in g.parse_records(StringIO(gbk_example),do_features=False) : 1426 print record.id, record.name, record.description 1427 print record.seq 1428 1429 g = GenBankScanner() 1430 for record in g.parse_records(StringIO(gbk_example),do_features=True) : 1431 print record.id, record.name, record.description 1432 print record.seq 1433 1434 g = GenBankScanner() 1435 for record in g.parse_records(StringIO(gbk_example2),do_features=False) : 1436 print record.id, record.name, record.description 1437 print record.seq 1438 1439 g = GenBankScanner() 1440 for record in g.parse_records(StringIO(gbk_example2),do_features=True) : 1441 print record.id, record.name, record.description 1442 print record.seq 1443 1444 print 1445 print "EMBL CDS Iteration" 1446 print "==================" 1447 1448 e = EmblScanner() 1449 for record in e.parse_cds_features(StringIO(embl_example)) : 1450 print record 1451 1452 print 1453 print "EMBL Iteration" 1454 print "==============" 1455 e = EmblScanner() 1456 for record in e.parse_records(StringIO(embl_example),do_features=True) : 1457 print record.id, record.name, record.description 1458 print record.seq 1459