Package Bio :: Package GenBank :: Module Scanner
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Scanner

   1  # Copyright 2007-2010 by Peter Cock.  All rights reserved. 
   2  # Revisions copyright 2010 by Uri Laserson.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6  # 
   7  # This code is NOT intended for direct use.  It provides a basic scanner 
   8  # (for use with a event consumer such as Bio.GenBank._FeatureConsumer) 
   9  # to parse a GenBank or EMBL file (with their shared INSDC feature table). 
  10  # 
  11  # It is used by Bio.GenBank to parse GenBank files 
  12  # It is also used by Bio.SeqIO to parse GenBank and EMBL files 
  13  # 
  14  # Feature Table Documentation: 
  15  # http://www.insdc.org/files/feature_table.html 
  16  # http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html 
  17  # ftp://ftp.ncbi.nih.gov/genbank/docs/ 
  18  # 
  19  # 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  20  # These are GenBank files that summarize the content of a project, and provide lists of 
  21  # scaffold and contig files in the project. These will be in annotations['wgs'] and 
  22  # annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  23  # http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  24  # http://is.gd/nNgk 
  25  # for more details of this format, and an example. 
  26  # Added by Ying Huang & Iddo Friedberg 
  27   
  28  import warnings 
  29  import os 
  30  import re 
  31  from Bio.Seq import Seq 
  32  from Bio.SeqRecord import SeqRecord 
  33  from Bio.Alphabet import generic_alphabet, generic_protein 
  34   
35 -class InsdcScanner:
36 """Basic functions for breaking up a GenBank/EMBL file into sub sections. 37 38 The International Nucleotide Sequence Database Collaboration (INSDC) 39 between the DDBJ, EMBL, and GenBank. These organisations all use the 40 same "Feature Table" layout in their plain text flat file formats. 41 42 However, the header and sequence sections of an EMBL file are very 43 different in layout to those produced by GenBank/DDBJ.""" 44 45 #These constants get redefined with sensible values in the sub classes: 46 RECORD_START = "XXX" # "LOCUS " or "ID " 47 HEADER_WIDTH = 3 # 12 or 5 48 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"] 49 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"] 50 FEATURE_QUALIFIER_INDENT = 0 51 FEATURE_QUALIFIER_SPACER = "" 52 SEQUENCE_HEADERS=["XXX"] #with right hand side spaces removed 53
54 - def __init__(self, debug=0):
55 assert len(self.RECORD_START)==self.HEADER_WIDTH 56 for marker in self.SEQUENCE_HEADERS: 57 assert marker==marker.rstrip() 58 assert len(self.FEATURE_QUALIFIER_SPACER)==self.FEATURE_QUALIFIER_INDENT 59 self.debug = debug 60 self.line = None
61
62 - def set_handle(self, handle):
63 self.handle = handle 64 self.line = ""
65
66 - def find_start(self):
67 """Read in lines until find the ID/LOCUS line, which is returned. 68 69 Any preamble (such as the header used by the NCBI on *.seq.gz archives) 70 will we ignored.""" 71 while True: 72 if self.line: 73 line = self.line 74 self.line = "" 75 else: 76 line = self.handle.readline() 77 if not line: 78 if self.debug : print "End of file" 79 return None 80 if line[:self.HEADER_WIDTH]==self.RECORD_START: 81 if self.debug > 1: print "Found the start of a record:\n" + line 82 break 83 line = line.rstrip() 84 if line == "//": 85 if self.debug > 1: print "Skipping // marking end of last record" 86 elif line == "": 87 if self.debug > 1: print "Skipping blank line before record" 88 else: 89 #Ignore any header before the first ID/LOCUS line. 90 if self.debug > 1: 91 print "Skipping header line before record:\n" + line 92 self.line = line 93 return line
94
95 - def parse_header(self):
96 """Return list of strings making up the header 97 98 New line characters are removed. 99 100 Assumes you have just read in the ID/LOCUS line. 101 """ 102 assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \ 103 "Not at start of record" 104 105 header_lines = [] 106 while True: 107 line = self.handle.readline() 108 if not line: 109 raise ValueError("Premature end of line during sequence data") 110 line = line.rstrip() 111 if line in self.FEATURE_START_MARKERS: 112 if self.debug : print "Found header table" 113 break 114 #if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH]: 115 # if self.debug : print "Found header table (?)" 116 # break 117 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 118 if self.debug : print "Found start of sequence" 119 break 120 if line == "//": 121 raise ValueError("Premature end of sequence data marker '//' found") 122 header_lines.append(line) 123 self.line = line 124 return header_lines
125
126 - def parse_features(self, skip=False):
127 """Return list of tuples for the features (if present) 128 129 Each feature is returned as a tuple (key, location, qualifiers) 130 where key and location are strings (e.g. "CDS" and 131 "complement(join(490883..490885,1..879))") while qualifiers 132 is a list of two string tuples (feature qualifier keys and values). 133 134 Assumes you have already read to the start of the features table. 135 """ 136 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 137 if self.debug : print "Didn't find any feature table" 138 return [] 139 140 while self.line.rstrip() in self.FEATURE_START_MARKERS: 141 self.line = self.handle.readline() 142 143 features = [] 144 line = self.line 145 while True: 146 if not line: 147 raise ValueError("Premature end of line during features table") 148 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 149 if self.debug : print "Found start of sequence" 150 break 151 line = line.rstrip() 152 if line == "//": 153 raise ValueError("Premature end of features table, marker '//' found") 154 if line in self.FEATURE_END_MARKERS: 155 if self.debug : print "Found end of features" 156 line = self.handle.readline() 157 break 158 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 159 #This is an empty feature line between qualifiers. Empty 160 #feature lines within qualifiers are handled below (ignored). 161 line = self.handle.readline() 162 continue 163 164 if skip: 165 line = self.handle.readline() 166 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 167 line = self.handle.readline() 168 else: 169 #Build up a list of the lines making up this feature: 170 if line[self.FEATURE_QUALIFIER_INDENT]!=" " \ 171 and " " in line[self.FEATURE_QUALIFIER_INDENT:]: 172 #The feature table design enforces a length limit on the feature keys. 173 #Some third party files (e.g. IGMT's EMBL like files) solve this by 174 #over indenting the location and qualifiers. 175 feature_key, line = line[2:].strip().split(None,1) 176 feature_lines = [line] 177 warnings.warn("Overindented %s feature?" % feature_key) 178 else: 179 feature_key = line[2:self.FEATURE_QUALIFIER_INDENT].strip() 180 feature_lines = [line[self.FEATURE_QUALIFIER_INDENT:]] 181 line = self.handle.readline() 182 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 183 or line.rstrip() == "" : # cope with blank lines in the midst of a feature 184 #Use strip to remove any harmless trailing white space AND and leading 185 #white space (e.g. out of spec files with too much intentation) 186 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 187 line = self.handle.readline() 188 features.append(self.parse_feature(feature_key, feature_lines)) 189 self.line = line 190 return features
191
192 - def parse_feature(self, feature_key, lines):
193 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers) 194 195 For example given this GenBank feature: 196 197 CDS complement(join(490883..490885,1..879)) 198 /locus_tag="NEQ001" 199 /note="conserved hypothetical [Methanococcus jannaschii]; 200 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear 201 localization signal; IPR002743: Protein of unknown 202 function DUF57" 203 /codon_start=1 204 /transl_table=11 205 /product="hypothetical protein" 206 /protein_id="NP_963295.1" 207 /db_xref="GI:41614797" 208 /db_xref="GeneID:2732620" 209 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK 210 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK 211 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP 212 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE 213 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS 214 LNSMGFGFVNTKKNSAR" 215 216 Then should give input key="CDS" and the rest of the data as a list of strings 217 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"] 218 where the leading spaces and trailing newlines have been removed. 219 220 Returns tuple containing: (key as string, location string, qualifiers as list) 221 as follows for this example: 222 223 key = "CDS", string 224 location = "complement(join(490883..490885,1..879))", string 225 qualifiers = list of string tuples: 226 227 [('locus_tag', '"NEQ001"'), 228 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'), 229 ('codon_start', '1'), 230 ('transl_table', '11'), 231 ('product', '"hypothetical protein"'), 232 ('protein_id', '"NP_963295.1"'), 233 ('db_xref', '"GI:41614797"'), 234 ('db_xref', '"GeneID:2732620"'), 235 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')] 236 237 In the above example, the "note" and "translation" were edited for compactness, 238 and they would contain multiple new line characters (displayed above as \n) 239 240 If a qualifier is quoted (in this case, everything except codon_start and 241 transl_table) then the quotes are NOT removed. 242 243 Note that no whitespace is removed. 244 """ 245 #Skip any blank lines 246 iterator = iter(filter(None, lines)) 247 try: 248 line = iterator.next() 249 250 feature_location = line.strip() 251 while feature_location[-1:]==",": 252 #Multiline location, still more to come! 253 line = iterator.next() 254 feature_location += line.strip() 255 256 qualifiers=[] 257 258 for line in iterator: 259 if line[0]=="/": 260 #New qualifier 261 i = line.find("=") 262 key = line[1:i] #does not work if i==-1 263 value = line[i+1:] #we ignore 'value' if i==-1 264 if i==-1: 265 #Qualifier with no key, e.g. /pseudo 266 key = line[1:] 267 qualifiers.append((key,None)) 268 elif value[0]=='"': 269 #Quoted... 270 if value[-1]!='"' or value!='"': 271 #No closing quote on the first line... 272 while value[-1] != '"': 273 value += "\n" + iterator.next() 274 else: 275 #One single line (quoted) 276 assert value == '"' 277 if self.debug : print "Quoted line %s:%s" % (key, value) 278 #DO NOT remove the quotes... 279 qualifiers.append((key,value)) 280 else: 281 #Unquoted 282 #if debug : print "Unquoted line %s:%s" % (key,value) 283 qualifiers.append((key,value)) 284 else: 285 #Unquoted continuation 286 assert len(qualifiers) > 0 287 assert key==qualifiers[-1][0] 288 #if debug : print "Unquoted Cont %s:%s" % (key, line) 289 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line) 290 return (feature_key, feature_location, qualifiers) 291 except StopIteration: 292 #Bummer 293 raise ValueError("Problem with '%s' feature:\n%s" \ 294 % (feature_key, "\n".join(lines)))
295 316
317 - def _feed_first_line(self, consumer, line):
318 """Handle the LOCUS/ID line, passing data to the comsumer 319 320 This should be implemented by the EMBL / GenBank specific subclass 321 322 Used by the parse_records() and parse() methods. 323 """ 324 pass
325
326 - def _feed_header_lines(self, consumer, lines):
327 """Handle the header lines (list of strings), passing data to the comsumer 328 329 This should be implemented by the EMBL / GenBank specific subclass 330 331 Used by the parse_records() and parse() methods. 332 """ 333 pass
334 335
336 - def _feed_feature_table(self, consumer, feature_tuples):
337 """Handle the feature table (list of tuples), passing data to the comsumer 338 339 Used by the parse_records() and parse() methods. 340 """ 341 consumer.start_feature_table() 342 for feature_key, location_string, qualifiers in feature_tuples: 343 consumer.feature_key(feature_key) 344 consumer.location(location_string) 345 for q_key, q_value in qualifiers: 346 consumer.feature_qualifier_name([q_key]) 347 if q_value is not None: 348 consumer.feature_qualifier_description(q_value.replace("\n"," "))
349
350 - def _feed_misc_lines(self, consumer, lines):
351 """Handle any lines between features and sequence (list of strings), passing data to the consumer 352 353 This should be implemented by the EMBL / GenBank specific subclass 354 355 Used by the parse_records() and parse() methods. 356 """ 357 pass
358
359 - def feed(self, handle, consumer, do_features=True):
360 """Feed a set of data into the consumer. 361 362 This method is intended for use with the "old" code in Bio.GenBank 363 364 Arguments: 365 handle - A handle with the information to parse. 366 consumer - The consumer that should be informed of events. 367 do_features - Boolean, should the features be parsed? 368 Skipping the features can be much faster. 369 370 Return values: 371 true - Passed a record 372 false - Did not find a record 373 """ 374 #Should work with both EMBL and GenBank files provided the 375 #equivalent Bio.GenBank._FeatureConsumer methods are called... 376 self.set_handle(handle) 377 if not self.find_start(): 378 #Could not find (another) record 379 consumer.data=None 380 return False 381 382 #We use the above class methods to parse the file into a simplified format. 383 #The first line, header lines and any misc lines after the features will be 384 #dealt with by GenBank / EMBL specific derived classes. 385 386 #First line and header: 387 self._feed_first_line(consumer, self.line) 388 self._feed_header_lines(consumer, self.parse_header()) 389 390 #Features (common to both EMBL and GenBank): 391 if do_features: 392 self._feed_feature_table(consumer, self.parse_features(skip=False)) 393 else: 394 self.parse_features(skip=True) # ignore the data 395 396 #Footer and sequence 397 misc_lines, sequence_string = self.parse_footer() 398 self._feed_misc_lines(consumer, misc_lines) 399 400 consumer.sequence(sequence_string) 401 #Calls to consumer.base_number() do nothing anyway 402 consumer.record_end("//") 403 404 assert self.line == "//" 405 406 #And we are done 407 return True
408
409 - def parse(self, handle, do_features=True):
410 """Returns a SeqRecord (with SeqFeatures if do_features=True) 411 412 See also the method parse_records() for use on multi-record files. 413 """ 414 from Bio.GenBank import _FeatureConsumer 415 from Bio.GenBank.utils import FeatureValueCleaner 416 417 consumer = _FeatureConsumer(use_fuzziness = 1, 418 feature_cleaner = FeatureValueCleaner()) 419 420 if self.feed(handle, consumer, do_features): 421 return consumer.data 422 else: 423 return None
424 425
426 - def parse_records(self, handle, do_features=True):
427 """Returns a SeqRecord object iterator 428 429 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord 430 431 The SeqRecord objects include SeqFeatures if do_features=True 432 433 This method is intended for use in Bio.SeqIO 434 """ 435 #This is a generator function 436 while True: 437 record = self.parse(handle, do_features) 438 if record is None : break 439 assert record.id is not None 440 assert record.name != "<unknown name>" 441 assert record.description != "<unknown description>" 442 yield record
443
444 - def parse_cds_features(self, handle, 445 alphabet=generic_protein, 446 tags2id=('protein_id','locus_tag','product')):
447 """Returns SeqRecord object iterator 448 449 Each CDS feature becomes a SeqRecord. 450 451 alphabet - Used for any sequence found in a translation field. 452 tags2id - Tupple of three strings, the feature keys to use 453 for the record id, name and description, 454 455 This method is intended for use in Bio.SeqIO 456 """ 457 self.set_handle(handle) 458 while self.find_start(): 459 #Got an EMBL or GenBank record... 460 self.parse_header() # ignore header lines! 461 feature_tuples = self.parse_features() 462 #self.parse_footer() # ignore footer lines! 463 while True: 464 line = self.handle.readline() 465 if not line : break 466 if line[:2]=="//" : break 467 self.line = line.rstrip() 468 469 #Now go though those features... 470 for key, location_string, qualifiers in feature_tuples: 471 if key=="CDS": 472 #Create SeqRecord 473 #================ 474 #SeqRecord objects cannot be created with annotations, they 475 #must be added afterwards. So create an empty record and 476 #then populate it: 477 record = SeqRecord(seq=None) 478 annotations = record.annotations 479 480 #Should we add a location object to the annotations? 481 #I *think* that only makes sense for SeqFeatures with their 482 #sub features... 483 annotations['raw_location'] = location_string.replace(' ','') 484 485 for (qualifier_name, qualifier_data) in qualifiers: 486 if qualifier_data is not None \ 487 and qualifier_data[0]=='"' and qualifier_data[-1]=='"': 488 #Remove quotes 489 qualifier_data = qualifier_data[1:-1] 490 #Append the data to the annotation qualifier... 491 if qualifier_name == "translation": 492 assert record.seq is None, "Multiple translations!" 493 record.seq = Seq(qualifier_data.replace("\n",""), alphabet) 494 elif qualifier_name == "db_xref": 495 #its a list, possibly empty. Its safe to extend 496 record.dbxrefs.append(qualifier_data) 497 else: 498 if qualifier_data is not None: 499 qualifier_data = qualifier_data.replace("\n"," ").replace(" "," ") 500 try: 501 annotations[qualifier_name] += " " + qualifier_data 502 except KeyError: 503 #Not an addition to existing data, its the first bit 504 annotations[qualifier_name]= qualifier_data 505 506 #Fill in the ID, Name, Description 507 #================================= 508 try: 509 record.id = annotations[tags2id[0]] 510 except KeyError: 511 pass 512 try: 513 record.name = annotations[tags2id[1]] 514 except KeyError: 515 pass 516 try: 517 record.description = annotations[tags2id[2]] 518 except KeyError: 519 pass 520 521 yield record
522 523
524 -class EmblScanner(InsdcScanner):
525 """For extracting chunks of information in EMBL files""" 526 527 RECORD_START = "ID " 528 HEADER_WIDTH = 5 529 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers","FH"] 530 FEATURE_END_MARKERS = ["XX"] #XX can also mark the end of many things! 531 FEATURE_QUALIFIER_INDENT = 21 532 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2) 533 SEQUENCE_HEADERS=["SQ", "CO"] #Remove trailing spaces 534 569
570 - def _feed_first_line(self, consumer, line):
571 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 572 if line[self.HEADER_WIDTH:].count(";") == 6: 573 #Looks like the semi colon separated style introduced in 2006 574 self._feed_first_line_new(consumer, line) 575 elif line[self.HEADER_WIDTH:].count(";") == 3: 576 #Looks like the pre 2006 style 577 self._feed_first_line_old(consumer, line) 578 else: 579 raise ValueError('Did not recognise the ID line layout:\n' + line)
580
581 - def _feed_first_line_old(self, consumer, line):
582 #Expects an ID line in the style before 2006, e.g. 583 #ID SC10H5 standard; DNA; PRO; 4870 BP. 584 #ID BSUB9999 standard; circular DNA; PRO; 4214630 BP. 585 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 586 fields = [line[self.HEADER_WIDTH:].split(None,1)[0]] 587 fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";")) 588 fields = [entry.strip() for entry in fields] 589 """ 590 The tokens represent: 591 0. Primary accession number 592 (space sep) 593 1. ??? (e.g. standard) 594 (semi-colon) 595 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA') 596 3. Taxonomic division (e.g. 'PRO') 597 4. Sequence length (e.g. '4639675 BP.') 598 """ 599 consumer.locus(fields[0]) #Should we also call the accession consumer? 600 consumer.residue_type(fields[2]) 601 consumer.data_file_division(fields[3]) 602 self._feed_seq_length(consumer, fields[4])
603
604 - def _feed_first_line_new(self, consumer, line):
605 #Expects an ID line in the style introduced in 2006, e.g. 606 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 607 #ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. 608 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 609 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")] 610 assert len(fields) == 7 611 """ 612 The tokens represent: 613 0. Primary accession number 614 1. Sequence version number 615 2. Topology: 'circular' or 'linear' 616 3. Molecule type (e.g. 'genomic DNA') 617 4. Data class (e.g. 'STD') 618 5. Taxonomic division (e.g. 'PRO') 619 6. Sequence length (e.g. '4639675 BP.') 620 """ 621 622 consumer.locus(fields[0]) 623 624 #Call the accession consumer now, to make sure we record 625 #something as the record.id, in case there is no AC line 626 consumer.accession(fields[0]) 627 628 #TODO - How to deal with the version field? At the moment the consumer 629 #will try and use this for the ID which isn't ideal for EMBL files. 630 version_parts = fields[1].split() 631 if len(version_parts)==2 \ 632 and version_parts[0]=="SV" \ 633 and version_parts[1].isdigit(): 634 consumer.version_suffix(version_parts[1]) 635 636 #Based on how the old GenBank parser worked, merge these two: 637 consumer.residue_type(" ".join(fields[2:4])) #TODO - Store as two fields? 638 639 #consumer.xxx(fields[4]) #TODO - What should we do with the data class? 640 641 consumer.data_file_division(fields[5]) 642 643 self._feed_seq_length(consumer, fields[6])
644
645 - def _feed_seq_length(self, consumer, text):
646 length_parts = text.split() 647 assert len(length_parts) == 2 648 assert length_parts[1].upper() in ["BP", "BP.", "AA."] 649 consumer.size(length_parts[0])
650
651 - def _feed_header_lines(self, consumer, lines):
652 EMBL_INDENT = self.HEADER_WIDTH 653 EMBL_SPACER = " " * EMBL_INDENT 654 consumer_dict = { 655 'AC' : 'accession', 656 'SV' : 'version', # SV line removed in June 2006, now part of ID line 657 'DE' : 'definition', 658 #'RN' : 'reference_num', 659 #'RC' : reference comment... TODO 660 #'RP' : 'reference_bases', 661 #'RX' : reference cross reference... DOI or Pubmed 662 'RG' : 'consrtm', #optional consortium 663 #'RA' : 'authors', 664 #'RT' : 'title', 665 'RL' : 'journal', 666 'OS' : 'organism', 667 'OC' : 'taxonomy', 668 #'DR' : data reference 669 'CC' : 'comment', 670 #'XX' : splitter 671 } 672 #We have to handle the following specially: 673 #RX (depending on reference type...) 674 for line in lines: 675 line_type = line[:EMBL_INDENT].strip() 676 data = line[EMBL_INDENT:].strip() 677 if line_type == 'XX': 678 pass 679 elif line_type == 'RN': 680 # Reformat reference numbers for the GenBank based consumer 681 # e.g. '[1]' becomes '1' 682 if data[0] == "[" and data[-1] == "]" : data = data[1:-1] 683 consumer.reference_num(data) 684 elif line_type == 'RP': 685 # Reformat reference numbers for the GenBank based consumer 686 # e.g. '1-4639675' becomes '(bases 1 to 4639675)' 687 # and '160-550, 904-1055' becomes '(bases 160 to 550; 904 to 1055)' 688 parts = [bases.replace("-"," to ").strip() for bases in data.split(",")] 689 consumer.reference_bases("(bases %s)" % "; ".join(parts)) 690 elif line_type == 'RT': 691 #Remove the enclosing quotes and trailing semi colon. 692 #Note the title can be split over multiple lines. 693 if data.startswith('"'): 694 data = data[1:] 695 if data.endswith('";'): 696 data = data[:-2] 697 consumer.title(data) 698 elif line_type == 'RX': 699 # EMBL support three reference types at the moment: 700 # - PUBMED PUBMED bibliographic database (NLM) 701 # - DOI Digital Object Identifier (International DOI Foundation) 702 # - AGRICOLA US National Agriculture Library (NAL) of the US Department 703 # of Agriculture (USDA) 704 # 705 # Format: 706 # RX resource_identifier; identifier. 707 # 708 # e.g. 709 # RX DOI; 10.1016/0024-3205(83)90010-3. 710 # RX PUBMED; 264242. 711 # 712 # Currently our reference object only supports PUBMED and MEDLINE 713 # (as these were in GenBank files?). 714 key, value = data.split(";",1) 715 if value.endswith(".") : value = value[:-1] 716 value = value.strip() 717 if key == "PUBMED": 718 consumer.pubmed_id(value) 719 #TODO - Handle other reference types (here and in BioSQL bindings) 720 elif line_type == 'CC': 721 # Have to pass a list of strings for this one (not just a string) 722 consumer.comment([data]) 723 elif line_type == 'DR': 724 # Database Cross-reference, format: 725 # DR database_identifier; primary_identifier; secondary_identifier. 726 # 727 # e.g. 728 # DR MGI; 98599; Tcrb-V4. 729 # 730 # TODO - How should we store any secondary identifier? 731 parts = data.rstrip(".").split(";") 732 #Turn it into "database_identifier:primary_identifier" to 733 #mimic the GenBank parser. e.g. "MGI:98599" 734 consumer.dblink("%s:%s" % (parts[0].strip(), 735 parts[1].strip())) 736 elif line_type == 'RA': 737 # Remove trailing ; at end of authors list 738 consumer.authors(data.rstrip(";")) 739 elif line_type == 'PR': 740 # Remove trailing ; at end of the project reference 741 # In GenBank files this corresponds to the old PROJECT 742 # line which is being replaced with the DBLINK line. 743 consumer.project(data.rstrip(";")) 744 elif line_type in consumer_dict: 745 #Its a semi-automatic entry! 746 getattr(consumer, consumer_dict[line_type])(data) 747 else: 748 if self.debug: 749 print "Ignoring EMBL header line:\n%s" % line
750
751 - def _feed_misc_lines(self, consumer, lines):
752 #TODO - Should we do something with the information on the SQ line(s)? 753 lines.append("") 754 line_iter = iter(lines) 755 try: 756 for line in line_iter: 757 if line.startswith("CO "): 758 line = line[5:].strip() 759 contig_location = line 760 while True: 761 line = line_iter.next() 762 if not line: 763 break 764 elif line.startswith("CO "): 765 #Don't need to preseve the whitespace here. 766 contig_location += line[5:].strip() 767 else: 768 raise ValueError('Expected CO (contig) continuation line, got:\n' + line) 769 consumer.contig_location(contig_location) 770 return 771 except StopIteration: 772 raise ValueError("Problem in misc lines before sequence")
773 774
775 -class _ImgtScanner(EmblScanner):
776 """For extracting chunks of information in IMGT (EMBL like) files (PRIVATE). 777 778 IMGT files are like EMBL files but in order to allow longer feature types 779 the features should be indented by 25 characters not 21 characters. In 780 practice the IMGT flat files tend to use either 21 or 25 characters, so we 781 must cope with both. 782 783 This is private to encourage use of Bio.SeqIO rather than Bio.GenBank. 784 """ 785 786 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", 787 "FH Key Location/Qualifiers (from EMBL)", 788 "FH Key Location/Qualifiers", 789 "FH"] 790
791 - def parse_features(self, skip=False):
792 """Return list of tuples for the features (if present) 793 794 Each feature is returned as a tuple (key, location, qualifiers) 795 where key and location are strings (e.g. "CDS" and 796 "complement(join(490883..490885,1..879))") while qualifiers 797 is a list of two string tuples (feature qualifier keys and values). 798 799 Assumes you have already read to the start of the features table. 800 """ 801 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 802 if self.debug : print "Didn't find any feature table" 803 return [] 804 805 while self.line.rstrip() in self.FEATURE_START_MARKERS: 806 self.line = self.handle.readline() 807 808 bad_position_re = re.compile(r'([0-9]+)>{1}') 809 810 features = [] 811 line = self.line 812 while True: 813 if not line: 814 raise ValueError("Premature end of line during features table") 815 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 816 if self.debug : print "Found start of sequence" 817 break 818 line = line.rstrip() 819 if line == "//": 820 raise ValueError("Premature end of features table, marker '//' found") 821 if line in self.FEATURE_END_MARKERS: 822 if self.debug : print "Found end of features" 823 line = self.handle.readline() 824 break 825 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 826 #This is an empty feature line between qualifiers. Empty 827 #feature lines within qualifiers are handled below (ignored). 828 line = self.handle.readline() 829 continue 830 831 if skip: 832 line = self.handle.readline() 833 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 834 line = self.handle.readline() 835 else: 836 assert line[:2] == "FT" 837 try: 838 feature_key, location_start = line[2:].strip().split() 839 except ValueError: 840 #e.g. "FT TRANSMEMBRANE-REGION2163..2240\n" 841 #Assume indent of 25 as per IMGT spec, with the location 842 #start in column 26 (one-based). 843 feature_key = line[2:25].strip() 844 location_start = line[25:].strip() 845 feature_lines = [location_start] 846 line = self.handle.readline() 847 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 848 or line.rstrip() == "" : # cope with blank lines in the midst of a feature 849 #Use strip to remove any harmless trailing white space AND and leading 850 #white space (copes with 21 or 26 indents and orther variants) 851 assert line[:2] == "FT" 852 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 853 line = self.handle.readline() 854 feature_key, location, qualifiers = \ 855 self.parse_feature(feature_key, feature_lines) 856 #Try to handle known problems with IMGT locations here: 857 if ">" in location: 858 #Nasty hack for common IMGT bug, should be >123 not 123> 859 #in a location string. At least here the meaning is clear, 860 #and since it is so common I don't want to issue a warning 861 #warnings.warn("Feature location %s is invalid, " 862 # "moving greater than sign before position" 863 # % location) 864 location = bad_position_re.sub(r'>\1',location) 865 features.append((feature_key, location, qualifiers)) 866 self.line = line 867 return features
868
869 -class GenBankScanner(InsdcScanner):
870 """For extracting chunks of information in GenBank files""" 871 872 RECORD_START = "LOCUS " 873 HEADER_WIDTH = 12 874 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers","FEATURES"] 875 FEATURE_END_MARKERS = [] 876 FEATURE_QUALIFIER_INDENT = 21 877 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 878 SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT", "WGS"] # trailing spaces removed 879 923
924 - def _feed_first_line(self, consumer, line):
925 ##################################### 926 # LOCUS line # 927 ##################################### 928 GENBANK_INDENT = self.HEADER_WIDTH 929 GENBANK_SPACER = " "*GENBANK_INDENT 930 assert line[0:GENBANK_INDENT] == 'LOCUS ', \ 931 'LOCUS line does not start correctly:\n' + line 932 933 #Have to break up the locus line, and handle the different bits of it. 934 #There are at least two different versions of the locus line... 935 if line[29:33] in [' bp ', ' aa ',' rc ']: 936 #Old... 937 # 938 # Positions Contents 939 # --------- -------- 940 # 00:06 LOCUS 941 # 06:12 spaces 942 # 12:?? Locus name 943 # ??:?? space 944 # ??:29 Length of sequence, right-justified 945 # 29:33 space, bp, space 946 # 33:41 strand type 947 # 41:42 space 948 # 42:51 Blank (implies linear), linear or circular 949 # 51:52 space 950 # 52:55 The division code (e.g. BCT, VRL, INV) 951 # 55:62 space 952 # 62:73 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 953 # 954 assert line[29:33] in [' bp ', ' aa ',' rc '] , \ 955 'LOCUS line does not contain size units at expected position:\n' + line 956 assert line[41:42] == ' ', \ 957 'LOCUS line does not contain space at position 42:\n' + line 958 assert line[42:51].strip() in ['','linear','circular'], \ 959 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 960 assert line[51:52] == ' ', \ 961 'LOCUS line does not contain space at position 52:\n' + line 962 assert line[55:62] == ' ', \ 963 'LOCUS line does not contain spaces from position 56 to 62:\n' + line 964 if line[62:73].strip(): 965 assert line[64:65] == '-', \ 966 'LOCUS line does not contain - at position 65 in date:\n' + line 967 assert line[68:69] == '-', \ 968 'LOCUS line does not contain - at position 69 in date:\n' + line 969 970 name_and_length_str = line[GENBANK_INDENT:29] 971 while name_and_length_str.find(' ')!=-1: 972 name_and_length_str = name_and_length_str.replace(' ',' ') 973 name_and_length = name_and_length_str.split(' ') 974 assert len(name_and_length)<=2, \ 975 'Cannot parse the name and length in the LOCUS line:\n' + line 976 assert len(name_and_length)!=1, \ 977 'Name and length collide in the LOCUS line:\n' + line 978 #Should be possible to split them based on position, if 979 #a clear definition of the standard exists THAT AGREES with 980 #existing files. 981 consumer.locus(name_and_length[0]) 982 consumer.size(name_and_length[1]) 983 #consumer.residue_type(line[33:41].strip()) 984 985 if line[33:51].strip() == "" and line[29:33] == ' aa ': 986 #Amino acids -> protein (even if there is no residue type given) 987 #We want to use a protein alphabet in this case, rather than a 988 #generic one. Not sure if this is the best way to achieve this, 989 #but it works because the scanner checks for this: 990 consumer.residue_type("PROTEIN") 991 else: 992 consumer.residue_type(line[33:51].strip()) 993 994 consumer.data_file_division(line[52:55]) 995 if line[62:73].strip(): 996 consumer.date(line[62:73]) 997 elif line[40:44] in [' bp ', ' aa ',' rc ']: 998 #New... 999 # 1000 # Positions Contents 1001 # --------- -------- 1002 # 00:06 LOCUS 1003 # 06:12 spaces 1004 # 12:?? Locus name 1005 # ??:?? space 1006 # ??:40 Length of sequence, right-justified 1007 # 40:44 space, bp, space 1008 # 44:47 Blank, ss-, ds-, ms- 1009 # 47:54 Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA 1010 # 54:55 space 1011 # 55:63 Blank (implies linear), linear or circular 1012 # 63:64 space 1013 # 64:67 The division code (e.g. BCT, VRL, INV) 1014 # 67:68 space 1015 # 68:79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 1016 # 1017 assert line[40:44] in [' bp ', ' aa ',' rc '] , \ 1018 'LOCUS line does not contain size units at expected position:\n' + line 1019 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 1020 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 1021 assert line[47:54].strip() == "" \ 1022 or line[47:54].strip().find('DNA') != -1 \ 1023 or line[47:54].strip().find('RNA') != -1, \ 1024 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 1025 assert line[54:55] == ' ', \ 1026 'LOCUS line does not contain space at position 55:\n' + line 1027 assert line[55:63].strip() in ['','linear','circular'], \ 1028 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 1029 assert line[63:64] == ' ', \ 1030 'LOCUS line does not contain space at position 64:\n' + line 1031 assert line[67:68] == ' ', \ 1032 'LOCUS line does not contain space at position 68:\n' + line 1033 if line[68:79].strip(): 1034 assert line[70:71] == '-', \ 1035 'LOCUS line does not contain - at position 71 in date:\n' + line 1036 assert line[74:75] == '-', \ 1037 'LOCUS line does not contain - at position 75 in date:\n' + line 1038 1039 name_and_length_str = line[GENBANK_INDENT:40] 1040 while name_and_length_str.find(' ')!=-1: 1041 name_and_length_str = name_and_length_str.replace(' ',' ') 1042 name_and_length = name_and_length_str.split(' ') 1043 assert len(name_and_length)<=2, \ 1044 'Cannot parse the name and length in the LOCUS line:\n' + line 1045 assert len(name_and_length)!=1, \ 1046 'Name and length collide in the LOCUS line:\n' + line 1047 #Should be possible to split them based on position, if 1048 #a clear definition of the stand exists THAT AGREES with 1049 #existing files. 1050 consumer.locus(name_and_length[0]) 1051 consumer.size(name_and_length[1]) 1052 1053 if line[44:54].strip() == "" and line[40:44] == ' aa ': 1054 #Amino acids -> protein (even if there is no residue type given) 1055 #We want to use a protein alphabet in this case, rather than a 1056 #generic one. Not sure if this is the best way to achieve this, 1057 #but it works because the scanner checks for this: 1058 consumer.residue_type(("PROTEIN " + line[54:63]).strip()) 1059 else: 1060 consumer.residue_type(line[44:63].strip()) 1061 1062 consumer.data_file_division(line[64:67]) 1063 if line[68:79].strip(): 1064 consumer.date(line[68:79]) 1065 elif line[GENBANK_INDENT:].strip().count(" ")==0 : 1066 #Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762 1067 # 1068 #e.g. 1069 # 1070 # "LOCUS U00096" 1071 # 1072 #rather than: 1073 # 1074 # "LOCUS U00096 4639675 bp DNA circular BCT" 1075 # 1076 # Positions Contents 1077 # --------- -------- 1078 # 00:06 LOCUS 1079 # 06:12 spaces 1080 # 12:?? Locus name 1081 if line[GENBANK_INDENT:].strip() != "": 1082 consumer.locus(line[GENBANK_INDENT:].strip()) 1083 else: 1084 #Must just have just "LOCUS ", is this even legitimate? 1085 #We should be able to continue parsing... we need real world testcases! 1086 warnings.warn("Minimal LOCUS line found - is this correct?\n" + line) 1087 elif len(line.split())>=4 and line.split()[3] in ["aa","bp"]: 1088 #Cope with EMBOSS seqret output where it seems the locus id can cause 1089 #the other fields to overflow. We just IGNORE the other fields! 1090 consumer.locus(line.split()[1]) 1091 consumer.size(line.split()[2]) 1092 warnings.warn("Malformed LOCUS line found - is this correct?\n" + line) 1093 else: 1094 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
1095 1096
1097 - def _feed_header_lines(self, consumer, lines):
1098 #Following dictionary maps GenBank lines to the associated 1099 #consumer methods - the special cases like LOCUS where one 1100 #genbank line triggers several consumer calls have to be 1101 #handled individually. 1102 GENBANK_INDENT = self.HEADER_WIDTH 1103 GENBANK_SPACER = " "*GENBANK_INDENT 1104 consumer_dict = { 1105 'DEFINITION' : 'definition', 1106 'ACCESSION' : 'accession', 1107 'NID' : 'nid', 1108 'PID' : 'pid', 1109 'DBSOURCE' : 'db_source', 1110 'KEYWORDS' : 'keywords', 1111 'SEGMENT' : 'segment', 1112 'SOURCE' : 'source', 1113 'AUTHORS' : 'authors', 1114 'CONSRTM' : 'consrtm', 1115 'PROJECT' : 'project', 1116 'DBLINK' : 'dblink', 1117 'TITLE' : 'title', 1118 'JOURNAL' : 'journal', 1119 'MEDLINE' : 'medline_id', 1120 'PUBMED' : 'pubmed_id', 1121 'REMARK' : 'remark'} 1122 #We have to handle the following specially: 1123 #ORIGIN (locus, size, residue_type, data_file_division and date) 1124 #COMMENT (comment) 1125 #VERSION (version and gi) 1126 #REFERENCE (eference_num and reference_bases) 1127 #ORGANISM (organism and taxonomy) 1128 lines = filter(None,lines) 1129 lines.append("") #helps avoid getting StopIteration all the time 1130 line_iter = iter(lines) 1131 try: 1132 line = line_iter.next() 1133 while True: 1134 if not line : break 1135 line_type = line[:GENBANK_INDENT].strip() 1136 data = line[GENBANK_INDENT:].strip() 1137 1138 if line_type == 'VERSION': 1139 #Need to call consumer.version(), and maybe also consumer.gi() as well. 1140 #e.g. 1141 # VERSION AC007323.5 GI:6587720 1142 while data.find(' ')!=-1: 1143 data = data.replace(' ',' ') 1144 if data.find(' GI:')==-1: 1145 consumer.version(data) 1146 else: 1147 if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]" 1148 consumer.version(data.split(' GI:')[0]) 1149 consumer.gi(data.split(' GI:')[1]) 1150 #Read in the next line! 1151 line = line_iter.next() 1152 elif line_type == 'REFERENCE': 1153 if self.debug >1 : print "Found reference [" + data + "]" 1154 #Need to call consumer.reference_num() and consumer.reference_bases() 1155 #e.g. 1156 # REFERENCE 1 (bases 1 to 86436) 1157 # 1158 #Note that this can be multiline, see Bug 1968, e.g. 1159 # 1160 # REFERENCE 42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to 1161 # 28259) 1162 # 1163 #For such cases we will call the consumer once only. 1164 data = data.strip() 1165 1166 #Read in the next line, and see if its more of the reference: 1167 while True: 1168 line = line_iter.next() 1169 if line[:GENBANK_INDENT] == GENBANK_SPACER: 1170 #Add this continuation to the data string 1171 data += " " + line[GENBANK_INDENT:] 1172 if self.debug >1 : print "Extended reference text [" + data + "]" 1173 else: 1174 #End of the reference, leave this text in the variable "line" 1175 break 1176 1177 #We now have all the reference line(s) stored in a string, data, 1178 #which we pass to the consumer 1179 while data.find(' ')!=-1: 1180 data = data.replace(' ',' ') 1181 if data.find(' ')==-1: 1182 if self.debug >2 : print 'Reference number \"' + data + '\"' 1183 consumer.reference_num(data) 1184 else: 1185 if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"' 1186 consumer.reference_num(data[:data.find(' ')]) 1187 consumer.reference_bases(data[data.find(' ')+1:]) 1188 elif line_type == 'ORGANISM': 1189 #Typically the first line is the organism, and subsequent lines 1190 #are the taxonomy lineage. However, given longer and longer 1191 #species names (as more and more strains and sub strains get 1192 #sequenced) the oragnism name can now get wrapped onto multiple 1193 #lines. The NCBI say we have to recognise the lineage line by 1194 #the presense of semi-colon delimited entries. In the long term, 1195 #they are considering adding a new keyword (e.g. LINEAGE). 1196 #See Bug 2591 for details. 1197 organism_data = data 1198 lineage_data = "" 1199 while True: 1200 line = line_iter.next() 1201 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1202 if lineage_data or ";" in line: 1203 lineage_data += " " + line[GENBANK_INDENT:] 1204 else: 1205 organism_data += " " + line[GENBANK_INDENT:].strip() 1206 else: 1207 #End of organism and taxonomy 1208 break 1209 consumer.organism(organism_data) 1210 if lineage_data.strip() == "" and self.debug > 1: 1211 print "Taxonomy line(s) missing or blank" 1212 consumer.taxonomy(lineage_data.strip()) 1213 del organism_data, lineage_data 1214 elif line_type == 'COMMENT': 1215 if self.debug > 1 : print "Found comment" 1216 #This can be multiline, and should call consumer.comment() once 1217 #with a list where each entry is a line. 1218 comment_list=[] 1219 comment_list.append(data) 1220 while True: 1221 line = line_iter.next() 1222 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1223 data = line[GENBANK_INDENT:] 1224 comment_list.append(data) 1225 if self.debug > 2 : print "Comment continuation [" + data + "]" 1226 else: 1227 #End of the comment 1228 break 1229 consumer.comment(comment_list) 1230 del comment_list 1231 elif line_type in consumer_dict: 1232 #Its a semi-automatic entry! 1233 #Now, this may be a multi line entry... 1234 while True: 1235 line = line_iter.next() 1236 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1237 data += ' ' + line[GENBANK_INDENT:] 1238 else: 1239 #We now have all the data for this entry: 1240 getattr(consumer, consumer_dict[line_type])(data) 1241 #End of continuation - return to top of loop! 1242 break 1243 else: 1244 if self.debug: 1245 print "Ignoring GenBank header line:\n" % line 1246 #Read in next line 1247 line = line_iter.next() 1248 except StopIteration: 1249 raise ValueError("Problem in header")
1250
1251 - def _feed_misc_lines(self, consumer, lines):
1252 #Deals with a few misc lines between the features and the sequence 1253 GENBANK_INDENT = self.HEADER_WIDTH 1254 GENBANK_SPACER = " "*GENBANK_INDENT 1255 lines.append("") 1256 line_iter = iter(lines) 1257 try: 1258 for line in line_iter: 1259 if line.find('BASE COUNT')==0: 1260 line = line[10:].strip() 1261 if line: 1262 if self.debug : print "base_count = " + line 1263 consumer.base_count(line) 1264 if line.find("ORIGIN")==0: 1265 line = line[6:].strip() 1266 if line: 1267 if self.debug : print "origin_name = " + line 1268 consumer.origin_name(line) 1269 if line.find("WGS ")==0 : 1270 line = line[3:].strip() 1271 consumer.wgs(line) 1272 if line.find("WGS_SCAFLD")==0 : 1273 line = line[10:].strip() 1274 consumer.add_wgs_scafld(line) 1275 if line.find("CONTIG")==0: 1276 line = line[6:].strip() 1277 contig_location = line 1278 while True: 1279 line = line_iter.next() 1280 if not line: 1281 break 1282 elif line[:GENBANK_INDENT]==GENBANK_SPACER: 1283 #Don't need to preseve the whitespace here. 1284 contig_location += line[GENBANK_INDENT:].rstrip() 1285 else: 1286 raise ValueError('Expected CONTIG continuation line, got:\n' + line) 1287 consumer.contig_location(contig_location) 1288 return 1289 except StopIteration: 1290 raise ValueError("Problem in misc lines before sequence")
1291 1292 if __name__ == "__main__": 1293 from StringIO import StringIO 1294 1295 gbk_example = \ 1296 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 1297 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p 1298 (AXL2) and Rev7p (REV7) genes, complete cds. 1299 ACCESSION U49845 1300 VERSION U49845.1 GI:1293613 1301 KEYWORDS . 1302 SOURCE Saccharomyces cerevisiae (baker's yeast) 1303 ORGANISM Saccharomyces cerevisiae 1304 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; 1305 Saccharomycetales; Saccharomycetaceae; Saccharomyces. 1306 REFERENCE 1 (bases 1 to 5028) 1307 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. 1308 TITLE Cloning and sequence of REV7, a gene whose function is required for 1309 DNA damage-induced mutagenesis in Saccharomyces cerevisiae 1310 JOURNAL Yeast 10 (11), 1503-1509 (1994) 1311 PUBMED 7871890 1312 REFERENCE 2 (bases 1 to 5028) 1313 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. 1314 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel 1315 plasma membrane glycoprotein 1316 JOURNAL Genes Dev. 10 (7), 777-793 (1996) 1317 PUBMED 8846915 1318 REFERENCE 3 (bases 1 to 5028) 1319 AUTHORS Roemer,T. 1320 TITLE Direct Submission 1321 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New 1322 Haven, CT, USA 1323 FEATURES Location/Qualifiers 1324 source 1..5028 1325 /organism="Saccharomyces cerevisiae" 1326 /db_xref="taxon:4932" 1327 /chromosome="IX" 1328 /map="9" 1329 CDS <1..206 1330 /codon_start=3 1331 /product="TCP1-beta" 1332 /protein_id="AAA98665.1" 1333 /db_xref="GI:1293614" 1334 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA 1335 AEVLLRVDNIIRARPRTANRQHM" 1336 gene 687..3158 1337 /gene="AXL2" 1338 CDS 687..3158 1339 /gene="AXL2" 1340 /note="plasma membrane glycoprotein" 1341 /codon_start=1 1342 /function="required for axial budding pattern of S. 1343 cerevisiae" 1344 /product="Axl2p" 1345 /protein_id="AAA98666.1" 1346 /db_xref="GI:1293615" 1347 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF 1348 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN 1349 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE 1350 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE 1351 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV 1352 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG 1353 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ 1354 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA 1355 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA 1356 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN 1357 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ 1358 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS 1359 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK 1360 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL 1361 VDFSNKSNVNVGQVKDIHGRIPEML" 1362 gene complement(3300..4037) 1363 /gene="REV7" 1364 CDS complement(3300..4037) 1365 /gene="REV7" 1366 /codon_start=1 1367 /product="Rev7p" 1368 /protein_id="AAA98667.1" 1369 /db_xref="GI:1293616" 1370 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ 1371 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD 1372 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR 1373 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK 1374 LISGDDKILNGVYSQYEEGESIFGSLF" 1375 ORIGIN 1376 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg 1377 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct 1378 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa 1379 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg 1380 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa 1381 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa 1382 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat 1383 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga 1384 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc 1385 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga 1386 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta 1387 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag 1388 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa 1389 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata 1390 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga 1391 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac 1392 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg 1393 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc 1394 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa 1395 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca 1396 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac 1397 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa 1398 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag 1399 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct 1400 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac 1401 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa 1402 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc 1403 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata 1404 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca 1405 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc 1406 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc 1407 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca 1408 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc 1409 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg 1410 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt 1411 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc 1412 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg 1413 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca 1414 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata 1415 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg 1416 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga 1417 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt 1418 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat 1419 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt 1420 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc 1421 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag 1422 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta 1423 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa 1424 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact 1425 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt 1426 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa 1427 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag 1428 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct 1429 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt 1430 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact 1431 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa 1432 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg 1433 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt 1434 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc 1435 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca 1436 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc 1437 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc 1438 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat 1439 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa 1440 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga 1441 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat 1442 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc 1443 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc 1444 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa 1445 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg 1446 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc 1447 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt 1448 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg 1449 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg 1450 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt 1451 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt 1452 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat 1453 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc 1454 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct 1455 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta 1456 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac 1457 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct 1458 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct 1459 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc 1460 //""" 1461 1462 # GenBank format protein (aka GenPept) file from: 1463 # http://www.molecularevolution.org/resources/fileformats/ 1464 gbk_example2 = \ 1465 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001 1466 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica]. 1467 ACCESSION AAD51968 1468 VERSION AAD51968.1 GI:5805369 1469 DBSOURCE locus AF171097 accession AF171097.1 1470 KEYWORDS . 1471 SOURCE Yersinia enterocolitica 1472 ORGANISM Yersinia enterocolitica 1473 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; 1474 Enterobacteriaceae; Yersinia. 1475 REFERENCE 1 (residues 1 to 143) 1476 AUTHORS Revell,P.A. and Miller,V.L. 1477 TITLE A chromosomally encoded regulator is required for expression of the 1478 Yersinia enterocolitica inv gene and for virulence 1479 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000) 1480 MEDLINE 20138369 1481 PUBMED 10672189 1482 REFERENCE 2 (residues 1 to 143) 1483 AUTHORS Revell,P.A. and Miller,V.L. 1484 TITLE Direct Submission 1485 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington 1486 University School of Medicine, Campus Box 8230, 660 South Euclid, 1487 St. Louis, MO 63110, USA 1488 COMMENT Method: conceptual translation. 1489 FEATURES Location/Qualifiers 1490 source 1..143 1491 /organism="Yersinia enterocolitica" 1492 /mol_type="unassigned DNA" 1493 /strain="JB580v" 1494 /serotype="O:8" 1495 /db_xref="taxon:630" 1496 Protein 1..143 1497 /product="transcriptional regulator RovA" 1498 /name="regulates inv expression" 1499 CDS 1..143 1500 /gene="rovA" 1501 /coded_by="AF171097.1:380..811" 1502 /note="regulator of virulence" 1503 /transl_table=11 1504 ORIGIN 1505 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq 1506 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp 1507 121 deiellsgli dklerniiql qsk 1508 // 1509 """ 1510 1511 embl_example="""ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 1512 XX 1513 AC X56734; S46826; 1514 XX 1515 DT 12-SEP-1991 (Rel. 29, Created) 1516 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) 1517 XX 1518 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase 1519 XX 1520 KW beta-glucosidase. 1521 XX 1522 OS Trifolium repens (white clover) 1523 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; 1524 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; 1525 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. 1526 XX 1527 RN [5] 1528 RP 1-1859 1529 RX PUBMED; 1907511. 1530 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; 1531 RT "Nucleotide and derived amino acid sequence of the cyanogenic 1532 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; 1533 RL Plant Mol. Biol. 17(2):209-219(1991). 1534 XX 1535 RN [6] 1536 RP 1-1859 1537 RA Hughes M.A.; 1538 RT ; 1539 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases. 1540 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle 1541 RL Upon Tyne, NE2 4HH, UK 1542 XX 1543 FH Key Location/Qualifiers 1544 FH 1545 FT source 1..1859 1546 FT /organism="Trifolium repens" 1547 FT /mol_type="mRNA" 1548 FT /clone_lib="lambda gt10" 1549 FT /clone="TRE361" 1550 FT /tissue_type="leaves" 1551 FT /db_xref="taxon:3899" 1552 FT CDS 14..1495 1553 FT /product="beta-glucosidase" 1554 FT /EC_number="3.2.1.21" 1555 FT /note="non-cyanogenic" 1556 FT /db_xref="GOA:P26204" 1557 FT /db_xref="InterPro:IPR001360" 1558 FT /db_xref="InterPro:IPR013781" 1559 FT /db_xref="UniProtKB/Swiss-Prot:P26204" 1560 FT /protein_id="CAA40058.1" 1561 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI 1562 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK 1563 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ 1564 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR 1565 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD 1566 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF 1567 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ 1568 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA 1569 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" 1570 FT mRNA 1..1859 1571 FT /experiment="experimental evidence, no additional details 1572 FT recorded" 1573 XX 1574 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; 1575 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 1576 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 1577 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 1578 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 1579 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 1580 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 1581 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 1582 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 1583 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 1584 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 1585 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 1586 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 1587 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 1588 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 1589 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 1590 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 1591 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 1592 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 1593 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 1594 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 1595 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 1596 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 1597 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 1598 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 1599 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 1600 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 1601 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 1602 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 1603 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 1604 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 1605 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 1606 // 1607 """ 1608 1609 print "GenBank CDS Iteration" 1610 print "=====================" 1611 1612 g = GenBankScanner() 1613 for record in g.parse_cds_features(StringIO(gbk_example)): 1614 print record 1615 1616 g = GenBankScanner() 1617 for record in g.parse_cds_features(StringIO(gbk_example2), 1618 tags2id=('gene','locus_tag','product')): 1619 print record 1620 1621 g = GenBankScanner() 1622 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2), 1623 tags2id=('gene','locus_tag','product')): 1624 print record 1625 1626 print 1627 print "GenBank Iteration" 1628 print "=================" 1629 g = GenBankScanner() 1630 for record in g.parse_records(StringIO(gbk_example),do_features=False): 1631 print record.id, record.name, record.description 1632 print record.seq 1633 1634 g = GenBankScanner() 1635 for record in g.parse_records(StringIO(gbk_example),do_features=True): 1636 print record.id, record.name, record.description 1637 print record.seq 1638 1639 g = GenBankScanner() 1640 for record in g.parse_records(StringIO(gbk_example2),do_features=False): 1641 print record.id, record.name, record.description 1642 print record.seq 1643 1644 g = GenBankScanner() 1645 for record in g.parse_records(StringIO(gbk_example2),do_features=True): 1646 print record.id, record.name, record.description 1647 print record.seq 1648 1649 print 1650 print "EMBL CDS Iteration" 1651 print "==================" 1652 1653 e = EmblScanner() 1654 for record in e.parse_cds_features(StringIO(embl_example)): 1655 print record 1656 1657 print 1658 print "EMBL Iteration" 1659 print "==============" 1660 e = EmblScanner() 1661 for record in e.parse_records(StringIO(embl_example),do_features=True): 1662 print record.id, record.name, record.description 1663 print record.seq 1664