1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 import sys
19 from Bio.Seq import Seq
20 from Bio.SeqRecord import SeqRecord
21 from Bio.Alphabet import generic_alphabet, generic_protein
22
24 """Basic functions for breaking up a GenBank/EMBL file into sub sections.
25
26 The International Nucleotide Sequence Database Collaboration (INSDC)
27 between the DDBJ, EMBL, and GenBank. These organisations all use the
28 same "Feature Table" layout in their plain text flat file formats.
29
30 However, the header and sequence sections of an EMBL file are very
31 different in layout to those produced by GenBank/DDBJ."""
32
33
34 RECORD_START = "XXX"
35 HEADER_WIDTH = 3
36 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"]
37 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"]
38 FEATURE_QUALIFIER_INDENT = 0
39 FEATURE_QUALIFIER_SPACER = ""
40 SEQUENCE_HEADERS=["XXX"]
41
49
53
55 """Read in lines until find the ID/LOCUS line, which is returned.
56
57 Any preamble (such as the header used by the NCBI on *.seq.gz archives)
58 will we ignored."""
59 while True :
60 if self.line :
61 line = self.line
62 self.line = ""
63 else :
64 line = self.handle.readline()
65 if not line :
66 if self.debug : print "End of file"
67 return None
68 if line[:self.HEADER_WIDTH]==self.RECORD_START :
69 if self.debug > 1: print "Found the start of a record:\n" + line
70 break
71 line = line.rstrip()
72 if line == "//" :
73 if self.debug > 1: print "Skipping // marking end of last record"
74 elif line == "" :
75 if self.debug > 1: print "Skipping blank line before record"
76 else :
77
78 if self.debug > 1:
79 print "Skipping header line before record:\n" + line
80 self.line = line
81 return line
82
84 """Return list of strings making up the header
85
86 New line characters are removed.
87
88 Assumes you have just read in the ID/LOCUS line.
89 """
90 assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \
91 "Not at start of record"
92
93 header_lines = []
94 while True :
95 line = self.handle.readline()
96 if not line :
97 raise ValueError("Premature end of line during sequence data")
98 line = line.rstrip()
99 if line in self.FEATURE_START_MARKERS :
100 if self.debug : print "Found header table"
101 break
102
103
104
105 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS :
106 if self.debug : print "Found start of sequence"
107 break
108 if line == "//" :
109 raise ValueError("Premature end of sequence data marker '//' found")
110 header_lines.append(line)
111 self.line = line
112 return header_lines
113
165
167 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers)
168
169 For example given this GenBank feature:
170
171 CDS complement(join(490883..490885,1..879))
172 /locus_tag="NEQ001"
173 /note="conserved hypothetical [Methanococcus jannaschii];
174 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear
175 localization signal; IPR002743: Protein of unknown
176 function DUF57"
177 /codon_start=1
178 /transl_table=11
179 /product="hypothetical protein"
180 /protein_id="NP_963295.1"
181 /db_xref="GI:41614797"
182 /db_xref="GeneID:2732620"
183 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK
184 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK
185 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP
186 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE
187 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS
188 LNSMGFGFVNTKKNSAR"
189
190 Then should give input key="CDS" and the rest of the data as a list of strings
191 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"]
192 where the leading spaces and trailing newlines have been removed.
193
194 Returns tuple containing: (key as string, location string, qualifiers as list)
195 as follows for this example:
196
197 key = "CDS", string
198 location = "complement(join(490883..490885,1..879))", string
199 qualifiers = list of string tuples:
200
201 [('locus_tag', '"NEQ001"'),
202 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'),
203 ('codon_start', '1'),
204 ('transl_table', '11'),
205 ('product', '"hypothetical protein"'),
206 ('protein_id', '"NP_963295.1"'),
207 ('db_xref', '"GI:41614797"'),
208 ('db_xref', '"GeneID:2732620"'),
209 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')]
210
211 In the above example, the "note" and "translation" were edited for compactness,
212 and they would contain multiple new line characters (displayed above as \n)
213
214 If a qualifier is quoted (in this case, everything except codon_start and
215 transl_table) then the quotes are NOT removed.
216
217 Note that no whitespace is removed.
218 """
219
220 iterator = iter(filter(None, lines))
221 try :
222 line = iterator.next()
223
224 feature_location = line.strip()
225 while feature_location[-1:]=="," :
226
227 feature_location += iterator.next().strip()
228
229 qualifiers=[]
230
231 for line in iterator :
232 if line[0]=="/" :
233
234 i = line.find("=")
235 key = line[1:i]
236 value = line[i+1:]
237 if i==-1 :
238
239 key = line[1:]
240 qualifiers.append((key,None))
241 elif value[0]=='"' :
242
243 if value[-1]!='"' or value!='"' :
244
245 while value[-1] != '"' :
246 value += "\n" + iterator.next()
247 else :
248
249 assert value == '"'
250 if self.debug : print "Quoted line %s:%s" % (key, value)
251
252 qualifiers.append((key,value))
253 else :
254
255
256 qualifiers.append((key,value))
257 else :
258
259 assert len(qualifiers) > 0
260 assert key==qualifiers[-1][0]
261
262 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line)
263 return (feature_key, feature_location, qualifiers)
264 except StopIteration:
265
266 raise ValueError("Problem with '%s' feature:\n%s" \
267 % (feature_key, "\n".join(lines)))
268
289
291 """Handle the LOCUS/ID line, passing data to the comsumer
292
293 This should be implemented by the EMBL / GenBank specific subclass
294
295 Used by the parse_records() and parse() methods.
296 """
297 pass
298
300 """Handle the header lines (list of strings), passing data to the comsumer
301
302 This should be implemented by the EMBL / GenBank specific subclass
303
304 Used by the parse_records() and parse() methods.
305 """
306 pass
307
308
322
324 """Handle any lines between features and sequence (list of strings), passing data to the consumer
325
326 This should be implemented by the EMBL / GenBank specific subclass
327
328 Used by the parse_records() and parse() methods.
329 """
330 pass
331
332 - def feed(self, handle, consumer, do_features=True) :
333 """Feed a set of data into the consumer.
334
335 This method is intended for use with the "old" code in Bio.GenBank
336
337 Arguments:
338 handle - A handle with the information to parse.
339 consumer - The consumer that should be informed of events.
340 do_features - Boolean, should the features be parsed?
341 Skipping the features can be much faster.
342
343 Return values:
344 true - Passed a record
345 false - Did not find a record
346 """
347
348
349 self.set_handle(handle)
350 if not self.find_start() :
351
352 consumer.data=None
353 return False
354
355
356
357
358
359
360 self._feed_first_line(consumer, self.line)
361 self._feed_header_lines(consumer, self.parse_header())
362
363
364 if do_features :
365 self._feed_feature_table(consumer, self.parse_features(skip=False))
366 else :
367 self.parse_features(skip=True)
368
369
370 misc_lines, sequence_string = self.parse_footer()
371 self._feed_misc_lines(consumer, misc_lines)
372
373 consumer.sequence(sequence_string)
374
375 consumer.record_end("//")
376
377 assert self.line == "//"
378
379
380 return True
381
397
398
400 """Returns a SeqRecord object iterator
401
402 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord
403
404 The SeqRecord objects include SeqFeatures if do_features=True
405
406 This method is intended for use in Bio.SeqIO
407 """
408
409 while True :
410 record = self.parse(handle)
411 if record is None : break
412 assert record.id is not None
413 assert record.name != "<unknown name>"
414 assert record.description != "<unknown description>"
415 yield record
416
420 """Returns SeqRecord object iterator
421
422 Each CDS feature becomes a SeqRecord.
423
424 alphabet - Used for any sequence found in a translation field.
425 tags2id - Tupple of three strings, the feature keys to use
426 for the record id, name and description,
427
428 This method is intended for use in Bio.SeqIO
429 """
430 self.set_handle(handle)
431 while self.find_start() :
432
433 self.parse_header()
434 feature_tuples = self.parse_features()
435
436 for line in self.handle :
437 if line[:2]=="//" : break
438 self.line = line.rstrip()
439
440
441 for key, location_string, qualifiers in feature_tuples :
442 if key=="CDS" :
443
444
445
446
447
448 record = SeqRecord(seq=None)
449 annotations = record.annotations
450
451
452
453
454 annotations['raw_location'] = location_string.replace(' ','')
455
456 for (qualifier_name, qualifier_data) in qualifiers :
457 if qualifier_data is not None \
458 and qualifier_data[0]=='"' and qualifier_data[-1]=='"' :
459
460 qualifier_data = qualifier_data[1:-1]
461
462 if qualifier_name == "translation" :
463 assert record.seq is None, "Multiple translations!"
464 record.seq = Seq(qualifier_data.replace("\n",""), alphabet)
465 elif qualifier_name == "db_xref" :
466
467 record.dbxrefs.append(qualifier_data)
468 else :
469 if qualifier_data is not None :
470 qualifier_data = qualifier_data.replace("\n"," ").replace(" "," ")
471 try :
472 annotations[qualifier_name] += " " + qualifier_data
473 except KeyError :
474
475 annotations[qualifier_name]= qualifier_data
476
477
478
479 try :
480 record.id = annotations[tags2id[0]]
481 except KeyError :
482 pass
483 try :
484 record.name = annotations[tags2id[1]]
485 except KeyError :
486 pass
487 try :
488 record.description = annotations[tags2id[2]]
489 except KeyError :
490 pass
491
492 yield record
493
495 """For extracting chunks of information in EMBL files"""
496
497 RECORD_START = "ID "
498 HEADER_WIDTH = 5
499 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers","FH"]
500 FEATURE_END_MARKERS = ["XX"]
501 FEATURE_QUALIFIER_INDENT = 21
502 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2)
503 SEQUENCE_HEADERS=["SQ"]
504
538
549
551
552
553
554 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
555 fields = [line[self.HEADER_WIDTH:].split(None,1)[0]]
556 fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";"))
557 fields = [entry.strip() for entry in fields]
558 """
559 The tokens represent:
560 0. Primary accession number
561 (space sep)
562 1. ??? (e.g. standard)
563 (semi-colon)
564 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA')
565 3. Taxonomic division (e.g. 'PRO')
566 4. Sequence length (e.g. '4639675 BP.')
567 """
568 consumer.locus(fields[0])
569 consumer.residue_type(fields[2])
570 consumer.data_file_division(fields[3])
571 self._feed_seq_length(consumer, fields[4])
572
574
575
576
577 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
578 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")]
579 assert len(fields) == 7
580 """
581 The tokens represent:
582 0. Primary accession number
583 1. Sequence version number
584 2. Topology: 'circular' or 'linear'
585 3. Molecule type (e.g. 'genomic DNA')
586 4. Data class (e.g. 'STD')
587 5. Taxonomic division (e.g. 'PRO')
588 6. Sequence length (e.g. '4639675 BP.')
589 """
590
591 consumer.locus(fields[0])
592
593
594
595 consumer.accession(fields[0])
596
597
598
599 version_parts = fields[1].split()
600 if len(version_parts)==2 \
601 and version_parts[0]=="SV" \
602 and version_parts[1].isdigit() :
603 consumer.version_suffix(version_parts[1])
604
605
606 consumer.residue_type(" ".join(fields[2:4]))
607
608
609
610 consumer.data_file_division(fields[5])
611
612 self._feed_seq_length(consumer, fields[6])
613
615 length_parts = text.split()
616 assert len(length_parts) == 2
617 assert length_parts[1].upper() in ["BP", "BP."]
618 consumer.size(length_parts[0])
619
621 EMBL_INDENT = self.HEADER_WIDTH
622 EMBL_SPACER = " " * EMBL_INDENT
623 consumer_dict = {
624 'AC' : 'accession',
625 'SV' : 'version',
626 'DE' : 'definition',
627
628
629
630 'RA' : 'authors',
631 'RT' : 'title',
632 'RL' : 'journal',
633 'OS' : 'organism',
634 'OC' : 'taxonomy',
635
636 'CC' : 'comment',
637
638 }
639
640
641 lines = filter(None,lines)
642 line_iter = iter(lines)
643 try :
644 while True :
645 try :
646 line = line_iter.next()
647 except StopIteration :
648 break
649 if not line : break
650 line_type = line[:EMBL_INDENT].strip()
651 data = line[EMBL_INDENT:].strip()
652
653 if line_type == 'XX' :
654 pass
655 elif line_type == 'RN' :
656
657
658 if data[0] == "[" and data[-1] == "]" : data = data[1:-1]
659 consumer.reference_num(data)
660 elif line_type == 'RP' :
661
662
663 assert data.count("-")==1
664 consumer.reference_bases("(bases " + data.replace("-", " to ") + ")")
665 elif line_type == 'RX' :
666
667
668 pass
669 elif line_type == 'CC' :
670
671 consumer.comment([data])
672 elif line_type == 'DR' :
673
674 pass
675 elif line_type in consumer_dict :
676
677 getattr(consumer, consumer_dict[line_type])(data)
678 else :
679 if self.debug :
680 print "Ignoring EMBL header line:\n%s" % line
681 except StopIteration :
682 raise ValueError("Problem with header")
683
687
689 """For extracting chunks of information in GenBank files"""
690
691 RECORD_START = "LOCUS "
692 HEADER_WIDTH = 12
693 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers","FEATURES"]
694 FEATURE_END_MARKERS = []
695 FEATURE_QUALIFIER_INDENT = 21
696 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
697 SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT"]
698
740
742
743
744
745 GENBANK_INDENT = self.HEADER_WIDTH
746 GENBANK_SPACER = " "*GENBANK_INDENT
747 assert line[0:GENBANK_INDENT] == 'LOCUS ', \
748 'LOCUS line does not start correctly:\n' + line
749
750
751
752 if line[29:33] in [' bp ', ' aa '] :
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771 assert line[29:33] in [' bp ', ' aa '] , \
772 'LOCUS line does not contain size units at expected position:\n' + line
773 assert line[41:42] == ' ', \
774 'LOCUS line does not contain space at position 42:\n' + line
775 assert line[42:51].strip() in ['','linear','circular'], \
776 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
777 assert line[51:52] == ' ', \
778 'LOCUS line does not contain space at position 52:\n' + line
779 assert line[55:62] == ' ', \
780 'LOCUS line does not contain spaces from position 56 to 62:\n' + line
781 assert line[64:65] == '-', \
782 'LOCUS line does not contain - at position 65 in date:\n' + line
783 assert line[68:69] == '-', \
784 'LOCUS line does not contain - at position 69 in date:\n' + line
785
786 name_and_length_str = line[GENBANK_INDENT:29]
787 while name_and_length_str.find(' ')!=-1 :
788 name_and_length_str = name_and_length_str.replace(' ',' ')
789 name_and_length = name_and_length_str.split(' ')
790 assert len(name_and_length)<=2, \
791 'Cannot parse the name and length in the LOCUS line:\n' + line
792 assert len(name_and_length)!=1, \
793 'Name and length collide in the LOCUS line:\n' + line
794
795
796
797 consumer.locus(name_and_length[0])
798 consumer.size(name_and_length[1])
799
800
801 if line[33:51].strip() == "" and line[29:33] == ' aa ' :
802
803
804
805
806 consumer.residue_type("PROTEIN")
807 else :
808 consumer.residue_type(line[33:51].strip())
809
810 consumer.data_file_division(line[52:55])
811 consumer.date(line[62:73])
812 elif line[40:44] in [' bp ', ' aa '] :
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832 assert line[40:44] in [' bp ', ' aa '] , \
833 'LOCUS line does not contain size units at expected position:\n' + line
834 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \
835 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
836 assert line[47:54].strip() == "" \
837 or line[47:54].strip().find('DNA') != -1 \
838 or line[47:54].strip().find('RNA') != -1, \
839 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
840 assert line[54:55] == ' ', \
841 'LOCUS line does not contain space at position 55:\n' + line
842 assert line[55:63].strip() in ['','linear','circular'], \
843 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
844 assert line[63:64] == ' ', \
845 'LOCUS line does not contain space at position 64:\n' + line
846 assert line[67:68] == ' ', \
847 'LOCUS line does not contain space at position 68:\n' + line
848 assert line[70:71] == '-', \
849 'LOCUS line does not contain - at position 71 in date:\n' + line
850 assert line[74:75] == '-', \
851 'LOCUS line does not contain - at position 75 in date:\n' + line
852
853 name_and_length_str = line[GENBANK_INDENT:40]
854 while name_and_length_str.find(' ')!=-1 :
855 name_and_length_str = name_and_length_str.replace(' ',' ')
856 name_and_length = name_and_length_str.split(' ')
857 assert len(name_and_length)<=2, \
858 'Cannot parse the name and length in the LOCUS line:\n' + line
859 assert len(name_and_length)!=1, \
860 'Name and length collide in the LOCUS line:\n' + line
861
862
863
864 consumer.locus(name_and_length[0])
865 consumer.size(name_and_length[1])
866
867 if line[44:54].strip() == "" and line[40:44] == ' aa ' :
868
869
870
871
872 consumer.residue_type(("PROTEIN " + line[54:63]).strip())
873 else :
874 consumer.residue_type(line[44:63].strip())
875
876 consumer.data_file_division(line[64:67])
877 consumer.date(line[68:79])
878 elif line[GENBANK_INDENT:].strip().count(" ")==0 :
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894 if line[GENBANK_INDENT:].strip() != "" :
895 consumer.locus(line[GENBANK_INDENT:].strip())
896 else :
897
898
899 print >> sys.stderr, "Warning: Minimal LOCUS line found - is this correct?\n" + line
900 elif len(line.split())>=4 and line.split()[3] in ["aa","bp"] :
901
902
903 consumer.locus(line.split()[1])
904 consumer.size(line.split()[2])
905 print >> sys.stderr, "Warning: Malformed LOCUS line found - is this correct?\n" + line
906 else :
907 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
908
909
911
912
913
914
915 GENBANK_INDENT = self.HEADER_WIDTH
916 GENBANK_SPACER = " "*GENBANK_INDENT
917 consumer_dict = {
918 'DEFINITION' : 'definition',
919 'ACCESSION' : 'accession',
920 'NID' : 'nid',
921 'PID' : 'pid',
922 'DBSOURCE' : 'db_source',
923 'KEYWORDS' : 'keywords',
924 'SEGMENT' : 'segment',
925 'SOURCE' : 'source',
926 'AUTHORS' : 'authors',
927 'CONSRTM' : 'consrtm',
928 'PROJECT' : 'project',
929 'DBLINK' : 'dblink',
930 'TITLE' : 'title',
931 'JOURNAL' : 'journal',
932 'MEDLINE' : 'medline_id',
933 'PUBMED' : 'pubmed_id',
934 'REMARK' : 'remark'}
935
936
937
938
939
940
941 lines = filter(None,lines)
942 lines.append("")
943 line_iter = iter(lines)
944 try :
945 line = line_iter.next()
946 while True :
947 if not line : break
948 line_type = line[:GENBANK_INDENT].strip()
949 data = line[GENBANK_INDENT:].strip()
950
951 if line_type == 'VERSION' :
952
953
954
955 while data.find(' ')!=-1:
956 data = data.replace(' ',' ')
957 if data.find(' GI:')==-1 :
958 consumer.version(data)
959 else :
960 if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]"
961 consumer.version(data.split(' GI:')[0])
962 consumer.gi(data.split(' GI:')[1])
963
964 line = line_iter.next()
965 elif line_type == 'REFERENCE' :
966 if self.debug >1 : print "Found reference [" + data + "]"
967
968
969
970
971
972
973
974
975
976
977 data = data.strip()
978
979
980 while True:
981 line = line_iter.next()
982 if line[:GENBANK_INDENT] == GENBANK_SPACER :
983
984 data += " " + line[GENBANK_INDENT:]
985 if self.debug >1 : print "Extended reference text [" + data + "]"
986 else :
987
988 break
989
990
991
992 while data.find(' ')!=-1:
993 data = data.replace(' ',' ')
994 if data.find(' ')==-1 :
995 if self.debug >2 : print 'Reference number \"' + data + '\"'
996 consumer.reference_num(data)
997 else :
998 if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"'
999 consumer.reference_num(data[:data.find(' ')])
1000 consumer.reference_bases(data[data.find(' ')+1:])
1001 elif line_type == 'ORGANISM' :
1002
1003 consumer.organism(data)
1004 data = ""
1005 while True :
1006 line = line_iter.next()
1007 if line[0:GENBANK_INDENT] == GENBANK_SPACER :
1008 data += ' ' + line[GENBANK_INDENT:]
1009 else :
1010
1011 if data.strip() == "" :
1012 if self.debug > 1 : print "Taxonomy line(s) missing or blank"
1013 consumer.taxonomy(data.strip())
1014
1015 break
1016 elif line_type == 'COMMENT' :
1017 if self.debug > 1 : print "Found comment"
1018
1019
1020 comment_list=[]
1021 comment_list.append(data)
1022 while True:
1023 line = line_iter.next()
1024 if line[0:GENBANK_INDENT] == GENBANK_SPACER :
1025 data = line[GENBANK_INDENT:]
1026 comment_list.append(data)
1027 if self.debug > 2 : print "Comment continuation [" + data + "]"
1028 else :
1029
1030 break
1031 consumer.comment(comment_list)
1032 del comment_list
1033 elif line_type in consumer_dict :
1034
1035
1036 while True :
1037 line = line_iter.next()
1038 if line[0:GENBANK_INDENT] == GENBANK_SPACER :
1039 data += ' ' + line[GENBANK_INDENT:]
1040 else :
1041
1042 getattr(consumer, consumer_dict[line_type])(data)
1043
1044 break
1045 else :
1046 if self.debug :
1047 print "Ignoring GenBank header line:\n" % line
1048
1049 line = line_iter.next()
1050 except StopIteration :
1051 raise ValueError("Problem in header")
1052
1086
1087 if __name__ == "__main__" :
1088 from StringIO import StringIO
1089
1090 gbk_example = \
1091 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999
1092 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
1093 (AXL2) and Rev7p (REV7) genes, complete cds.
1094 ACCESSION U49845
1095 VERSION U49845.1 GI:1293613
1096 KEYWORDS .
1097 SOURCE Saccharomyces cerevisiae (baker's yeast)
1098 ORGANISM Saccharomyces cerevisiae
1099 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
1100 Saccharomycetales; Saccharomycetaceae; Saccharomyces.
1101 REFERENCE 1 (bases 1 to 5028)
1102 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
1103 TITLE Cloning and sequence of REV7, a gene whose function is required for
1104 DNA damage-induced mutagenesis in Saccharomyces cerevisiae
1105 JOURNAL Yeast 10 (11), 1503-1509 (1994)
1106 PUBMED 7871890
1107 REFERENCE 2 (bases 1 to 5028)
1108 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M.
1109 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel
1110 plasma membrane glycoprotein
1111 JOURNAL Genes Dev. 10 (7), 777-793 (1996)
1112 PUBMED 8846915
1113 REFERENCE 3 (bases 1 to 5028)
1114 AUTHORS Roemer,T.
1115 TITLE Direct Submission
1116 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
1117 Haven, CT, USA
1118 FEATURES Location/Qualifiers
1119 source 1..5028
1120 /organism="Saccharomyces cerevisiae"
1121 /db_xref="taxon:4932"
1122 /chromosome="IX"
1123 /map="9"
1124 CDS <1..206
1125 /codon_start=3
1126 /product="TCP1-beta"
1127 /protein_id="AAA98665.1"
1128 /db_xref="GI:1293614"
1129 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
1130 AEVLLRVDNIIRARPRTANRQHM"
1131 gene 687..3158
1132 /gene="AXL2"
1133 CDS 687..3158
1134 /gene="AXL2"
1135 /note="plasma membrane glycoprotein"
1136 /codon_start=1
1137 /function="required for axial budding pattern of S.
1138 cerevisiae"
1139 /product="Axl2p"
1140 /protein_id="AAA98666.1"
1141 /db_xref="GI:1293615"
1142 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
1143 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
1144 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
1145 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
1146 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
1147 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
1148 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
1149 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
1150 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
1151 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
1152 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
1153 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
1154 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
1155 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
1156 VDFSNKSNVNVGQVKDIHGRIPEML"
1157 gene complement(3300..4037)
1158 /gene="REV7"
1159 CDS complement(3300..4037)
1160 /gene="REV7"
1161 /codon_start=1
1162 /product="Rev7p"
1163 /protein_id="AAA98667.1"
1164 /db_xref="GI:1293616"
1165 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
1166 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
1167 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
1168 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
1169 LISGDDKILNGVYSQYEEGESIFGSLF"
1170 ORIGIN
1171 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
1172 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
1173 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
1174 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg
1175 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa
1176 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa
1177 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat
1178 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga
1179 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc
1180 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga
1181 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta
1182 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag
1183 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa
1184 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata
1185 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga
1186 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac
1187 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg
1188 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc
1189 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa
1190 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca
1191 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac
1192 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa
1193 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag
1194 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct
1195 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac
1196 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa
1197 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc
1198 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata
1199 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca
1200 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc
1201 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc
1202 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca
1203 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc
1204 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg
1205 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt
1206 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc
1207 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg
1208 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca
1209 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata
1210 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg
1211 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga
1212 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt
1213 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat
1214 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt
1215 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc
1216 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag
1217 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta
1218 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa
1219 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact
1220 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt
1221 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa
1222 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag
1223 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct
1224 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt
1225 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact
1226 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa
1227 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg
1228 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt
1229 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc
1230 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca
1231 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc
1232 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc
1233 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat
1234 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa
1235 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga
1236 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat
1237 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc
1238 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc
1239 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa
1240 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg
1241 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc
1242 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt
1243 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg
1244 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg
1245 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt
1246 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt
1247 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat
1248 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc
1249 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct
1250 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta
1251 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac
1252 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct
1253 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct
1254 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
1255 //"""
1256
1257
1258
1259 gbk_example2 = \
1260 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001
1261 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica].
1262 ACCESSION AAD51968
1263 VERSION AAD51968.1 GI:5805369
1264 DBSOURCE locus AF171097 accession AF171097.1
1265 KEYWORDS .
1266 SOURCE Yersinia enterocolitica
1267 ORGANISM Yersinia enterocolitica
1268 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
1269 Enterobacteriaceae; Yersinia.
1270 REFERENCE 1 (residues 1 to 143)
1271 AUTHORS Revell,P.A. and Miller,V.L.
1272 TITLE A chromosomally encoded regulator is required for expression of the
1273 Yersinia enterocolitica inv gene and for virulence
1274 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000)
1275 MEDLINE 20138369
1276 PUBMED 10672189
1277 REFERENCE 2 (residues 1 to 143)
1278 AUTHORS Revell,P.A. and Miller,V.L.
1279 TITLE Direct Submission
1280 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington
1281 University School of Medicine, Campus Box 8230, 660 South Euclid,
1282 St. Louis, MO 63110, USA
1283 COMMENT Method: conceptual translation.
1284 FEATURES Location/Qualifiers
1285 source 1..143
1286 /organism="Yersinia enterocolitica"
1287 /mol_type="unassigned DNA"
1288 /strain="JB580v"
1289 /serotype="O:8"
1290 /db_xref="taxon:630"
1291 Protein 1..143
1292 /product="transcriptional regulator RovA"
1293 /name="regulates inv expression"
1294 CDS 1..143
1295 /gene="rovA"
1296 /coded_by="AF171097.1:380..811"
1297 /note="regulator of virulence"
1298 /transl_table=11
1299 ORIGIN
1300 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq
1301 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp
1302 121 deiellsgli dklerniiql qsk
1303 //
1304 """
1305
1306 embl_example="""ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
1307 XX
1308 AC X56734; S46826;
1309 XX
1310 DT 12-SEP-1991 (Rel. 29, Created)
1311 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
1312 XX
1313 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
1314 XX
1315 KW beta-glucosidase.
1316 XX
1317 OS Trifolium repens (white clover)
1318 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
1319 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
1320 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
1321 XX
1322 RN [5]
1323 RP 1-1859
1324 RX PUBMED; 1907511.
1325 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
1326 RT "Nucleotide and derived amino acid sequence of the cyanogenic
1327 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
1328 RL Plant Mol. Biol. 17(2):209-219(1991).
1329 XX
1330 RN [6]
1331 RP 1-1859
1332 RA Hughes M.A.;
1333 RT ;
1334 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases.
1335 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
1336 RL Upon Tyne, NE2 4HH, UK
1337 XX
1338 FH Key Location/Qualifiers
1339 FH
1340 FT source 1..1859
1341 FT /organism="Trifolium repens"
1342 FT /mol_type="mRNA"
1343 FT /clone_lib="lambda gt10"
1344 FT /clone="TRE361"
1345 FT /tissue_type="leaves"
1346 FT /db_xref="taxon:3899"
1347 FT CDS 14..1495
1348 FT /product="beta-glucosidase"
1349 FT /EC_number="3.2.1.21"
1350 FT /note="non-cyanogenic"
1351 FT /db_xref="GOA:P26204"
1352 FT /db_xref="InterPro:IPR001360"
1353 FT /db_xref="InterPro:IPR013781"
1354 FT /db_xref="UniProtKB/Swiss-Prot:P26204"
1355 FT /protein_id="CAA40058.1"
1356 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
1357 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
1358 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
1359 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
1360 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
1361 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
1362 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
1363 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
1364 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
1365 FT mRNA 1..1859
1366 FT /experiment="experimental evidence, no additional details
1367 FT recorded"
1368 XX
1369 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
1370 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
1371 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
1372 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
1373 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
1374 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
1375 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
1376 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
1377 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
1378 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
1379 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
1380 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
1381 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
1382 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
1383 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
1384 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
1385 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
1386 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
1387 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
1388 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
1389 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
1390 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
1391 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
1392 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
1393 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
1394 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
1395 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
1396 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
1397 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
1398 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
1399 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
1400 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859
1401 //
1402 """
1403
1404 print "GenBank CDS Iteration"
1405 print "====================="
1406
1407 g = GenBankScanner()
1408 for record in g.parse_cds_features(StringIO(gbk_example)) :
1409 print record
1410
1411 g = GenBankScanner()
1412 for record in g.parse_cds_features(StringIO(gbk_example2),
1413 tags2id=('gene','locus_tag','product')) :
1414 print record
1415
1416 g = GenBankScanner()
1417 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2),
1418 tags2id=('gene','locus_tag','product')) :
1419 print record
1420
1421 print
1422 print "GenBank Iteration"
1423 print "================="
1424 g = GenBankScanner()
1425 for record in g.parse_records(StringIO(gbk_example),do_features=False) :
1426 print record.id, record.name, record.description
1427 print record.seq
1428
1429 g = GenBankScanner()
1430 for record in g.parse_records(StringIO(gbk_example),do_features=True) :
1431 print record.id, record.name, record.description
1432 print record.seq
1433
1434 g = GenBankScanner()
1435 for record in g.parse_records(StringIO(gbk_example2),do_features=False) :
1436 print record.id, record.name, record.description
1437 print record.seq
1438
1439 g = GenBankScanner()
1440 for record in g.parse_records(StringIO(gbk_example2),do_features=True) :
1441 print record.id, record.name, record.description
1442 print record.seq
1443
1444 print
1445 print "EMBL CDS Iteration"
1446 print "=================="
1447
1448 e = EmblScanner()
1449 for record in e.parse_cds_features(StringIO(embl_example)) :
1450 print record
1451
1452 print
1453 print "EMBL Iteration"
1454 print "=============="
1455 e = EmblScanner()
1456 for record in e.parse_records(StringIO(embl_example),do_features=True) :
1457 print record.id, record.name, record.description
1458 print record.seq
1459