1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 import warnings
29 import os
30 import re
31 from Bio.Seq import Seq
32 from Bio.SeqRecord import SeqRecord
33 from Bio.Alphabet import generic_alphabet, generic_protein
34
36 """Basic functions for breaking up a GenBank/EMBL file into sub sections.
37
38 The International Nucleotide Sequence Database Collaboration (INSDC)
39 between the DDBJ, EMBL, and GenBank. These organisations all use the
40 same "Feature Table" layout in their plain text flat file formats.
41
42 However, the header and sequence sections of an EMBL file are very
43 different in layout to those produced by GenBank/DDBJ."""
44
45
46 RECORD_START = "XXX"
47 HEADER_WIDTH = 3
48 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"]
49 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"]
50 FEATURE_QUALIFIER_INDENT = 0
51 FEATURE_QUALIFIER_SPACER = ""
52 SEQUENCE_HEADERS=["XXX"]
53
61
65
67 """Read in lines until find the ID/LOCUS line, which is returned.
68
69 Any preamble (such as the header used by the NCBI on *.seq.gz archives)
70 will we ignored."""
71 while True:
72 if self.line:
73 line = self.line
74 self.line = ""
75 else:
76 line = self.handle.readline()
77 if not line:
78 if self.debug : print "End of file"
79 return None
80 if line[:self.HEADER_WIDTH]==self.RECORD_START:
81 if self.debug > 1: print "Found the start of a record:\n" + line
82 break
83 line = line.rstrip()
84 if line == "//":
85 if self.debug > 1: print "Skipping // marking end of last record"
86 elif line == "":
87 if self.debug > 1: print "Skipping blank line before record"
88 else:
89
90 if self.debug > 1:
91 print "Skipping header line before record:\n" + line
92 self.line = line
93 return line
94
96 """Return list of strings making up the header
97
98 New line characters are removed.
99
100 Assumes you have just read in the ID/LOCUS line.
101 """
102 assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \
103 "Not at start of record"
104
105 header_lines = []
106 while True:
107 line = self.handle.readline()
108 if not line:
109 raise ValueError("Premature end of line during sequence data")
110 line = line.rstrip()
111 if line in self.FEATURE_START_MARKERS:
112 if self.debug : print "Found header table"
113 break
114
115
116
117 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
118 if self.debug : print "Found start of sequence"
119 break
120 if line == "//":
121 raise ValueError("Premature end of sequence data marker '//' found")
122 header_lines.append(line)
123 self.line = line
124 return header_lines
125
191
193 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers)
194
195 For example given this GenBank feature:
196
197 CDS complement(join(490883..490885,1..879))
198 /locus_tag="NEQ001"
199 /note="conserved hypothetical [Methanococcus jannaschii];
200 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear
201 localization signal; IPR002743: Protein of unknown
202 function DUF57"
203 /codon_start=1
204 /transl_table=11
205 /product="hypothetical protein"
206 /protein_id="NP_963295.1"
207 /db_xref="GI:41614797"
208 /db_xref="GeneID:2732620"
209 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK
210 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK
211 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP
212 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE
213 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS
214 LNSMGFGFVNTKKNSAR"
215
216 Then should give input key="CDS" and the rest of the data as a list of strings
217 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"]
218 where the leading spaces and trailing newlines have been removed.
219
220 Returns tuple containing: (key as string, location string, qualifiers as list)
221 as follows for this example:
222
223 key = "CDS", string
224 location = "complement(join(490883..490885,1..879))", string
225 qualifiers = list of string tuples:
226
227 [('locus_tag', '"NEQ001"'),
228 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'),
229 ('codon_start', '1'),
230 ('transl_table', '11'),
231 ('product', '"hypothetical protein"'),
232 ('protein_id', '"NP_963295.1"'),
233 ('db_xref', '"GI:41614797"'),
234 ('db_xref', '"GeneID:2732620"'),
235 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')]
236
237 In the above example, the "note" and "translation" were edited for compactness,
238 and they would contain multiple new line characters (displayed above as \n)
239
240 If a qualifier is quoted (in this case, everything except codon_start and
241 transl_table) then the quotes are NOT removed.
242
243 Note that no whitespace is removed.
244 """
245
246 iterator = iter(filter(None, lines))
247 try:
248 line = iterator.next()
249
250 feature_location = line.strip()
251 while feature_location[-1:]==",":
252
253 line = iterator.next()
254 feature_location += line.strip()
255
256 qualifiers=[]
257
258 for line in iterator:
259 if line[0]=="/":
260
261 i = line.find("=")
262 key = line[1:i]
263 value = line[i+1:]
264 if i==-1:
265
266 key = line[1:]
267 qualifiers.append((key,None))
268 elif value[0]=='"':
269
270 if value[-1]!='"' or value!='"':
271
272 while value[-1] != '"':
273 value += "\n" + iterator.next()
274 else:
275
276 assert value == '"'
277 if self.debug : print "Quoted line %s:%s" % (key, value)
278
279 qualifiers.append((key,value))
280 else:
281
282
283 qualifiers.append((key,value))
284 else:
285
286 assert len(qualifiers) > 0
287 assert key==qualifiers[-1][0]
288
289 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line)
290 return (feature_key, feature_location, qualifiers)
291 except StopIteration:
292
293 raise ValueError("Problem with '%s' feature:\n%s" \
294 % (feature_key, "\n".join(lines)))
295
316
318 """Handle the LOCUS/ID line, passing data to the comsumer
319
320 This should be implemented by the EMBL / GenBank specific subclass
321
322 Used by the parse_records() and parse() methods.
323 """
324 pass
325
327 """Handle the header lines (list of strings), passing data to the comsumer
328
329 This should be implemented by the EMBL / GenBank specific subclass
330
331 Used by the parse_records() and parse() methods.
332 """
333 pass
334
335
349
351 """Handle any lines between features and sequence (list of strings), passing data to the consumer
352
353 This should be implemented by the EMBL / GenBank specific subclass
354
355 Used by the parse_records() and parse() methods.
356 """
357 pass
358
359 - def feed(self, handle, consumer, do_features=True):
360 """Feed a set of data into the consumer.
361
362 This method is intended for use with the "old" code in Bio.GenBank
363
364 Arguments:
365 handle - A handle with the information to parse.
366 consumer - The consumer that should be informed of events.
367 do_features - Boolean, should the features be parsed?
368 Skipping the features can be much faster.
369
370 Return values:
371 true - Passed a record
372 false - Did not find a record
373 """
374
375
376 self.set_handle(handle)
377 if not self.find_start():
378
379 consumer.data=None
380 return False
381
382
383
384
385
386
387 self._feed_first_line(consumer, self.line)
388 self._feed_header_lines(consumer, self.parse_header())
389
390
391 if do_features:
392 self._feed_feature_table(consumer, self.parse_features(skip=False))
393 else:
394 self.parse_features(skip=True)
395
396
397 misc_lines, sequence_string = self.parse_footer()
398 self._feed_misc_lines(consumer, misc_lines)
399
400 consumer.sequence(sequence_string)
401
402 consumer.record_end("//")
403
404 assert self.line == "//"
405
406
407 return True
408
409 - def parse(self, handle, do_features=True):
424
425
427 """Returns a SeqRecord object iterator
428
429 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord
430
431 The SeqRecord objects include SeqFeatures if do_features=True
432
433 This method is intended for use in Bio.SeqIO
434 """
435
436 while True:
437 record = self.parse(handle, do_features)
438 if record is None : break
439 assert record.id is not None
440 assert record.name != "<unknown name>"
441 assert record.description != "<unknown description>"
442 yield record
443
447 """Returns SeqRecord object iterator
448
449 Each CDS feature becomes a SeqRecord.
450
451 alphabet - Used for any sequence found in a translation field.
452 tags2id - Tupple of three strings, the feature keys to use
453 for the record id, name and description,
454
455 This method is intended for use in Bio.SeqIO
456 """
457 self.set_handle(handle)
458 while self.find_start():
459
460 self.parse_header()
461 feature_tuples = self.parse_features()
462
463 while True:
464 line = self.handle.readline()
465 if not line : break
466 if line[:2]=="//" : break
467 self.line = line.rstrip()
468
469
470 for key, location_string, qualifiers in feature_tuples:
471 if key=="CDS":
472
473
474
475
476
477 record = SeqRecord(seq=None)
478 annotations = record.annotations
479
480
481
482
483 annotations['raw_location'] = location_string.replace(' ','')
484
485 for (qualifier_name, qualifier_data) in qualifiers:
486 if qualifier_data is not None \
487 and qualifier_data[0]=='"' and qualifier_data[-1]=='"':
488
489 qualifier_data = qualifier_data[1:-1]
490
491 if qualifier_name == "translation":
492 assert record.seq is None, "Multiple translations!"
493 record.seq = Seq(qualifier_data.replace("\n",""), alphabet)
494 elif qualifier_name == "db_xref":
495
496 record.dbxrefs.append(qualifier_data)
497 else:
498 if qualifier_data is not None:
499 qualifier_data = qualifier_data.replace("\n"," ").replace(" "," ")
500 try:
501 annotations[qualifier_name] += " " + qualifier_data
502 except KeyError:
503
504 annotations[qualifier_name]= qualifier_data
505
506
507
508 try:
509 record.id = annotations[tags2id[0]]
510 except KeyError:
511 pass
512 try:
513 record.name = annotations[tags2id[1]]
514 except KeyError:
515 pass
516 try:
517 record.description = annotations[tags2id[2]]
518 except KeyError:
519 pass
520
521 yield record
522
523
525 """For extracting chunks of information in EMBL files"""
526
527 RECORD_START = "ID "
528 HEADER_WIDTH = 5
529 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers","FH"]
530 FEATURE_END_MARKERS = ["XX"]
531 FEATURE_QUALIFIER_INDENT = 21
532 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2)
533 SEQUENCE_HEADERS=["SQ", "CO"]
534
569
580
582
583
584
585 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
586 fields = [line[self.HEADER_WIDTH:].split(None,1)[0]]
587 fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";"))
588 fields = [entry.strip() for entry in fields]
589 """
590 The tokens represent:
591 0. Primary accession number
592 (space sep)
593 1. ??? (e.g. standard)
594 (semi-colon)
595 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA')
596 3. Taxonomic division (e.g. 'PRO')
597 4. Sequence length (e.g. '4639675 BP.')
598 """
599 consumer.locus(fields[0])
600 consumer.residue_type(fields[2])
601 consumer.data_file_division(fields[3])
602 self._feed_seq_length(consumer, fields[4])
603
605
606
607
608 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
609 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")]
610 assert len(fields) == 7
611 """
612 The tokens represent:
613 0. Primary accession number
614 1. Sequence version number
615 2. Topology: 'circular' or 'linear'
616 3. Molecule type (e.g. 'genomic DNA')
617 4. Data class (e.g. 'STD')
618 5. Taxonomic division (e.g. 'PRO')
619 6. Sequence length (e.g. '4639675 BP.')
620 """
621
622 consumer.locus(fields[0])
623
624
625
626 consumer.accession(fields[0])
627
628
629
630 version_parts = fields[1].split()
631 if len(version_parts)==2 \
632 and version_parts[0]=="SV" \
633 and version_parts[1].isdigit():
634 consumer.version_suffix(version_parts[1])
635
636
637 consumer.residue_type(" ".join(fields[2:4]))
638
639
640
641 consumer.data_file_division(fields[5])
642
643 self._feed_seq_length(consumer, fields[6])
644
646 length_parts = text.split()
647 assert len(length_parts) == 2
648 assert length_parts[1].upper() in ["BP", "BP.", "AA."]
649 consumer.size(length_parts[0])
650
652 EMBL_INDENT = self.HEADER_WIDTH
653 EMBL_SPACER = " " * EMBL_INDENT
654 consumer_dict = {
655 'AC' : 'accession',
656 'SV' : 'version',
657 'DE' : 'definition',
658
659
660
661
662 'RG' : 'consrtm',
663
664
665 'RL' : 'journal',
666 'OS' : 'organism',
667 'OC' : 'taxonomy',
668
669 'CC' : 'comment',
670
671 }
672
673
674 for line in lines:
675 line_type = line[:EMBL_INDENT].strip()
676 data = line[EMBL_INDENT:].strip()
677 if line_type == 'XX':
678 pass
679 elif line_type == 'RN':
680
681
682 if data[0] == "[" and data[-1] == "]" : data = data[1:-1]
683 consumer.reference_num(data)
684 elif line_type == 'RP':
685
686
687
688 parts = [bases.replace("-"," to ").strip() for bases in data.split(",")]
689 consumer.reference_bases("(bases %s)" % "; ".join(parts))
690 elif line_type == 'RT':
691
692
693 if data.startswith('"'):
694 data = data[1:]
695 if data.endswith('";'):
696 data = data[:-2]
697 consumer.title(data)
698 elif line_type == 'RX':
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714 key, value = data.split(";",1)
715 if value.endswith(".") : value = value[:-1]
716 value = value.strip()
717 if key == "PUBMED":
718 consumer.pubmed_id(value)
719
720 elif line_type == 'CC':
721
722 consumer.comment([data])
723 elif line_type == 'DR':
724
725
726
727
728
729
730
731 parts = data.rstrip(".").split(";")
732
733
734 consumer.dblink("%s:%s" % (parts[0].strip(),
735 parts[1].strip()))
736 elif line_type == 'RA':
737
738 consumer.authors(data.rstrip(";"))
739 elif line_type == 'PR':
740
741
742
743 consumer.project(data.rstrip(";"))
744 elif line_type in consumer_dict:
745
746 getattr(consumer, consumer_dict[line_type])(data)
747 else:
748 if self.debug:
749 print "Ignoring EMBL header line:\n%s" % line
750
752
753 lines.append("")
754 line_iter = iter(lines)
755 try:
756 for line in line_iter:
757 if line.startswith("CO "):
758 line = line[5:].strip()
759 contig_location = line
760 while True:
761 line = line_iter.next()
762 if not line:
763 break
764 elif line.startswith("CO "):
765
766 contig_location += line[5:].strip()
767 else:
768 raise ValueError('Expected CO (contig) continuation line, got:\n' + line)
769 consumer.contig_location(contig_location)
770 return
771 except StopIteration:
772 raise ValueError("Problem in misc lines before sequence")
773
774
776 """For extracting chunks of information in IMGT (EMBL like) files (PRIVATE).
777
778 IMGT files are like EMBL files but in order to allow longer feature types
779 the features should be indented by 25 characters not 21 characters. In
780 practice the IMGT flat files tend to use either 21 or 25 characters, so we
781 must cope with both.
782
783 This is private to encourage use of Bio.SeqIO rather than Bio.GenBank.
784 """
785
786 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers",
787 "FH Key Location/Qualifiers (from EMBL)",
788 "FH Key Location/Qualifiers",
789 "FH"]
790
792 """Return list of tuples for the features (if present)
793
794 Each feature is returned as a tuple (key, location, qualifiers)
795 where key and location are strings (e.g. "CDS" and
796 "complement(join(490883..490885,1..879))") while qualifiers
797 is a list of two string tuples (feature qualifier keys and values).
798
799 Assumes you have already read to the start of the features table.
800 """
801 if self.line.rstrip() not in self.FEATURE_START_MARKERS:
802 if self.debug : print "Didn't find any feature table"
803 return []
804
805 while self.line.rstrip() in self.FEATURE_START_MARKERS:
806 self.line = self.handle.readline()
807
808 bad_position_re = re.compile(r'([0-9]+)>{1}')
809
810 features = []
811 line = self.line
812 while True:
813 if not line:
814 raise ValueError("Premature end of line during features table")
815 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
816 if self.debug : print "Found start of sequence"
817 break
818 line = line.rstrip()
819 if line == "//":
820 raise ValueError("Premature end of features table, marker '//' found")
821 if line in self.FEATURE_END_MARKERS:
822 if self.debug : print "Found end of features"
823 line = self.handle.readline()
824 break
825 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "":
826
827
828 line = self.handle.readline()
829 continue
830
831 if skip:
832 line = self.handle.readline()
833 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER:
834 line = self.handle.readline()
835 else:
836 assert line[:2] == "FT"
837 try:
838 feature_key, location_start = line[2:].strip().split()
839 except ValueError:
840
841
842
843 feature_key = line[2:25].strip()
844 location_start = line[25:].strip()
845 feature_lines = [location_start]
846 line = self.handle.readline()
847 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \
848 or line.rstrip() == "" :
849
850
851 assert line[:2] == "FT"
852 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip())
853 line = self.handle.readline()
854 feature_key, location, qualifiers = \
855 self.parse_feature(feature_key, feature_lines)
856
857 if ">" in location:
858
859
860
861
862
863
864 location = bad_position_re.sub(r'>\1',location)
865 features.append((feature_key, location, qualifiers))
866 self.line = line
867 return features
868
870 """For extracting chunks of information in GenBank files"""
871
872 RECORD_START = "LOCUS "
873 HEADER_WIDTH = 12
874 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers","FEATURES"]
875 FEATURE_END_MARKERS = []
876 FEATURE_QUALIFIER_INDENT = 21
877 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
878 SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT", "WGS"]
879
923
925
926
927
928 GENBANK_INDENT = self.HEADER_WIDTH
929 GENBANK_SPACER = " "*GENBANK_INDENT
930 assert line[0:GENBANK_INDENT] == 'LOCUS ', \
931 'LOCUS line does not start correctly:\n' + line
932
933
934
935 if line[29:33] in [' bp ', ' aa ',' rc ']:
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954 assert line[29:33] in [' bp ', ' aa ',' rc '] , \
955 'LOCUS line does not contain size units at expected position:\n' + line
956 assert line[41:42] == ' ', \
957 'LOCUS line does not contain space at position 42:\n' + line
958 assert line[42:51].strip() in ['','linear','circular'], \
959 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
960 assert line[51:52] == ' ', \
961 'LOCUS line does not contain space at position 52:\n' + line
962 assert line[55:62] == ' ', \
963 'LOCUS line does not contain spaces from position 56 to 62:\n' + line
964 if line[62:73].strip():
965 assert line[64:65] == '-', \
966 'LOCUS line does not contain - at position 65 in date:\n' + line
967 assert line[68:69] == '-', \
968 'LOCUS line does not contain - at position 69 in date:\n' + line
969
970 name_and_length_str = line[GENBANK_INDENT:29]
971 while name_and_length_str.find(' ')!=-1:
972 name_and_length_str = name_and_length_str.replace(' ',' ')
973 name_and_length = name_and_length_str.split(' ')
974 assert len(name_and_length)<=2, \
975 'Cannot parse the name and length in the LOCUS line:\n' + line
976 assert len(name_and_length)!=1, \
977 'Name and length collide in the LOCUS line:\n' + line
978
979
980
981 consumer.locus(name_and_length[0])
982 consumer.size(name_and_length[1])
983
984
985 if line[33:51].strip() == "" and line[29:33] == ' aa ':
986
987
988
989
990 consumer.residue_type("PROTEIN")
991 else:
992 consumer.residue_type(line[33:51].strip())
993
994 consumer.data_file_division(line[52:55])
995 if line[62:73].strip():
996 consumer.date(line[62:73])
997 elif line[40:44] in [' bp ', ' aa ',' rc ']:
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017 assert line[40:44] in [' bp ', ' aa ',' rc '] , \
1018 'LOCUS line does not contain size units at expected position:\n' + line
1019 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \
1020 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
1021 assert line[47:54].strip() == "" \
1022 or line[47:54].strip().find('DNA') != -1 \
1023 or line[47:54].strip().find('RNA') != -1, \
1024 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
1025 assert line[54:55] == ' ', \
1026 'LOCUS line does not contain space at position 55:\n' + line
1027 assert line[55:63].strip() in ['','linear','circular'], \
1028 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
1029 assert line[63:64] == ' ', \
1030 'LOCUS line does not contain space at position 64:\n' + line
1031 assert line[67:68] == ' ', \
1032 'LOCUS line does not contain space at position 68:\n' + line
1033 if line[68:79].strip():
1034 assert line[70:71] == '-', \
1035 'LOCUS line does not contain - at position 71 in date:\n' + line
1036 assert line[74:75] == '-', \
1037 'LOCUS line does not contain - at position 75 in date:\n' + line
1038
1039 name_and_length_str = line[GENBANK_INDENT:40]
1040 while name_and_length_str.find(' ')!=-1:
1041 name_and_length_str = name_and_length_str.replace(' ',' ')
1042 name_and_length = name_and_length_str.split(' ')
1043 assert len(name_and_length)<=2, \
1044 'Cannot parse the name and length in the LOCUS line:\n' + line
1045 assert len(name_and_length)!=1, \
1046 'Name and length collide in the LOCUS line:\n' + line
1047
1048
1049
1050 consumer.locus(name_and_length[0])
1051 consumer.size(name_and_length[1])
1052
1053 if line[44:54].strip() == "" and line[40:44] == ' aa ':
1054
1055
1056
1057
1058 consumer.residue_type(("PROTEIN " + line[54:63]).strip())
1059 else:
1060 consumer.residue_type(line[44:63].strip())
1061
1062 consumer.data_file_division(line[64:67])
1063 if line[68:79].strip():
1064 consumer.date(line[68:79])
1065 elif line[GENBANK_INDENT:].strip().count(" ")==0 :
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081 if line[GENBANK_INDENT:].strip() != "":
1082 consumer.locus(line[GENBANK_INDENT:].strip())
1083 else:
1084
1085
1086 warnings.warn("Minimal LOCUS line found - is this correct?\n" + line)
1087 elif len(line.split())>=4 and line.split()[3] in ["aa","bp"]:
1088
1089
1090 consumer.locus(line.split()[1])
1091 consumer.size(line.split()[2])
1092 warnings.warn("Malformed LOCUS line found - is this correct?\n" + line)
1093 else:
1094 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
1095
1096
1098
1099
1100
1101
1102 GENBANK_INDENT = self.HEADER_WIDTH
1103 GENBANK_SPACER = " "*GENBANK_INDENT
1104 consumer_dict = {
1105 'DEFINITION' : 'definition',
1106 'ACCESSION' : 'accession',
1107 'NID' : 'nid',
1108 'PID' : 'pid',
1109 'DBSOURCE' : 'db_source',
1110 'KEYWORDS' : 'keywords',
1111 'SEGMENT' : 'segment',
1112 'SOURCE' : 'source',
1113 'AUTHORS' : 'authors',
1114 'CONSRTM' : 'consrtm',
1115 'PROJECT' : 'project',
1116 'DBLINK' : 'dblink',
1117 'TITLE' : 'title',
1118 'JOURNAL' : 'journal',
1119 'MEDLINE' : 'medline_id',
1120 'PUBMED' : 'pubmed_id',
1121 'REMARK' : 'remark'}
1122
1123
1124
1125
1126
1127
1128 lines = filter(None,lines)
1129 lines.append("")
1130 line_iter = iter(lines)
1131 try:
1132 line = line_iter.next()
1133 while True:
1134 if not line : break
1135 line_type = line[:GENBANK_INDENT].strip()
1136 data = line[GENBANK_INDENT:].strip()
1137
1138 if line_type == 'VERSION':
1139
1140
1141
1142 while data.find(' ')!=-1:
1143 data = data.replace(' ',' ')
1144 if data.find(' GI:')==-1:
1145 consumer.version(data)
1146 else:
1147 if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]"
1148 consumer.version(data.split(' GI:')[0])
1149 consumer.gi(data.split(' GI:')[1])
1150
1151 line = line_iter.next()
1152 elif line_type == 'REFERENCE':
1153 if self.debug >1 : print "Found reference [" + data + "]"
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164 data = data.strip()
1165
1166
1167 while True:
1168 line = line_iter.next()
1169 if line[:GENBANK_INDENT] == GENBANK_SPACER:
1170
1171 data += " " + line[GENBANK_INDENT:]
1172 if self.debug >1 : print "Extended reference text [" + data + "]"
1173 else:
1174
1175 break
1176
1177
1178
1179 while data.find(' ')!=-1:
1180 data = data.replace(' ',' ')
1181 if data.find(' ')==-1:
1182 if self.debug >2 : print 'Reference number \"' + data + '\"'
1183 consumer.reference_num(data)
1184 else:
1185 if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"'
1186 consumer.reference_num(data[:data.find(' ')])
1187 consumer.reference_bases(data[data.find(' ')+1:])
1188 elif line_type == 'ORGANISM':
1189
1190
1191
1192
1193
1194
1195
1196
1197 organism_data = data
1198 lineage_data = ""
1199 while True:
1200 line = line_iter.next()
1201 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1202 if lineage_data or ";" in line:
1203 lineage_data += " " + line[GENBANK_INDENT:]
1204 else:
1205 organism_data += " " + line[GENBANK_INDENT:].strip()
1206 else:
1207
1208 break
1209 consumer.organism(organism_data)
1210 if lineage_data.strip() == "" and self.debug > 1:
1211 print "Taxonomy line(s) missing or blank"
1212 consumer.taxonomy(lineage_data.strip())
1213 del organism_data, lineage_data
1214 elif line_type == 'COMMENT':
1215 if self.debug > 1 : print "Found comment"
1216
1217
1218 comment_list=[]
1219 comment_list.append(data)
1220 while True:
1221 line = line_iter.next()
1222 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1223 data = line[GENBANK_INDENT:]
1224 comment_list.append(data)
1225 if self.debug > 2 : print "Comment continuation [" + data + "]"
1226 else:
1227
1228 break
1229 consumer.comment(comment_list)
1230 del comment_list
1231 elif line_type in consumer_dict:
1232
1233
1234 while True:
1235 line = line_iter.next()
1236 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1237 data += ' ' + line[GENBANK_INDENT:]
1238 else:
1239
1240 getattr(consumer, consumer_dict[line_type])(data)
1241
1242 break
1243 else:
1244 if self.debug:
1245 print "Ignoring GenBank header line:\n" % line
1246
1247 line = line_iter.next()
1248 except StopIteration:
1249 raise ValueError("Problem in header")
1250
1291
1292 if __name__ == "__main__":
1293 from StringIO import StringIO
1294
1295 gbk_example = \
1296 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999
1297 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
1298 (AXL2) and Rev7p (REV7) genes, complete cds.
1299 ACCESSION U49845
1300 VERSION U49845.1 GI:1293613
1301 KEYWORDS .
1302 SOURCE Saccharomyces cerevisiae (baker's yeast)
1303 ORGANISM Saccharomyces cerevisiae
1304 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
1305 Saccharomycetales; Saccharomycetaceae; Saccharomyces.
1306 REFERENCE 1 (bases 1 to 5028)
1307 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
1308 TITLE Cloning and sequence of REV7, a gene whose function is required for
1309 DNA damage-induced mutagenesis in Saccharomyces cerevisiae
1310 JOURNAL Yeast 10 (11), 1503-1509 (1994)
1311 PUBMED 7871890
1312 REFERENCE 2 (bases 1 to 5028)
1313 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M.
1314 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel
1315 plasma membrane glycoprotein
1316 JOURNAL Genes Dev. 10 (7), 777-793 (1996)
1317 PUBMED 8846915
1318 REFERENCE 3 (bases 1 to 5028)
1319 AUTHORS Roemer,T.
1320 TITLE Direct Submission
1321 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
1322 Haven, CT, USA
1323 FEATURES Location/Qualifiers
1324 source 1..5028
1325 /organism="Saccharomyces cerevisiae"
1326 /db_xref="taxon:4932"
1327 /chromosome="IX"
1328 /map="9"
1329 CDS <1..206
1330 /codon_start=3
1331 /product="TCP1-beta"
1332 /protein_id="AAA98665.1"
1333 /db_xref="GI:1293614"
1334 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
1335 AEVLLRVDNIIRARPRTANRQHM"
1336 gene 687..3158
1337 /gene="AXL2"
1338 CDS 687..3158
1339 /gene="AXL2"
1340 /note="plasma membrane glycoprotein"
1341 /codon_start=1
1342 /function="required for axial budding pattern of S.
1343 cerevisiae"
1344 /product="Axl2p"
1345 /protein_id="AAA98666.1"
1346 /db_xref="GI:1293615"
1347 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
1348 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
1349 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
1350 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
1351 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
1352 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
1353 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
1354 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
1355 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
1356 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
1357 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
1358 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
1359 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
1360 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
1361 VDFSNKSNVNVGQVKDIHGRIPEML"
1362 gene complement(3300..4037)
1363 /gene="REV7"
1364 CDS complement(3300..4037)
1365 /gene="REV7"
1366 /codon_start=1
1367 /product="Rev7p"
1368 /protein_id="AAA98667.1"
1369 /db_xref="GI:1293616"
1370 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
1371 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
1372 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
1373 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
1374 LISGDDKILNGVYSQYEEGESIFGSLF"
1375 ORIGIN
1376 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
1377 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
1378 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
1379 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg
1380 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa
1381 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa
1382 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat
1383 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga
1384 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc
1385 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga
1386 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta
1387 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag
1388 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa
1389 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata
1390 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga
1391 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac
1392 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg
1393 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc
1394 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa
1395 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca
1396 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac
1397 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa
1398 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag
1399 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct
1400 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac
1401 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa
1402 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc
1403 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata
1404 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca
1405 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc
1406 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc
1407 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca
1408 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc
1409 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg
1410 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt
1411 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc
1412 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg
1413 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca
1414 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata
1415 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg
1416 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga
1417 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt
1418 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat
1419 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt
1420 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc
1421 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag
1422 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta
1423 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa
1424 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact
1425 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt
1426 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa
1427 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag
1428 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct
1429 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt
1430 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact
1431 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa
1432 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg
1433 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt
1434 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc
1435 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca
1436 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc
1437 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc
1438 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat
1439 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa
1440 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga
1441 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat
1442 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc
1443 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc
1444 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa
1445 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg
1446 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc
1447 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt
1448 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg
1449 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg
1450 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt
1451 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt
1452 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat
1453 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc
1454 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct
1455 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta
1456 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac
1457 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct
1458 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct
1459 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
1460 //"""
1461
1462
1463
1464 gbk_example2 = \
1465 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001
1466 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica].
1467 ACCESSION AAD51968
1468 VERSION AAD51968.1 GI:5805369
1469 DBSOURCE locus AF171097 accession AF171097.1
1470 KEYWORDS .
1471 SOURCE Yersinia enterocolitica
1472 ORGANISM Yersinia enterocolitica
1473 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
1474 Enterobacteriaceae; Yersinia.
1475 REFERENCE 1 (residues 1 to 143)
1476 AUTHORS Revell,P.A. and Miller,V.L.
1477 TITLE A chromosomally encoded regulator is required for expression of the
1478 Yersinia enterocolitica inv gene and for virulence
1479 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000)
1480 MEDLINE 20138369
1481 PUBMED 10672189
1482 REFERENCE 2 (residues 1 to 143)
1483 AUTHORS Revell,P.A. and Miller,V.L.
1484 TITLE Direct Submission
1485 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington
1486 University School of Medicine, Campus Box 8230, 660 South Euclid,
1487 St. Louis, MO 63110, USA
1488 COMMENT Method: conceptual translation.
1489 FEATURES Location/Qualifiers
1490 source 1..143
1491 /organism="Yersinia enterocolitica"
1492 /mol_type="unassigned DNA"
1493 /strain="JB580v"
1494 /serotype="O:8"
1495 /db_xref="taxon:630"
1496 Protein 1..143
1497 /product="transcriptional regulator RovA"
1498 /name="regulates inv expression"
1499 CDS 1..143
1500 /gene="rovA"
1501 /coded_by="AF171097.1:380..811"
1502 /note="regulator of virulence"
1503 /transl_table=11
1504 ORIGIN
1505 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq
1506 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp
1507 121 deiellsgli dklerniiql qsk
1508 //
1509 """
1510
1511 embl_example="""ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
1512 XX
1513 AC X56734; S46826;
1514 XX
1515 DT 12-SEP-1991 (Rel. 29, Created)
1516 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
1517 XX
1518 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
1519 XX
1520 KW beta-glucosidase.
1521 XX
1522 OS Trifolium repens (white clover)
1523 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
1524 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
1525 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
1526 XX
1527 RN [5]
1528 RP 1-1859
1529 RX PUBMED; 1907511.
1530 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
1531 RT "Nucleotide and derived amino acid sequence of the cyanogenic
1532 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
1533 RL Plant Mol. Biol. 17(2):209-219(1991).
1534 XX
1535 RN [6]
1536 RP 1-1859
1537 RA Hughes M.A.;
1538 RT ;
1539 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases.
1540 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
1541 RL Upon Tyne, NE2 4HH, UK
1542 XX
1543 FH Key Location/Qualifiers
1544 FH
1545 FT source 1..1859
1546 FT /organism="Trifolium repens"
1547 FT /mol_type="mRNA"
1548 FT /clone_lib="lambda gt10"
1549 FT /clone="TRE361"
1550 FT /tissue_type="leaves"
1551 FT /db_xref="taxon:3899"
1552 FT CDS 14..1495
1553 FT /product="beta-glucosidase"
1554 FT /EC_number="3.2.1.21"
1555 FT /note="non-cyanogenic"
1556 FT /db_xref="GOA:P26204"
1557 FT /db_xref="InterPro:IPR001360"
1558 FT /db_xref="InterPro:IPR013781"
1559 FT /db_xref="UniProtKB/Swiss-Prot:P26204"
1560 FT /protein_id="CAA40058.1"
1561 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
1562 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
1563 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
1564 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
1565 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
1566 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
1567 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
1568 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
1569 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
1570 FT mRNA 1..1859
1571 FT /experiment="experimental evidence, no additional details
1572 FT recorded"
1573 XX
1574 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
1575 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
1576 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
1577 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
1578 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
1579 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
1580 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
1581 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
1582 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
1583 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
1584 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
1585 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
1586 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
1587 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
1588 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
1589 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
1590 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
1591 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
1592 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
1593 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
1594 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
1595 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
1596 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
1597 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
1598 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
1599 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
1600 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
1601 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
1602 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
1603 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
1604 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
1605 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859
1606 //
1607 """
1608
1609 print "GenBank CDS Iteration"
1610 print "====================="
1611
1612 g = GenBankScanner()
1613 for record in g.parse_cds_features(StringIO(gbk_example)):
1614 print record
1615
1616 g = GenBankScanner()
1617 for record in g.parse_cds_features(StringIO(gbk_example2),
1618 tags2id=('gene','locus_tag','product')):
1619 print record
1620
1621 g = GenBankScanner()
1622 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2),
1623 tags2id=('gene','locus_tag','product')):
1624 print record
1625
1626 print
1627 print "GenBank Iteration"
1628 print "================="
1629 g = GenBankScanner()
1630 for record in g.parse_records(StringIO(gbk_example),do_features=False):
1631 print record.id, record.name, record.description
1632 print record.seq
1633
1634 g = GenBankScanner()
1635 for record in g.parse_records(StringIO(gbk_example),do_features=True):
1636 print record.id, record.name, record.description
1637 print record.seq
1638
1639 g = GenBankScanner()
1640 for record in g.parse_records(StringIO(gbk_example2),do_features=False):
1641 print record.id, record.name, record.description
1642 print record.seq
1643
1644 g = GenBankScanner()
1645 for record in g.parse_records(StringIO(gbk_example2),do_features=True):
1646 print record.id, record.name, record.description
1647 print record.seq
1648
1649 print
1650 print "EMBL CDS Iteration"
1651 print "=================="
1652
1653 e = EmblScanner()
1654 for record in e.parse_cds_features(StringIO(embl_example)):
1655 print record
1656
1657 print
1658 print "EMBL Iteration"
1659 print "=============="
1660 e = EmblScanner()
1661 for record in e.parse_records(StringIO(embl_example),do_features=True):
1662 print record.id, record.name, record.description
1663 print record.seq
1664