1
2
3
4
5
6
7
8 """Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format.
9
10 You are expected to use this module via the Bio.SeqIO functions.
11 See also the Bio.SwissProt module which offers more than just accessing
12 the sequences as SeqRecord objects.
13
14 See also Bio.SeqIO.UniprotIO.py which supports the "uniprot-xml" format.
15 """
16
17 from Bio import Seq
18 from Bio import SeqRecord
19 from Bio import Alphabet
20 from Bio import SeqFeature
21 from Bio import SwissProt
22
24 """Turn a Swiss location position into a SeqFeature position object (PRIVATE).
25
26 An offset of -1 is used with a start location to make it pythonic.
27 """
28 if location_string=="?":
29 return SeqFeature.UnknownPosition()
30
31 try:
32 return SeqFeature.ExactPosition(max(0, offset+int(location_string)))
33 except ValueError:
34 pass
35 if location_string.startswith("<"):
36 try:
37 return SeqFeature.BeforePosition(max(0,offset+int(location_string[1:])))
38 except ValueError:
39 pass
40 elif location_string.startswith(">"):
41 try:
42 return SeqFeature.AfterPosition(max(0,offset+int(location_string[1:])))
43 except ValueError :
44 pass
45 elif location_string.startswith("?"):
46 try:
47 return SeqFeature.UncertainPosition(max(0,offset+int(location_string[1:])))
48 except ValueError:
49 pass
50 raise NotImplementedError("Cannot parse location '%s'" % location_string)
51
60
61
63 """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects.
64
65 Every section from the ID line to the terminating // becomes
66 a single SeqRecord with associated annotation and features.
67
68 This parser is for the flat file "swiss" format as used by:
69 * Swiss-Prot aka SwissProt
70 * TrEMBL
71 * UniProtKB aka UniProt Knowledgebase
72
73 For consistency with BioPerl and EMBOSS we call this the "swiss"
74 format. See also the SeqIO support for "uniprot-xml" format.
75 """
76 swiss_records = SwissProt.parse(handle)
77 for swiss_record in swiss_records:
78
79 seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein)
80 record = SeqRecord.SeqRecord(seq,
81 id=swiss_record.accessions[0],
82 name=swiss_record.entry_name,
83 description=swiss_record.description,
84 features=[_make_seqfeature(*f) for f \
85 in swiss_record.features],
86 )
87 record.description = swiss_record.description
88 for cross_reference in swiss_record.cross_references:
89 if len(cross_reference) < 2:
90 continue
91 database, accession = cross_reference[:2]
92 dbxref = "%s:%s" % (database, accession)
93 if not dbxref in record.dbxrefs:
94 record.dbxrefs.append(dbxref)
95 annotations = record.annotations
96 annotations['accessions'] = swiss_record.accessions
97 annotations['date'] = swiss_record.created[0]
98 annotations['date_last_sequence_update'] = swiss_record.sequence_update[0]
99 if swiss_record.annotation_update:
100 annotations['date_last_annotation_update'] = swiss_record.annotation_update[0]
101 if swiss_record.gene_name:
102 annotations['gene_name'] = swiss_record.gene_name
103 annotations['organism'] = swiss_record.organism.rstrip(".")
104 annotations['taxonomy'] = swiss_record.organism_classification
105 annotations['ncbi_taxid'] = swiss_record.taxonomy_id
106 if swiss_record.host_organism:
107 annotations['organism_host'] = swiss_record.host_organism
108 if swiss_record.host_taxonomy_id:
109 annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id
110 if swiss_record.comments:
111 annotations['comment'] = "\n".join(swiss_record.comments)
112 if swiss_record.references:
113 annotations['references'] = []
114 for reference in swiss_record.references:
115 feature = SeqFeature.Reference()
116 feature.comment = " ".join(["%s=%s;" % (key, value) \
117 for key, value \
118 in reference.comments])
119 for key, value in reference.references:
120 if key == 'PubMed':
121 feature.pubmed_id = value
122 elif key == 'MEDLINE':
123 feature.medline_id = value
124 elif key == 'DOI':
125 pass
126 elif key == 'AGRICOLA':
127 pass
128 else:
129 raise ValueError(\
130 "Unknown key %s found in references" % key)
131 feature.authors = reference.authors
132 feature.title = reference.title
133 feature.journal = reference.location
134 annotations['references'].append(feature)
135 if swiss_record.keywords:
136 record.annotations['keywords'] = swiss_record.keywords
137 yield record
138
139 if __name__ == "__main__":
140 print "Quick self test..."
141
142 example_filename = "../../Tests/SwissProt/sp008"
143
144 import os
145 if not os.path.isfile(example_filename):
146 print "Missing test file %s" % example_filename
147 else:
148
149 handle = open(example_filename)
150 records = SwissIterator(handle)
151 for record in records:
152 print record.name
153 print record.id
154 print record.annotations['keywords']
155 print repr(record.annotations['organism'])
156 print record.seq.tostring()[:20] + "..."
157 for f in record.features: print f
158 handle.close()
159