1
2
3
4
5
6
7 """Bio.SeqIO support for the "genbank" and "embl" file formats.
8
9 You are expected to use this module via the Bio.SeqIO functions.
10 Note that internally this module calls Bio.GenBank to do the actual
11 parsing of both GenBank and EMBL files.
12
13 See also:
14
15 International Nucleotide Sequence Database Collaboration
16 http://www.insdc.org/
17
18 GenBank
19 http://www.ncbi.nlm.nih.gov/Genbank/
20
21 EMBL Nucleotide Sequence Database
22 http://www.ebi.ac.uk/embl/
23
24 DDBJ (DNA Data Bank of Japan)
25 http://www.ddbj.nig.ac.jp/
26 """
27
28 from Bio.GenBank.Scanner import GenBankScanner, EmblScanner
29 from Bio import Alphabet
30 from Interfaces import SequentialSequenceWriter
31
32
33
34
35
36
37
39 """Breaks up a Genbank file into SeqRecord objects.
40
41 Every section from the LOCUS line to the terminating // becomes
42 a single SeqRecord with associated annotation and features.
43
44 Note that for genomes or chromosomes, there is typically only
45 one record."""
46
47 return GenBankScanner(debug=0).parse_records(handle)
48
50 """Breaks up an EMBL file into SeqRecord objects.
51
52 Every section from the LOCUS line to the terminating // becomes
53 a single SeqRecord with associated annotation and features.
54
55 Note that for genomes or chromosomes, there is typically only
56 one record."""
57
58 return EmblScanner(debug=0).parse_records(handle)
59
61 """Breaks up a Genbank file into SeqRecord objects for each CDS feature.
62
63 Every section from the LOCUS line to the terminating // can contain
64 many CDS features. These are returned as with the stated amino acid
65 translation sequence (if given).
66 """
67
68 return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
69
71 """Breaks up a EMBL file into SeqRecord objects for each CDS feature.
72
73 Every section from the LOCUS line to the terminating // can contain
74 many CDS features. These are returned as with the stated amino acid
75 translation sequence (if given).
76 """
77
78 return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
79
81 HEADER_WIDTH = 12
82 MAX_WIDTH = 80
83
91
119
121 """Write the LOCUS line."""
122
123 locus = record.name
124 if not locus or locus == "<unknown name>" :
125 locus = record.id
126 if not locus or locus == "<unknown id>" :
127 locus = self._get_annotation_str(record, "accession", just_first=True)
128 if len(locus) > 16 :
129 raise ValueError("Locus identifier %s is too long" % repr(locus))
130
131 if len(record) > 99999999999 :
132
133
134 raise ValueError("Sequence too long!")
135
136
137 a = Alphabet._get_base_alphabet(record.seq.alphabet)
138 if not isinstance(a, Alphabet.Alphabet) :
139 raise TypeError("Invalid alphabet")
140 elif isinstance(a, Alphabet.ProteinAlphabet) :
141 units = "bp"
142 elif isinstance(a, Alphabet.NucleotideAlphabet) :
143 units = "aa"
144 else :
145
146
147 raise ValueError("Need a Nucleotide or Protein alphabet")
148
149
150
151 if isinstance(a, Alphabet.ProteinAlphabet) :
152 mol_type = ""
153 elif isinstance(a, Alphabet.DNAAlphabet) :
154 mol_type = "DNA"
155 elif isinstance(a, Alphabet.RNAAlphabet) :
156 mol_type = "RNA"
157 else :
158
159
160 raise ValueError("Need a DNA, RNA or Protein alphabet")
161
162 try :
163 division = record.annotations["data_file_division"]
164 except KeyError :
165 division = "UNK"
166 if division not in ["PRI","ROD","MAM","VRT","INV","PLN","BCT",
167 "VRL","PHG","SYN","UNA","EST","PAT","STS",
168 "GSS","HTG","HTC","ENV"] :
169 division = "UNK"
170
171 assert len(units) == 2
172 assert len(division) == 3
173
174
175 line = "LOCUS %s %s %s %s %s 01-JAN-1980\n" \
176 % (locus.ljust(16),
177 str(len(record)).rjust(11),
178 units,
179 mol_type.ljust(6),
180 division)
181 assert len(line) == 79+1, repr(line)
182
183 assert line[12:28].rstrip() == locus, \
184 'LOCUS line does not contain the locus at the expected position:\n' + line
185 assert line[28:29] == " "
186 assert line[29:40].lstrip() == str(len(record)), \
187 'LOCUS line does not contain the length at the expected position:\n' + line
188
189
190 assert line[40:44] in [' bp ', ' aa '] , \
191 'LOCUS line does not contain size units at expected position:\n' + line
192 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \
193 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
194 assert line[47:54].strip() == "" \
195 or line[47:54].strip().find('DNA') != -1 \
196 or line[47:54].strip().find('RNA') != -1, \
197 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
198 assert line[54:55] == ' ', \
199 'LOCUS line does not contain space at position 55:\n' + line
200 assert line[55:63].strip() in ['','linear','circular'], \
201 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
202 assert line[63:64] == ' ', \
203 'LOCUS line does not contain space at position 64:\n' + line
204 assert line[67:68] == ' ', \
205 'LOCUS line does not contain space at position 68:\n' + line
206 assert line[70:71] == '-', \
207 'LOCUS line does not contain - at position 71 in date:\n' + line
208 assert line[74:75] == '-', \
209 'LOCUS line does not contain - at position 75 in date:\n' + line
210
211 self.handle.write(line)
212
214 """Get an annotation dictionary entry (as a string).
215
216 Some entries are lists, in which case if just_first=True the first entry
217 is returned. If just_first=False (default) this verifies there is only
218 one entry before returning it."""
219 try :
220 answer = record.annotations[key]
221 except KeyError :
222 return default
223 if isinstance(answer, list) :
224 if not just_first : assert len(answer) == 1
225 return str(answer[0])
226 else :
227 return str(answer)
228
230
231
232 LETTERS_PER_LINE = 60
233 SEQUENCE_INDENT = 9
234 seq_len = len(record)
235 for line_number in range(0,seq_len,LETTERS_PER_LINE):
236 self.handle.write(str(line_number+1).rjust(SEQUENCE_INDENT))
237 for words in range(line_number,min(line_number+LETTERS_PER_LINE,seq_len),10):
238 self.handle.write(" %s" % record.seq[words:words+10])
239 self.handle.write("\n")
240
242 """Write a single record to the output file."""
243 handle = self.handle
244 self._write_the_first_line(record)
245
246 accession = self._get_annotation_str(record, "accession",
247 record.id.split(".",1)[0],
248 just_first=True)
249 acc_with_version = accession
250 if record.id.startswith(accession+".") :
251 try :
252 acc_with_version = "%s.%i" \
253 % (accession, int(record.id.split(".",1)[1]))
254 except ValueError :
255 pass
256 gi = self._get_annotation_str(record, "gi", just_first=True)
257
258 descr = record.description
259 if descr == "<unknown description>" : descr = "."
260 self._write_multi_line("DEFINITION", descr)
261
262 self._write_single_line("ACCESSION", accession)
263 if gi != "." :
264 self._write_single_line("VERSION", "%s GI:%s" % (acc_with_version,gi))
265 else :
266 self._write_single_line("VERSION", "%s" % (acc_with_version))
267
268 try :
269
270 keywords = "; ".join(record.annotations["keywords"])
271 except KeyError :
272 keywords = "."
273 self._write_multi_line("KEYWORDS", keywords)
274
275 self._write_multi_line("SOURCE", \
276 self._get_annotation_str(record, "source"))
277
278 org = self._get_annotation_str(record, "organism")
279 if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH :
280 org = org[:self.MAX_WIDTH - self.HEADER_WIDTH-4]+"..."
281 self._write_single_line(" ORGANISM", org)
282 try :
283
284 taxonomy = "; ".join(record.annotations["taxonomy"])
285 except KeyError :
286 taxonomy = "."
287 self._write_multi_line("", taxonomy)
288
289
290 handle.write("FEATURES Location/Qualifiers\n")
291
292 handle.write("ORIGIN\n")
293 self._write_sequence(record)
294 handle.write("//\n")
295
296 if __name__ == "__main__" :
297 print "Quick self test"
298 import os
299 from StringIO import StringIO
300
302 handle = StringIO()
303 GenBankWriter(handle).write_file(records)
304 handle.seek(0)
305
306 records2 = list(GenBankIterator(handle))
307
308 assert len(records) == len(records2)
309 for r1, r2 in zip(records, records2) :
310
311 assert r1.description.replace("\n", " ") == r2.description
312 assert r1.id == r2.id
313 assert r1.name == r2.name
314 assert str(r1.seq) == str(r2.seq)
315 for key in ["gi", "keywords", "source", "taxonomy"] :
316 if key in r1.annotations :
317 assert r1.annotations[key] == r2.annotations[key], key
318 for key in ["organism"] :
319 if key in r1.annotations :
320 v1 = r1.annotations[key]
321 v2 = r2.annotations[key]
322 assert isinstance(v1, str) and isinstance(v2, str)
323
324 assert v1 == v2 or \
325 (v2.endswith("...") and v1.startswith(v2[:-3])), key
326
327 for filename in os.listdir("../../Tests/GenBank") :
328 if not filename.endswith(".gbk") and not filename.endswith(".gb") :
329 continue
330 print filename
331
332 handle = open("../../Tests/GenBank/%s" % filename)
333 records = list(GenBankIterator(handle))
334 handle.close()
335
336 check_genbank_writer(records)
337
338 for filename in os.listdir("../../Tests/EMBL") :
339 if not filename.endswith(".embl") :
340 continue
341 print filename
342
343 handle = open("../../Tests/EMBL/%s" % filename)
344 records = list(EmblIterator(handle))
345 handle.close()
346
347 check_genbank_writer(records)
348
349 from Bio import SeqIO
350 for filename in os.listdir("../../Tests/SwissProt") :
351 if not filename.startswith("sp") :
352 continue
353 print filename
354
355 handle = open("../../Tests/SwissProt/%s" % filename)
356 records = list(SeqIO.parse(handle,"swiss"))
357 handle.close()
358
359 check_genbank_writer(records)
360