Package Bio :: Package SeqIO :: Module InsdcIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.InsdcIO

  1  # Copyright 2007, 2008 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package.. 
  6   
  7  """Bio.SeqIO support for the "genbank" and "embl" file formats. 
  8   
  9  You are expected to use this module via the Bio.SeqIO functions. 
 10  Note that internally this module calls Bio.GenBank to do the actual 
 11  parsing of both GenBank and EMBL files. 
 12   
 13  See also: 
 14   
 15  International Nucleotide Sequence Database Collaboration 
 16  http://www.insdc.org/ 
 17    
 18  GenBank 
 19  http://www.ncbi.nlm.nih.gov/Genbank/ 
 20   
 21  EMBL Nucleotide Sequence Database 
 22  http://www.ebi.ac.uk/embl/ 
 23   
 24  DDBJ (DNA Data Bank of Japan) 
 25  http://www.ddbj.nig.ac.jp/ 
 26  """ 
 27   
 28  from Bio.GenBank.Scanner import GenBankScanner, EmblScanner 
 29  from Bio import Alphabet 
 30  from Interfaces import SequentialSequenceWriter 
 31   
 32  # NOTE 
 33  # ==== 
 34  # The "brains" for parsing GenBank and EMBL files (and any 
 35  # other flat file variants from the INSDC in future) is in 
 36  # Bio.GenBank.Scanner (plus the _FeatureConsumer in Bio.GenBank) 
 37   
38 -def GenBankIterator(handle) :
39 """Breaks up a Genbank file into SeqRecord objects. 40 41 Every section from the LOCUS line to the terminating // becomes 42 a single SeqRecord with associated annotation and features. 43 44 Note that for genomes or chromosomes, there is typically only 45 one record.""" 46 #This calls a generator function: 47 return GenBankScanner(debug=0).parse_records(handle)
48
49 -def EmblIterator(handle) :
50 """Breaks up an EMBL file into SeqRecord objects. 51 52 Every section from the LOCUS line to the terminating // becomes 53 a single SeqRecord with associated annotation and features. 54 55 Note that for genomes or chromosomes, there is typically only 56 one record.""" 57 #This calls a generator function: 58 return EmblScanner(debug=0).parse_records(handle)
59
60 -def GenBankCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein) :
61 """Breaks up a Genbank file into SeqRecord objects for each CDS feature. 62 63 Every section from the LOCUS line to the terminating // can contain 64 many CDS features. These are returned as with the stated amino acid 65 translation sequence (if given). 66 """ 67 #This calls a generator function: 68 return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
69
70 -def EmblCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein) :
71 """Breaks up a EMBL file into SeqRecord objects for each CDS feature. 72 73 Every section from the LOCUS line to the terminating // can contain 74 many CDS features. These are returned as with the stated amino acid 75 translation sequence (if given). 76 """ 77 #This calls a generator function: 78 return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
79
80 -class GenBankWriter(SequentialSequenceWriter) :
81 HEADER_WIDTH = 12 82 MAX_WIDTH = 80 83
84 - def _write_single_line(self, tag, text) :
85 "Used in the the 'header' of each GenBank record.""" 86 assert len(tag) < self.HEADER_WIDTH 87 assert len(text) < self.MAX_WIDTH - self.HEADER_WIDTH, \ 88 "Annotation %s too long for %s line" % (repr(text), tag) 89 self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH), 90 text.replace("\n"," ")))
91
92 - def _write_multi_line(self, tag, text) :
93 "Used in the the 'header' of each GenBank record.""" 94 #TODO - Do the line spliting while preserving white space? 95 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 96 assert len(tag) < self.HEADER_WIDTH 97 text = text.strip() 98 if len(text) < max_len : 99 self._write_single_line(tag, text) 100 return 101 102 words = text.split() 103 assert max([len(w) for w in words]) < max_len, \ 104 "Your description cannot be broken into nice lines!" 105 text = "" 106 while words and len(text) + 1 + len(words[0]) < max_len : 107 text += " " + words.pop(0) 108 text = text.strip() 109 assert len(text) < max_len 110 self._write_single_line(tag, text) 111 while words : 112 text = "" 113 while words and len(text) + 1 + len(words[0]) < max_len : 114 text += " " + words.pop(0) 115 text = text.strip() 116 assert len(text) < max_len 117 self._write_single_line("", text) 118 assert not words
119
120 - def _write_the_first_line(self, record) :
121 """Write the LOCUS line.""" 122 123 locus = record.name 124 if not locus or locus == "<unknown name>" : 125 locus = record.id 126 if not locus or locus == "<unknown id>" : 127 locus = self._get_annotation_str(record, "accession", just_first=True) 128 if len(locus) > 16 : 129 raise ValueError("Locus identifier %s is too long" % repr(locus)) 130 131 if len(record) > 99999999999 : 132 #Currently GenBank only officially support up to 350000, but 133 #the length field can take eleven digits 134 raise ValueError("Sequence too long!") 135 136 #Get the base alphabet (underneath any Gapped or StopCodon encoding) 137 a = Alphabet._get_base_alphabet(record.seq.alphabet) 138 if not isinstance(a, Alphabet.Alphabet) : 139 raise TypeError("Invalid alphabet") 140 elif isinstance(a, Alphabet.ProteinAlphabet) : 141 units = "bp" 142 elif isinstance(a, Alphabet.NucleotideAlphabet) : 143 units = "aa" 144 else : 145 #Must be something like NucleotideAlphabet or 146 #just the generic Alphabet (default for fasta files) 147 raise ValueError("Need a Nucleotide or Protein alphabet") 148 149 #Get the molecule type 150 #TODO - record this explicitly in the parser? 151 if isinstance(a, Alphabet.ProteinAlphabet) : 152 mol_type = "" 153 elif isinstance(a, Alphabet.DNAAlphabet) : 154 mol_type = "DNA" 155 elif isinstance(a, Alphabet.RNAAlphabet) : 156 mol_type = "RNA" 157 else : 158 #Must be something like NucleotideAlphabet or 159 #just the generic Alphabet (default for fasta files) 160 raise ValueError("Need a DNA, RNA or Protein alphabet") 161 162 try : 163 division = record.annotations["data_file_division"] 164 except KeyError : 165 division = "UNK" 166 if division not in ["PRI","ROD","MAM","VRT","INV","PLN","BCT", 167 "VRL","PHG","SYN","UNA","EST","PAT","STS", 168 "GSS","HTG","HTC","ENV"] : 169 division = "UNK" 170 171 assert len(units) == 2 172 assert len(division) == 3 173 #TODO - date 174 #TODO - mol_type 175 line = "LOCUS %s %s %s %s %s 01-JAN-1980\n" \ 176 % (locus.ljust(16), 177 str(len(record)).rjust(11), 178 units, 179 mol_type.ljust(6), 180 division) 181 assert len(line) == 79+1, repr(line) #plus one for new line 182 183 assert line[12:28].rstrip() == locus, \ 184 'LOCUS line does not contain the locus at the expected position:\n' + line 185 assert line[28:29] == " " 186 assert line[29:40].lstrip() == str(len(record)), \ 187 'LOCUS line does not contain the length at the expected position:\n' + line 188 189 #Tests copied from Bio.GenBank.Scanner 190 assert line[40:44] in [' bp ', ' aa '] , \ 191 'LOCUS line does not contain size units at expected position:\n' + line 192 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 193 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 194 assert line[47:54].strip() == "" \ 195 or line[47:54].strip().find('DNA') != -1 \ 196 or line[47:54].strip().find('RNA') != -1, \ 197 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 198 assert line[54:55] == ' ', \ 199 'LOCUS line does not contain space at position 55:\n' + line 200 assert line[55:63].strip() in ['','linear','circular'], \ 201 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 202 assert line[63:64] == ' ', \ 203 'LOCUS line does not contain space at position 64:\n' + line 204 assert line[67:68] == ' ', \ 205 'LOCUS line does not contain space at position 68:\n' + line 206 assert line[70:71] == '-', \ 207 'LOCUS line does not contain - at position 71 in date:\n' + line 208 assert line[74:75] == '-', \ 209 'LOCUS line does not contain - at position 75 in date:\n' + line 210 211 self.handle.write(line)
212
213 - def _get_annotation_str(self, record, key, default=".", just_first=False) :
214 """Get an annotation dictionary entry (as a string). 215 216 Some entries are lists, in which case if just_first=True the first entry 217 is returned. If just_first=False (default) this verifies there is only 218 one entry before returning it.""" 219 try : 220 answer = record.annotations[key] 221 except KeyError : 222 return default 223 if isinstance(answer, list) : 224 if not just_first : assert len(answer) == 1 225 return str(answer[0]) 226 else : 227 return str(answer)
228
229 - def _write_sequence(self, record):
230 #Loosely based on code from Howard Salis 231 #TODO - Force lower case? 232 LETTERS_PER_LINE = 60 233 SEQUENCE_INDENT = 9 234 seq_len = len(record) 235 for line_number in range(0,seq_len,LETTERS_PER_LINE): 236 self.handle.write(str(line_number+1).rjust(SEQUENCE_INDENT)) 237 for words in range(line_number,min(line_number+LETTERS_PER_LINE,seq_len),10): 238 self.handle.write(" %s" % record.seq[words:words+10]) 239 self.handle.write("\n")
240
241 - def write_record(self, record):
242 """Write a single record to the output file.""" 243 handle = self.handle 244 self._write_the_first_line(record) 245 246 accession = self._get_annotation_str(record, "accession", 247 record.id.split(".",1)[0], 248 just_first=True) 249 acc_with_version = accession 250 if record.id.startswith(accession+".") : 251 try : 252 acc_with_version = "%s.%i" \ 253 % (accession, int(record.id.split(".",1)[1])) 254 except ValueError : 255 pass 256 gi = self._get_annotation_str(record, "gi", just_first=True) 257 258 descr = record.description 259 if descr == "<unknown description>" : descr = "." 260 self._write_multi_line("DEFINITION", descr) 261 262 self._write_single_line("ACCESSION", accession) 263 if gi != "." : 264 self._write_single_line("VERSION", "%s GI:%s" % (acc_with_version,gi)) 265 else : 266 self._write_single_line("VERSION", "%s" % (acc_with_version)) 267 268 try : 269 #List of strings 270 keywords = "; ".join(record.annotations["keywords"]) 271 except KeyError : 272 keywords = "." 273 self._write_multi_line("KEYWORDS", keywords) 274 275 self._write_multi_line("SOURCE", \ 276 self._get_annotation_str(record, "source")) 277 #The ORGANISM line MUST be a single line, as any continuation is the taxonomy 278 org = self._get_annotation_str(record, "organism") 279 if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH : 280 org = org[:self.MAX_WIDTH - self.HEADER_WIDTH-4]+"..." 281 self._write_single_line(" ORGANISM", org) 282 try : 283 #List of strings 284 taxonomy = "; ".join(record.annotations["taxonomy"]) 285 except KeyError : 286 taxonomy = "." 287 self._write_multi_line("", taxonomy) 288 289 #TODO - References... 290 handle.write("FEATURES Location/Qualifiers\n") 291 #TODO - Features... 292 handle.write("ORIGIN\n") 293 self._write_sequence(record) 294 handle.write("//\n")
295 296 if __name__ == "__main__" : 297 print "Quick self test" 298 import os 299 from StringIO import StringIO 300
301 - def check_genbank_writer(records) :
302 handle = StringIO() 303 GenBankWriter(handle).write_file(records) 304 handle.seek(0) 305 306 records2 = list(GenBankIterator(handle)) 307 308 assert len(records) == len(records2) 309 for r1, r2 in zip(records, records2) : 310 #The SwissProt parser may leave \n in the description... 311 assert r1.description.replace("\n", " ") == r2.description 312 assert r1.id == r2.id 313 assert r1.name == r2.name 314 assert str(r1.seq) == str(r2.seq) 315 for key in ["gi", "keywords", "source", "taxonomy"] : 316 if key in r1.annotations : 317 assert r1.annotations[key] == r2.annotations[key], key 318 for key in ["organism"] : 319 if key in r1.annotations : 320 v1 = r1.annotations[key] 321 v2 = r2.annotations[key] 322 assert isinstance(v1, str) and isinstance(v2, str) 323 #SwissProt organism can be too long to record in GenBank format 324 assert v1 == v2 or \ 325 (v2.endswith("...") and v1.startswith(v2[:-3])), key
326 327 for filename in os.listdir("../../Tests/GenBank") : 328 if not filename.endswith(".gbk") and not filename.endswith(".gb") : 329 continue 330 print filename 331 332 handle = open("../../Tests/GenBank/%s" % filename) 333 records = list(GenBankIterator(handle)) 334 handle.close() 335 336 check_genbank_writer(records) 337 338 for filename in os.listdir("../../Tests/EMBL") : 339 if not filename.endswith(".embl") : 340 continue 341 print filename 342 343 handle = open("../../Tests/EMBL/%s" % filename) 344 records = list(EmblIterator(handle)) 345 handle.close() 346 347 check_genbank_writer(records) 348 349 from Bio import SeqIO 350 for filename in os.listdir("../../Tests/SwissProt") : 351 if not filename.startswith("sp") : 352 continue 353 print filename 354 355 handle = open("../../Tests/SwissProt/%s" % filename) 356 records = list(SeqIO.parse(handle,"swiss")) 357 handle.close() 358 359 check_genbank_writer(records) 360