1
2
3
4
5
6
7
8
9 """Sequence input/output as SeqRecord objects.
10
11 Bio.SeqIO is also documented at U{http://biopython.org/wiki/SeqIO} and by
12 a whole chapter in our tutorial:
13 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
14 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
15
16 Input
17 =====
18 The main function is Bio.SeqIO.parse(...) which takes an input file handle
19 (or in recent versions of Biopython alternatively a filename as a string),
20 and format string. This returns an iterator giving SeqRecord objects:
21
22 >>> from Bio import SeqIO
23 >>> for record in SeqIO.parse("Fasta/f002", "fasta"):
24 ... print record.id, len(record)
25 gi|1348912|gb|G26680|G26680 633
26 gi|1348917|gb|G26685|G26685 413
27 gi|1592936|gb|G29385|G29385 471
28
29 Note that the parse() function will invoke the relevant parser for the
30 format with its default settings. You may want more control, in which case
31 you need to create a format specific sequence iterator directly.
32
33 Input - Single Records
34 ======================
35 If you expect your file to contain one-and-only-one record, then we provide
36 the following 'helper' function which will return a single SeqRecord, or
37 raise an exception if there are no records or more than one record:
38
39 >>> from Bio import SeqIO
40 >>> record = SeqIO.read("Fasta/f001", "fasta")
41 >>> print record.id, len(record)
42 gi|3318709|pdb|1A91| 79
43
44 This style is useful when you expect a single record only (and would
45 consider multiple records an error). For example, when dealing with GenBank
46 files for bacterial genomes or chromosomes, there is normally only a single
47 record. Alternatively, use this with a handle when downloading a single
48 record from the internet.
49
50 However, if you just want the first record from a file containing multiple
51 record, use the iterator's next() method:
52
53 >>> from Bio import SeqIO
54 >>> record = SeqIO.parse("Fasta/f002", "fasta").next()
55 >>> print record.id, len(record)
56 gi|1348912|gb|G26680|G26680 633
57
58 The above code will work as long as the file contains at least one record.
59 Note that if there is more than one record, the remaining records will be
60 silently ignored.
61
62
63 Input - Multiple Records
64 ========================
65 For non-interlaced files (e.g. Fasta, GenBank, EMBL) with multiple records
66 using a sequence iterator can save you a lot of memory (RAM). There is
67 less benefit for interlaced file formats (e.g. most multiple alignment file
68 formats). However, an iterator only lets you access the records one by one.
69
70 If you want random access to the records by number, turn this into a list:
71
72 >>> from Bio import SeqIO
73 >>> records = list(SeqIO.parse("Fasta/f002", "fasta"))
74 >>> len(records)
75 3
76 >>> print records[1].id
77 gi|1348917|gb|G26685|G26685
78
79 If you want random access to the records by a key such as the record id,
80 turn the iterator into a dictionary:
81
82 >>> from Bio import SeqIO
83 >>> record_dict = SeqIO.to_dict(SeqIO.parse("Fasta/f002", "fasta"))
84 >>> len(record_dict)
85 3
86 >>> print len(record_dict["gi|1348917|gb|G26685|G26685"])
87 413
88
89 However, using list() or the to_dict() function will load all the records
90 into memory at once, and therefore is not possible on very large files.
91 Instead, for *some* file formats Bio.SeqIO provides an indexing approach
92 providing dictionary like access to any record. For example,
93
94 >>> from Bio import SeqIO
95 >>> record_dict = SeqIO.index("Fasta/f002", "fasta")
96 >>> len(record_dict)
97 3
98 >>> print len(record_dict["gi|1348917|gb|G26685|G26685"])
99 413
100
101 Many but not all of the supported input file formats can be indexed like
102 this. For example "fasta", "fastq", "qual" and even the binary format "sff"
103 work, but alignment formats like "phylip", "clustalw" and "nexus" will not.
104
105 In most cases you can also use SeqIO.index to get the record from the file
106 as a raw string (not a SeqRecord). This can be useful for example to extract
107 a sub-set of records from a file where SeqIO cannot output the file format
108 (e.g. the plain text SwissProt format, "swiss") or where it is important to
109 keep the output 100% identical to the input). For example,
110
111 >>> from Bio import SeqIO
112 >>> record_dict = SeqIO.index("Fasta/f002", "fasta")
113 >>> len(record_dict)
114 3
115 >>> print record_dict.get_raw("gi|1348917|gb|G26685|G26685")
116 >gi|1348917|gb|G26685|G26685 human STS STS_D11734.
117 CGGAGCCAGCGAGCATATGCTGCATGAGGACCTTTCTATCTTACATTATGGCTGGGAATCTTACTCTTTC
118 ATCTGATACCTTGTTCAGATTTCAAAATAGTTGTAGCCTTATCCTGGTTTTACAGATGTGAAACTTTCAA
119 GAGATTTACTGACTTTCCTAGAATAGTTTCTCTACTGGAAACCTGATGCTTTTATAAGCCATTGTGATTA
120 GGATGACTGTTACAGGCTTAGCTTTGTGTGAAANCCAGTCACCTTTCTCCTAGGTAATGAGTAGTGCTGT
121 TCATATTACTNTAAGTTCTATAGCATACTTGCNATCCTTTANCCATGCTTATCATANGTACCATTTGAGG
122 AATTGNTTTGCCCTTTTGGGTTTNTTNTTGGTAAANNNTTCCCGGGTGGGGGNGGTNNNGAAA
123 <BLANKLINE>
124 >>> print record_dict["gi|1348917|gb|G26685|G26685"].format("fasta")
125 >gi|1348917|gb|G26685|G26685 human STS STS_D11734.
126 CGGAGCCAGCGAGCATATGCTGCATGAGGACCTTTCTATCTTACATTATGGCTGGGAATC
127 TTACTCTTTCATCTGATACCTTGTTCAGATTTCAAAATAGTTGTAGCCTTATCCTGGTTT
128 TACAGATGTGAAACTTTCAAGAGATTTACTGACTTTCCTAGAATAGTTTCTCTACTGGAA
129 ACCTGATGCTTTTATAAGCCATTGTGATTAGGATGACTGTTACAGGCTTAGCTTTGTGTG
130 AAANCCAGTCACCTTTCTCCTAGGTAATGAGTAGTGCTGTTCATATTACTNTAAGTTCTA
131 TAGCATACTTGCNATCCTTTANCCATGCTTATCATANGTACCATTTGAGGAATTGNTTTG
132 CCCTTTTGGGTTTNTTNTTGGTAAANNNTTCCCGGGTGGGGGNGGTNNNGAAA
133 <BLANKLINE>
134
135 Here the original file and what Biopython would output differ in the line
136 wrapping.
137
138 Input - Alignments
139 ==================
140 You can read in alignment files as alignment objects using Bio.AlignIO.
141 Alternatively, reading in an alignment file format via Bio.SeqIO will give
142 you a SeqRecord for each row of each alignment:
143
144 >>> from Bio import SeqIO
145 >>> for record in SeqIO.parse("Clustalw/hedgehog.aln", "clustal"):
146 ... print record.id, len(record)
147 gi|167877390|gb|EDS40773.1| 447
148 gi|167234445|ref|NP_001107837. 447
149 gi|74100009|gb|AAZ99217.1| 447
150 gi|13990994|dbj|BAA33523.2| 447
151 gi|56122354|gb|AAV74328.1| 447
152
153 Output
154 ======
155 Use the function Bio.SeqIO.write(...), which takes a complete set of
156 SeqRecord objects (either as a list, or an iterator), an output file handle
157 (or in recent versions of Biopython an output filename as a string) and of
158 course the file format::
159
160 from Bio import SeqIO
161 records = ...
162 SeqIO.write(records, "example.faa", "fasta")
163
164 Or, using a handle::
165
166 from Bio import SeqIO
167 records = ...
168 handle = open("example.faa", "w")
169 SeqIO.write(records, handle, "fasta")
170 handle.close()
171
172 You are expected to call this function once (with all your records) and if
173 using a handle, make sure you close it to flush the data to the hard disk.
174
175 Output - Advanced
176 =================
177 The effect of calling write() multiple times on a single file will vary
178 depending on the file format, and is best avoided unless you have a strong
179 reason to do so.
180
181 If you give a filename, then each time you call write() the existing file
182 will be overwriten. For sequential files formats (e.g. fasta, genbank) each
183 "record block" holds a single sequence. For these files it would probably
184 be safe to call write() multiple times by re-using the same handle.
185
186
187 However, trying this for certain alignment formats (e.g. phylip, clustal,
188 stockholm) would have the effect of concatenating several multiple sequence
189 alignments together. Such files are created by the PHYLIP suite of programs
190 for bootstrap analysis, but it is clearer to do this via Bio.AlignIO instead.
191
192
193 Conversion
194 ==========
195 The Bio.SeqIO.convert(...) function allows an easy interface for simple
196 file format conversions. Additionally, it may use file format specific
197 optimisations so this should be the fastest way too.
198
199 In general however, you can combine the Bio.SeqIO.parse(...) function with
200 the Bio.SeqIO.write(...) function for sequence file conversion. Using
201 generator expressions or generator functions provides a memory efficient way
202 to perform filtering or other extra operations as part of the process.
203
204 File Formats
205 ============
206 When specifying the file format, use lowercase strings. The same format
207 names are also used in Bio.AlignIO and include the following:
208
209 - ace - Reads the contig sequences from an ACE assembly file.
210 - embl - The EMBL flat file format. Uses Bio.GenBank internally.
211 - fasta - The generic sequence file format where each record starts with
212 an identifer line starting with a ">" character, followed by
213 lines of sequence.
214 - fastq - A "FASTA like" format used by Sanger which also stores PHRED
215 sequence quality values (with an ASCII offset of 33).
216 - fastq-sanger - An alias for "fastq" for consistency with BioPerl and EMBOSS
217 - fastq-solexa - Original Solexa/Illumnia variant of the FASTQ format which
218 encodes Solexa quality scores (not PHRED quality scores) with an
219 ASCII offset of 64.
220 - fastq-illumina - Solexa/Illumnia 1.3+ variant of the FASTQ format which
221 encodes PHRED quality scores with an ASCII offset of 64 (not 33).
222 - genbank - The GenBank or GenPept flat file format.
223 - gb - An alias for "genbank", for consistency with NCBI Entrez Utilities
224 - ig - The IntelliGenetics file format, apparently the same as the
225 MASE alignment format.
226 - imgt - An EMBL like format from IMGT where the feature tables are more
227 indented to allow for longer feature types.
228 - phd - Output from PHRED, used by PHRAP and CONSED for input.
229 - pir - A "FASTA like" format introduced by the National Biomedical
230 Research Foundation (NBRF) for the Protein Information Resource
231 (PIR) database, now part of UniProt.
232 - sff - Standard Flowgram Format (SFF), typical output from Roche 454.
233 - sff-trim - Standard Flowgram Format (SFF) with given trimming applied.
234 - swiss - Plain text Swiss-Prot aka UniProt format.
235 - tab - Simple two column tab separated sequence files, where each
236 line holds a record's identifier and sequence. For example,
237 this is used as by Aligent's eArray software when saving
238 microarray probes in a minimal tab delimited text file.
239 - qual - A "FASTA like" format holding PHRED quality values from
240 sequencing DNA, but no actual sequences (usually provided
241 in separate FASTA files).
242 - uniprot-xml - The UniProt XML format (replacement for the SwissProt plain
243 text format which we call "swiss")
244
245 Note that while Bio.SeqIO can read all the above file formats, it cannot
246 write to all of them.
247
248 You can also use any file format supported by Bio.AlignIO, such as "nexus",
249 "phlip" and "stockholm", which gives you access to the individual sequences
250 making up each alignment as SeqRecords.
251 """
252 __docformat__ = "epytext en"
253
254
255
256
257
258
259
260
261
262
263
264
265 """
266 FAO BioPython Developers
267 ========================
268 The way I envision this SeqIO system working as that for any sequence file
269 format we have an iterator that returns SeqRecord objects.
270
271 This also applies to interlaced fileformats (like clustal - although that
272 is now handled via Bio.AlignIO instead) where the file cannot be read record
273 by record. You should still return an iterator, even if the implementation
274 could just as easily return a list.
275
276 These file format specific sequence iterators may be implemented as:
277 * Classes which take a handle for __init__ and provide the __iter__ method
278 * Functions that take a handle, and return an iterator object
279 * Generator functions that take a handle, and yield SeqRecord objects
280
281 It is then trivial to turn this iterator into a list of SeqRecord objects,
282 an in memory dictionary, or a multiple sequence alignment object.
283
284 For building the dictionary by default the id propery of each SeqRecord is
285 used as the key. You should always populate the id property, and it should
286 be unique in most cases. For some file formats the accession number is a good
287 choice. If the file itself contains ambiguous identifiers, don't try and
288 dis-ambiguate them - return them as is.
289
290 When adding a new file format, please use the same lower case format name
291 as BioPerl, or if they have not defined one, try the names used by EMBOSS.
292
293 See also http://biopython.org/wiki/SeqIO_dev
294
295 --Peter
296 """
297
298 from Bio.Seq import Seq
299 from Bio.SeqRecord import SeqRecord
300 from Bio.Align import MultipleSeqAlignment
301 from Bio.Align.Generic import Alignment
302 from Bio.Alphabet import Alphabet, AlphabetEncoder, _get_base_alphabet
303
304 import AceIO
305 import FastaIO
306 import IgIO
307 import InsdcIO
308 import PhdIO
309 import PirIO
310 import SffIO
311 import SwissIO
312 import TabIO
313 import QualityIO
314 import UniprotIO
315
316
317
318
319
320
321
322
323
324
325 _FormatToIterator = {"fasta" : FastaIO.FastaIterator,
326 "gb" : InsdcIO.GenBankIterator,
327 "genbank" : InsdcIO.GenBankIterator,
328 "genbank-cds" : InsdcIO.GenBankCdsFeatureIterator,
329 "embl" : InsdcIO.EmblIterator,
330 "embl-cds" : InsdcIO.EmblCdsFeatureIterator,
331 "imgt" : InsdcIO.ImgtIterator,
332 "ig" : IgIO.IgIterator,
333 "swiss" : SwissIO.SwissIterator,
334 "phd" : PhdIO.PhdIterator,
335 "ace" : AceIO.AceIterator,
336 "tab" : TabIO.TabIterator,
337 "pir" : PirIO.PirIterator,
338 "fastq" : QualityIO.FastqPhredIterator,
339 "fastq-sanger" : QualityIO.FastqPhredIterator,
340 "fastq-solexa" : QualityIO.FastqSolexaIterator,
341 "fastq-illumina" : QualityIO.FastqIlluminaIterator,
342 "qual" : QualityIO.QualPhredIterator,
343 "sff": SffIO.SffIterator,
344
345 "sff-trim": SffIO._SffTrimIterator,
346 "uniprot-xml": UniprotIO.UniprotIterator,
347 }
348
349 _FormatToWriter = {"fasta" : FastaIO.FastaWriter,
350 "gb" : InsdcIO.GenBankWriter,
351 "genbank" : InsdcIO.GenBankWriter,
352 "embl" : InsdcIO.EmblWriter,
353 "imgt" : InsdcIO.ImgtWriter,
354 "tab" : TabIO.TabWriter,
355 "fastq" : QualityIO.FastqPhredWriter,
356 "fastq-sanger" : QualityIO.FastqPhredWriter,
357 "fastq-solexa" : QualityIO.FastqSolexaWriter,
358 "fastq-illumina" : QualityIO.FastqIlluminaWriter,
359 "phd" : PhdIO.PhdWriter,
360 "qual" : QualityIO.QualPhredWriter,
361 "sff" : SffIO.SffWriter,
362 }
363
364 _BinaryFormats = ["sff", "sff-trim"]
365
366 -def write(sequences, handle, format):
367 """Write complete set of sequences to a file.
368
369 - sequences - A list (or iterator) of SeqRecord objects, or (if using
370 Biopython 1.54 or later) a single SeqRecord.
371 - handle - File handle object to write to, or filename as string
372 (note older versions of Biopython only took a handle).
373 - format - lower case string describing the file format to write.
374
375 You should close the handle after calling this function.
376
377 Returns the number of records written (as an integer).
378 """
379 from Bio import AlignIO
380
381
382 if not isinstance(format, basestring):
383 raise TypeError("Need a string for the file format (lower case)")
384 if not format:
385 raise ValueError("Format required (lower case string)")
386 if format != format.lower():
387 raise ValueError("Format string '%s' should be lower case" % format)
388
389 if isinstance(sequences, SeqRecord):
390
391 sequences = [sequences]
392
393 if isinstance(handle, basestring):
394 if format in _BinaryFormats :
395 handle = open(handle, "wb")
396 else :
397 handle = open(handle, "w")
398 handle_close = True
399 else:
400 handle_close = False
401
402
403 if format in _FormatToWriter:
404 writer_class = _FormatToWriter[format]
405 count = writer_class(handle).write_file(sequences)
406 elif format in AlignIO._FormatToWriter:
407
408
409 alignment = MultipleSeqAlignment(sequences)
410 alignment_count = AlignIO.write([alignment], handle, format)
411 assert alignment_count == 1, "Internal error - the underlying writer " \
412 + " should have returned 1, not %s" % repr(alignment_count)
413 count = len(alignment)
414 del alignment_count, alignment
415 elif format in _FormatToIterator or format in AlignIO._FormatToIterator:
416 raise ValueError("Reading format '%s' is supported, but not writing" \
417 % format)
418 else:
419 raise ValueError("Unknown format '%s'" % format)
420
421 assert isinstance(count, int), "Internal error - the underlying %s " \
422 "writer should have returned the record count, not %s" \
423 % (format, repr(count))
424
425 if handle_close:
426 handle.close()
427
428 return count
429
430 -def parse(handle, format, alphabet=None):
431 r"""Turns a sequence file into an iterator returning SeqRecords.
432
433 - handle - handle to the file, or the filename as a string
434 (note older verions of Biopython only took a handle).
435 - format - lower case string describing the file format.
436 - alphabet - optional Alphabet object, useful when the sequence type
437 cannot be automatically inferred from the file itself
438 (e.g. format="fasta" or "tab")
439
440 Typical usage, opening a file to read in, and looping over the record(s):
441
442 >>> from Bio import SeqIO
443 >>> filename = "Fasta/sweetpea.nu"
444 >>> for record in SeqIO.parse(filename, "fasta"):
445 ... print "ID", record.id
446 ... print "Sequence length", len(record)
447 ... print "Sequence alphabet", record.seq.alphabet
448 ID gi|3176602|gb|U78617.1|LOU78617
449 Sequence length 309
450 Sequence alphabet SingleLetterAlphabet()
451
452 For file formats like FASTA where the alphabet cannot be determined, it
453 may be useful to specify the alphabet explicitly:
454
455 >>> from Bio import SeqIO
456 >>> from Bio.Alphabet import generic_dna
457 >>> filename = "Fasta/sweetpea.nu"
458 >>> for record in SeqIO.parse(filename, "fasta", generic_dna):
459 ... print "ID", record.id
460 ... print "Sequence length", len(record)
461 ... print "Sequence alphabet", record.seq.alphabet
462 ID gi|3176602|gb|U78617.1|LOU78617
463 Sequence length 309
464 Sequence alphabet DNAAlphabet()
465
466 If you have a string 'data' containing the file contents, you must
467 first turn this into a handle in order to parse it:
468
469 >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n"
470 >>> from Bio import SeqIO
471 >>> from StringIO import StringIO
472 >>> for record in SeqIO.parse(StringIO(data), "fasta"):
473 ... print record.id, record.seq
474 Alpha ACCGGATGTA
475 Beta AGGCTCGGTTA
476
477 Use the Bio.SeqIO.read(...) function when you expect a single record
478 only.
479 """
480
481
482
483 from Bio import AlignIO
484
485 handle_close = False
486
487 if isinstance(handle, basestring):
488
489 if format in _BinaryFormats :
490 handle = open(handle, "rb")
491 else :
492 handle = open(handle, "rU")
493
494 handle_close = True
495
496
497 if not isinstance(format, basestring):
498 raise TypeError("Need a string for the file format (lower case)")
499 if not format:
500 raise ValueError("Format required (lower case string)")
501 if format != format.lower():
502 raise ValueError("Format string '%s' should be lower case" % format)
503 if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
504 isinstance(alphabet, AlphabetEncoder)):
505 raise ValueError("Invalid alphabet, %s" % repr(alphabet))
506
507
508 if format in _FormatToIterator:
509 iterator_generator = _FormatToIterator[format]
510 if alphabet is None:
511 i = iterator_generator(handle)
512 else:
513 try:
514 i = iterator_generator(handle, alphabet=alphabet)
515 except TypeError:
516 i = _force_alphabet(iterator_generator(handle), alphabet)
517 elif format in AlignIO._FormatToIterator:
518
519
520
521 i = _iterate_via_AlignIO(handle, format, alphabet)
522 else:
523 raise ValueError("Unknown format '%s'" % format)
524
525 for r in i:
526 yield r
527 if handle_close:
528 handle.close()
529
530
537
551
552 -def read(handle, format, alphabet=None):
553 """Turns a sequence file into a single SeqRecord.
554
555 - handle - handle to the file, or the filename as a string
556 (note older verions of Biopython only took a handle).
557 - format - string describing the file format.
558 - alphabet - optional Alphabet object, useful when the sequence type
559 cannot be automatically inferred from the file itself
560 (e.g. format="fasta" or "tab")
561
562 This function is for use parsing sequence files containing
563 exactly one record. For example, reading a GenBank file:
564
565 >>> from Bio import SeqIO
566 >>> record = SeqIO.read("GenBank/arab1.gb", "genbank")
567 >>> print "ID", record.id
568 ID AC007323.5
569 >>> print "Sequence length", len(record)
570 Sequence length 86436
571 >>> print "Sequence alphabet", record.seq.alphabet
572 Sequence alphabet IUPACAmbiguousDNA()
573
574 If the handle contains no records, or more than one record,
575 an exception is raised. For example:
576
577 >>> from Bio import SeqIO
578 >>> record = SeqIO.read("GenBank/cor6_6.gb", "genbank")
579 Traceback (most recent call last):
580 ...
581 ValueError: More than one record found in handle
582
583 If however you want the first record from a file containing
584 multiple records this function would raise an exception (as
585 shown in the example above). Instead use:
586
587 >>> from Bio import SeqIO
588 >>> record = SeqIO.parse("GenBank/cor6_6.gb", "genbank").next()
589 >>> print "First record's ID", record.id
590 First record's ID X55053.1
591
592 Use the Bio.SeqIO.parse(handle, format) function if you want
593 to read multiple records from the handle.
594 """
595 iterator = parse(handle, format, alphabet)
596 try:
597 first = iterator.next()
598 except StopIteration:
599 first = None
600 if first is None:
601 raise ValueError("No records found in handle")
602 try:
603 second = iterator.next()
604 except StopIteration:
605 second = None
606 if second is not None:
607 raise ValueError("More than one record found in handle")
608 return first
609
610 -def to_dict(sequences, key_function=None):
611 """Turns a sequence iterator or list into a dictionary.
612
613 - sequences - An iterator that returns SeqRecord objects,
614 or simply a list of SeqRecord objects.
615 - key_function - Optional callback function which when given a
616 SeqRecord should return a unique key for the dictionary.
617
618 e.g. key_function = lambda rec : rec.name
619 or, key_function = lambda rec : rec.description.split()[0]
620
621 If key_function is ommitted then record.id is used, on the assumption
622 that the records objects returned are SeqRecords with a unique id.
623
624 If there are duplicate keys, an error is raised.
625
626 Example usage, defaulting to using the record.id as key:
627
628 >>> from Bio import SeqIO
629 >>> filename = "GenBank/cor6_6.gb"
630 >>> format = "genbank"
631 >>> id_dict = SeqIO.to_dict(SeqIO.parse(filename, format))
632 >>> print sorted(id_dict)
633 ['AF297471.1', 'AJ237582.1', 'L31939.1', 'M81224.1', 'X55053.1', 'X62281.1']
634 >>> print id_dict["L31939.1"].description
635 Brassica rapa (clone bif72) kin mRNA, complete cds.
636
637 A more complex example, using the key_function argument in order to
638 use a sequence checksum as the dictionary key:
639
640 >>> from Bio import SeqIO
641 >>> from Bio.SeqUtils.CheckSum import seguid
642 >>> filename = "GenBank/cor6_6.gb"
643 >>> format = "genbank"
644 >>> seguid_dict = SeqIO.to_dict(SeqIO.parse(filename, format),
645 ... key_function = lambda rec : seguid(rec.seq))
646 >>> for key, record in sorted(seguid_dict.iteritems()):
647 ... print key, record.id
648 /wQvmrl87QWcm9llO4/efg23Vgg AJ237582.1
649 BUg6YxXSKWEcFFH0L08JzaLGhQs L31939.1
650 SabZaA4V2eLE9/2Fm5FnyYy07J4 X55053.1
651 TtWsXo45S3ZclIBy4X/WJc39+CY M81224.1
652 l7gjJFE6W/S1jJn5+1ASrUKW/FA X62281.1
653 uVEYeAQSV5EDQOnFoeMmVea+Oow AF297471.1
654
655 This approach is not suitable for very large sets of sequences, as all
656 the SeqRecord objects are held in memory. Instead, consider using the
657 Bio.SeqIO.index() function (if it supports your particular file format).
658 """
659 if key_function is None:
660 key_function = lambda rec : rec.id
661
662 d = dict()
663 for record in sequences:
664 key = key_function(record)
665 if key in d:
666 raise ValueError("Duplicate key '%s'" % key)
667 d[key] = record
668 return d
669
670 -def index(filename, format, alphabet=None, key_function=None):
671 """Indexes a sequence file and returns a dictionary like object.
672
673 - filename - string giving name of file to be indexed
674 - format - lower case string describing the file format
675 - alphabet - optional Alphabet object, useful when the sequence type
676 cannot be automatically inferred from the file itself
677 (e.g. format="fasta" or "tab")
678 - key_function - Optional callback function which when given a
679 SeqRecord identifier string should return a unique
680 key for the dictionary.
681
682 This indexing function will return a dictionary like object, giving the
683 SeqRecord objects as values:
684
685 >>> from Bio import SeqIO
686 >>> records = SeqIO.index("Quality/example.fastq", "fastq")
687 >>> len(records)
688 3
689 >>> sorted(records)
690 ['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_443_348', 'EAS54_6_R1_2_1_540_792']
691 >>> print records["EAS54_6_R1_2_1_540_792"].format("fasta")
692 >EAS54_6_R1_2_1_540_792
693 TTGGCAGGCCAAGGCCGATGGATCA
694 <BLANKLINE>
695 >>> "EAS54_6_R1_2_1_540_792" in records
696 True
697 >>> print records.get("Missing", None)
698 None
699
700 Note that this psuedo dictionary will not support all the methods of a
701 true Python dictionary, for example values() is not defined since this
702 would require loading all of the records into memory at once.
703
704 When you call the index function, it will scan through the file, noting
705 the location of each record. When you access a particular record via the
706 dictionary methods, the code will jump to the appropriate part of the
707 file and then parse that section into a SeqRecord.
708
709 Note that not all the input formats supported by Bio.SeqIO can be used
710 with this index function. It is designed to work only with sequential
711 file formats (e.g. "fasta", "gb", "fastq") and is not suitable for any
712 interlaced file format (e.g. alignment formats such as "clustal").
713
714 For small files, it may be more efficient to use an in memory Python
715 dictionary, e.g.
716
717 >>> from Bio import SeqIO
718 >>> records = SeqIO.to_dict(SeqIO.parse(open("Quality/example.fastq"), "fastq"))
719 >>> len(records)
720 3
721 >>> sorted(records)
722 ['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_443_348', 'EAS54_6_R1_2_1_540_792']
723 >>> print records["EAS54_6_R1_2_1_540_792"].format("fasta")
724 >EAS54_6_R1_2_1_540_792
725 TTGGCAGGCCAAGGCCGATGGATCA
726 <BLANKLINE>
727
728 As with the to_dict() function, by default the id string of each record
729 is used as the key. You can specify a callback function to transform
730 this (the record identifier string) into your prefered key. For example:
731
732 >>> from Bio import SeqIO
733 >>> def make_tuple(identifier):
734 ... parts = identifier.split("_")
735 ... return int(parts[-2]), int(parts[-1])
736 >>> records = SeqIO.index("Quality/example.fastq", "fastq",
737 ... key_function=make_tuple)
738 >>> len(records)
739 3
740 >>> sorted(records)
741 [(413, 324), (443, 348), (540, 792)]
742 >>> print records[(540, 792)].format("fasta")
743 >EAS54_6_R1_2_1_540_792
744 TTGGCAGGCCAAGGCCGATGGATCA
745 <BLANKLINE>
746 >>> (540, 792) in records
747 True
748 >>> "EAS54_6_R1_2_1_540_792" in records
749 False
750 >>> print records.get("Missing", None)
751 None
752
753 Another common use case would be indexing an NCBI style FASTA file,
754 where you might want to extract the GI number from the FASTA identifer
755 to use as the dictionary key.
756
757 Notice that unlike the to_dict() function, here the key_function does
758 not get given the full SeqRecord to use to generate the key. Doing so
759 would impose a severe performance penalty as it would require the file
760 to be completely parsed while building the index. Right now this is
761 usually avoided.
762 """
763
764 if not isinstance(filename, basestring):
765 raise TypeError("Need a filename (not a handle)")
766 if not isinstance(format, basestring):
767 raise TypeError("Need a string for the file format (lower case)")
768 if not format:
769 raise ValueError("Format required (lower case string)")
770 if format != format.lower():
771 raise ValueError("Format string '%s' should be lower case" % format)
772 if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
773 isinstance(alphabet, AlphabetEncoder)):
774 raise ValueError("Invalid alphabet, %s" % repr(alphabet))
775
776
777 import _index
778 try:
779 indexer = _index._FormatToIndexedDict[format]
780 except KeyError:
781 raise ValueError("Unsupported format '%s'" % format)
782 return indexer(filename, format, alphabet, key_function)
783
785 """Returns a multiple sequence alignment (DEPRECATED).
786
787 - sequences -An iterator that returns SeqRecord objects,
788 or simply a list of SeqRecord objects. All
789 the record sequences must be the same length.
790 - alphabet - Optional alphabet. Stongly recommended.
791 - strict - Dummy argument, used to enable strict error
792 checking of sequence lengths and alphabets.
793 This is now always done.
794
795 Using this function is now discouraged. You are now encouraged to use
796 Bio.AlignIO instead, e.g.
797
798 >>> from Bio import AlignIO
799 >>> filename = "Clustalw/protein.aln"
800 >>> alignment = AlignIO.read(filename, "clustal")
801 """
802 import warnings
803 import Bio
804 warnings.warn("The Bio.SeqIO.to_alignment(...) function is deprecated. "
805 "Please use the Bio.Align.MultipleSeqAlignment(...) object "
806 "directly instead.", Bio.BiopythonDeprecationWarning)
807 return MultipleSeqAlignment(sequences, alphabet)
808
809 -def convert(in_file, in_format, out_file, out_format, alphabet=None):
810 """Convert between two sequence file formats, return number of records.
811
812 - in_file - an input handle or filename
813 - in_format - input file format, lower case string
814 - out_file - an output handle or filename
815 - out_format - output file format, lower case string
816 - alphabet - optional alphabet to assume
817
818 NOTE - If you provide an output filename, it will be opened which will
819 overwrite any existing file without warning. This may happen if even
820 the conversion is aborted (e.g. an invalid out_format name is given).
821
822 For example, going from a filename to a handle:
823
824 >>> from Bio import SeqIO
825 >>> from StringIO import StringIO
826 >>> handle = StringIO("")
827 >>> SeqIO.convert("Quality/example.fastq", "fastq", handle, "fasta")
828 3
829 >>> print handle.getvalue()
830 >EAS54_6_R1_2_1_413_324
831 CCCTTCTTGTCTTCAGCGTTTCTCC
832 >EAS54_6_R1_2_1_540_792
833 TTGGCAGGCCAAGGCCGATGGATCA
834 >EAS54_6_R1_2_1_443_348
835 GTTGCTTCTGGCGTGGGTGGGGGGG
836 <BLANKLINE>
837 """
838 if isinstance(in_file, basestring):
839
840 if in_format in _BinaryFormats :
841 in_handle = open(in_file, "rb")
842 else :
843 in_handle = open(in_file, "rU")
844 in_close = True
845 else:
846 in_handle = in_file
847 in_close = False
848
849 if isinstance(out_file, basestring):
850 if out_format in ["sff", "sff_trim"] :
851 out_handle = open(out_file, "wb")
852 else :
853 out_handle = open(out_file, "w")
854 out_close = True
855 else:
856 out_handle = out_file
857 out_close = False
858
859
860 from _convert import _handle_convert
861 count = _handle_convert(in_handle, in_format,
862 out_handle, out_format,
863 alphabet)
864
865 if in_close:
866 in_handle.close()
867 if out_close:
868 out_handle.close()
869 return count
870
872 """Run the Bio.SeqIO module's doctests.
873
874 This will try and locate the unit tests directory, and run the doctests
875 from there in order that the relative paths used in the examples work.
876 """
877 import doctest
878 import os
879 if os.path.isdir(os.path.join("..", "..", "Tests")):
880 print "Runing doctests..."
881 cur_dir = os.path.abspath(os.curdir)
882 os.chdir(os.path.join("..", "..", "Tests"))
883 doctest.testmod()
884 os.chdir(cur_dir)
885 del cur_dir
886 print "Done"
887 elif os.path.isdir(os.path.join("Tests", "Fasta")):
888 print "Runing doctests..."
889 cur_dir = os.path.abspath(os.curdir)
890 os.chdir(os.path.join("Tests"))
891 doctest.testmod()
892 os.chdir(cur_dir)
893 del cur_dir
894 print "Done"
895
896 if __name__ == "__main__":
897
898 _test()
899