Package Bio :: Package SeqIO :: Module SwissIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.SwissIO

  1  # Copyright 2006-2010 by Peter Cock and Michiel de Hoon. 
  2  # All rights reserved. 
  3  # 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format. 
  9   
 10  You are expected to use this module via the Bio.SeqIO functions. 
 11  See also the Bio.SwissProt module which offers more than just accessing 
 12  the sequences as SeqRecord objects. 
 13   
 14  See also Bio.SeqIO.UniprotIO.py which supports the "uniprot-xml" format. 
 15  """ 
 16   
 17  from Bio import Seq 
 18  from Bio import SeqRecord 
 19  from Bio import Alphabet 
 20  from Bio import SeqFeature 
 21  from Bio import SwissProt 
 22   
23 -def _make_position(location_string, offset=0):
24 """Turn a Swiss location position into a SeqFeature position object (PRIVATE). 25 26 An offset of -1 is used with a start location to make it pythonic. 27 """ 28 if location_string=="?": 29 return SeqFeature.UnknownPosition() 30 #Hack so that feature from 0 to 0 becomes 0 to 0, not -1 to 0. 31 try: 32 return SeqFeature.ExactPosition(max(0, offset+int(location_string))) 33 except ValueError: 34 pass 35 if location_string.startswith("<"): 36 try: 37 return SeqFeature.BeforePosition(max(0,offset+int(location_string[1:]))) 38 except ValueError: 39 pass 40 elif location_string.startswith(">"): # e.g. ">13" 41 try: 42 return SeqFeature.AfterPosition(max(0,offset+int(location_string[1:]))) 43 except ValueError : 44 pass 45 elif location_string.startswith("?"): # e.g. "?22" 46 try: 47 return SeqFeature.UncertainPosition(max(0,offset+int(location_string[1:]))) 48 except ValueError: 49 pass 50 raise NotImplementedError("Cannot parse location '%s'" % location_string)
51
52 -def _make_seqfeature(name, from_res, to_res, description, ft_id):
53 """Construct SeqFeature from feature data from parser (PRIVATE).""" 54 loc = SeqFeature.FeatureLocation(_make_position(from_res,-1), 55 _make_position(to_res, 0)) 56 if not ft_id: 57 ft_id = "<unknown id>" #The default in SeqFeature object 58 return SeqFeature.SeqFeature(loc, type=name, id=ft_id, 59 qualifiers={"description":description})
60 61 #This is a generator function!
62 -def SwissIterator(handle):
63 """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects. 64 65 Every section from the ID line to the terminating // becomes 66 a single SeqRecord with associated annotation and features. 67 68 This parser is for the flat file "swiss" format as used by: 69 * Swiss-Prot aka SwissProt 70 * TrEMBL 71 * UniProtKB aka UniProt Knowledgebase 72 73 For consistency with BioPerl and EMBOSS we call this the "swiss" 74 format. See also the SeqIO support for "uniprot-xml" format. 75 """ 76 swiss_records = SwissProt.parse(handle) 77 for swiss_record in swiss_records: 78 # Convert the SwissProt record to a SeqRecord 79 seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein) 80 record = SeqRecord.SeqRecord(seq, 81 id=swiss_record.accessions[0], 82 name=swiss_record.entry_name, 83 description=swiss_record.description, 84 features=[_make_seqfeature(*f) for f \ 85 in swiss_record.features], 86 ) 87 record.description = swiss_record.description 88 for cross_reference in swiss_record.cross_references: 89 if len(cross_reference) < 2: 90 continue 91 database, accession = cross_reference[:2] 92 dbxref = "%s:%s" % (database, accession) 93 if not dbxref in record.dbxrefs: 94 record.dbxrefs.append(dbxref) 95 annotations = record.annotations 96 annotations['accessions'] = swiss_record.accessions 97 annotations['date'] = swiss_record.created[0] 98 annotations['date_last_sequence_update'] = swiss_record.sequence_update[0] 99 if swiss_record.annotation_update: 100 annotations['date_last_annotation_update'] = swiss_record.annotation_update[0] 101 if swiss_record.gene_name: 102 annotations['gene_name'] = swiss_record.gene_name 103 annotations['organism'] = swiss_record.organism.rstrip(".") 104 annotations['taxonomy'] = swiss_record.organism_classification 105 annotations['ncbi_taxid'] = swiss_record.taxonomy_id 106 if swiss_record.host_organism: 107 annotations['organism_host'] = swiss_record.host_organism 108 if swiss_record.host_taxonomy_id: 109 annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id 110 if swiss_record.comments: 111 annotations['comment'] = "\n".join(swiss_record.comments) 112 if swiss_record.references: 113 annotations['references'] = [] 114 for reference in swiss_record.references: 115 feature = SeqFeature.Reference() 116 feature.comment = " ".join(["%s=%s;" % (key, value) \ 117 for key, value \ 118 in reference.comments]) 119 for key, value in reference.references: 120 if key == 'PubMed': 121 feature.pubmed_id = value 122 elif key == 'MEDLINE': 123 feature.medline_id = value 124 elif key == 'DOI': 125 pass 126 elif key == 'AGRICOLA': 127 pass 128 else: 129 raise ValueError(\ 130 "Unknown key %s found in references" % key) 131 feature.authors = reference.authors 132 feature.title = reference.title 133 feature.journal = reference.location 134 annotations['references'].append(feature) 135 if swiss_record.keywords: 136 record.annotations['keywords'] = swiss_record.keywords 137 yield record
138 139 if __name__ == "__main__": 140 print "Quick self test..." 141 142 example_filename = "../../Tests/SwissProt/sp008" 143 144 import os 145 if not os.path.isfile(example_filename): 146 print "Missing test file %s" % example_filename 147 else: 148 #Try parsing it! 149 handle = open(example_filename) 150 records = SwissIterator(handle) 151 for record in records: 152 print record.name 153 print record.id 154 print record.annotations['keywords'] 155 print repr(record.annotations['organism']) 156 print record.seq.tostring()[:20] + "..." 157 for f in record.features: print f 158 handle.close() 159