Package Bio :: Module SeqRecord
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqRecord

  1  # Stores data about the sequence 
  2  """Represent a Sequence Record, a sequence with annotation.""" 
  3   
  4  # NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL 
  5  # In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes 
  6  # need to be in sync (this is the BioSQL "Database SeqRecord", see 
  7  # also BioSQL.BioSeq.DBSeq which is the "Database Seq" class) 
  8   
9 -class SeqRecord(object):
10 """A SeqRecord object holds a sequence and information about it. 11 12 Main attributes: 13 id - Identifier such as a locus tag (string) 14 seq - The sequence itself (Seq object) 15 16 Additional attributes: 17 name - Sequence name, e.g. gene name (string) 18 description - Additional text (string) 19 dbxrefs - List of database cross references (list of strings) 20 features - Any (sub)features defined (list of SeqFeature objects) 21 annotations - Further information about the whole sequence (dictionary) 22 Most entries are lists of strings. 23 24 You will typically use Bio.SeqIO to read in sequences from files as 25 SeqRecord objects. However, you may want to create your own SeqRecord 26 objects directly (see the __init__ method for further details). 27 28 e.g. 29 >>> from Bio.Seq import Seq 30 >>> from Bio.SeqRecord import SeqRecord 31 >>> from Bio.Alphabet import IUPAC 32 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 33 ... IUPAC.protein), 34 ... id="YP_025292.1", name="HokC", 35 ... description="toxic membrane protein, small") 36 >>> print record 37 ID: YP_025292.1 38 Name: HokC 39 Description: toxic membrane protein, small 40 Number of features: 0 41 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 42 43 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO 44 for this. For the special case where you want the SeqRecord turned into 45 a string in a particular file format there is a format method which uses 46 Bio.SeqIO internally: 47 48 >>> print record.format("fasta") 49 >YP_025292.1 toxic membrane protein, small 50 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 51 <BLANKLINE> 52 """
53 - def __init__(self, seq, id = "<unknown id>", name = "<unknown name>", 54 description = "<unknown description>", dbxrefs = None, 55 features = None):
56 """Create a SeqRecord. 57 58 Arguments: 59 seq - Sequence, required (Seq object) 60 id - Sequence identifier, recommended (string) 61 name - Sequence name, optional (string) 62 description - Sequence description, optional (string) 63 dbxrefs - Database cross references, optional (list of strings) 64 features - Any (sub)features, optional (list of SeqFeature objects) 65 66 You will typically use Bio.SeqIO to read in sequences from files as 67 SeqRecord objects. However, you may want to create your own SeqRecord 68 objects directly. 69 70 Note that while an id is optional, we strongly recommend you supply a 71 unique id string for each record. This is especially important 72 if you wish to write your sequences to a file. 73 74 You can create a 'blank' SeqRecord object, and then populated the 75 attributes later. Note that currently the annotations dictionary 76 cannot be specified when creating the SeqRecord. 77 """ 78 if id is not None and not isinstance(id, basestring) : 79 #Lots of existing code uses id=None... this may be a bad idea. 80 raise ValueError("id argument should be a string") 81 if not isinstance(name, basestring) : 82 raise ValueError("name argument should be a string") 83 if not isinstance(description, basestring) : 84 raise ValueError("description argument should be a string") 85 if dbxrefs is not None and not isinstance(dbxrefs, list) : 86 raise ValueError("dbxrefs argument should be a list (of strings)") 87 if features is not None and not isinstance(features, list) : 88 raise ValueError("features argument should be a list (of SeqFeature objects)") 89 self.seq = seq 90 self.id = id 91 self.name = name 92 self.description = description 93 if dbxrefs is None: 94 dbxrefs = [] 95 self.dbxrefs = dbxrefs 96 # annotations about the whole sequence 97 self.annotations = {} 98 99 # annotations about parts of the sequence 100 if features is None: 101 features = [] 102 self.features = features
103
104 - def __str__(self) :
105 """A human readable summary of the record and its annotation (string). 106 107 The python built in function str works by calling the object's ___str__ 108 method. 109 110 e.g. 111 >>> from Bio.Seq import Seq 112 >>> from Bio.SeqRecord import SeqRecord 113 >>> from Bio.Alphabet import IUPAC 114 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 115 ... IUPAC.protein), 116 ... id="YP_025292.1", name="HokC", 117 ... description="toxic membrane protein, small") 118 >>> print str(record) 119 ID: YP_025292.1 120 Name: HokC 121 Description: toxic membrane protein, small 122 Number of features: 0 123 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 124 125 In this example you don't actually need to call str explicity, as the 126 print command does this automatically: 127 128 >>> print record 129 ID: YP_025292.1 130 Name: HokC 131 Description: toxic membrane protein, small 132 Number of features: 0 133 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 134 135 Note that long sequences are shown truncated. 136 """ 137 lines = [] 138 if self.id : lines.append("ID: %s" % self.id) 139 if self.name : lines.append("Name: %s" % self.name) 140 if self.description : lines.append("Description: %s" % self.description) 141 if self.dbxrefs : lines.append("Database cross-references: " \ 142 + ", ".join(self.dbxrefs)) 143 lines.append("Number of features: %i" % len(self.features)) 144 for a in self.annotations: 145 lines.append("/%s=%s" % (a, str(self.annotations[a]))) 146 #Don't want to include the entire sequence, 147 #and showing the alphabet is useful: 148 lines.append(repr(self.seq)) 149 return "\n".join(lines)
150
151 - def __repr__(self) :
152 """A concise summary of the record for debugging (string). 153 154 The python built in function repr works by calling the object's ___repr__ 155 method. 156 157 e.g. 158 >>> from Bio.Seq import Seq 159 >>> from Bio.SeqRecord import SeqRecord 160 >>> from Bio.Alphabet import generic_protein 161 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT" 162 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ" 163 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ" 164 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF", 165 ... generic_protein), 166 ... id="NP_418483.1", name="b4059", 167 ... description="ssDNA-binding protein", 168 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"]) 169 >>> print repr(rec) 170 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 171 172 At the python prompt you can also use this shorthand: 173 174 >>> rec 175 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 176 177 Note that long sequences are shown truncated. 178 """ 179 return self.__class__.__name__ \ 180 + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \ 181 % tuple(map(repr, (self.seq, self.id, self.name, 182 self.description, self.dbxrefs)))
183
184 - def format(self, format) :
185 """Returns the record as a string in the specified file format. 186 187 The format should be a lower case string supported as an output 188 format by Bio.SeqIO, which is used to turn the SeqRecord into a 189 string. 190 191 e.g. 192 >>> from Bio.Seq import Seq 193 >>> from Bio.SeqRecord import SeqRecord 194 >>> from Bio.Alphabet import IUPAC 195 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 196 ... IUPAC.protein), 197 ... id="YP_025292.1", name="HokC", 198 ... description="toxic membrane protein, small") 199 >>> print record.format("fasta") 200 >YP_025292.1 toxic membrane protein, small 201 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 202 <BLANKLINE> 203 204 The python print command automatically appends a new line, meaning 205 in this example a blank line is shown. If you look at the string 206 representation you can see there is a trailing new line (shown as 207 slash n) which is important when writing to a file or if 208 concatenating mutliple sequence strings together. 209 210 Note that this method will NOT work on every possible file format 211 supported by Bio.SeqIO (e.g. some are for multiple sequences only). 212 """ 213 #See also the __format__ added for Python 2.6 / 3.0, PEP 3101 214 #See also the Bio.Align.Generic.Alignment class and its format() 215 return self.__format__(format)
216
217 - def __format__(self, format_spec) :
218 """Returns the record as a string in the specified file format. 219 220 This method supports the python format() function added in 221 Python 2.6/3.0. The format_spec should be a lower case 222 string supported by Bio.SeqIO as an output file format. 223 See also the SeqRecord's format() method. 224 """ 225 if format_spec: 226 from StringIO import StringIO 227 from Bio import SeqIO 228 handle = StringIO() 229 SeqIO.write([self], handle, format_spec) 230 handle.seek(0) 231 return handle.read() 232 else : 233 #Follow python convention and default to using __str__ 234 return str(self)
235
236 - def __len__(self) :
237 """Returns the length of the sequence.""" 238 return len(self.seq)
239
240 - def __nonzero__(self) :
241 """Returns True regardless of the length of the sequence. 242 243 This behaviour is for backwards compatibility, since until the 244 __len__ method was added, a SeqRecord always evaluated as True. 245 246 Note that in comparison, a Seq object will evaluate to False if it 247 has a zero length sequence. 248 249 WARNING: The SeqRecord may in future evaluate to False when its 250 sequence is of zero length (in order to better match the Seq 251 object behaviour)! 252 """ 253 return True
254
255 -def _test():
256 """Run the Bio.SeqRecord module's doctests.""" 257 print "Runing doctests..." 258 import doctest 259 doctest.testmod() 260 print "Done"
261 262 if __name__ == "__main__": 263 _test() 264