1
2 """Represent a Sequence Record, a sequence with annotation."""
3
4
5
6
7
8
10 """A SeqRecord object holds a sequence and information about it.
11
12 Main attributes:
13 id - Identifier such as a locus tag (string)
14 seq - The sequence itself (Seq object)
15
16 Additional attributes:
17 name - Sequence name, e.g. gene name (string)
18 description - Additional text (string)
19 dbxrefs - List of database cross references (list of strings)
20 features - Any (sub)features defined (list of SeqFeature objects)
21 annotations - Further information about the whole sequence (dictionary)
22 Most entries are lists of strings.
23
24 You will typically use Bio.SeqIO to read in sequences from files as
25 SeqRecord objects. However, you may want to create your own SeqRecord
26 objects directly (see the __init__ method for further details).
27
28 e.g.
29 >>> from Bio.Seq import Seq
30 >>> from Bio.SeqRecord import SeqRecord
31 >>> from Bio.Alphabet import IUPAC
32 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
33 ... IUPAC.protein),
34 ... id="YP_025292.1", name="HokC",
35 ... description="toxic membrane protein, small")
36 >>> print record
37 ID: YP_025292.1
38 Name: HokC
39 Description: toxic membrane protein, small
40 Number of features: 0
41 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
42
43 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO
44 for this. For the special case where you want the SeqRecord turned into
45 a string in a particular file format there is a format method which uses
46 Bio.SeqIO internally:
47
48 >>> print record.format("fasta")
49 >YP_025292.1 toxic membrane protein, small
50 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
51 <BLANKLINE>
52 """
53 - def __init__(self, seq, id = "<unknown id>", name = "<unknown name>",
54 description = "<unknown description>", dbxrefs = None,
55 features = None):
56 """Create a SeqRecord.
57
58 Arguments:
59 seq - Sequence, required (Seq object)
60 id - Sequence identifier, recommended (string)
61 name - Sequence name, optional (string)
62 description - Sequence description, optional (string)
63 dbxrefs - Database cross references, optional (list of strings)
64 features - Any (sub)features, optional (list of SeqFeature objects)
65
66 You will typically use Bio.SeqIO to read in sequences from files as
67 SeqRecord objects. However, you may want to create your own SeqRecord
68 objects directly.
69
70 Note that while an id is optional, we strongly recommend you supply a
71 unique id string for each record. This is especially important
72 if you wish to write your sequences to a file.
73
74 You can create a 'blank' SeqRecord object, and then populated the
75 attributes later. Note that currently the annotations dictionary
76 cannot be specified when creating the SeqRecord.
77 """
78 if id is not None and not isinstance(id, basestring) :
79
80 raise ValueError("id argument should be a string")
81 if not isinstance(name, basestring) :
82 raise ValueError("name argument should be a string")
83 if not isinstance(description, basestring) :
84 raise ValueError("description argument should be a string")
85 if dbxrefs is not None and not isinstance(dbxrefs, list) :
86 raise ValueError("dbxrefs argument should be a list (of strings)")
87 if features is not None and not isinstance(features, list) :
88 raise ValueError("features argument should be a list (of SeqFeature objects)")
89 self.seq = seq
90 self.id = id
91 self.name = name
92 self.description = description
93 if dbxrefs is None:
94 dbxrefs = []
95 self.dbxrefs = dbxrefs
96
97 self.annotations = {}
98
99
100 if features is None:
101 features = []
102 self.features = features
103
105 """A human readable summary of the record and its annotation (string).
106
107 The python built in function str works by calling the object's ___str__
108 method.
109
110 e.g.
111 >>> from Bio.Seq import Seq
112 >>> from Bio.SeqRecord import SeqRecord
113 >>> from Bio.Alphabet import IUPAC
114 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
115 ... IUPAC.protein),
116 ... id="YP_025292.1", name="HokC",
117 ... description="toxic membrane protein, small")
118 >>> print str(record)
119 ID: YP_025292.1
120 Name: HokC
121 Description: toxic membrane protein, small
122 Number of features: 0
123 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
124
125 In this example you don't actually need to call str explicity, as the
126 print command does this automatically:
127
128 >>> print record
129 ID: YP_025292.1
130 Name: HokC
131 Description: toxic membrane protein, small
132 Number of features: 0
133 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
134
135 Note that long sequences are shown truncated.
136 """
137 lines = []
138 if self.id : lines.append("ID: %s" % self.id)
139 if self.name : lines.append("Name: %s" % self.name)
140 if self.description : lines.append("Description: %s" % self.description)
141 if self.dbxrefs : lines.append("Database cross-references: " \
142 + ", ".join(self.dbxrefs))
143 lines.append("Number of features: %i" % len(self.features))
144 for a in self.annotations:
145 lines.append("/%s=%s" % (a, str(self.annotations[a])))
146
147
148 lines.append(repr(self.seq))
149 return "\n".join(lines)
150
152 """A concise summary of the record for debugging (string).
153
154 The python built in function repr works by calling the object's ___repr__
155 method.
156
157 e.g.
158 >>> from Bio.Seq import Seq
159 >>> from Bio.SeqRecord import SeqRecord
160 >>> from Bio.Alphabet import generic_protein
161 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT"
162 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ"
163 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ"
164 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF",
165 ... generic_protein),
166 ... id="NP_418483.1", name="b4059",
167 ... description="ssDNA-binding protein",
168 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"])
169 >>> print repr(rec)
170 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
171
172 At the python prompt you can also use this shorthand:
173
174 >>> rec
175 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
176
177 Note that long sequences are shown truncated.
178 """
179 return self.__class__.__name__ \
180 + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \
181 % tuple(map(repr, (self.seq, self.id, self.name,
182 self.description, self.dbxrefs)))
183
216
235
237 """Returns the length of the sequence."""
238 return len(self.seq)
239
241 """Returns True regardless of the length of the sequence.
242
243 This behaviour is for backwards compatibility, since until the
244 __len__ method was added, a SeqRecord always evaluated as True.
245
246 Note that in comparison, a Seq object will evaluate to False if it
247 has a zero length sequence.
248
249 WARNING: The SeqRecord may in future evaluate to False when its
250 sequence is of zero length (in order to better match the Seq
251 object behaviour)!
252 """
253 return True
254
256 """Run the Bio.SeqRecord module's doctests."""
257 print "Runing doctests..."
258 import doctest
259 doctest.testmod()
260 print "Done"
261
262 if __name__ == "__main__":
263 _test()
264