1
2
3
4
5 """
6 This module provides code to work with the sprotXX.dat file from
7 SwissProt.
8 http://www.expasy.ch/sprot/sprot-top.html
9
10 Tested with:
11 Release 56.9, 03-March-2009.
12
13
14 Classes:
15 Record Holds SwissProt data.
16 Reference Holds reference data from a SwissProt record.
17
18 Functions:
19 read Read one SwissProt record
20 parse Read multiple SwissProt records
21
22 """
23
24 from Bio._py3k import _as_string
25
27 """Holds information from a SwissProt record.
28
29 Members:
30 entry_name Name of this entry, e.g. RL1_ECOLI.
31 data_class Either 'STANDARD' or 'PRELIMINARY'.
32 molecule_type Type of molecule, 'PRT',
33 sequence_length Number of residues.
34
35 accessions List of the accession numbers, e.g. ['P00321']
36 created A tuple of (date, release).
37 sequence_update A tuple of (date, release).
38 annotation_update A tuple of (date, release).
39
40 description Free-format description.
41 gene_name Gene name. See userman.txt for description.
42 organism The source of the sequence.
43 organelle The origin of the sequence.
44 organism_classification The taxonomy classification. List of strings.
45 (http://www.ncbi.nlm.nih.gov/Taxonomy/)
46 taxonomy_id A list of NCBI taxonomy id's.
47 host_organism A list of names of the hosts of a virus, if any.
48 host_taxonomy_id A list of NCBI taxonomy id's of the hosts, if any.
49 references List of Reference objects.
50 comments List of strings.
51 cross_references List of tuples (db, id1[, id2][, id3]). See the docs.
52 keywords List of the keywords.
53 features List of tuples (key name, from, to, description).
54 from and to can be either integers for the residue
55 numbers, '<', '>', or '?'
56
57 seqinfo tuple of (length, molecular weight, CRC32 value)
58 sequence The sequence.
59
60 """
62 self.entry_name = None
63 self.data_class = None
64 self.molecule_type = None
65 self.sequence_length = None
66
67 self.accessions = []
68 self.created = None
69 self.sequence_update = None
70 self.annotation_update = None
71
72 self.description = []
73 self.gene_name = ''
74 self.organism = []
75 self.organelle = ''
76 self.organism_classification = []
77 self.taxonomy_id = []
78 self.host_organism = []
79 self.host_taxonomy_id = []
80 self.references = []
81 self.comments = []
82 self.cross_references = []
83 self.keywords = []
84 self.features = []
85
86 self.seqinfo = None
87 self.sequence = ''
88
89
91 """Holds information from one reference in a SwissProt entry.
92
93 Members:
94 number Number of reference in an entry.
95 positions Describes extent of work. list of strings.
96 comments Comments. List of (token, text).
97 references References. List of (dbname, identifier)
98 authors The authors of the work.
99 title Title of the work.
100 location A citation for the work.
101
102 """
104 self.number = None
105 self.positions = []
106 self.comments = []
107 self.references = []
108 self.authors = []
109 self.title = []
110 self.location = []
111
112
119
120
130
131
132
133
134
136 record = None
137 unread = ""
138 for line in handle:
139
140
141 line = _as_string(line)
142 key, value = line[:2], line[5:].rstrip()
143 if unread:
144 value = unread + " " + value
145 unread = ""
146 if key=='**':
147
148
149
150
151
152
153 pass
154 elif key=='ID':
155 record = Record()
156 _read_id(record, line)
157 _sequence_lines = []
158 elif key=='AC':
159 accessions = [word for word in value.rstrip(";").split("; ")]
160 record.accessions.extend(accessions)
161 elif key=='DT':
162 _read_dt(record, line)
163 elif key=='DE':
164 record.description.append(value.strip())
165 elif key=='GN':
166 if record.gene_name:
167 record.gene_name += " "
168 record.gene_name += value
169 elif key=='OS':
170 record.organism.append(value)
171 elif key=='OG':
172 record.organelle += line[5:]
173 elif key=='OC':
174 cols = [col for col in value.rstrip(";.").split("; ")]
175 record.organism_classification.extend(cols)
176 elif key=='OX':
177 _read_ox(record, line)
178 elif key=='OH':
179 _read_oh(record, line)
180 elif key=='RN':
181 reference = Reference()
182 _read_rn(reference, value)
183 record.references.append(reference)
184 elif key=='RP':
185 assert record.references, "RP: missing RN"
186 record.references[-1].positions.append(value)
187 elif key=='RC':
188 assert record.references, "RC: missing RN"
189 reference = record.references[-1]
190 unread = _read_rc(reference, value)
191 elif key=='RX':
192 assert record.references, "RX: missing RN"
193 reference = record.references[-1]
194 _read_rx(reference, value)
195 elif key=='RL':
196 assert record.references, "RL: missing RN"
197 reference = record.references[-1]
198 reference.location.append(value)
199
200
201
202 elif key=='RA':
203 assert record.references, "RA: missing RN"
204 reference = record.references[-1]
205 reference.authors.append(value)
206 elif key=='RG':
207 assert record.references, "RG: missing RN"
208 reference = record.references[-1]
209 reference.authors.append(value)
210 elif key=="RT":
211 assert record.references, "RT: missing RN"
212 reference = record.references[-1]
213 reference.title.append(value)
214 elif key=='CC':
215 _read_cc(record, line)
216 elif key=='DR':
217 _read_dr(record, value)
218 elif key=='PE':
219
220 pass
221 elif key=='KW':
222 cols = value.rstrip(";.").split('; ')
223 record.keywords.extend(cols)
224 elif key=='FT':
225 _read_ft(record, line)
226 elif key=='SQ':
227 cols = value.split()
228 assert len(cols) == 7, "I don't understand SQ line %s" % line
229
230 record.seqinfo = int(cols[1]), int(cols[3]), cols[5]
231 elif key==' ':
232 _sequence_lines.append(value.replace(" ", "").rstrip())
233 elif key=='//':
234
235 record.description = " ".join(record.description)
236 record.organism = " ".join(record.organism)
237 record.organelle = record.organelle.rstrip()
238 for reference in record.references:
239 reference.authors = " ".join(reference.authors).rstrip(";")
240 reference.title = " ".join(reference.title).rstrip(";")
241 if reference.title.startswith('"') and reference.title.endswith('"'):
242 reference.title = reference.title[1:-1]
243 reference.location = " ".join(reference.location)
244 record.sequence = "".join(_sequence_lines)
245 return record
246 else:
247 raise ValueError("Unknown keyword '%s' found" % key)
248 if record:
249 raise ValueError("Unexpected end of stream.")
250
251
253 cols = line[5:].split()
254
255
256
257
258
259 if len(cols) == 5:
260 record.entry_name = cols[0]
261 record.data_class = cols[1].rstrip(";")
262 record.molecule_type = cols[2].rstrip(";")
263 record.sequence_length = int(cols[3])
264 elif len(cols) == 4:
265 record.entry_name = cols[0]
266 record.data_class = cols[1].rstrip(";")
267 record.molecule_type = None
268 record.sequence_length = int(cols[2])
269 else:
270 raise ValueError("ID line has unrecognised format:\n"+line)
271
272 allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed')
273 if record.data_class not in allowed:
274 raise ValueError("Unrecognized data class %s in line\n%s" % \
275 (record.data_class, line))
276
277
278 if record.molecule_type not in (None, 'PRT'):
279 raise ValueError("Unrecognized molecule type %s in line\n%s" % \
280 (record.molecule_type, line))
281
282
284 value = line[5:]
285 uprline = value.upper()
286 cols = value.rstrip().split()
287 if 'CREATED' in uprline \
288 or 'LAST SEQUENCE UPDATE' in uprline \
289 or 'LAST ANNOTATION UPDATE' in uprline:
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305 uprcols = uprline.split()
306 rel_index = -1
307 for index in range(len(uprcols)):
308 if uprcols[index].find("REL.") >= 0:
309 rel_index = index
310 assert rel_index >= 0, \
311 "Could not find Rel. in DT line: %s" % line
312 version_index = rel_index + 1
313
314 str_version = cols[version_index].rstrip(",")
315
316 if str_version == '':
317 version = 0
318
319 elif str_version.find(".") >= 0:
320 version = str_version
321
322 else:
323 version = int(str_version)
324 date = cols[0]
325
326 if 'CREATED' in uprline:
327 record.created = date, version
328 elif 'LAST SEQUENCE UPDATE' in uprline:
329 record.sequence_update = date, version
330 elif 'LAST ANNOTATION UPDATE' in uprline:
331 record.annotation_update = date, version
332 else:
333 assert False, "Shouldn't reach this line!"
334 elif 'INTEGRATED INTO' in uprline \
335 or 'SEQUENCE VERSION' in uprline \
336 or 'ENTRY VERSION' in uprline:
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357 try:
358 version = int(cols[-1])
359 except ValueError:
360 version = 0
361 date = cols[0].rstrip(",")
362
363
364
365 if "INTEGRATED" in uprline:
366 record.created = date, version
367 elif 'SEQUENCE VERSION' in uprline:
368 record.sequence_update = date, version
369 elif 'ENTRY VERSION' in uprline:
370 record.annotation_update = date, version
371 else:
372 assert False, "Shouldn't reach this line!"
373 else:
374 raise ValueError("I don't understand the date line %s" % line)
375
376
378
379
380
381
382
383
384
385
386
387
388 if record.taxonomy_id:
389 ids = line[5:].rstrip().rstrip(";")
390 else:
391 descr, ids = line[5:].rstrip().rstrip(";").split("=")
392 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr
393 record.taxonomy_id.extend(ids.split(', '))
394
395
404
405
407 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn
408 reference.number = int(rn[1:-1])
409
410
431
432
434
435
436
437
438
439
440
441
442 value = value.replace(' [NCBI, ExPASy, Israel, Japan]','')
443
444
445
446
447
448
449
450
451 if "=" in value:
452 cols = value.split("; ")
453 cols = [x.strip() for x in cols]
454 cols = [x for x in cols if x]
455 for col in cols:
456 x = col.split("=")
457 assert len(x) == 2, "I don't understand RX line %s" % value
458 key, value = x[0], x[1].rstrip(";")
459 reference.references.append((key, value))
460
461 else:
462 cols = value.split("; ")
463
464 assert len(cols) == 2, "I don't understand RX line %s" % value
465 reference.references.append((cols[0].rstrip(";"), cols[1].rstrip(".")))
466
467
478
479
487
488
490 line = line[5:]
491 name = line[0:8].rstrip()
492 try:
493 from_res = int(line[9:15])
494 except ValueError:
495 from_res = line[9:15].lstrip()
496 try:
497 to_res = int(line[16:22])
498 except ValueError:
499 to_res = line[16:22].lstrip()
500
501 if line[29:35]==r"/FTId=":
502 ft_id = line[35:70].rstrip()[:-1]
503 description = ""
504 else:
505 ft_id =""
506 description = line[29:70].rstrip()
507 if not name:
508 assert not from_res and not to_res
509 name, from_res, to_res, old_description,old_ft_id = record.features[-1]
510 del record.features[-1]
511 description = ("%s %s" % (old_description, description)).strip()
512
513
514 if name == "VARSPLIC":
515
516
517
518
519
520 descr_cols = description.split(" -> ")
521 if len(descr_cols) == 2:
522 first_seq, second_seq = descr_cols
523 extra_info = ''
524
525
526 extra_info_pos = second_seq.find(" (")
527 if extra_info_pos != -1:
528 extra_info = second_seq[extra_info_pos:]
529 second_seq = second_seq[:extra_info_pos]
530
531 first_seq = first_seq.replace(" ", "")
532 second_seq = second_seq.replace(" ", "")
533
534 description = first_seq + " -> " + second_seq + extra_info
535 record.features.append((name, from_res, to_res, description,ft_id))
536
537
538 if __name__ == "__main__":
539 print "Quick self test..."
540
541 example_filename = "../../Tests/SwissProt/sp008"
542
543 import os
544 if not os.path.isfile(example_filename):
545 print "Missing test file %s" % example_filename
546 else:
547
548
549 handle = open(example_filename)
550 records = parse(handle)
551 for record in records:
552 print record.entry_name
553 print ",".join(record.accessions)
554 print record.keywords
555 print repr(record.organism)
556 print record.sequence[:20] + "..."
557 handle.close()
558