1
2
3
4
5
6 """
7 This module provides code to work with the sprotXX.dat file from
8 SwissProt.
9 http://www.expasy.ch/sprot/sprot-top.html
10
11 Tested with:
12 Release 37, Release 38, Release 39
13
14 Limited testing with:
15 Release 51, 54
16
17
18 Classes:
19 Record Holds SwissProt data.
20 Reference Holds reference data from a SwissProt entry.
21 Iterator Iterates over entries in a SwissProt file.
22 Dictionary Accesses a SwissProt file using a dictionary interface.
23 RecordParser Parses a SwissProt record into a Record object.
24 SequenceParser Parses a SwissProt record into a SeqRecord object.
25
26 _Scanner Scans SwissProt-formatted data.
27 _RecordConsumer Consumes SwissProt data to a SProt.Record object.
28 _SequenceConsumer Consumes SwissProt data to a SeqRecord object.
29
30
31 Functions:
32 index_file Index a SwissProt file for a Dictionary.
33
34 """
35 from types import *
36 import os
37 from Bio import File
38 from Bio import Index
39 from Bio import Alphabet
40 from Bio import Seq
41 from Bio import SeqRecord
42 from Bio.ParserSupport import *
43
44 _CHOMP = " \n\r\t.,;"
45
47 """Holds information from a SwissProt record.
48
49 Members:
50 entry_name Name of this entry, e.g. RL1_ECOLI.
51 data_class Either 'STANDARD' or 'PRELIMINARY'.
52 molecule_type Type of molecule, 'PRT',
53 sequence_length Number of residues.
54
55 accessions List of the accession numbers, e.g. ['P00321']
56 created A tuple of (date, release).
57 sequence_update A tuple of (date, release).
58 annotation_update A tuple of (date, release).
59
60 description Free-format description.
61 gene_name Gene name. See userman.txt for description.
62 organism The source of the sequence.
63 organelle The origin of the sequence.
64 organism_classification The taxonomy classification. List of strings.
65 (http://www.ncbi.nlm.nih.gov/Taxonomy/)
66 taxonomy_id A list of NCBI taxonomy id's.
67 host_organism A list of NCBI taxonomy id's of the hosts of a virus,
68 if any.
69 references List of Reference objects.
70 comments List of strings.
71 cross_references List of tuples (db, id1[, id2][, id3]). See the docs.
72 keywords List of the keywords.
73 features List of tuples (key name, from, to, description).
74 from and to can be either integers for the residue
75 numbers, '<', '>', or '?'
76
77 seqinfo tuple of (length, molecular weight, CRC32 value)
78 sequence The sequence.
79
80 """
82 self.entry_name = None
83 self.data_class = None
84 self.molecule_type = None
85 self.sequence_length = None
86
87 self.accessions = []
88 self.created = None
89 self.sequence_update = None
90 self.annotation_update = None
91
92 self.description = ''
93 self.gene_name = ''
94 self.organism = ''
95 self.organelle = ''
96 self.organism_classification = []
97 self.taxonomy_id = []
98 self.host_organism = []
99 self.references = []
100 self.comments = []
101 self.cross_references = []
102 self.keywords = []
103 self.features = []
104
105 self.seqinfo = None
106 self.sequence = ''
107
109 """Holds information from 1 references in a SwissProt entry.
110
111 Members:
112 number Number of reference in an entry.
113 positions Describes extent of work. list of strings.
114 comments Comments. List of (token, text).
115 references References. List of (dbname, identifier)
116 authors The authors of the work.
117 title Title of the work.
118 location A citation for the work.
119
120 """
122 self.number = None
123 self.positions = []
124 self.comments = []
125 self.references = []
126 self.authors = ''
127 self.title = ''
128 self.location = ''
129
131 """Returns one record at a time from a SwissProt file.
132
133 Methods:
134 next Return the next record from the stream, or None.
135
136 """
137 - def __init__(self, handle, parser=None):
138 """__init__(self, handle, parser=None)
139
140 Create a new iterator. handle is a file-like object. parser
141 is an optional Parser object to change the results into another form.
142 If set to None, then the raw contents of the file will be returned.
143
144 """
145 import warnings
146 warnings.warn("Bio.SwissProt.SProt.Iterator is deprecated. Please use the function Bio.SwissProt.parse instead if you want to get a SwissProt.SProt.Record, or Bio.SeqIO.parse if you want to get a SeqRecord. If these solutions do not work for you, please get in contact with the Biopython developers (biopython-dev@biopython.org).",
147 DeprecationWarning)
148
149 if type(handle) is not FileType and type(handle) is not InstanceType:
150 raise ValueError("I expected a file handle or file-like object")
151 self._uhandle = File.UndoHandle(handle)
152 self._parser = parser
153
155 """next(self) -> object
156
157 Return the next swissprot record from the file. If no more records,
158 return None.
159
160 """
161 lines = []
162 while 1:
163 line = self._uhandle.readline()
164 if not line:
165 break
166 lines.append(line)
167 if line[:2] == '//':
168 break
169
170 if not lines:
171 return None
172
173 data = ''.join(lines)
174 if self._parser is not None:
175 return self._parser.parse(File.StringHandle(data))
176 return data
177
179 return iter(self.next, None)
180
182 """Accesses a SwissProt file using a dictionary interface.
183
184 """
185 __filename_key = '__filename'
186
187 - def __init__(self, indexname, parser=None):
188 """__init__(self, indexname, parser=None)
189
190 Open a SwissProt Dictionary. indexname is the name of the
191 index for the dictionary. The index should have been created
192 using the index_file function. parser is an optional Parser
193 object to change the results into another form. If set to None,
194 then the raw contents of the file will be returned.
195
196 """
197 self._index = Index.Index(indexname)
198 self._handle = open(self._index[self.__filename_key])
199 self._parser = parser
200
202 return len(self._index)
203
211
213 return getattr(self._index, name)
214
220
222 """Access SwissProt at ExPASy using a read-only dictionary interface.
223
224 """
225 - def __init__(self, delay=5.0, parser=None):
226 """__init__(self, delay=5.0, parser=None)
227
228 Create a new Dictionary to access SwissProt. parser is an optional
229 parser (e.g. SProt.RecordParser) object to change the results
230 into another form. If set to None, then the raw contents of the
231 file will be returned. delay is the number of seconds to wait
232 between each query.
233
234 """
235 import warnings
236 from Bio.WWW import RequestLimiter
237 warnings.warn("Bio.SwissProt.ExPASyDictionary is deprecated. Please use the function Bio.ExPASy.get_sprot_raw instead.",
238 DeprecationWarning)
239 self.parser = parser
240 self.limiter = RequestLimiter(delay)
241
243 raise NotImplementedError("SwissProt contains lots of entries")
245 raise NotImplementedError("This is a read-only dictionary")
247 raise NotImplementedError("This is a read-only dictionary")
249 raise NotImplementedError("This is a read-only dictionary")
251 raise NotImplementedError("You don't need to do this...")
253 raise NotImplementedError("You don't really want to do this...")
255 raise NotImplementedError("You don't really want to do this...")
257 raise NotImplementedError("You don't really want to do this...")
258
260 """has_key(self, id) -> bool"""
261 try:
262 self[id]
263 except KeyError:
264 return 0
265 return 1
266
267 - def get(self, id, failobj=None):
268 try:
269 return self[id]
270 except KeyError:
271 return failobj
272
274 """__getitem__(self, id) -> object
275
276 Return a SwissProt entry. id is either the id or accession
277 for the entry. Raises a KeyError if there's an error.
278
279 """
280 from Bio import ExPASy
281
282
283 self.limiter.wait()
284
285 try:
286 handle = ExPASy.get_sprot_raw(id)
287 except IOError:
288 raise KeyError(id)
289
290 if self.parser is not None:
291 return self.parser.parse(handle)
292 return handle.read()
293
295 """Parses SwissProt data into a Record object.
296
297 """
301
302 - def parse(self, handle):
303 self._scanner.feed(handle, self._consumer)
304 return self._consumer.data
305
307 """Parses SwissProt data into a standard SeqRecord object.
308 """
310 """Initialize a SequenceParser.
311
312 Arguments:
313 o alphabet - The alphabet to use for the generated Seq objects. If
314 not supplied this will default to the generic protein alphabet.
315 """
316 self._scanner = _Scanner()
317 self._consumer = _SequenceConsumer(alphabet)
318
319 - def parse(self, handle):
320 self._scanner.feed(handle, self._consumer)
321 return self._consumer.data
322
324 """Scans SwissProt-formatted data.
325
326 Tested with:
327 Release 37
328 Release 38
329 """
330
331 - def feed(self, handle, consumer):
332 """feed(self, handle, consumer)
333
334 Feed in SwissProt data for scanning. handle is a file-like
335 object that contains swissprot data. consumer is a
336 Consumer object that will receive events as the report is scanned.
337
338 """
339 if isinstance(handle, File.UndoHandle):
340 uhandle = handle
341 else:
342 uhandle = File.UndoHandle(handle)
343 self._scan_record(uhandle, consumer)
344
346 """Ignores any lines starting **"""
347
348
349
350
351
352
353
354 while "**" == uhandle.peekline()[:2] :
355 skip = uhandle.readline()
356
357
371
372 - def _scan_line(self, line_type, uhandle, event_fn,
373 exactly_one=None, one_or_more=None, any_number=None,
374 up_to_one=None):
392
395
401
407
411
414
418
421
425
429
433
451
455
459
463
467
481
485
491
494
498
501
504
507
510
513
516
517 _scan_fns = [
518 _scan_id,
519 _scan_ac,
520 _scan_dt,
521 _scan_de,
522 _scan_gn,
523 _scan_os,
524 _scan_og,
525 _scan_oc,
526 _scan_ox,
527 _scan_oh,
528 _scan_reference,
529 _scan_cc,
530 _scan_dr,
531 _scan_pe,
532 _scan_kw,
533 _scan_ft,
534 _scan_sq,
535 _scan_sequence_data,
536 _scan_terminator
537 ]
538
540 """Consumer that converts a SwissProt record to a Record object.
541
542 Members:
543 data Record with SwissProt data.
544
545 """
548
550 return "Bio.SwissProt.SProt._RecordConsumer()"
551
553 self.data = Record()
554 self._sequence_lines = []
555
559
561 cols = line.split()
562
563
564
565
566
567
568
569
570 if len(cols) == 6 :
571 self.data.entry_name = cols[1]
572 self.data.data_class = cols[2].rstrip(_CHOMP)
573 self.data.molecule_type = cols[3].rstrip(_CHOMP)
574 self.data.sequence_length = int(cols[4])
575 elif len(cols) == 5 :
576 self.data.entry_name = cols[1]
577 self.data.data_class = cols[2].rstrip(_CHOMP)
578 self.data.molecule_type = None
579 self.data.sequence_length = int(cols[3])
580 else :
581
582 raise ValueError("ID line has unrecognised format:\n"+line)
583
584
585
586
587 if self.data.data_class not in ['STANDARD', 'PRELIMINARY', 'IPI',
588 'Reviewed', 'Unreviewed']:
589 raise ValueError("Unrecognized data class %s in line\n%s" % \
590 (self.data.data_class, line))
591
592
593 if self.data.molecule_type is not None \
594 and self.data.molecule_type != 'PRT':
595 raise ValueError("Unrecognized molecule type %s in line\n%s" % \
596 (self.data.molecule_type, line))
597
604
605 - def date(self, line):
606 uprline = line.upper()
607 cols = line.rstrip().split()
608
609 if uprline.find('CREATED') >= 0 \
610 or uprline.find('LAST SEQUENCE UPDATE') >= 0 \
611 or uprline.find('LAST ANNOTATION UPDATE') >= 0:
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627 uprcols = uprline.split()
628 rel_index = -1
629 for index in range(len(uprcols)):
630 if uprcols[index].find("REL.") >= 0:
631 rel_index = index
632 assert rel_index >= 0, \
633 "Could not find Rel. in DT line: %s" % (line)
634 version_index = rel_index + 1
635
636 str_version = cols[version_index].rstrip(_CHOMP)
637
638 if str_version == '':
639 version = 0
640
641 elif str_version.find(".") >= 0:
642 version = str_version
643
644 else:
645 version = int(str_version)
646
647 if uprline.find('CREATED') >= 0:
648 self.data.created = cols[1], version
649 elif uprline.find('LAST SEQUENCE UPDATE') >= 0:
650 self.data.sequence_update = cols[1], version
651 elif uprline.find( 'LAST ANNOTATION UPDATE') >= 0:
652 self.data.annotation_update = cols[1], version
653 else:
654 assert False, "Shouldn't reach this line!"
655 elif uprline.find('INTEGRATED INTO') >= 0 \
656 or uprline.find('SEQUENCE VERSION') >= 0 \
657 or uprline.find('ENTRY VERSION') >= 0:
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678 try:
679 version = int(cols[-1])
680 except ValueError :
681 version = 0
682
683
684
685 if uprline.find("INTEGRATED") >= 0:
686 self.data.created = cols[1], version
687 elif uprline.find('SEQUENCE VERSION') >= 0:
688 self.data.sequence_update = cols[1], version
689 elif uprline.find( 'ENTRY VERSION') >= 0:
690 self.data.annotation_update = cols[1], version
691 else:
692 assert False, "Shouldn't reach this line!"
693 else:
694 raise ValueError("I don't understand the date line %s" % line)
695
698
701
704
707
713
735
748
750 rn = line[5:].rstrip()
751 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn
752 ref = Reference()
753 ref.number = int(rn[1:-1])
754 self.data.references.append(ref)
755
757 assert self.data.references, "RP: missing RN"
758 self.data.references[-1].positions.append(line[5:].rstrip())
759
787
789 assert self.data.references, "RX: missing RN"
790
791
792
793
794
795
796
797
798 ind = line.find('[NCBI, ExPASy, Israel, Japan]')
799 if ind >= 0:
800 line = line[:ind]
801
802
803
804
805
806
807
808
809 if line.find("=") != -1:
810 cols = line[2:].split("; ")
811 cols = [x.strip() for x in cols]
812 cols = [x for x in cols if x]
813 for col in cols:
814 x = col.split("=")
815 assert len(x) == 2, "I don't understand RX line %s" % line
816 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP)
817 ref = self.data.references[-1].references
818 ref.append((key, value))
819
820 else:
821 cols = line.split()
822
823 assert len(cols) == 3, "I don't understand RX line %s" % line
824 self.data.references[-1].references.append(
825 (cols[1].rstrip(_CHOMP), cols[2].rstrip(_CHOMP)))
826
828 assert self.data.references, "RA: missing RN"
829 ref = self.data.references[-1]
830 ref.authors += line[5:]
831
833 assert self.data.references, "RT: missing RN"
834 ref = self.data.references[-1]
835 ref.title += line[5:]
836
838 assert self.data.references, "RL: missing RN"
839 ref = self.data.references[-1]
840 ref.location += line[5:]
841
860
862
863
864
865
866 line = line[5:]
867
868 i = line.find('[')
869 if i >= 0:
870 line = line[:i]
871 cols = line.rstrip(_CHOMP).split(';')
872 cols = [col.lstrip() for col in cols]
873 self.data.cross_references.append(tuple(cols))
874
878
906
908 """Remove unwanted spaces in sequences.
909
910 During line carryover, the sequences in VARSPLIC can get mangled
911 with unwanted spaces like:
912 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH'
913 We want to check for this case and correct it as it happens.
914 """
915 descr_cols = description.split(" -> ")
916 if len(descr_cols) == 2:
917 first_seq = descr_cols[0]
918 second_seq = descr_cols[1]
919 extra_info = ''
920
921
922 extra_info_pos = second_seq.find(" (")
923 if extra_info_pos != -1:
924 extra_info = second_seq[extra_info_pos:]
925 second_seq = second_seq[:extra_info_pos]
926
927
928 first_seq = first_seq.replace(" ", "")
929 second_seq = second_seq.replace(" ", "")
930
931
932 description = first_seq + " -> " + second_seq + extra_info
933
934 return description
935
939
941 cols = line.split()
942 assert len(cols) == 8, "I don't understand SQ line %s" % line
943
944 self.data.seqinfo = int(cols[2]), int(cols[4]), cols[6]
945
947
948 self._sequence_lines.append(line.replace(" ", "").rstrip())
949
952
953
954
955
956
957
959
960 members = ['description', 'gene_name', 'organism', 'organelle']
961 for m in members:
962 attr = getattr(rec, m)
963 setattr(rec, m, attr.rstrip())
964 for ref in rec.references:
965 self._clean_references(ref)
966
968
969 members = ['authors', 'title', 'location']
970 for m in members:
971 attr = getattr(ref, m)
972 setattr(ref, m, attr.rstrip())
973
975 """Consumer that converts a SwissProt record to a SeqRecord object.
976
977 Members:
978 data Record with SwissProt data.
979 alphabet The alphabet the generated Seq objects will have.
980 """
981
983 """Initialize a Sequence Consumer
984
985 Arguments:
986 o alphabet - The alphabet to use for the generated Seq objects. If
987 not supplied this will default to the generic protein alphabet.
988 """
989 self.data = None
990 self.alphabet = alphabet
991
999
1006
1010
1025
1026
1030
1032
1033 self._sequence_lines.append(line.replace(" ", "").rstrip())
1034
1041
1050
1051
1070
1071
1072
1073
1074 - def date(self, line):
1075 date_str = line.split()[0]
1076 uprline = line.upper()
1077 if uprline.find('CREATED') >= 0 :
1078
1079
1080 self.data.annotations['date'] = date_str
1081 elif uprline.find('LAST SEQUENCE UPDATE') >= 0 :
1082
1083 self.data.annotations['date_last_sequence_update'] = date_str
1084 elif uprline.find('LAST ANNOTATION UPDATE') >= 0:
1085
1086 self.data.annotations['date_last_annotation_update'] = date_str
1087
1100
1110
1127
1140
1142
1143
1144
1145
1146
1147 line = line[5:].rstrip(_CHOMP)
1148 index = line.find('=')
1149 if index >= 0:
1150 descr = line[:index]
1151 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr
1152 ids = line[index+1:].split(',')
1153 else:
1154 ids = line.split(',')
1155
1156 try :
1157
1158 self.data.annotations['ncbi_taxid'].extend(ids)
1159 except KeyError:
1160 self.data.annotations['ncbi_taxid'] = ids
1161
1173
1175 """RP line, reference position."""
1176 assert self._current_ref is not None, "RP: missing RN"
1177
1178
1179
1180 pass
1181
1183 """RX line, reference cross-references."""
1184 assert self._current_ref is not None, "RX: missing RN"
1185
1186
1187
1188
1189
1190
1191 if line.find("=") != -1:
1192 cols = line[2:].split("; ")
1193 cols = [x.strip() for x in cols]
1194 cols = [x for x in cols if x]
1195 for col in cols:
1196 x = col.split("=")
1197 assert len(x) == 2, "I don't understand RX line %s" % line
1198 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP)
1199 if key == "MEDLINE" :
1200 self._current_ref.medline_id = value
1201 elif key == "PubMed" :
1202 self._current_ref.pubmed_id = value
1203 else :
1204
1205
1206 pass
1207
1208 else:
1209
1210
1211
1212
1213 ind = line.find('[NCBI, ExPASy, Israel, Japan]')
1214 if ind >= 0:
1215 line = line[:ind]
1216 cols = line.split()
1217
1218 assert len(cols) == 3, "I don't understand RX line %s" % line
1219 key = cols[1].rstrip(_CHOMP)
1220 value = cols[2].rstrip(_CHOMP)
1221 if key == "MEDLINE" :
1222 self._current_ref.medline_id = value
1223 elif key == "PubMed" :
1224 self._current_ref.pubmed_id = value
1225 else :
1226
1227
1228 pass
1229
1231 """RA line, reference author(s)."""
1232 assert self._current_ref is not None, "RA: missing RN"
1233 self._current_ref.authors += line[5:].rstrip("\n")
1234
1236 """RT line, reference title."""
1237 assert self._current_ref is not None, "RT: missing RN"
1238 self._current_ref.title += line[5:].rstrip("\n")
1239
1241 """RL line, reference 'location' - journal, volume, pages, year."""
1242 assert self._current_ref is not None, "RL: missing RN"
1243 self._current_ref.journal += line[5:].rstrip("\n")
1244
1251
1252 -def index_file(filename, indexname, rec2key=None):
1253 """index_file(filename, indexname, rec2key=None)
1254
1255 Index a SwissProt file. filename is the name of the file.
1256 indexname is the name of the dictionary. rec2key is an
1257 optional callback that takes a Record and generates a unique key
1258 (e.g. the accession number) for the record. If not specified,
1259 the entry name will be used.
1260
1261 """
1262 from Bio.SwissProt import parse
1263 if not os.path.exists(filename):
1264 raise ValueError("%s does not exist" % filename)
1265
1266 index = Index.Index(indexname, truncate=1)
1267 index[Dictionary._Dictionary__filename_key] = filename
1268
1269 handle = open(filename)
1270 records = parse(handle)
1271 end = 0L
1272 for record in records:
1273 start = end
1274 end = long(handle.tell())
1275 length = end - start
1276
1277 if rec2key is not None:
1278 key = rec2key(record)
1279 else:
1280 key = record.entry_name
1281
1282 if not key:
1283 raise KeyError("empty sequence key was produced")
1284 elif key in index:
1285 raise KeyError("duplicate key %s found" % key)
1286
1287 index[key] = start, length
1288