1
2
3
4
5
6
7 """
8 This module provides code to work with the prosite dat file from
9 Prosite.
10 http://www.expasy.ch/prosite/
11
12 Tested with:
13 Release 15.0, July 1998
14 Release 16.0, July 1999
15 Release 17.0, Dec 2001
16 Release 19.0, Mar 2006
17
18
19 Functions:
20 parse Iterates over entries in a Prosite file.
21 scan_sequence_expasy Scan a sequence for occurrences of Prosite patterns.
22 index_file Index a Prosite file for a Dictionary.
23 _extract_record Extract Prosite data from a web page.
24 _extract_pattern_hits Extract Prosite patterns from a web page.
25
26
27 Classes:
28 Record Holds Prosite data.
29 PatternHit Holds data from a hit against a Prosite pattern.
30 Dictionary Accesses a Prosite file using a dictionary interface.
31 RecordParser Parses a Prosite record into a Record object.
32 Iterator Iterates over entries in a Prosite file; DEPRECATED.
33
34 _Scanner Scans Prosite-formatted data.
35 _RecordConsumer Consumes Prosite data to a Record object.
36
37 """
38 from types import *
39 import re
40 import sgmllib
41 from Bio import File
42 from Bio import Index
43 from Bio.ParserSupport import *
44
45
46
47
48
62
77
79 """Holds information from a Prosite record.
80
81 Members:
82 name ID of the record. e.g. ADH_ZINC
83 type Type of entry. e.g. PATTERN, MATRIX, or RULE
84 accession e.g. PS00387
85 created Date the entry was created. (MMM-YYYY)
86 data_update Date the 'primary' data was last updated.
87 info_update Date data other than 'primary' data was last updated.
88 pdoc ID of the PROSITE DOCumentation.
89
90 description Free-format description.
91 pattern The PROSITE pattern. See docs.
92 matrix List of strings that describes a matrix entry.
93 rules List of rule definitions (from RU lines). (strings)
94 prorules List of prorules (from PR lines). (strings)
95
96 NUMERICAL RESULTS
97 nr_sp_release SwissProt release.
98 nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int)
99 nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs)
100 nr_positive True positives. tuple of (hits, seqs)
101 nr_unknown Could be positives. tuple of (hits, seqs)
102 nr_false_pos False positives. tuple of (hits, seqs)
103 nr_false_neg False negatives. (int)
104 nr_partial False negatives, because they are fragments. (int)
105
106 COMMENTS
107 cc_taxo_range Taxonomic range. See docs for format
108 cc_max_repeat Maximum number of repetitions in a protein
109 cc_site Interesting site. list of tuples (pattern pos, desc.)
110 cc_skip_flag Can this entry be ignored?
111 cc_matrix_type
112 cc_scaling_db
113 cc_author
114 cc_ft_key
115 cc_ft_desc
116 cc_version version number (introduced in release 19.0)
117
118 DATA BANK REFERENCES - The following are all
119 lists of tuples (swiss-prot accession,
120 swiss-prot name)
121 dr_positive
122 dr_false_neg
123 dr_false_pos
124 dr_potential Potential hits, but fingerprint region not yet available.
125 dr_unknown Could possibly belong
126
127 pdb_structs List of PDB entries.
128
129 """
131 self.name = ''
132 self.type = ''
133 self.accession = ''
134 self.created = ''
135 self.data_update = ''
136 self.info_update = ''
137 self.pdoc = ''
138
139 self.description = ''
140 self.pattern = ''
141 self.matrix = []
142 self.rules = []
143 self.prorules = []
144 self.postprocessing = []
145
146 self.nr_sp_release = ''
147 self.nr_sp_seqs = ''
148 self.nr_total = (None, None)
149 self.nr_positive = (None, None)
150 self.nr_unknown = (None, None)
151 self.nr_false_pos = (None, None)
152 self.nr_false_neg = None
153 self.nr_partial = None
154
155 self.cc_taxo_range = ''
156 self.cc_max_repeat = ''
157 self.cc_site = []
158 self.cc_skip_flag = ''
159
160 self.dr_positive = []
161 self.dr_false_neg = []
162 self.dr_false_pos = []
163 self.dr_potential = []
164 self.dr_unknown = []
165
166 self.pdb_structs = []
167
169 """Holds information from a hit against a Prosite pattern.
170
171 Members:
172 name ID of the record. e.g. ADH_ZINC
173 accession e.g. PS00387
174 pdoc ID of the PROSITE DOCumentation.
175 description Free-format description.
176 matches List of tuples (start, end, sequence) where
177 start and end are indexes of the match, and sequence is
178 the sequence matched.
179
180 """
188 lines = []
189 lines.append("%s %s %s" % (self.accession, self.pdoc, self.name))
190 lines.append(self.description)
191 lines.append('')
192 if len(self.matches) > 1:
193 lines.append("Number of matches: %s" % len(self.matches))
194 for i in range(len(self.matches)):
195 start, end, seq = self.matches[i]
196 range_str = "%d-%d" % (start, end)
197 if len(self.matches) > 1:
198 lines.append("%7d %10s %s" % (i+1, range_str, seq))
199 else:
200 lines.append("%7s %10s %s" % (' ', range_str, seq))
201 return "\n".join(lines)
202
204 """Returns one record at a time from a Prosite file.
205
206 Methods:
207 next Return the next record from the stream, or None.
208
209 """
210 - def __init__(self, handle, parser=None):
211 """__init__(self, handle, parser=None)
212
213 Create a new iterator. handle is a file-like object. parser
214 is an optional Parser object to change the results into another form.
215 If set to None, then the raw contents of the file will be returned.
216
217 """
218 import warnings
219 warnings.warn("Bio.Prosite.Iterator is deprecated; we recommend using the function Bio.Prosite.parse instead. Please contact the Biopython developers at biopython-dev@biopython.org you cannot use Bio.Prosite.parse instead of Bio.Prosite.Iterator.",
220 DeprecationWarning)
221 if type(handle) is not FileType and type(handle) is not InstanceType:
222 raise ValueError("I expected a file handle or file-like object")
223 self._uhandle = File.UndoHandle(handle)
224 self._parser = parser
225
227 """next(self) -> object
228
229 Return the next Prosite record from the file. If no more records,
230 return None.
231
232 """
233
234 line = self._uhandle.peekline()
235 if line[:2] == 'CC':
236 while 1:
237 line = self._uhandle.readline()
238 if not line:
239 break
240 if line[:2] == '//':
241 break
242 if line[:2] != 'CC':
243 raise ValueError("Oops, where's the copyright?")
244
245 lines = []
246 while 1:
247 line = self._uhandle.readline()
248 if not line:
249 break
250 lines.append(line)
251 if line[:2] == '//':
252 break
253
254 if not lines:
255 return None
256
257 data = "".join(lines)
258 if self._parser is not None:
259 return self._parser.parse(File.StringHandle(data))
260 return data
261
263 return iter(self.next, None)
264
266 """Accesses a Prosite file using a dictionary interface.
267
268 """
269 __filename_key = '__filename'
270
271 - def __init__(self, indexname, parser=None):
272 """__init__(self, indexname, parser=None)
273
274 Open a Prosite Dictionary. indexname is the name of the
275 index for the dictionary. The index should have been created
276 using the index_file function. parser is an optional Parser
277 object to change the results into another form. If set to None,
278 then the raw contents of the file will be returned.
279
280 """
281 self._index = Index.Index(indexname)
282 self._handle = open(self._index[Dictionary.__filename_key])
283 self._parser = parser
284
286 return len(self._index)
287
295
297 return getattr(self._index, name)
298
300 """Access PROSITE at ExPASy using a read-only dictionary interface.
301
302 """
303 - def __init__(self, delay=5.0, parser=None):
304 """__init__(self, delay=5.0, parser=None)
305
306 Create a new Dictionary to access PROSITE. parser is an optional
307 parser (e.g. Prosite.RecordParser) object to change the results
308 into another form. If set to None, then the raw contents of the
309 file will be returned. delay is the number of seconds to wait
310 between each query.
311
312 """
313 import warnings
314 from Bio.WWW import RequestLimiter
315 warnings.warn("Bio.Prosite.ExPASyDictionary is deprecated. Please use the function Bio.ExPASy.get_prosite_raw instead.",
316 DeprecationWarning)
317 self.parser = parser
318 self.limiter = RequestLimiter(delay)
319
321 raise NotImplementedError("Prosite contains lots of entries")
323 raise NotImplementedError("This is a read-only dictionary")
325 raise NotImplementedError("This is a read-only dictionary")
327 raise NotImplementedError("This is a read-only dictionary")
329 raise NotImplementedError("You don't need to do this...")
331 raise NotImplementedError("You don't really want to do this...")
333 raise NotImplementedError("You don't really want to do this...")
335 raise NotImplementedError("You don't really want to do this...")
336
338 """has_key(self, id) -> bool"""
339 try:
340 self[id]
341 except KeyError:
342 return 0
343 return 1
344
345 - def get(self, id, failobj=None):
346 try:
347 return self[id]
348 except KeyError:
349 return failobj
350
352 """__getitem__(self, id) -> object
353
354 Return a Prosite entry. id is either the id or accession
355 for the entry. Raises a KeyError if there's an error.
356
357 """
358 from Bio import ExPASy
359
360
361 self.limiter.wait()
362
363 try:
364 handle = ExPASy.get_prosite_entry(id)
365 except IOError:
366 raise KeyError(id)
367 try:
368 handle = File.StringHandle(_extract_record(handle))
369 except ValueError:
370 raise KeyError(id)
371
372 if self.parser is not None:
373 return self.parser.parse(handle)
374 return handle.read()
375
377 """Parses Prosite data into a Record object.
378
379 """
383
384 - def parse(self, handle):
385 self._scanner.feed(handle, self._consumer)
386 return self._consumer.data
387
389 """Scans Prosite-formatted data.
390
391 Tested with:
392 Release 15.0, July 1998
393
394 """
395 - def feed(self, handle, consumer):
396 """feed(self, handle, consumer)
397
398 Feed in Prosite data for scanning. handle is a file-like
399 object that contains prosite data. consumer is a
400 Consumer object that will receive events as the report is scanned.
401
402 """
403 if isinstance(handle, File.UndoHandle):
404 uhandle = handle
405 else:
406 uhandle = File.UndoHandle(handle)
407
408 consumer.finished = False
409 while not consumer.finished:
410 line = uhandle.peekline()
411 if not line:
412 break
413 elif is_blank_line(line):
414
415 uhandle.readline()
416 continue
417 elif line[:2] == 'ID':
418 self._scan_record(uhandle, consumer)
419 elif line[:2] == 'CC':
420 self._scan_copyrights(uhandle, consumer)
421 else:
422 raise ValueError("There doesn't appear to be a record")
423
425 consumer.start_copyrights()
426 self._scan_line('CC', uhandle, consumer.copyright, any_number=1)
427 self._scan_terminator(uhandle, consumer)
428 consumer.end_copyrights()
429
442
443 - def _scan_line(self, line_type, uhandle, event_fn,
444 exactly_one=None, one_or_more=None, any_number=None,
445 up_to_one=None):
463
466
469
472
475
478
481
482
483
484
485
486
487
488
489
490
491
492
493
494
498
501
505
508
512
516
520
523
526
527
528
529
530 _scan_fns = [
531 _scan_id,
532 _scan_ac,
533 _scan_dt,
534 _scan_de,
535 _scan_pa,
536 _scan_ma,
537 _scan_pp,
538 _scan_ru,
539 _scan_nr,
540 _scan_cc,
541
542
543
544
545
546 _scan_ma,
547 _scan_nr,
548 _scan_cc,
549
550 _scan_dr,
551 _scan_3d,
552 _scan_pr,
553 _scan_do,
554 _scan_terminator
555 ]
556
558 """Consumer that converts a Prosite record to a Record object.
559
560 Members:
561 data Record with Prosite data.
562
563 """
566
569
572
574 cols = line.split()
575 if len(cols) != 3:
576 raise ValueError("I don't understand identification line\n%s" \
577 % line)
578 self.data.name = self._chomp(cols[1])
579 self.data.type = self._chomp(cols[2])
580
582 cols = line.split()
583 if len(cols) != 2:
584 raise ValueError("I don't understand accession line\n%s" % line)
585 self.data.accession = self._chomp(cols[1])
586
587 - def date(self, line):
588 uprline = line.upper()
589 cols = uprline.split()
590
591
592 if cols[2] != '(CREATED);' or \
593 cols[4] != '(DATA' or cols[5] != 'UPDATE);' or \
594 cols[7][:4] != '(INF' or cols[8] != 'UPDATE).':
595 raise ValueError("I don't understand date line\n%s" % line)
596
597 self.data.created = cols[1]
598 self.data.data_update = cols[3]
599 self.data.info_update = cols[6]
600
603
606
609
610 - def postprocessing(self, line):
613
614 - def rule(self, line):
616
618 cols = self._clean(line).split(";")
619 for col in cols:
620 if not col:
621 continue
622 qual, data = [word.lstrip() for word in col.split("=")]
623 if qual == '/RELEASE':
624 release, seqs = data.split(",")
625 self.data.nr_sp_release = release
626 self.data.nr_sp_seqs = int(seqs)
627 elif qual == '/FALSE_NEG':
628 self.data.nr_false_neg = int(data)
629 elif qual == '/PARTIAL':
630 self.data.nr_partial = int(data)
631 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']:
632 m = re.match(r'(\d+)\((\d+)\)', data)
633 if not m:
634 raise Exception("Broken data %s in comment line\n%s" \
635 % (repr(data), line))
636 hits = tuple(map(int, m.groups()))
637 if(qual == "/TOTAL"):
638 self.data.nr_total = hits
639 elif(qual == "/POSITIVE"):
640 self.data.nr_positive = hits
641 elif(qual == "/UNKNOWN"):
642 self.data.nr_unknown = hits
643 elif(qual == "/FALSE_POS"):
644 self.data.nr_false_pos = hits
645 else:
646 raise ValueError("Unknown qual %s in comment line\n%s" \
647 % (repr(qual), line))
648
690
709
714
719
722
725
726 - def _chomp(self, word, to_chomp='.,;'):
727
728 if word[-1] in to_chomp:
729 return word[:-1]
730 return word
731
732 - def _clean(self, line, rstrip=1):
733
734 if rstrip:
735 return line[5:].rstrip()
736 return line[5:]
737
739 """scan_sequence_expasy(seq=None, id=None, exclude_frequent=None) ->
740 list of PatternHit's
741
742 Search a sequence for occurrences of Prosite patterns. You can
743 specify either a sequence in seq or a SwissProt/trEMBL ID or accession
744 in id. Only one of those should be given. If exclude_frequent
745 is true, then the patterns with the high probability of occurring
746 will be excluded.
747
748 """
749 from Bio import ExPASy
750 if (seq and id) or not (seq or id):
751 raise ValueError("Please specify either a sequence or an id")
752 handle = ExPASy.scanprosite1(seq, id, exclude_frequent)
753 return _extract_pattern_hits(handle)
754
756 """_extract_pattern_hits(handle) -> list of PatternHit's
757
758 Extract hits from a web page. Raises a ValueError if there
759 was an error in the query.
760
761 """
762 class parser(sgmllib.SGMLParser):
763 def __init__(self):
764 sgmllib.SGMLParser.__init__(self)
765 self.hits = []
766 self.broken_message = 'Some error occurred'
767 self._in_pre = 0
768 self._current_hit = None
769 self._last_found = None
770 def handle_data(self, data):
771 if data.find('try again') >= 0:
772 self.broken_message = data
773 return
774 elif data == 'illegal':
775 self.broken_message = 'Sequence contains illegal characters'
776 return
777 if not self._in_pre:
778 return
779 elif not data.strip():
780 return
781 if self._last_found is None and data[:4] == 'PDOC':
782 self._current_hit.pdoc = data
783 self._last_found = 'pdoc'
784 elif self._last_found == 'pdoc':
785 if data[:2] != 'PS':
786 raise ValueError("Expected accession but got:\n%s" % data)
787 self._current_hit.accession = data
788 self._last_found = 'accession'
789 elif self._last_found == 'accession':
790 self._current_hit.name = data
791 self._last_found = 'name'
792 elif self._last_found == 'name':
793 self._current_hit.description = data
794 self._last_found = 'description'
795 elif self._last_found == 'description':
796 m = re.findall(r'(\d+)-(\d+) (\w+)', data)
797 for start, end, seq in m:
798 self._current_hit.matches.append(
799 (int(start), int(end), seq))
800
801 def do_hr(self, attrs):
802
803 if self._in_pre:
804 self._current_hit = PatternHit()
805 self.hits.append(self._current_hit)
806 self._last_found = None
807 def start_pre(self, attrs):
808 self._in_pre = 1
809 self.broken_message = None
810 def end_pre(self):
811 self._in_pre = 0
812 p = parser()
813 p.feed(handle.read())
814 if p.broken_message:
815 raise ValueError(p.broken_message)
816 return p.hits
817
818
819
820
821 -def index_file(filename, indexname, rec2key=None):
822 """index_file(filename, indexname, rec2key=None)
823
824 Index a Prosite file. filename is the name of the file.
825 indexname is the name of the dictionary. rec2key is an
826 optional callback that takes a Record and generates a unique key
827 (e.g. the accession number) for the record. If not specified,
828 the id name will be used.
829
830 """
831 import os
832 if not os.path.exists(filename):
833 raise ValueError("%s does not exist" % filename)
834
835 index = Index.Index(indexname, truncate=1)
836 index[Dictionary._Dictionary__filename_key] = filename
837
838 handle = open(filename)
839 records = parse(handle)
840 end = 0L
841 for record in records:
842 start = end
843 end = long(handle.tell())
844 length = end - start
845
846 if rec2key is not None:
847 key = rec2key(record)
848 else:
849 key = record.name
850
851 if not key:
852 raise KeyError("empty key was produced")
853 elif key in index:
854 raise KeyError("duplicate key %s found" % key)
855
856 index[key] = start, length
857
858
859
861 """_extract_record(handle) -> str
862
863 Extract PROSITE data from a web page. Raises a ValueError if no
864 data was found in the web page.
865
866 """
867
868
869
870 class parser(sgmllib.SGMLParser):
871 def __init__(self):
872 sgmllib.SGMLParser.__init__(self)
873 self._in_pre = 0
874 self.data = []
875 def handle_data(self, data):
876 if self._in_pre:
877 self.data.append(data)
878 def do_br(self, attrs):
879 if self._in_pre:
880 self.data.append('\n')
881 def start_pre(self, attrs):
882 self._in_pre = 1
883 def end_pre(self):
884 self._in_pre = 0
885 p = parser()
886 p.feed(handle.read())
887 if not p.data:
888 raise ValueError("No data found in web page.")
889 return "".join(p.data)
890