Package Bio :: Package Compass
[hide private]
[frames] | no frames]

Source Code for Package Bio.Compass

  1  # Copyright 2004 by James Casbon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  Code to deal with COMPASS output, a program for profile/profile comparison. 
  8   
  9  Compass is described in: 
 10   
 11  Sadreyev R, Grishin N. COMPASS: a tool for comparison of multiple protein 
 12  alignments with assessment of statistical significance. J Mol Biol. 2003 Feb 
 13  7;326(1):317-36. 
 14   
 15  Tested with COMPASS 1.24. 
 16   
 17  Functions: 
 18  read          Reads a COMPASS file containing one COMPASS record 
 19  parse         Iterates over records in a COMPASS file. 
 20   
 21  Classes: 
 22  Record        One result of a COMPASS file 
 23   
 24  DEPRECATED CLASSES: 
 25   
 26  _Scanner      Scan compass results 
 27  _Consumer     Consume scanner events 
 28  RecordParser  Parse one compass record 
 29  Iterator      Iterate through a number of compass records 
 30  """ 
 31  import re 
 32   
 33   
 34   
35 -def read(handle):
36 record = None 37 try: 38 line = handle.next() 39 record = Record() 40 __read_names(record, line) 41 line = handle.next() 42 __read_threshold(record, line) 43 line = handle.next() 44 __read_lengths(record, line) 45 line = handle.next() 46 __read_profilewidth(record, line) 47 line = handle.next() 48 __read_scores(record, line) 49 except StopIteration: 50 if not record: 51 raise ValueError("No record found in handle") 52 else: 53 raise ValueError("Unexpected end of stream.") 54 for line in handle: 55 if is_blank_line(line): 56 continue 57 __read_query_alignment(record, line) 58 try: 59 line = handle.next() 60 __read_positive_alignment(record, line) 61 line = handle.next() 62 __read_hit_alignment(record, line) 63 except StopIteration: 64 raise ValueError("Unexpected end of stream.") 65 return record
66
67 -def parse(handle):
68 record = None 69 try: 70 line = handle.next() 71 except StopIteration: 72 return 73 while True: 74 try: 75 record = Record() 76 __read_names(record, line) 77 line = handle.next() 78 __read_threshold(record, line) 79 line = handle.next() 80 __read_lengths(record, line) 81 line = handle.next() 82 __read_profilewidth(record, line) 83 line = handle.next() 84 __read_scores(record, line) 85 except StopIteration: 86 raise ValueError("Unexpected end of stream.") 87 for line in handle: 88 if not line.strip(): 89 continue 90 if "Ali1:" in line: 91 yield record 92 break 93 __read_query_alignment(record, line) 94 try: 95 line = handle.next() 96 __read_positive_alignment(record, line) 97 line = handle.next() 98 __read_hit_alignment(record, line) 99 except StopIteration: 100 raise ValueError("Unexpected end of stream.") 101 else: 102 yield record 103 break
104
105 -class Record:
106 """ 107 Hold information from one compass hit. 108 Ali1 one is the query, Ali2 the hit. 109 """ 110
111 - def __init__(self):
112 self.query='' 113 self.hit='' 114 self.gap_threshold=0 115 self.query_length=0 116 self.query_filtered_length=0 117 self.query_nseqs=0 118 self.query_neffseqs=0 119 self.hit_length=0 120 self.hit_filtered_length=0 121 self.hit_nseqs=0 122 self.hit_neffseqs=0 123 self.sw_score=0 124 self.evalue=-1 125 self.query_start=-1 126 self.hit_start=-1 127 self.query_aln='' 128 self.hit_aln='' 129 self.positives=''
130 131
132 - def query_coverage(self):
133 """Return the length of the query covered in alignment""" 134 s = self.query_aln.replace("=", "") 135 return len(s)
136
137 - def hit_coverage(self):
138 """Return the length of the hit covered in the alignment""" 139 s = self.hit_aln.replace("=", "") 140 return len(s)
141 142 # Everything below is private 143 144 __regex = {"names": re.compile("Ali1:\s+(\S+)\s+Ali2:\s+(\S+)\s+"), 145 "threshold": re.compile("Threshold of effective gap content in columns: (\S+)"), 146 "lengths": re.compile("length1=(\S+)\s+filtered_length1=(\S+)\s+length2=(\S+)\s+filtered_length2=(\S+)"), 147 "profilewidth": re.compile("Nseqs1=(\S+)\s+Neff1=(\S+)\s+Nseqs2=(\S+)\s+Neff2=(\S+)"), 148 "scores": re.compile("Smith-Waterman score = (\S+)\s+Evalue = (\S+)"), 149 "start": re.compile("(\d+)"), 150 "align": re.compile("^.{15}(\S+)"), 151 "positive_alignment": re.compile("^.{15}(.+)"), 152 } 153
154 -def __read_names(record, line):
155 """ 156 Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln 157 ------query----- -------hit------------- 158 """ 159 if not "Ali1:" in line: 160 raise ValueError("Line does not contain 'Ali1:':\n%s" % line) 161 m = __regex["names"].search(line) 162 record.query = m.group(1) 163 record.hit = m.group(2)
164
165 -def __read_threshold(record,line):
166 if not line.startswith("Threshold"): 167 raise ValueError("Line does not start with 'Threshold':\n%s" % line) 168 m = __regex["threshold"].search(line) 169 record.gap_threshold = float(m.group(1))
170
171 -def __read_lengths(record, line):
172 if not line.startswith("length1="): 173 raise ValueError("Line does not start with 'length1=':\n%s" % line) 174 m = __regex["lengths"].search(line) 175 record.query_length = int(m.group(1)) 176 record.query_filtered_length = float(m.group(2)) 177 record.hit_length = int(m.group(3)) 178 record.hit_filtered_length = float(m.group(4))
179
180 -def __read_profilewidth(record, line):
181 if not "Nseqs1" in line: 182 raise ValueError("Line does not contain 'Nseqs1':\n%s" % line) 183 m = __regex["profilewidth"].search(line) 184 record.query_nseqs = int(m.group(1)) 185 record.query_neffseqs = float(m.group(2)) 186 record.hit_nseqs = int(m.group(3)) 187 record.hit_neffseqs = float(m.group(4))
188
189 -def __read_scores(record, line):
190 if not line.startswith("Smith-Waterman"): 191 raise ValueError("Line does not start with 'Smith-Waterman':\n%s" % line) 192 m = __regex["scores"].search(line) 193 if m: 194 record.sw_score = int(m.group(1)) 195 record.evalue = float(m.group(2)) 196 else: 197 record.sw_score = 0 198 record.evalue = -1.0
199
200 -def __read_query_alignment(record, line):
201 m = __regex["start"].search(line) 202 if m: 203 record.query_start = int(m.group(1)) 204 m = __regex["align"].match(line) 205 assert m!=None, "invalid match" 206 record.query_aln += m.group(1)
207
208 -def __read_positive_alignment(record, line):
209 m = __regex["positive_alignment"].match(line) 210 assert m!=None, "invalid match" 211 record.positives += m.group(1)
212
213 -def __read_hit_alignment(record, line):
214 m = __regex["start"].search(line) 215 if m: 216 record.hit_start = int(m.group(1)) 217 m = __regex["align"].match(line) 218 assert m!=None, "invalid match" 219 record.hit_aln += m.group(1)
220 221 # Everything below is deprecated 222 223 from Bio import File 224 from Bio.ParserSupport import * 225 import Bio 226 227
228 -class _Scanner:
229 """Reads compass output and generate events (DEPRECATED)"""
230 - def __init__(self):
231 import warnings 232 warnings.warn("Bio.Compass._Scanner is deprecated; please use the read() and parse() functions in this module instead", Bio.BiopythonDeprecationWarning)
233
234 - def feed(self, handle, consumer):
235 """Feed in COMPASS ouput""" 236 237 if isinstance(handle, File.UndoHandle): 238 pass 239 else: 240 handle = File.UndoHandle(handle) 241 242 243 assert isinstance(handle, File.UndoHandle), \ 244 "handle must be an UndoHandle" 245 if handle.peekline(): 246 self._scan_record(handle, consumer)
247 248
249 - def _scan_record(self,handle,consumer):
250 self._scan_names(handle, consumer) 251 self._scan_threshold(handle, consumer) 252 self._scan_lengths(handle,consumer) 253 self._scan_profilewidth(handle, consumer) 254 self._scan_scores(handle,consumer) 255 self._scan_alignment(handle,consumer)
256
257 - def _scan_names(self,handle,consumer):
258 """ 259 Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln 260 """ 261 read_and_call(handle, consumer.names, contains="Ali1:")
262
263 - def _scan_threshold(self,handle, consumer):
264 """ 265 Threshold of effective gap content in columns: 0.5 266 """ 267 read_and_call(handle, consumer.threshold, start="Threshold")
268
269 - def _scan_lengths(self,handle, consumer):
270 """ 271 length1=388 filtered_length1=386 length2=145 filtered_length2=137 272 """ 273 read_and_call(handle, consumer.lengths, start="length1=")
274
275 - def _scan_profilewidth(self,handle,consumer):
276 """ 277 Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=6.099 278 """ 279 read_and_call(handle, consumer.profilewidth, contains="Nseqs1")
280
281 - def _scan_scores(self,handle, consumer):
282 """ 283 Smith-Waterman score = 37 Evalue = 5.75e+02 284 """ 285 read_and_call(handle, consumer.scores, start="Smith-Waterman")
286
287 - def _scan_alignment(self,handle, consumer):
288 """ 289 QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN 290 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 291 QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN 292 293 294 QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV 295 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 296 QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV 297 298 """ 299 while 1: 300 line = handle.readline() 301 if not line: 302 break 303 if is_blank_line(line): 304 continue 305 else: 306 consumer.query_alignment(line) 307 read_and_call(handle, consumer.positive_alignment) 308 read_and_call(handle, consumer.hit_alignment)
309
310 -class _Consumer:
311 # all regular expressions used -- compile only once 312 _re_names = re.compile("Ali1:\s+(\S+)\s+Ali2:\s+(\S+)\s+") 313 _re_threshold = \ 314 re.compile("Threshold of effective gap content in columns: (\S+)") 315 _re_lengths = \ 316 re.compile("length1=(\S+)\s+filtered_length1=(\S+)\s+length2=(\S+)" 317 + "\s+filtered_length2=(\S+)") 318 _re_profilewidth = \ 319 re.compile("Nseqs1=(\S+)\s+Neff1=(\S+)\s+Nseqs2=(\S+)\s+Neff2=(\S+)") 320 _re_scores = re.compile("Smith-Waterman score = (\S+)\s+Evalue = (\S+)") 321 _re_start = re.compile("(\d+)") 322 _re_align = re.compile("^.{15}(\S+)") 323 _re_positive_alignment = re.compile("^.{15}(.+)") 324
325 - def __init__(self):
326 import warnings 327 warnings.warn("Bio.Compass._Consumer is deprecated; please use the read() and parse() functions in this module instead", Bio.BiopythonDeprecationWarning) 328 self.data = None
329
330 - def names(self, line):
331 """ 332 Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln 333 ------query----- -------hit------------- 334 """ 335 self.data = Record() 336 m = self.__class__._re_names.search(line) 337 self.data.query = m.group(1) 338 self.data.hit = m.group(2)
339
340 - def threshold(self,line):
341 m = self.__class__._re_threshold.search(line) 342 self.data.gap_threshold = float(m.group(1))
343
344 - def lengths(self,line):
345 m = self.__class__._re_lengths.search(line) 346 self.data.query_length = int(m.group(1)) 347 self.data.query_filtered_length = float(m.group(2)) 348 self.data.hit_length = int(m.group(3)) 349 self.data.hit_filtered_length = float(m.group(4))
350
351 - def profilewidth(self,line):
352 m = self.__class__._re_profilewidth.search(line) 353 self.data.query_nseqs = int(m.group(1)) 354 self.data.query_neffseqs = float(m.group(2)) 355 self.data.hit_nseqs = int(m.group(3)) 356 self.data.hit_neffseqs = float(m.group(4))
357
358 - def scores(self, line):
359 m = self.__class__._re_scores.search(line) 360 if m: 361 self.data.sw_score = int(m.group(1)) 362 self.data.evalue = float(m.group(2)) 363 else: 364 self.data.sw_score = 0 365 self.data.evalue = -1.0
366
367 - def query_alignment(self, line):
368 m = self.__class__._re_start.search(line) 369 if m: 370 self.data.query_start = int(m.group(1)) 371 m = self.__class__._re_align.match(line) 372 assert m!=None, "invalid match" 373 self.data.query_aln = self.data.query_aln + m.group(1)
374
375 - def positive_alignment(self,line):
376 m = self.__class__._re_positive_alignment.match(line) 377 assert m!=None, "invalid match" 378 self.data.positives = self.data.positives + m.group(1)
379
380 - def hit_alignment(self,line):
381 m = self.__class__._re_start.search(line) 382 if m: 383 self.data.hit_start = int(m.group(1)) 384 m = self.__class__._re_align.match(line) 385 assert m!=None, "invalid match" 386 self.data.hit_aln = self.data.hit_aln + m.group(1)
387
388 -class RecordParser(AbstractParser):
389 """Parses compass results into a Record object (DEPRECATED). 390 """
391 - def __init__(self):
392 import warnings 393 warnings.warn("Bio.Compass._RecordParser is deprecated; please use the read() and parse() functions in this module instead", Bio.BiopythonDeprecationWarning) 394 self._scanner = _Scanner() 395 self._consumer = _Consumer()
396 397
398 - def parse(self, handle):
399 if isinstance(handle, File.UndoHandle): 400 uhandle = handle 401 else: 402 uhandle = File.UndoHandle(handle) 403 self._scanner.feed(uhandle, self._consumer) 404 return self._consumer.data
405
406 -class Iterator:
407 """Iterate through a file of compass results (DEPRECATED)."""
408 - def __init__(self, handle):
409 import warnings 410 warnings.warn("Bio.Compass.Iterator is deprecated; please use the parse() function in this module instead", Bio.BiopythonDeprecationWarning) 411 412 self._uhandle = File.UndoHandle(handle) 413 self._parser = RecordParser()
414
415 - def next(self):
416 lines = [] 417 while 1: 418 line = self._uhandle.readline() 419 if not line: 420 break 421 if line[0:4] == "Ali1" and lines: 422 self._uhandle.saveline(line) 423 break 424 425 lines.append(line) 426 427 428 if not lines: 429 return None 430 431 data = ''.join(lines) 432 return self._parser.parse(File.StringHandle(data))
433