Package Bio :: Package Sequencing :: Module Phd
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Phd

  1  # Copyright 2004 by Cymon J. Cox and Frank Kauff.  All rights reserved. 
  2  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  Parser for PHD files output by PHRED and used by PHRAP and CONSED. 
  8   
  9  This module can be used used directly which will return Record objects 
 10  which should contain all the original data in the file. 
 11   
 12  Alternatively, using Bio.SeqIO with the "phd" format will call this module 
 13  internally.  This will give SeqRecord objects for each contig sequence. 
 14  """ 
 15   
 16  from Bio import Seq 
 17  from Bio.Alphabet import IUPAC 
 18   
 19  CKEYWORDS=['CHROMAT_FILE','ABI_THUMBPRINT','PHRED_VERSION','CALL_METHOD',\ 
 20          'QUALITY_LEVELS','TIME','TRACE_ARRAY_MIN_INDEX','TRACE_ARRAY_MAX_INDEX',\ 
 21          'TRIM','TRACE_PEAK_AREA_RATIO','CHEM','DYE'] 
 22   
23 -class Record:
24 """Hold information from a PHD file."""
25 - def __init__(self):
26 self.file_name = '' 27 self.comments={} 28 for kw in CKEYWORDS: 29 self.comments[kw.lower()]=None 30 self.sites = [] 31 self.seq = '' 32 self.seq_trimmed = ''
33 34
35 -def read(handle):
36 """Reads the next PHD record from the file, returning it as a Record object. 37 38 This function reads PHD file data line by line from the handle, 39 and returns a single Record object. 40 """ 41 for line in handle: 42 if line.startswith("BEGIN_SEQUENCE"): 43 record = Record() 44 record.file_name = line[15:].rstrip() 45 break 46 else: 47 return # No record found 48 49 for line in handle: 50 if line.startswith("BEGIN_COMMENT"): 51 break 52 else: 53 raise ValueError("Failed to find BEGIN_COMMENT line") 54 55 for line in handle: 56 line = line.strip() 57 if not line: 58 continue 59 if line=="END_COMMENT": 60 break 61 keyword, value = line.split(":", 1) 62 keyword = keyword.lower() 63 value = value.strip() 64 if keyword in ('chromat_file', 65 'phred_version', 66 'call_method', 67 'chem', 68 'dye', 69 'time', 70 'basecaller_version', 71 'trace_processor_version'): 72 record.comments[keyword] = value 73 elif keyword in ('abi_thumbprint', 74 'quality_levels', 75 'trace_array_min_index', 76 'trace_array_max_index'): 77 record.comments[keyword] = int(value) 78 elif keyword=='trace_peak_area_ratio': 79 record.comments[keyword] = float(value) 80 elif keyword=='trim': 81 first, last, prob = value.split() 82 record.comments[keyword] = (int(first), int(last), float(prob)) 83 else: 84 raise ValueError("Failed to find END_COMMENT line") 85 86 for line in handle: 87 if line.startswith('BEGIN_DNA'): 88 break 89 else: 90 raise ValueError("Failed to find BEGIN_DNA line") 91 92 for line in handle: 93 if line.startswith('END_DNA'): 94 break 95 else: 96 base, quality, location = line.split() 97 record.sites.append((base, quality, location)) 98 99 for line in handle: 100 if line.startswith("END_SEQUENCE"): 101 break 102 else: 103 raise ValueError("Failed to find END_SEQUENCE line") 104 105 alphabet = IUPAC.IUPACAmbiguousDNA() 106 record.seq = Seq.Seq(''.join([n[0] for n in record.sites]), alphabet) 107 if record.comments['trim'] is not None: 108 first, last = record.comments['trim'][:2] 109 record.seq_trimmed = record.seq[first:last] 110 111 return record
112
113 -def parse(handle):
114 """Iterates over a file returning multiple PHD records. 115 116 The data is read line by line from the handle. The handle can be a list 117 of lines, an open file, or similar; the only requirement is that we can 118 iterate over the handle to retrieve lines from it. 119 120 Typical usage: 121 122 records = parse(handle) 123 for record in records: 124 # do something with the record object 125 """ 126 while True: 127 record = read(handle) 128 if not record: 129 return 130 yield record
131 132 133 # ---------- Everything below is deprecated 134 135 from Bio import File 136 from Bio.ParserSupport import * 137 138
139 -class Iterator:
140 """Iterates over a file of multiple PHD records (DEPRECATED). 141 142 Methods: 143 next Return the next record from the stream, or None. 144 """ 145
146 - def __init__(self, handle, parser=None):
147 """Create a new iterator. 148 149 Create a new iterator. handle is a file-like object. parser 150 is an optional Parser object to change the results into another form. 151 If set to None, then the raw contents of the file will be returned. 152 """ 153 import warnings 154 warnings.warn("Bio.Sequencing.Phd.Iterator is deprecated. Please use Bio.Sequencing.Phd.parse(handle) instead of Bio.Sequencing.Phd.Iterator(handle, RecordParser())", DeprecationWarning) 155 self._uhandle = File.UndoHandle(handle) 156 self._parser = parser
157
158 - def next(self):
159 lines = [] 160 while 1: 161 line = self._uhandle.readline() 162 if not line: 163 break 164 # If a new record, then put the line back and stop. 165 if lines and line[:14] == 'BEGIN_SEQUENCE': 166 self._uhandle.saveline(line) 167 break 168 lines.append(line) 169 170 if not lines: 171 return None 172 173 data = ''.join(lines) 174 if self._parser is not None: 175 return self._parser.parse(File.StringHandle(data)) 176 return data
177
178 - def __iter__(self):
179 """Iterate over the PHY file record by record.""" 180 return iter(self.next, None)
181
182 -class RecordParser(AbstractParser):
183 """Parses PHD file data into a Record object (DEPRECATED)."""
184 - def __init__(self):
185 import warnings 186 warnings.warn("Bio.Sequencing.Phd.RecordParser is deprecated. Please use Bio.Sequencing.Phd.read(handle) instead of Bio.Sequencing.Phd.RecordParser().parse(handle)", DeprecationWarning) 187 self._scanner = _Scanner() 188 self._consumer = _RecordConsumer()
189
190 - def parse(self, handle):
191 if isinstance(handle, File.UndoHandle): 192 uhandle = handle 193 else: 194 uhandle = File.UndoHandle(handle) 195 self._scanner.feed(uhandle, self._consumer) 196 return self._consumer.data
197 198
199 -class _Scanner:
200 """Scans a PHD-formatted file (DEPRECATED). 201 202 Methods: 203 feed - Feed one PHD record. 204 """
205 - def feed(self, handle, consumer):
206 """Reads in PDH data from the handle for scanning. 207 208 Feed in PHD data for scanning. handle is a file-like object 209 containing PHD data. consumer is a Consumer object that will 210 receive events as the PHD data is scanned. 211 """ 212 assert isinstance(handle, File.UndoHandle), \ 213 "handle must be an UndoHandle" 214 if handle.peekline(): 215 self._scan_record(handle, consumer)
216
217 - def _scan_record(self, uhandle, consumer):
218 self._scan_begin_sequence(uhandle, consumer) 219 self._scan_comments(uhandle, consumer) 220 self._scan_dna(uhandle, consumer) 221 consumer.end_sequence()
222
223 - def _scan_begin_sequence(self, uhandle, consumer):
224 read_and_call(uhandle, consumer.begin_sequence, start = 'BEGIN_SEQUENCE')
225
226 - def _scan_comments(self, uhandle, consumer):
227 228 read_and_call_while(uhandle, consumer.noevent, blank=1) 229 read_and_call(uhandle, consumer.noevent, start = 'BEGIN_COMMENT') 230 read_and_call_while(uhandle, consumer.noevent, blank=1) 231 232 while 1: 233 for kw in CKEYWORDS: 234 if attempt_read_and_call(uhandle,getattr(consumer,kw.lower()),start=kw+':'): 235 break # recognized keyword: end for loop and do another while 236 else: 237 break # no keywords found: end while loop 238 239 read_and_call_while(uhandle, consumer.noevent, blank=1) 240 read_and_call(uhandle, consumer.noevent, start = 'END_COMMENT')
241
242 - def _scan_dna(self, uhandle, consumer):
243 while 1: 244 line = uhandle.readline() 245 if is_blank_line(line) or line == 'BEGIN_DNA\n': 246 continue 247 elif line == 'END_DNA\n': 248 break 249 consumer.read_dna(line)
250 251
252 -class _RecordConsumer(AbstractConsumer):
253 """Consumer that converts a PHD record to a Record object (DEPRECATED)."""
254 - def __init__(self):
255 self.data = None
256
257 - def begin_sequence(self, line):
258 self.data = Record() 259 self.data.file_name = line[15:].rstrip()
260
261 - def end_sequence(self):
262 self.data.seq = Seq.Seq(''.join([n[0] for n in self.data.sites]), IUPAC.IUPACAmbiguousDNA()) 263 if self.data.comments['trim'] is not None : 264 first = self.data.comments['trim'][0] 265 last = self.data.comments['trim'][1] 266 self.data.seq_trimmed = self.data.seq[first:last]
267
268 - def chromat_file(self, line):
269 self.data.comments['chromat_file'] = line[13:-1].strip()
270
271 - def abi_thumbprint(self, line):
272 self.data.comments['abi_thumbprint'] = int(line[15:-1].strip())
273
274 - def phred_version(self, line):
275 self.data.comments['phred_version'] = line[14:-1].strip()
276
277 - def call_method(self, line):
278 self.data.comments['call_method'] = line[12:-1].strip()
279
280 - def quality_levels(self, line):
281 self.data.comments['quality_levels'] = int(line[15:-1].strip())
282
283 - def time(self, line):
284 self.data.comments['time'] = line[5:-1].strip()
285
286 - def trace_array_min_index(self, line):
287 self.data.comments['trace_array_min_index'] = int(line[22:-1].strip())
288
289 - def trace_array_max_index(self, line):
290 self.data.comments['trace_array_max_index'] = int(line[22:-1].strip())
291
292 - def trim(self, line):
293 first, last, prob = line[5:-1].split() 294 self.data.comments['trim'] = (int(first), int(last), float(prob))
295
296 - def trace_peak_area_ratio(self, line):
297 self.data.comments['trace_peak_area_ratio'] = float(line[22:-1].strip())
298
299 - def chem(self, line):
300 self.data.comments['chem'] = line[5:-1].strip()
301
302 - def dye(self, line):
303 self.data.comments['dye'] = line[4:-1].strip()
304
305 - def read_dna(self, line):
306 base, quality, location = line.split() 307 self.data.sites.append((base, quality, location))
308 309 if __name__ == "__main__" : 310 print "Quick self test" 311 #Test the iterator, 312 handle = open("../../Tests/Phd/phd1") 313 records = parse(handle) 314 for record in records: 315 print record.file_name, len(record.seq) 316 handle.close() 317 print "Done" 318