Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006-2008 by Peter Cock.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6   
   7  """Code to work with GenBank formatted files. 
   8   
   9  Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with 
  10  the "genbank" or "embl" format names to parse GenBank or EMBL files into 
  11  SeqRecord and SeqFeature objects (see the Biopython tutorial for details). 
  12   
  13  Also, rather than using Bio.GenBank to search or download files from the NCBI, 
  14  you are now encouraged to use Bio.Entrez instead (again, see the Biopython 
  15  tutorial for details). 
  16   
  17  Currently the ONLY reason to use Bio.GenBank directly is for the RecordParser 
  18  which turns a GenBank file into GenBank-specific Record objects.  This is a 
  19  much closer representation to the raw file contents that the SeqRecord 
  20  alternative from the FeatureParser (used in Bio.SeqIO). 
  21   
  22  Classes: 
  23  Iterator              Iterate through a file of GenBank entries 
  24  ErrorFeatureParser    Catch errors caused during parsing. 
  25  FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects. 
  26  RecordParser          Parse GenBank data into a Record object. 
  27   
  28  Exceptions: 
  29  ParserFailureError    Exception indicating a failure in the parser (ie. 
  30                        scanner or consumer) 
  31  LocationParserError   Exception indiciating a problem with the spark based 
  32                        location parser. 
  33   
  34   
  35  17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  36  These are GenBank files that summarize the content of a project, and provide lists of 
  37  scaffold and contig files in the project. These will be in annotations['wgs'] and 
  38  annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  39  http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  40   
  41  http://is.gd/nNgk 
  42  for more details of this format, and an example. 
  43  Added by Ying Huang & Iddo Friedberg 
  44  """ 
  45  import cStringIO 
  46  import re 
  47   
  48  # other Biopython stuff 
  49  from Bio import SeqFeature 
  50  from Bio.ParserSupport import AbstractConsumer 
  51  from Bio import Entrez 
  52   
  53  # other Bio.GenBank stuff 
  54  from utils import FeatureValueCleaner 
  55  from Scanner import GenBankScanner 
  56   
  57  #Constants used to parse GenBank header lines 
  58  GENBANK_INDENT = 12 
  59  GENBANK_SPACER = " " * GENBANK_INDENT 
  60   
  61  #Constants for parsing GenBank feature lines 
  62  FEATURE_KEY_INDENT = 5 
  63  FEATURE_QUALIFIER_INDENT = 21 
  64  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  65  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  66   
  67  #Regular expresions for location parsing 
  68  _solo_location = r"[<>]?\d+" 
  69  _pair_location = r"[<>]?\d+\.\.[<>]?\d+" 
  70  _between_location = r"\d+\^\d+" 
  71   
  72  _within_position = r"\(\d+\.\d+\)" 
  73  _re_within_position = re.compile(_within_position) 
  74  _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  75                     % (_within_position,_within_position) 
  76  assert _re_within_position.match("(3.9)") 
  77  assert re.compile(_within_location).match("(3.9)..10") 
  78  assert re.compile(_within_location).match("26..(30.33)") 
  79  assert re.compile(_within_location).match("(13.19)..(20.28)") 
  80   
  81  _oneof_position = r"one\-of\(\d+(,\d+)+\)" 
  82  _re_oneof_position = re.compile(_oneof_position) 
  83  _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  84                     % (_oneof_position,_oneof_position) 
  85  assert _re_oneof_position.match("one-of(6,9)") 
  86  assert re.compile(_oneof_location).match("one-of(6,9)..101") 
  87  assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)") 
  88  assert re.compile(_oneof_location).match("6..one-of(101,104)") 
  89   
  90  assert not _re_oneof_position.match("one-of(3)") 
  91  assert _re_oneof_position.match("one-of(3,6)") 
  92  assert _re_oneof_position.match("one-of(3,6,9)") 
  93   
  94   
  95  _simple_location = r"\d+\.\.\d+" 
  96  _re_simple_location = re.compile(_simple_location) 
  97  _re_simple_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" \ 
  98                                   % (_simple_location, _simple_location)) 
  99  _complex_location = r"([a-zA-z][a-zA-Z0-9]*(\.[a-zA-Z0-9]+)?\:)?(%s|%s|%s|%s|%s)" \ 
 100                      % (_pair_location, _solo_location, _between_location, 
 101                         _within_location, _oneof_location) 
 102  _re_complex_location = re.compile(r"^%s$" % _complex_location) 
 103  _possibly_complemented_complex_location = r"(%s|complement\(%s\))" \ 
 104                                            % (_complex_location, _complex_location) 
 105  _re_complex_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" \ 
 106                                   % (_possibly_complemented_complex_location, 
 107                                      _possibly_complemented_complex_location)) 
 108   
 109  assert _re_simple_location.match("104..160") 
 110  assert not _re_simple_location.match("<104..>160") 
 111  assert not _re_simple_location.match("104") 
 112  assert not _re_simple_location.match("<1") 
 113  assert not _re_simple_location.match(">99999") 
 114  assert not _re_simple_location.match("join(104..160,320..390,504..579)") 
 115  assert not _re_simple_compound.match("bond(12,63)") 
 116  assert _re_simple_compound.match("join(104..160,320..390,504..579)") 
 117  assert _re_simple_compound.match("order(1..69,1308..1465)") 
 118  assert not _re_simple_compound.match("order(1..69,1308..1465,1524)") 
 119  assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)") 
 120  assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)") 
 121  assert not _re_simple_compound.match("join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)") 
 122  assert not _re_simple_compound.match("test(1..69,1308..1465)") 
 123  assert not _re_simple_compound.match("complement(1..69)") 
 124  assert not _re_simple_compound.match("(1..69)") 
 125  assert _re_complex_location.match("(3.9)..10") 
 126  assert _re_complex_location.match("26..(30.33)") 
 127  assert _re_complex_location.match("(13.19)..(20.28)") 
 128  assert _re_complex_location.match("41^42") #between 
 129  assert _re_complex_location.match("AL121804:41^42") 
 130  assert _re_complex_location.match("AL121804:41..610") 
 131  assert _re_complex_location.match("AL121804.2:41..610") 
 132  assert _re_complex_location.match("one-of(3,6)..101") 
 133  assert _re_complex_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 134  assert not _re_simple_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 135  assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)") 
 136   
137 -def _pos(pos_str, offset=0):
138 """Build a Position object (PRIVATE). 139 140 For an end position, leave offset as zero (default): 141 142 >>> _pos("5") 143 ExactPosition(5) 144 145 For a start position, set offset to minus one (for Python counting): 146 147 >>> _pos("5", -1) 148 ExactPosition(4) 149 150 This also covers fuzzy positions: 151 152 >>> _pos("<5") 153 BeforePosition(5) 154 >>> _pos(">5") 155 AfterPosition(5) 156 >>> _pos("one-of(5,8,11)") 157 OneOfPosition([ExactPosition(5), ExactPosition(8), ExactPosition(11)]) 158 >>> _pos("(8.10)") 159 WithinPosition(8,2) 160 """ 161 if pos_str.startswith("<"): 162 return SeqFeature.BeforePosition(int(pos_str[1:])+offset) 163 elif pos_str.startswith(">"): 164 return SeqFeature.AfterPosition(int(pos_str[1:])+offset) 165 elif _re_within_position.match(pos_str): 166 s,e = pos_str[1:-1].split(".") 167 return SeqFeature.WithinPosition(int(s)+offset, int(e)-int(s)) 168 elif _re_oneof_position.match(pos_str): 169 assert pos_str.startswith("one-of(") 170 assert pos_str[-1]==")" 171 parts = [SeqFeature.ExactPosition(int(pos)+offset) \ 172 for pos in pos_str[7:-1].split(",")] 173 return SeqFeature.OneOfPosition(parts) 174 else: 175 return SeqFeature.ExactPosition(int(pos_str)+offset)
176
177 -def _loc(loc_str, expected_seq_length):
178 """FeatureLocation from non-compound non-complement location (PRIVATE). 179 180 Simple examples, 181 182 >>> _loc("123..456", 1000) 183 FeatureLocation(ExactPosition(122),ExactPosition(456)) 184 >>> _loc("<123..>456", 1000) 185 FeatureLocation(BeforePosition(122),AfterPosition(456)) 186 187 A more complex location using within positions, 188 189 >>> _loc("(9.10)..(20.25)", 1000) 190 FeatureLocation(WithinPosition(8,1),WithinPosition(20,5)) 191 192 Zero length between feature, 193 194 >>> _loc("123^124", 1000) 195 FeatureLocation(ExactPosition(123),ExactPosition(123)) 196 197 The expected sequence length is needed for a special case, a between 198 position at the start/end of a circular genome: 199 200 >>> _loc("1000^1", 1000) 201 FeatureLocation(ExactPosition(1000),ExactPosition(1000)) 202 203 Apart from this special case, between positions P^Q must have P+1==Q, 204 205 >>> _loc("123^456", 1000) 206 Traceback (most recent call last): 207 ... 208 ValueError: Invalid between location '123^456' 209 """ 210 try: 211 s, e = loc_str.split("..") 212 except ValueError: 213 assert ".." not in loc_str 214 if "^" in loc_str: 215 #A between location like "67^68" (one based counting) is a 216 #special case (note it has zero length). In python slice 217 #notation this is 67:67, a zero length slice. See Bug 2622 218 #Further more, on a circular genome of length N you can have 219 #a location N^1 meaning the junction at the origin. See Bug 3098. 220 #NOTE - We can imagine between locations like "2^4", but this 221 #is just "3". Similarly, "2^5" is just "3..4" 222 s, e = loc_str.split("^") 223 if int(s)+1==int(e): 224 pos = _pos(s) 225 elif int(s)==expected_seq_length and e=="1": 226 pos = _pos(s) 227 else: 228 raise ValueError("Invalid between location %s" % repr(loc_str)) 229 return SeqFeature.FeatureLocation(pos, pos) 230 else: 231 #e.g. "123" 232 s = loc_str 233 e = loc_str 234 return SeqFeature.FeatureLocation(_pos(s,-1), _pos(e))
235
236 -def _split_compound_loc(compound_loc):
237 """Split a tricky compound location string (PRIVATE). 238 239 >>> list(_split_compound_loc("123..145")) 240 ['123..145'] 241 >>> list(_split_compound_loc("123..145,200..209")) 242 ['123..145', '200..209'] 243 >>> list(_split_compound_loc("one-of(200,203)..300")) 244 ['one-of(200,203)..300'] 245 >>> list(_split_compound_loc("complement(123..145),200..209")) 246 ['complement(123..145)', '200..209'] 247 >>> list(_split_compound_loc("123..145,one-of(200,203)..209")) 248 ['123..145', 'one-of(200,203)..209'] 249 >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300")) 250 ['123..145', 'one-of(200,203)..one-of(209,211)', '300'] 251 >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300")) 252 ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300'] 253 >>> list(_split_compound_loc("123..145,200..one-of(209,211),300")) 254 ['123..145', '200..one-of(209,211)', '300'] 255 >>> list(_split_compound_loc("123..145,200..one-of(209,211)")) 256 ['123..145', '200..one-of(209,211)'] 257 """ 258 if "one-of(" in compound_loc: 259 #Hard case 260 while "," in compound_loc: 261 assert compound_loc[0] != "," 262 assert compound_loc[0:2] != ".." 263 i = compound_loc.find(",") 264 part = compound_loc[:i] 265 compound_loc = compound_loc[i:] #includes the comma 266 while part.count("(") > part.count(")"): 267 assert "one-of(" in part, (part, compound_loc) 268 i = compound_loc.find(")") 269 part += compound_loc[:i+1] 270 compound_loc = compound_loc[i+1:] 271 if compound_loc.startswith(".."): 272 i = compound_loc.find(",") 273 if i==-1: 274 part += compound_loc 275 compound_loc = "" 276 else: 277 part += compound_loc[:i] 278 compound_loc = compound_loc[i:] #includes the comma 279 while part.count("(") > part.count(")"): 280 assert part.count("one-of(") == 2 281 i = compound_loc.find(")") 282 part += compound_loc[:i+1] 283 compound_loc = compound_loc[i+1:] 284 if compound_loc.startswith(","): 285 compound_loc = compound_loc[1:] 286 assert part 287 yield part 288 if compound_loc: 289 yield compound_loc 290 else: 291 #Easy case 292 for part in compound_loc.split(","): 293 yield part
294
295 -class Iterator:
296 """Iterator interface to move over a file of GenBank entries one at a time. 297 """
298 - def __init__(self, handle, parser = None):
299 """Initialize the iterator. 300 301 Arguments: 302 o handle - A handle with GenBank entries to iterate through. 303 o parser - An optional parser to pass the entries through before 304 returning them. If None, then the raw entry will be returned. 305 """ 306 self.handle = handle 307 self._parser = parser
308
309 - def next(self):
310 """Return the next GenBank record from the handle. 311 312 Will return None if we ran out of records. 313 """ 314 if self._parser is None: 315 lines = [] 316 while True: 317 line = self.handle.readline() 318 if not line : return None #Premature end of file? 319 lines.append(line) 320 if line.rstrip() == "//" : break 321 return "".join(lines) 322 try: 323 return self._parser.parse(self.handle) 324 except StopIteration: 325 return None
326
327 - def __iter__(self):
328 return iter(self.next, None)
329
330 -class ParserFailureError(Exception):
331 """Failure caused by some kind of problem in the parser. 332 """ 333 pass
334
335 -class LocationParserError(Exception):
336 """Could not Properly parse out a location from a GenBank file. 337 """ 338 pass
339
340 -class FeatureParser:
341 """Parse GenBank files into Seq + Feature objects. 342 """
343 - def __init__(self, debug_level = 0, use_fuzziness = 1, 344 feature_cleaner = FeatureValueCleaner()):
345 """Initialize a GenBank parser and Feature consumer. 346 347 Arguments: 348 o debug_level - An optional argument that species the amount of 349 debugging information the parser should spit out. By default we have 350 no debugging info (the fastest way to do things), but if you want 351 you can set this as high as two and see exactly where a parse fails. 352 o use_fuzziness - Specify whether or not to use fuzzy representations. 353 The default is 1 (use fuzziness). 354 o feature_cleaner - A class which will be used to clean out the 355 values of features. This class must implement the function 356 clean_value. GenBank.utils has a "standard" cleaner class, which 357 is used by default. 358 """ 359 self._scanner = GenBankScanner(debug_level) 360 self.use_fuzziness = use_fuzziness 361 self._cleaner = feature_cleaner
362
363 - def parse(self, handle):
364 """Parse the specified handle. 365 """ 366 self._consumer = _FeatureConsumer(self.use_fuzziness, 367 self._cleaner) 368 self._scanner.feed(handle, self._consumer) 369 return self._consumer.data
370
371 -class RecordParser:
372 """Parse GenBank files into Record objects 373 """
374 - def __init__(self, debug_level = 0):
375 """Initialize the parser. 376 377 Arguments: 378 o debug_level - An optional argument that species the amount of 379 debugging information the parser should spit out. By default we have 380 no debugging info (the fastest way to do things), but if you want 381 you can set this as high as two and see exactly where a parse fails. 382 """ 383 self._scanner = GenBankScanner(debug_level)
384
385 - def parse(self, handle):
386 """Parse the specified handle into a GenBank record. 387 """ 388 self._consumer = _RecordConsumer() 389 self._scanner.feed(handle, self._consumer) 390 return self._consumer.data
391
392 -class _BaseGenBankConsumer(AbstractConsumer):
393 """Abstract GenBank consumer providing useful general functions. 394 395 This just helps to eliminate some duplication in things that most 396 GenBank consumers want to do. 397 """ 398 # Special keys in GenBank records that we should remove spaces from 399 # For instance, \translation keys have values which are proteins and 400 # should have spaces and newlines removed from them. This class 401 # attribute gives us more control over specific formatting problems. 402 remove_space_keys = ["translation"] 403
404 - def __init__(self):
405 pass
406
407 - def _split_keywords(self, keyword_string):
408 """Split a string of keywords into a nice clean list. 409 """ 410 # process the keywords into a python list 411 if keyword_string == "" or keyword_string == ".": 412 keywords = "" 413 elif keyword_string[-1] == '.': 414 keywords = keyword_string[:-1] 415 else: 416 keywords = keyword_string 417 keyword_list = keywords.split(';') 418 clean_keyword_list = [x.strip() for x in keyword_list] 419 return clean_keyword_list
420
421 - def _split_accessions(self, accession_string):
422 """Split a string of accession numbers into a list. 423 """ 424 # first replace all line feeds with spaces 425 # Also, EMBL style accessions are split with ';' 426 accession = accession_string.replace("\n", " ").replace(";"," ") 427 428 return [x.strip() for x in accession.split() if x.strip()]
429
430 - def _split_taxonomy(self, taxonomy_string):
431 """Split a string with taxonomy info into a list. 432 """ 433 if not taxonomy_string or taxonomy_string==".": 434 #Missing data, no taxonomy 435 return [] 436 437 if taxonomy_string[-1] == '.': 438 tax_info = taxonomy_string[:-1] 439 else: 440 tax_info = taxonomy_string 441 tax_list = tax_info.split(';') 442 new_tax_list = [] 443 for tax_item in tax_list: 444 new_items = tax_item.split("\n") 445 new_tax_list.extend(new_items) 446 while '' in new_tax_list: 447 new_tax_list.remove('') 448 clean_tax_list = [x.strip() for x in new_tax_list] 449 450 return clean_tax_list
451
452 - def _clean_location(self, location_string):
453 """Clean whitespace out of a location string. 454 455 The location parser isn't a fan of whitespace, so we clean it out 456 before feeding it into the parser. 457 """ 458 #Originally this imported string.whitespace and did a replace 459 #via a loop. It's simpler to just split on whitespace and rejoin 460 #the string - and this avoids importing string too. See Bug 2684. 461 return ''.join(location_string.split())
462
463 - def _remove_newlines(self, text):
464 """Remove any newlines in the passed text, returning the new string. 465 """ 466 # get rid of newlines in the qualifier value 467 newlines = ["\n", "\r"] 468 for ws in newlines: 469 text = text.replace(ws, "") 470 471 return text
472
473 - def _normalize_spaces(self, text):
474 """Replace multiple spaces in the passed text with single spaces. 475 """ 476 # get rid of excessive spaces 477 text_parts = text.split(" ") 478 text_parts = filter(None, text_parts) 479 return ' '.join(text_parts)
480
481 - def _remove_spaces(self, text):
482 """Remove all spaces from the passed text. 483 """ 484 return text.replace(" ", "")
485
486 - def _convert_to_python_numbers(self, start, end):
487 """Convert a start and end range to python notation. 488 489 In GenBank, starts and ends are defined in "biological" coordinates, 490 where 1 is the first base and [i, j] means to include both i and j. 491 492 In python, 0 is the first base and [i, j] means to include i, but 493 not j. 494 495 So, to convert "biological" to python coordinates, we need to 496 subtract 1 from the start, and leave the end and things should 497 be converted happily. 498 """ 499 new_start = start - 1 500 new_end = end 501 502 return new_start, new_end
503
504 -class _FeatureConsumer(_BaseGenBankConsumer):
505 """Create a SeqRecord object with Features to return. 506 507 Attributes: 508 o use_fuzziness - specify whether or not to parse with fuzziness in 509 feature locations. 510 o feature_cleaner - a class that will be used to provide specialized 511 cleaning-up of feature values. 512 """
513 - def __init__(self, use_fuzziness, feature_cleaner = None):
514 from Bio.SeqRecord import SeqRecord 515 _BaseGenBankConsumer.__init__(self) 516 self.data = SeqRecord(None, id = None) 517 self.data.id = None 518 self.data.description = "" 519 520 self._use_fuzziness = use_fuzziness 521 self._feature_cleaner = feature_cleaner 522 523 self._seq_type = '' 524 self._seq_data = [] 525 self._cur_reference = None 526 self._cur_feature = None 527 self._cur_qualifier_key = None 528 self._cur_qualifier_value = None 529 self._expected_size = None
530
531 - def locus(self, locus_name):
532 """Set the locus name is set as the name of the Sequence. 533 """ 534 self.data.name = locus_name
535
536 - def size(self, content):
537 """Record the sequence length.""" 538 self._expected_size = int(content)
539
540 - def residue_type(self, type):
541 """Record the sequence type so we can choose an appropriate alphabet. 542 """ 543 self._seq_type = type
544
545 - def data_file_division(self, division):
546 self.data.annotations['data_file_division'] = division
547
548 - def date(self, submit_date):
549 self.data.annotations['date'] = submit_date
550
551 - def definition(self, definition):
552 """Set the definition as the description of the sequence. 553 """ 554 if self.data.description: 555 #Append to any existing description 556 #e.g. EMBL files with two DE lines. 557 self.data.description += " " + definition 558 else: 559 self.data.description = definition
560
561 - def accession(self, acc_num):
562 """Set the accession number as the id of the sequence. 563 564 If we have multiple accession numbers, the first one passed is 565 used. 566 """ 567 new_acc_nums = self._split_accessions(acc_num) 568 569 #Also record them ALL in the annotations 570 try: 571 #On the off chance there was more than one accession line: 572 for acc in new_acc_nums: 573 #Prevent repeat entries 574 if acc not in self.data.annotations['accessions']: 575 self.data.annotations['accessions'].append(acc) 576 except KeyError: 577 self.data.annotations['accessions'] = new_acc_nums 578 579 # if we haven't set the id information yet, add the first acc num 580 if self.data.id is None: 581 if len(new_acc_nums) > 0: 582 #self.data.id = new_acc_nums[0] 583 #Use the FIRST accession as the ID, not the first on this line! 584 self.data.id = self.data.annotations['accessions'][0]
585
586 - def wgs(self, content):
587 self.data.annotations['wgs'] = content.split('-')
588
589 - def add_wgs_scafld(self, content):
590 self.data.annotations.setdefault('wgs_scafld',[]).append(content.split('-'))
591
592 - def nid(self, content):
593 self.data.annotations['nid'] = content
594
595 - def pid(self, content):
596 self.data.annotations['pid'] = content
597
598 - def version(self, version_id):
599 #Want to use the versioned accession as the record.id 600 #This comes from the VERSION line in GenBank files, or the 601 #obsolete SV line in EMBL. For the new EMBL files we need 602 #both the version suffix from the ID line and the accession 603 #from the AC line. 604 if version_id.count(".")==1 and version_id.split(".")[1].isdigit(): 605 self.accession(version_id.split(".")[0]) 606 self.version_suffix(version_id.split(".")[1]) 607 else: 608 #For backwards compatibility... 609 self.data.id = version_id
610
611 - def project(self, content):
612 """Handle the information from the PROJECT line as a list of projects. 613 614 e.g. 615 PROJECT GenomeProject:28471 616 617 or: 618 PROJECT GenomeProject:13543 GenomeProject:99999 619 620 This is stored as dbxrefs in the SeqRecord to be consistent with the 621 projected switch of this line to DBLINK in future GenBank versions. 622 Note the NCBI plan to replace "GenomeProject:28471" with the shorter 623 "Project:28471" as part of this transition. 624 """ 625 content = content.replace("GenomeProject:", "Project:") 626 self.data.dbxrefs.extend([p for p in content.split() if p])
627 653
654 - def version_suffix(self, version):
655 """Set the version to overwrite the id. 656 657 Since the verison provides the same information as the accession 658 number, plus some extra info, we set this as the id if we have 659 a version. 660 """ 661 #e.g. GenBank line: 662 #VERSION U49845.1 GI:1293613 663 #or the obsolete EMBL line: 664 #SV U49845.1 665 #Scanner calls consumer.version("U49845.1") 666 #which then calls consumer.version_suffix(1) 667 # 668 #e.g. EMBL new line: 669 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 670 #Scanner calls consumer.version_suffix(1) 671 assert version.isdigit() 672 self.data.annotations['sequence_version'] = int(version)
673
674 - def db_source(self, content):
675 self.data.annotations['db_source'] = content.rstrip()
676
677 - def gi(self, content):
678 self.data.annotations['gi'] = content
679
680 - def keywords(self, content):
681 self.data.annotations['keywords'] = self._split_keywords(content)
682
683 - def segment(self, content):
684 self.data.annotations['segment'] = content
685
686 - def source(self, content):
687 #Note that some software (e.g. VectorNTI) may produce an empty 688 #source (rather than using a dot/period as might be expected). 689 if content == "": 690 source_info = "" 691 elif content[-1] == '.': 692 source_info = content[:-1] 693 else: 694 source_info = content 695 self.data.annotations['source'] = source_info
696
697 - def organism(self, content):
698 self.data.annotations['organism'] = content
699
700 - def taxonomy(self, content):
701 """Records (another line of) the taxonomy lineage. 702 """ 703 lineage = self._split_taxonomy(content) 704 try: 705 self.data.annotations['taxonomy'].extend(lineage) 706 except KeyError: 707 self.data.annotations['taxonomy'] = lineage
708
709 - def reference_num(self, content):
710 """Signal the beginning of a new reference object. 711 """ 712 # if we have a current reference that hasn't been added to 713 # the list of references, add it. 714 if self._cur_reference is not None: 715 self.data.annotations['references'].append(self._cur_reference) 716 else: 717 self.data.annotations['references'] = [] 718 719 self._cur_reference = SeqFeature.Reference()
720
721 - def reference_bases(self, content):
722 """Attempt to determine the sequence region the reference entails. 723 724 Possible types of information we may have to deal with: 725 726 (bases 1 to 86436) 727 (sites) 728 (bases 1 to 105654; 110423 to 111122) 729 1 (residues 1 to 182) 730 """ 731 # first remove the parentheses or other junk 732 ref_base_info = content[1:-1] 733 734 all_locations = [] 735 # parse if we've got 'bases' and 'to' 736 if ref_base_info.find('bases') != -1 and \ 737 ref_base_info.find('to') != -1: 738 # get rid of the beginning 'bases' 739 ref_base_info = ref_base_info[5:] 740 locations = self._split_reference_locations(ref_base_info) 741 all_locations.extend(locations) 742 elif (ref_base_info.find("residues") >= 0 and 743 ref_base_info.find("to") >= 0): 744 residues_start = ref_base_info.find("residues") 745 # get only the information after "residues" 746 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 747 locations = self._split_reference_locations(ref_base_info) 748 all_locations.extend(locations) 749 750 # make sure if we are not finding information then we have 751 # the string 'sites' or the string 'bases' 752 elif (ref_base_info == 'sites' or 753 ref_base_info.strip() == 'bases'): 754 pass 755 # otherwise raise an error 756 else: 757 raise ValueError("Could not parse base info %s in record %s" % 758 (ref_base_info, self.data.id)) 759 760 self._cur_reference.location = all_locations
761
762 - def _split_reference_locations(self, location_string):
763 """Get reference locations out of a string of reference information 764 765 The passed string should be of the form: 766 767 1 to 20; 20 to 100 768 769 This splits the information out and returns a list of location objects 770 based on the reference locations. 771 """ 772 # split possibly multiple locations using the ';' 773 all_base_info = location_string.split(';') 774 775 new_locations = [] 776 for base_info in all_base_info: 777 start, end = base_info.split('to') 778 new_start, new_end = \ 779 self._convert_to_python_numbers(int(start.strip()), 780 int(end.strip())) 781 this_location = SeqFeature.FeatureLocation(new_start, new_end) 782 new_locations.append(this_location) 783 return new_locations
784
785 - def authors(self, content):
786 if self._cur_reference.authors: 787 self._cur_reference.authors += ' ' + content 788 else: 789 self._cur_reference.authors = content
790
791 - def consrtm(self, content):
792 if self._cur_reference.consrtm: 793 self._cur_reference.consrtm += ' ' + content 794 else: 795 self._cur_reference.consrtm = content
796
797 - def title(self, content):
798 if self._cur_reference is None: 799 import warnings 800 warnings.warn("GenBank TITLE line without REFERENCE line.") 801 elif self._cur_reference.title: 802 self._cur_reference.title += ' ' + content 803 else: 804 self._cur_reference.title = content
805
806 - def journal(self, content):
807 if self._cur_reference.journal: 808 self._cur_reference.journal += ' ' + content 809 else: 810 self._cur_reference.journal = content
811
812 - def medline_id(self, content):
813 self._cur_reference.medline_id = content
814
815 - def pubmed_id(self, content):
816 self._cur_reference.pubmed_id = content
817
818 - def remark(self, content):
819 """Deal with a reference comment.""" 820 if self._cur_reference.comment: 821 self._cur_reference.comment += ' ' + content 822 else: 823 self._cur_reference.comment = content
824
825 - def comment(self, content):
826 try: 827 self.data.annotations['comment'] += "\n" + "\n".join(content) 828 except KeyError: 829 self.data.annotations['comment'] = "\n".join(content)
830
831 - def features_line(self, content):
832 """Get ready for the feature table when we reach the FEATURE line. 833 """ 834 self.start_feature_table()
835
836 - def start_feature_table(self):
837 """Indicate we've got to the start of the feature table. 838 """ 839 # make sure we've added on our last reference object 840 if self._cur_reference is not None: 841 self.data.annotations['references'].append(self._cur_reference) 842 self._cur_reference = None
843
844 - def _add_feature(self):
845 """Utility function to add a feature to the SeqRecord. 846 847 This does all of the appropriate checking to make sure we haven't 848 left any info behind, and that we are only adding info if it 849 exists. 850 """ 851 if self._cur_feature: 852 # if we have a left over qualifier, add it to the qualifiers 853 # on the current feature 854 self._add_qualifier() 855 856 self._cur_qualifier_key = '' 857 self._cur_qualifier_value = '' 858 self.data.features.append(self._cur_feature)
859
860 - def feature_key(self, content):
861 # if we already have a feature, add it on 862 self._add_feature() 863 864 # start a new feature 865 self._cur_feature = SeqFeature.SeqFeature() 866 self._cur_feature.type = content 867 868 # assume positive strand to start with if we have DNA or cDNA 869 # (labelled as mRNA). The complement in the location will 870 # change this later if something is on the reverse strand 871 if self._seq_type.find("DNA") >= 0 or self._seq_type.find("mRNA") >= 0: 872 self._cur_feature.strand = 1
873
874 - def location(self, content):
875 """Parse out location information from the location string. 876 877 This uses simple Python code with some regular expressions to do the 878 parsing, and then translates the results into appropriate objects. 879 """ 880 # clean up newlines and other whitespace inside the location before 881 # parsing - locations should have no whitespace whatsoever 882 location_line = self._clean_location(content) 883 884 # Older records have junk like replace(266,"c") in the 885 # location line. Newer records just replace this with 886 # the number 266 and have the information in a more reasonable 887 # place. So we'll just grab out the number and feed this to the 888 # parser. We shouldn't really be losing any info this way. 889 if location_line.find('replace') != -1: 890 comma_pos = location_line.find(',') 891 location_line = location_line[8:comma_pos] 892 893 cur_feature = self._cur_feature 894 895 #Handle top level complement here for speed 896 if location_line.startswith("complement("): 897 assert location_line.endswith(")") 898 location_line = location_line[11:-1] 899 cur_feature.strand = -1 900 #And continue... 901 902 #Special case handling of the most common cases for speed 903 if _re_simple_location.match(location_line): 904 #e.g. "123..456" 905 s, e = location_line.split("..") 906 cur_feature.location = SeqFeature.FeatureLocation(int(s)-1, 907 int(e)) 908 return 909 if _re_simple_compound.match(location_line): 910 #e.g. join(<123..456,480..>500) 911 i = location_line.find("(") 912 cur_feature.location_operator = location_line[:i] 913 #we can split on the comma because these are simple locations 914 for part in location_line[i+1:-1].split(","): 915 s, e = part.split("..") 916 f = SeqFeature.SeqFeature(SeqFeature.FeatureLocation(int(s)-1, 917 int(e)), 918 location_operator=cur_feature.location_operator, 919 strand=cur_feature.strand, type=cur_feature.type) 920 cur_feature.sub_features.append(f) 921 s = cur_feature.sub_features[0].location.start 922 e = cur_feature.sub_features[-1].location.end 923 cur_feature.location = SeqFeature.FeatureLocation(s,e) 924 return 925 926 #Handle the general case with more complex regular expressions 927 if _re_complex_location.match(location_line): 928 #e.g. "AL121804.2:41..610" 929 if ":" in location_line: 930 cur_feature.ref, location_line = location_line.split(":") 931 cur_feature.location = _loc(location_line, self._expected_size) 932 return 933 if _re_complex_compound.match(location_line): 934 i = location_line.find("(") 935 cur_feature.location_operator = location_line[:i] 936 #Can't split on the comma because of ositions like one-of(1,2,3) 937 for part in _split_compound_loc(location_line[i+1:-1]): 938 if part.startswith("complement("): 939 assert part[-1]==")" 940 part = part[11:-1] 941 assert cur_feature.strand != -1, "Double complement?" 942 strand = -1 943 else: 944 strand = cur_feature.strand 945 if ":" in part: 946 ref, part = part.split(":") 947 else: 948 ref = None 949 try: 950 loc = _loc(part, self._expected_size) 951 except ValueError, err: 952 print location_line 953 print part 954 raise err 955 f = SeqFeature.SeqFeature(location=loc, ref=ref, 956 location_operator=cur_feature.location_operator, 957 strand=strand, type=cur_feature.type) 958 cur_feature.sub_features.append(f) 959 # Historically a join on the reverse strand has been represented 960 # in Biopython with both the parent SeqFeature and its children 961 # (the exons for a CDS) all given a strand of -1. Likewise, for 962 # a join feature on the forward strand they all have strand +1. 963 # However, we must also consider evil mixed strand examples like 964 # this, join(complement(69611..69724),139856..140087,140625..140650) 965 strands = set(sf.strand for sf in cur_feature.sub_features) 966 if len(strands)==1: 967 cur_feature.strand = cur_feature.sub_features[0].strand 968 else: 969 cur_feature.strand = None # i.e. mixed strands 970 s = cur_feature.sub_features[0].location.start 971 e = cur_feature.sub_features[-1].location.end 972 cur_feature.location = SeqFeature.FeatureLocation(s,e) 973 return 974 #Not recognised 975 raise LocationParserError(location_line)
976
977 - def _add_qualifier(self):
978 """Add a qualifier to the current feature without loss of info. 979 980 If there are multiple qualifier keys with the same name we 981 would lose some info in the dictionary, so we append a unique 982 number to the end of the name in case of conflicts. 983 """ 984 # if we've got a key from before, add it to the dictionary of 985 # qualifiers 986 if self._cur_qualifier_key: 987 key = self._cur_qualifier_key 988 value = "".join(self._cur_qualifier_value) 989 if self._feature_cleaner is not None: 990 value = self._feature_cleaner.clean_value(key, value) 991 # if the qualifier name exists, append the value 992 if key in self._cur_feature.qualifiers: 993 self._cur_feature.qualifiers[key].append(value) 994 # otherwise start a new list of the key with its values 995 else: 996 self._cur_feature.qualifiers[key] = [value]
997
998 - def feature_qualifier_name(self, content_list):
999 """When we get a qualifier key, use it as a dictionary key. 1000 1001 We receive a list of keys, since you can have valueless keys such as 1002 /pseudo which would be passed in with the next key (since no other 1003 tags separate them in the file) 1004 """ 1005 for content in content_list: 1006 # add a qualifier if we've got one 1007 self._add_qualifier() 1008 1009 # assume the / and = have been removed, and it has been trimmed 1010 assert '/' not in content and '=' not in content \ 1011 and content == content.strip(), content 1012 1013 self._cur_qualifier_key = content 1014 self._cur_qualifier_value = []
1015
1016 - def feature_qualifier_description(self, content):
1017 # get rid of the quotes surrounding the qualifier if we've got 'em 1018 qual_value = content.replace('"', '') 1019 1020 self._cur_qualifier_value.append(qual_value)
1021
1022 - def contig_location(self, content):
1023 """Deal with CONTIG information.""" 1024 #Historically this was stored as a SeqFeature object, but it was 1025 #stored under record.annotations["contig"] and not under 1026 #record.features with the other SeqFeature objects. 1027 # 1028 #The CONTIG location line can include additional tokens like 1029 #Gap(), Gap(100) or Gap(unk100) which are not used in the feature 1030 #location lines, so storing it using SeqFeature based location 1031 #objects is difficult. 1032 # 1033 #We now store this a string, which means for BioSQL we are now in 1034 #much better agreement with how BioPerl records the CONTIG line 1035 #in the database. 1036 # 1037 #NOTE - This code assumes the scanner will return all the CONTIG 1038 #lines already combined into one long string! 1039 self.data.annotations["contig"] = content
1040
1041 - def origin_name(self, content):
1042 pass
1043
1044 - def base_count(self, content):
1045 pass
1046
1047 - def base_number(self, content):
1048 pass
1049
1050 - def sequence(self, content):
1051 """Add up sequence information as we get it. 1052 1053 To try and make things speedier, this puts all of the strings 1054 into a list of strings, and then uses string.join later to put 1055 them together. Supposedly, this is a big time savings 1056 """ 1057 assert ' ' not in content 1058 self._seq_data.append(content.upper())
1059
1060 - def record_end(self, content):
1061 """Clean up when we've finished the record. 1062 """ 1063 from Bio import Alphabet 1064 from Bio.Alphabet import IUPAC 1065 from Bio.Seq import Seq, UnknownSeq 1066 1067 #Try and append the version number to the accession for the full id 1068 if self.data.id is None: 1069 assert 'accessions' not in self.data.annotations, \ 1070 self.data.annotations['accessions'] 1071 self.data.id = self.data.name #Good fall back? 1072 elif self.data.id.count('.') == 0: 1073 try: 1074 self.data.id+='.%i' % self.data.annotations['sequence_version'] 1075 except KeyError: 1076 pass 1077 1078 # add the last feature in the table which hasn't been added yet 1079 self._add_feature() 1080 1081 # add the sequence information 1082 # first, determine the alphabet 1083 # we default to an generic alphabet if we don't have a 1084 # seq type or have strange sequence information. 1085 seq_alphabet = Alphabet.generic_alphabet 1086 1087 # now set the sequence 1088 sequence = "".join(self._seq_data) 1089 1090 if self._expected_size is not None \ 1091 and len(sequence) != 0 \ 1092 and self._expected_size != len(sequence): 1093 import warnings 1094 warnings.warn("Expected sequence length %i, found %i (%s)." \ 1095 % (self._expected_size, len(sequence), self.data.id)) 1096 1097 if self._seq_type: 1098 # mRNA is really also DNA, since it is actually cDNA 1099 if self._seq_type.find('DNA') != -1 or \ 1100 self._seq_type.find('mRNA') != -1: 1101 seq_alphabet = IUPAC.ambiguous_dna 1102 # are there ever really RNA sequences in GenBank? 1103 elif self._seq_type.find('RNA') != -1: 1104 #Even for data which was from RNA, the sequence string 1105 #is usually given as DNA (T not U). Bug 2408 1106 if "T" in sequence and "U" not in sequence: 1107 seq_alphabet = IUPAC.ambiguous_dna 1108 else: 1109 seq_alphabet = IUPAC.ambiguous_rna 1110 elif self._seq_type.upper().find('PROTEIN') != -1: 1111 seq_alphabet = IUPAC.protein # or extended protein? 1112 # work around ugly GenBank records which have circular or 1113 # linear but no indication of sequence type 1114 elif self._seq_type in ["circular", "linear"]: 1115 pass 1116 # we have a bug if we get here 1117 else: 1118 raise ValueError("Could not determine alphabet for seq_type %s" 1119 % self._seq_type) 1120 1121 if not sequence and self.__expected_size: 1122 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet) 1123 else: 1124 self.data.seq = Seq(sequence, seq_alphabet)
1125
1126 -class _RecordConsumer(_BaseGenBankConsumer):
1127 """Create a GenBank Record object from scanner generated information. 1128 """
1129 - def __init__(self):
1130 _BaseGenBankConsumer.__init__(self) 1131 import Record 1132 self.data = Record.Record() 1133 1134 self._seq_data = [] 1135 self._cur_reference = None 1136 self._cur_feature = None 1137 self._cur_qualifier = None
1138
1139 - def wgs(self, content):
1140 self.data.wgs = content.split('-')
1141
1142 - def add_wgs_scafld(self, content):
1143 self.data.wgs_scafld.append(content.split('-'))
1144
1145 - def locus(self, content):
1146 self.data.locus = content
1147
1148 - def size(self, content):
1149 self.data.size = content
1150
1151 - def residue_type(self, content):
1152 self.data.residue_type = content
1153
1154 - def data_file_division(self, content):
1155 self.data.data_file_division = content
1156
1157 - def date(self, content):
1158 self.data.date = content
1159
1160 - def definition(self, content):
1161 self.data.definition = content
1162
1163 - def accession(self, content):
1164 for acc in self._split_accessions(content): 1165 if acc not in self.data.accession: 1166 self.data.accession.append(acc)
1167
1168 - def nid(self, content):
1169 self.data.nid = content
1170
1171 - def pid(self, content):
1172 self.data.pid = content
1173
1174 - def version(self, content):
1175 self.data.version = content
1176
1177 - def db_source(self, content):
1178 self.data.db_source = content.rstrip()
1179
1180 - def gi(self, content):
1181 self.data.gi = content
1182
1183 - def keywords(self, content):
1184 self.data.keywords = self._split_keywords(content)
1185
1186 - def project(self, content):
1187 self.data.projects.extend([p for p in content.split() if p])
1188 1191
1192 - def segment(self, content):
1193 self.data.segment = content
1194
1195 - def source(self, content):
1196 self.data.source = content
1197
1198 - def organism(self, content):
1199 self.data.organism = content
1200
1201 - def taxonomy(self, content):
1202 self.data.taxonomy = self._split_taxonomy(content)
1203
1204 - def reference_num(self, content):
1205 """Grab the reference number and signal the start of a new reference. 1206 """ 1207 # check if we have a reference to add 1208 if self._cur_reference is not None: 1209 self.data.references.append(self._cur_reference) 1210 1211 import Record 1212 self._cur_reference = Record.Reference() 1213 self._cur_reference.number = content
1214
1215 - def reference_bases(self, content):
1216 self._cur_reference.bases = content
1217
1218 - def authors(self, content):
1219 self._cur_reference.authors = content
1220
1221 - def consrtm(self, content):
1222 self._cur_reference.consrtm = content
1223
1224 - def title(self, content):
1225 if self._cur_reference is None: 1226 import warnings 1227 warnings.warn("GenBank TITLE line without REFERENCE line.") 1228 return 1229 self._cur_reference.title = content
1230
1231 - def journal(self, content):
1232 self._cur_reference.journal = content
1233
1234 - def medline_id(self, content):
1235 self._cur_reference.medline_id = content
1236
1237 - def pubmed_id(self, content):
1238 self._cur_reference.pubmed_id = content
1239
1240 - def remark(self, content):
1241 self._cur_reference.remark = content
1242
1243 - def comment(self, content):
1244 self.data.comment += "\n".join(content)
1245
1246 - def primary_ref_line(self,content):
1247 """Data for the PRIMARY line""" 1248 self.data.primary.append(content)
1249
1250 - def primary(self,content):
1251 pass
1252
1253 - def features_line(self, content):
1254 """Get ready for the feature table when we reach the FEATURE line. 1255 """ 1256 self.start_feature_table()
1257
1258 - def start_feature_table(self):
1259 """Signal the start of the feature table. 1260 """ 1261 # we need to add on the last reference 1262 if self._cur_reference is not None: 1263 self.data.references.append(self._cur_reference)
1264
1265 - def feature_key(self, content):
1266 """Grab the key of the feature and signal the start of a new feature. 1267 """ 1268 # first add on feature information if we've got any 1269 self._add_feature() 1270 1271 import Record 1272 self._cur_feature = Record.Feature() 1273 self._cur_feature.key = content
1274
1275 - def _add_feature(self):
1276 """Utility function to add a feature to the Record. 1277 1278 This does all of the appropriate checking to make sure we haven't 1279 left any info behind, and that we are only adding info if it 1280 exists. 1281 """ 1282 if self._cur_feature is not None: 1283 # if we have a left over qualifier, add it to the qualifiers 1284 # on the current feature 1285 if self._cur_qualifier is not None: 1286 self._cur_feature.qualifiers.append(self._cur_qualifier) 1287 1288 self._cur_qualifier = None 1289 self.data.features.append(self._cur_feature)
1290
1291 - def location(self, content):
1292 self._cur_feature.location = self._clean_location(content)
1293
1294 - def feature_qualifier_name(self, content_list):
1295 """Deal with qualifier names 1296 1297 We receive a list of keys, since you can have valueless keys such as 1298 /pseudo which would be passed in with the next key (since no other 1299 tags separate them in the file) 1300 """ 1301 import Record 1302 for content in content_list: 1303 # the record parser keeps the /s -- add them if we don't have 'em 1304 if content.find("/") != 0: 1305 content = "/%s" % content 1306 # add on a qualifier if we've got one 1307 if self._cur_qualifier is not None: 1308 self._cur_feature.qualifiers.append(self._cur_qualifier) 1309 1310 self._cur_qualifier = Record.Qualifier() 1311 self._cur_qualifier.key = content
1312
1313 - def feature_qualifier_description(self, content):
1314 # if we have info then the qualifier key should have a ='s 1315 if self._cur_qualifier.key.find("=") == -1: 1316 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1317 cur_content = self._remove_newlines(content) 1318 # remove all spaces from the value if it is a type where spaces 1319 # are not important 1320 for remove_space_key in self.__class__.remove_space_keys: 1321 if self._cur_qualifier.key.find(remove_space_key) >= 0: 1322 cur_content = self._remove_spaces(cur_content) 1323 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1324
1325 - def base_count(self, content):
1326 self.data.base_counts = content
1327
1328 - def origin_name(self, content):
1329 self.data.origin = content
1330
1331 - def contig_location(self, content):
1332 """Signal that we have contig information to add to the record. 1333 """ 1334 self.data.contig = self._clean_location(content)
1335
1336 - def sequence(self, content):
1337 """Add sequence information to a list of sequence strings. 1338 1339 This removes spaces in the data and uppercases the sequence, and 1340 then adds it to a list of sequences. Later on we'll join this 1341 list together to make the final sequence. This is faster than 1342 adding on the new string every time. 1343 """ 1344 assert ' ' not in content 1345 self._seq_data.append(content.upper())
1346
1347 - def record_end(self, content):
1348 """Signal the end of the record and do any necessary clean-up. 1349 """ 1350 # add together all of the sequence parts to create the 1351 # final sequence string 1352 self.data.sequence = "".join(self._seq_data) 1353 # add on the last feature 1354 self._add_feature()
1355 1356
1357 -def _test():
1358 """Run the Bio.GenBank module's doctests.""" 1359 print "Runing doctests..." 1360 import doctest 1361 doctest.testmod() 1362 print "Done"
1363 1364 if __name__ == "__main__": 1365 _test() 1366