Package Bio :: Package GenBank :: Module Record
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Record

  1  """Hold GenBank data in a straightforward format. 
  2   
  3  classes: 
  4  o Record - All of the information in a GenBank record. 
  5  o Reference - hold reference data for a record. 
  6  o Feature - Hold the information in a Feature Table. 
  7  o Qualifier - Qualifiers on a Feature. 
  8  """ 
  9  # local stuff 
 10  import Bio.GenBank 
 11   
12 -def _wrapped_genbank(information, indent, wrap_space = 1, split_char = " "):
13 """Write a line of GenBank info that can wrap over multiple lines. 14 15 This takes a line of information which can potentially wrap over 16 multiple lines, and breaks it up with carriage returns and 17 indentation so it fits properly into a GenBank record. 18 19 Arguments: 20 21 o information - The string holding the information we want 22 wrapped in GenBank method. 23 24 o indent - The indentation on the lines we are writing. 25 26 o wrap_space - Whether or not to wrap only on spaces in the 27 information. 28 29 o split_char - A specific character to split the lines on. By default 30 spaces are used. 31 """ 32 info_length = Record.GB_LINE_LENGTH - indent 33 34 if wrap_space: 35 info_parts = information.split(split_char) 36 else: 37 cur_pos = 0 38 info_parts = [] 39 while cur_pos < len(information): 40 info_parts.append(information[cur_pos: cur_pos + info_length]) 41 cur_pos += info_length 42 43 # first get the information string split up by line 44 output_parts = [] 45 cur_part = "" 46 for info_part in info_parts: 47 if len(cur_part) + 1 + len(info_part) > info_length: 48 if cur_part: 49 if split_char != " ": 50 cur_part += split_char 51 output_parts.append(cur_part) 52 cur_part = info_part 53 else: 54 if cur_part == "": 55 cur_part = info_part 56 else: 57 cur_part += split_char + info_part 58 59 # add the last bit of information to the output 60 if cur_part: 61 output_parts.append(cur_part) 62 63 # now format the information string for return 64 output_info = output_parts[0] + "\n" 65 for output_part in output_parts[1:]: 66 output_info += " " * indent + output_part + "\n" 67 68 return output_info
69
70 -def _indent_genbank(information, indent):
71 """Write out information with the specified indent. 72 73 Unlike _wrapped_genbank, this function makes no attempt to wrap 74 lines -- it assumes that the information already has newlines in the 75 appropriate places, and will add the specified indent to the start of 76 each line. 77 """ 78 # split the info into lines based on line breaks 79 info_parts = information.split("\n") 80 81 # the first line will have no indent 82 output_info = info_parts[0] + "\n" 83 for info_part in info_parts[1:]: 84 output_info += " " * indent + info_part + "\n" 85 86 return output_info
87
88 -class Record:
89 """Hold GenBank information in a format similar to the original record. 90 91 The Record class is meant to make data easy to get to when you are 92 just interested in looking at GenBank data. 93 94 Attributes: 95 o locus - The name specified after the LOCUS keyword in the GenBank 96 record. This may be the accession number, or a clone id or something else. 97 o size - The size of the record. 98 o residue_type - The type of residues making up the sequence in this 99 record. Normally something like RNA, DNA or PROTEIN, but may be as 100 esoteric as 'ss-RNA circular'. 101 o data_file_division - The division this record is stored under in 102 GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...) 103 o date - The date of submission of the record, in a form like '28-JUL-1998' 104 o accession - list of all accession numbers for the sequence. 105 o nid - Nucleotide identifier number. 106 o pid - Proteint identifier number 107 o version - The accession number + version (ie. AB01234.2) 108 o db_source - Information about the database the record came from 109 o gi - The NCBI gi identifier for the record. 110 o keywords - A list of keywords related to the record. 111 o segment - If the record is one of a series, this is info about which 112 segment this record is (something like '1 of 6'). 113 o source - The source of material where the sequence came from. 114 o organism - The genus and species of the organism (ie. 'Homo sapiens') 115 o taxonomy - A listing of the taxonomic classification of the organism, 116 starting general and getting more specific. 117 o references - A list of Reference objects. 118 o comment - Text with any kind of comment about the record. 119 o features - A listing of Features making up the feature table. 120 o base_counts - A string with the counts of bases for the sequence. 121 o origin - A string specifying info about the origin of the sequence. 122 o sequence - A string with the sequence itself. 123 o contig - A string of location information for a CONTIG in a RefSeq file 124 o project - The genome sequencing project numbers 125 (will be replaced by the dblink cross-references in 2009). 126 o dblinks - The genome sequencing project number(s) and other links. 127 (will replace the project information in 2009). 128 """ 129 # constants for outputting GenBank information 130 GB_LINE_LENGTH = 79 131 GB_BASE_INDENT = 12 132 GB_FEATURE_INDENT = 21 133 GB_INTERNAL_INDENT = 2 134 GB_OTHER_INTERNAL_INDENT = 3 135 GB_FEATURE_INTERNAL_INDENT = 5 136 GB_SEQUENCE_INDENT = 9 137 138 BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s" 139 INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \ 140 str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s" 141 OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \ 142 str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \ 143 "s" 144 145 BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s" 146 INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \ 147 str(GB_FEATURE_INDENT - 148 GB_FEATURE_INTERNAL_INDENT) + "s" 149 SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s" 150
151 - def __init__(self):
152 self.locus = '' 153 self.size = '' 154 self.residue_type = '' 155 self.data_file_division = '' 156 self.date = '' 157 self.definition = '' 158 self.accession = [] 159 self.nid = '' 160 self.pid = '' 161 self.version = '' 162 self.projects = [] 163 self.dblinks = [] 164 self.db_source = '' 165 self.gi = '' 166 self.keywords = [] 167 self.segment = '' 168 self.source = '' 169 self.organism = '' 170 self.taxonomy = [] 171 self.references = [] 172 self.comment = '' 173 self.features = [] 174 self.base_counts = '' 175 self.origin = '' 176 self.sequence = '' 177 self.contig = '' 178 self.primary=[]
179
180 - def __str__(self):
181 """Provide a GenBank formatted output option for a Record. 182 183 The objective of this is to provide an easy way to read in a GenBank 184 record, modify it somehow, and then output it in 'GenBank format.' 185 We are striving to make this work so that a parsed Record that is 186 output using this function will look exactly like the original 187 record. 188 189 Much of the output is based on format description info at: 190 191 ftp://ncbi.nlm.nih.gov/genbank/gbrel.txt 192 """ 193 output = self._locus_line() 194 output += self._definition_line() 195 output += self._accession_line() 196 output += self._version_line() 197 output += self._project_line() 198 output += self._dblink_line() 199 output += self._nid_line() 200 output += self._pid_line() 201 output += self._keywords_line() 202 output += self._db_source_line() 203 output += self._segment_line() 204 output += self._source_line() 205 output += self._organism_line() 206 for reference in self.references: 207 output += str(reference) 208 output += self._comment_line() 209 output += self._features_line() 210 for feature in self.features: 211 output += str(feature) 212 output += self._base_count_line() 213 output += self._origin_line() 214 output += self._sequence_line() 215 output += self._contig_line() 216 output += "//" 217 return output
218
219 - def _locus_line(self):
220 """Provide the output string for the LOCUS line. 221 """ 222 output = "LOCUS" 223 output += " " * 7 # 6-12 spaces 224 output += "%-9s" % self.locus 225 output += " " # 22 space 226 output += "%7s" % self.size 227 if self.residue_type.find("PROTEIN") >= 0: 228 output += " aa" 229 else: 230 output += " bp " 231 232 # treat circular types differently, since they'll have long residue 233 # types 234 if self.residue_type.find("circular") >= 0: 235 output += "%17s" % self.residue_type 236 # second case: ss-DNA types of records 237 elif self.residue_type.find("-") >= 0: 238 output += "%7s" % self.residue_type 239 output += " " * 10 # spaces for circular 240 else: 241 output += " " * 3 # spaces for stuff like ss- 242 output += "%-4s" % self.residue_type 243 output += " " * 10 # spaces for circular 244 245 output += " " * 2 246 output += "%3s" % self.data_file_division 247 output += " " * 7 # spaces for 56-63 248 output += "%11s" % self.date 249 output += "\n" 250 return output
251
252 - def _definition_line(self):
253 """Provide output for the DEFINITION line. 254 """ 255 output = Record.BASE_FORMAT % "DEFINITION" 256 output += _wrapped_genbank(self.definition, Record.GB_BASE_INDENT) 257 return output
258
259 - def _accession_line(self):
260 """Output for the ACCESSION line. 261 """ 262 if self.accession: 263 output = Record.BASE_FORMAT % "ACCESSION" 264 265 acc_info = "" 266 for accession in self.accession: 267 acc_info += "%s " % accession 268 # strip off an extra space at the end 269 acc_info = acc_info.rstrip() 270 output += _wrapped_genbank(acc_info, Record.GB_BASE_INDENT) 271 else: 272 output = "" 273 274 return output
275
276 - def _version_line(self):
277 """Output for the VERSION line. 278 """ 279 if self.version: 280 output = Record.BASE_FORMAT % "VERSION" 281 output += self.version 282 output += " GI:" 283 output += "%s\n" % self.gi 284 else: 285 output = "" 286 return output
287
288 - def _project_line(self):
289 output = "" 290 if len(self.projects) > 0: 291 output = Record.BASE_FORMAT % "PROJECT" 292 output += "%s\n" % " ".join(self.projects) 293 return output
294 302
303 - def _nid_line(self):
304 """Output for the NID line. Use of NID is obsolete in GenBank files. 305 """ 306 if self.nid: 307 output = Record.BASE_FORMAT % "NID" 308 output += "%s\n" % self.nid 309 else: 310 output = "" 311 return output
312
313 - def _pid_line(self):
314 """Output for PID line. Presumedly, PID usage is also obsolete. 315 """ 316 if self.pid: 317 output = Record.BASE_FORMAT % "PID" 318 output += "%s\n" % self.pid 319 else: 320 output = "" 321 return output
322
323 - def _keywords_line(self):
324 """Output for the KEYWORDS line. 325 """ 326 output = "" 327 if len(self.keywords) >= 0: 328 output += Record.BASE_FORMAT % "KEYWORDS" 329 keyword_info = "" 330 for keyword in self.keywords: 331 keyword_info += "%s; " % keyword 332 # replace the ; at the end with a period 333 keyword_info = keyword_info[:-2] 334 keyword_info += "." 335 336 output += _wrapped_genbank(keyword_info, 337 Record.GB_BASE_INDENT) 338 339 return output
340
341 - def _db_source_line(self):
342 """Output for DBSOURCE line. 343 """ 344 if self.db_source: 345 output = Record.BASE_FORMAT % "DBSOURCE" 346 output += "%s\n" % self.db_source 347 else: 348 output = "" 349 return output
350
351 - def _segment_line(self):
352 """Output for the SEGMENT line. 353 """ 354 output = "" 355 if self.segment: 356 output += Record.BASE_FORMAT % "SEGMENT" 357 output += _wrapped_genbank(self.segment, Record.GB_BASE_INDENT) 358 return output
359
360 - def _source_line(self):
361 """Output for SOURCE line on where the sample came from. 362 """ 363 output = Record.BASE_FORMAT % "SOURCE" 364 output += _wrapped_genbank(self.source, Record.GB_BASE_INDENT) 365 return output
366
367 - def _organism_line(self):
368 """Output for ORGANISM line with taxonomy info. 369 """ 370 output = Record.INTERNAL_FORMAT % "ORGANISM" 371 output += "%s\n" % self.organism 372 output += " " * Record.GB_BASE_INDENT 373 taxonomy_info = "" 374 for tax in self.taxonomy: 375 taxonomy_info += "%s; " % tax 376 # replace the ; at the end with a period 377 taxonomy_info = taxonomy_info[:-2] 378 taxonomy_info += "." 379 output += _wrapped_genbank(taxonomy_info, Record.GB_BASE_INDENT) 380 381 return output
382
383 - def _comment_line(self):
384 """Output for the COMMENT lines. 385 """ 386 output = "" 387 if self.comment: 388 output += Record.BASE_FORMAT % "COMMENT" 389 output += _indent_genbank(self.comment, 390 Record.GB_BASE_INDENT) 391 return output
392
393 - def _features_line(self):
394 """Output for the FEATURES line. 395 """ 396 output = "" 397 if len(self.features) > 0: 398 output += Record.BASE_FEATURE_FORMAT % "FEATURES" 399 output += "Location/Qualifiers\n" 400 return output
401
402 - def _base_count_line(self):
403 """Output for the BASE COUNT line with base information. 404 """ 405 output = "" 406 if self.base_counts: 407 output += Record.BASE_FORMAT % "BASE COUNT " 408 # split up the base counts into their individual parts 409 count_parts = self.base_counts.split(" ") 410 while '' in count_parts: 411 count_parts.remove('') 412 # deal with the standard case, with a normal origin line 413 # like: 474 a 356 c 428 g 364 t 414 if len(count_parts) % 2 == 0: 415 while len(count_parts) > 0: 416 count_info = count_parts.pop(0) 417 count_type = count_parts.pop(0) 418 419 output += "%7s %s" % (count_info, count_type) 420 # deal with ugly ORIGIN lines like: 421 # 1311257 a2224835 c2190093 g1309889 t 422 # by just outputting the raw information 423 else: 424 output += self.base_counts 425 output += "\n" 426 return output
427
428 - def _origin_line(self):
429 """Output for the ORIGIN line 430 """ 431 output = "" 432 # only output the ORIGIN line if we have a sequence 433 if self.sequence: 434 output += Record.BASE_FORMAT % "ORIGIN" 435 if self.origin: 436 output += _wrapped_genbank(self.origin, 437 Record.GB_BASE_INDENT) 438 else: 439 output += "\n" 440 return output
441
442 - def _sequence_line(self):
443 """Output for all of the sequence. 444 """ 445 output = "" 446 if self.sequence: 447 cur_seq_pos = 0 448 while cur_seq_pos < len(self.sequence): 449 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1) 450 451 for section in range(6): 452 start_pos = cur_seq_pos + section * 10 453 end_pos = start_pos + 10 454 seq_section = self.sequence[start_pos:end_pos] 455 output += " %s" % seq_section.lower() 456 457 # stop looping if we are out of sequence 458 if end_pos > len(self.sequence): 459 break 460 461 output += "\n" 462 cur_seq_pos += 60 463 return output
464
465 - def _contig_line(self):
466 """Output for CONTIG location information from RefSeq. 467 """ 468 output = "" 469 if self.contig: 470 output += Record.BASE_FORMAT % "CONTIG" 471 output += _wrapped_genbank(self.contig, 472 Record.GB_BASE_INDENT, split_char = ',') 473 return output
474
475 -class Reference:
476 """Hold information from a GenBank reference. 477 478 Attributes: 479 o number - The number of the reference in the listing of references. 480 o bases - The bases in the sequence the reference refers to. 481 o authors - String with all of the authors. 482 o consrtm - Consortium the authors belong to. 483 o title - The title of the reference. 484 o journal - Information about the journal where the reference appeared. 485 o medline_id - The medline id for the reference. 486 o pubmed_id - The pubmed_id for the reference. 487 o remark - Free-form remarks about the reference. 488 """
489 - def __init__(self):
490 self.number = '' 491 self.bases = '' 492 self.authors = '' 493 self.consrtm = '' 494 self.title = '' 495 self.journal = '' 496 self.medline_id = '' 497 self.pubmed_id = '' 498 self.remark = ''
499
500 - def __str__(self):
501 output = self._reference_line() 502 output += self._authors_line() 503 output += self._consrtm_line() 504 output += self._title_line() 505 output += self._journal_line() 506 output += self._medline_line() 507 output += self._pubmed_line() 508 output += self._remark_line() 509 510 return output
511
512 - def _reference_line(self):
513 """Output for REFERENCE lines. 514 """ 515 output = Record.BASE_FORMAT % "REFERENCE" 516 if self.number: 517 if self.bases: 518 output += "%-3s" % self.number 519 output += "%s" % self.bases 520 else: 521 output += "%s" % self.number 522 523 output += "\n" 524 return output
525
526 - def _authors_line(self):
527 """Output for AUTHORS information. 528 """ 529 output = "" 530 if self.authors: 531 output += Record.INTERNAL_FORMAT % "AUTHORS" 532 output += _wrapped_genbank(self.authors, Record.GB_BASE_INDENT) 533 return output
534
535 - def _consrtm_line(self):
536 """Output for CONSRTM information. 537 """ 538 output = "" 539 if self.consrtm: 540 output += Record.INTERNAL_FORMAT % "CONSRTM" 541 output += _wrapped_genbank(self.consrtm, Record.GB_BASE_INDENT) 542 return output
543
544 - def _title_line(self):
545 """Output for TITLE information. 546 """ 547 output = "" 548 if self.title: 549 output += Record.INTERNAL_FORMAT % "TITLE" 550 output += _wrapped_genbank(self.title, Record.GB_BASE_INDENT) 551 return output
552
553 - def _journal_line(self):
554 """Output for JOURNAL information. 555 """ 556 output = "" 557 if self.journal: 558 output += Record.INTERNAL_FORMAT % "JOURNAL" 559 output += _wrapped_genbank(self.journal, Record.GB_BASE_INDENT) 560 return output
561
562 - def _medline_line(self):
563 """Output for MEDLINE information. 564 """ 565 output = "" 566 if self.medline_id: 567 output += Record.INTERNAL_FORMAT % "MEDLINE" 568 output += self.medline_id + "\n" 569 return output
570
571 - def _pubmed_line(self):
572 """Output for PUBMED information. 573 """ 574 output = "" 575 if self.pubmed_id: 576 output += Record.OTHER_INTERNAL_FORMAT % "PUBMED" 577 output += self.pubmed_id + "\n" 578 return output
579
580 - def _remark_line(self):
581 """Output for REMARK information. 582 """ 583 output = "" 584 if self.remark: 585 output += Record.INTERNAL_FORMAT % "REMARK" 586 output += _wrapped_genbank(self.remark, Record.GB_BASE_INDENT) 587 return output
588
589 -class Feature:
590 """Hold information about a Feature in the Feature Table of GenBank record. 591 592 Attributes: 593 o key - The key name of the featue (ie. source) 594 o location - The string specifying the location of the feature. 595 o qualfiers - A listing Qualifier objects in the feature. 596 """
597 - def __init__(self):
598 self.key = '' 599 self.location = '' 600 self.qualifiers = []
601
602 - def __str__(self):
603 output = Record.INTERNAL_FEATURE_FORMAT % self.key 604 output += _wrapped_genbank(self.location, Record.GB_FEATURE_INDENT, 605 split_char = ',') 606 for qualifier in self.qualifiers: 607 output += " " * Record.GB_FEATURE_INDENT 608 609 # determine whether we can wrap on spaces 610 space_wrap = 1 611 for no_space_key in \ 612 Bio.GenBank._BaseGenBankConsumer.remove_space_keys: 613 if qualifier.key.find(no_space_key) >= 0: 614 space_wrap = 0 615 616 output += _wrapped_genbank(qualifier.key + qualifier.value, 617 Record.GB_FEATURE_INDENT, space_wrap) 618 return output
619
620 -class Qualifier:
621 """Hold information about a qualifier in a GenBank feature. 622 623 Attributes: 624 o key - The key name of the qualifier (ie. /organism=) 625 o value - The value of the qualifier ("Dictyostelium discoideum"). 626 """
627 - def __init__(self):
628 self.key = '' 629 self.value = ''
630