Package Bio :: Package Phylo :: Module PhyloXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PhyloXML

   1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes corresponding to phyloXML elements. 
   7   
   8  See U{ http://phyloxml.org/ } for the official specification. 
   9   
  10  See also Han and Zmasek (2009) doi:10.1186/1471-2105-10-356 
  11  """ 
  12  __docformat__ = "epytext en" 
  13   
  14  import re 
  15  import warnings 
  16   
  17  from Bio import Alphabet 
  18  from Bio.Align import MultipleSeqAlignment 
  19  from Bio.Seq import Seq 
  20  from Bio.SeqFeature import SeqFeature, FeatureLocation 
  21  from Bio.SeqRecord import SeqRecord 
  22  import Bio 
  23   
  24  from Bio.Phylo import BaseTree 
25 26 27 -class PhyloXMLWarning(Warning):
28 """Warning for non-compliance with the phyloXML specification.""" 29 pass
30
31 32 -def _check_str(text, testfunc):
33 """Check a string using testfunc, and warn if there's no match.""" 34 if text is not None and not testfunc(text): 35 warnings.warn("String %s doesn't match the given regexp" % text, 36 PhyloXMLWarning, stacklevel=2)
37
38 39 # Core elements 40 41 -class PhyloElement(BaseTree.TreeElement):
42 """Base class for all PhyloXML objects."""
43
44 45 -class Phyloxml(PhyloElement):
46 """Root node of the PhyloXML document. 47 48 Contains an arbitrary number of Phylogeny elements, possibly followed by 49 elements from other namespaces. 50 51 @param attributes: (XML namespace definitions) 52 @param phylogenies: list of phylogenetic trees 53 @param other: list of arbitrary non-phyloXML elements, if any 54 """
55 - def __init__(self, attributes, phylogenies=None, other=None):
56 self.attributes = attributes 57 self.phylogenies = phylogenies or [] 58 self.other = other or []
59
60 - def __getitem__(self, index):
61 """Get a phylogeny by index or name.""" 62 if isinstance(index, int) or isinstance(index, slice): 63 return self.phylogenies[index] 64 if not isinstance(index, basestring): 65 raise KeyError("can't use %s as an index" % type(index)) 66 for tree in self.phylogenies: 67 if tree.name == index: 68 return tree 69 else: 70 raise KeyError("no phylogeny found with name " + repr(index))
71
72 - def __iter__(self):
73 """Iterate through the phylogenetic trees in this object.""" 74 return iter(self.phylogenies)
75
76 - def __len__(self):
77 """Number of phylogenetic trees in this object.""" 78 return len(self.phylogenies)
79
80 - def __str__(self):
81 return '%s([%s])' % (self.__class__.__name__, 82 ',\n'.join(map(str, self.phylogenies)))
83
84 85 -class Other(PhyloElement):
86 """Container for non-phyloXML elements in the tree. 87 88 Usually, an Other object will have either a 'value' or a non-empty list 89 of 'children', but not both. This is not enforced here, though. 90 91 @param tag: local tag for the XML node 92 @param namespace: XML namespace for the node -- should not be the default 93 phyloXML namespace. 94 @param attributes: string attributes on the XML node 95 @param value: text contained directly within this XML node 96 @param children: list of child nodes, if any (also Other instances) 97 """
98 - def __init__(self, tag, namespace=None, attributes=None, value=None, 99 children=None):
100 self.tag = tag 101 self.namespace = namespace 102 self.attributes = attributes 103 self.value = value 104 self.children = children or []
105
106 - def __iter__(self):
107 """Iterate through the children of this object (if any).""" 108 return iter(self.children)
109
110 111 -class Phylogeny(PhyloElement, BaseTree.Tree):
112 """A phylogenetic tree. 113 114 @param root: the root node/clade of this tree 115 @param rooted: True if this tree is rooted 116 @param rerootable: True if this tree is rerootable 117 @param branch_length_unit: unit for branch_length values on clades 118 @type type: str 119 120 @param name: string identifier for this tree, not required to be unique 121 @param id: unique identifier for this tree (type Id) 122 @param description: plain-text description 123 @param date: date for the root node of this tree (type Date) 124 @param confidences: list of Confidence objects for this tree 125 @param clade_relations: list of CladeRelation objects 126 @param sequence_relations: list of SequenceRelation objects 127 @param properties: list of Property objects 128 @param other: list of non-phyloXML elements (type Other) 129 """
130 - def __init__(self, root=None, rooted=True, 131 rerootable=None, branch_length_unit=None, type=None, 132 # Child nodes 133 name=None, id=None, description=None, date=None, 134 # Collections 135 confidences=None, clade_relations=None, sequence_relations=None, 136 properties=None, other=None, 137 ):
138 assert isinstance(rooted, bool) 139 self.root = root 140 self.rooted = rooted 141 self.rerootable = rerootable 142 self.branch_length_unit = branch_length_unit 143 self.type = type 144 self.name = name 145 self.id = id 146 self.description = description 147 self.date = date 148 self.confidences = confidences or [] 149 self.clade_relations = clade_relations or [] 150 self.sequence_relations = sequence_relations or [] 151 self.properties = properties or [] 152 self.other = other or []
153 154 @classmethod
155 - def from_tree(cls, tree, **kwargs):
156 """Create a new Phylogeny given a Tree (from Newick/Nexus or BaseTree). 157 158 Keyword arguments are the usual Phylogeny constructor parameters. 159 """ 160 phy = cls( 161 root=Clade.from_clade(tree.root), 162 rooted=tree.rooted, 163 name=tree.name, 164 id=(tree.id is not None) and Id(str(tree.id)) or None) 165 phy.__dict__.update(kwargs) 166 return phy
167 168 @classmethod
169 - def from_clade(cls, clade, **kwargs):
170 """Create a new Phylogeny given a Newick or BaseTree Clade object. 171 172 Keyword arguments are the usual PhyloXML Clade constructor parameters. 173 """ 174 return Clade.from_clade(clade).to_phylogeny(**kwargs)
175
176 - def as_phyloxml(self):
177 """Return this tree, a PhyloXML-compatible Phylogeny object. 178 179 Overrides the BaseTree method. 180 """ 181 return self
182
183 - def to_phyloxml_container(self, **kwargs):
184 """Create a new Phyloxml object containing just this phylogeny.""" 185 return Phyloxml(kwargs, phylogenies=[self])
186
187 - def to_alignment(self):
188 """Construct an alignment from the aligned sequences in this tree.""" 189 def is_aligned_seq(elem): 190 if isinstance(elem, Sequence) and elem.mol_seq.is_aligned: 191 return True 192 return False
193 seqs = self._filter_search(is_aligned_seq, 'preorder', True) 194 try: 195 first_seq = seqs.next() 196 except StopIteration: 197 # No aligned sequences were found --> empty MSA 198 return MultipleSeqAlignment([]) 199 msa = MultipleSeqAlignment([first_seq.to_seqrecord()], 200 first_seq.get_alphabet()) 201 msa.extend(seq.to_seqrecord() for seq in seqs) 202 return msa
203 204 # Singular property for plural attribute
205 - def _get_confidence(self):
206 """Equivalent to self.confidences[0] if there is only 1 value. 207 208 See also: Clade.confidence, Clade.taxonomy 209 """ 210 if len(self.confidences) == 0: 211 return None 212 if len(self.confidences) > 1: 213 raise AttributeError("more than 1 confidence value available; " 214 "use Phylogeny.confidences") 215 return self.confidences[0]
216
217 - def _set_confidence(self, value):
218 if value is None: 219 # Special case: mirror the behavior of _get_confidence 220 self.confidences = [] 221 return 222 if isinstance(value, float) or isinstance(value, int): 223 value = Confidence(value) 224 elif not isinstance(value, Confidence): 225 raise ValueError("value must be a number or Confidence instance") 226 if len(self.confidences) == 0: 227 self.confidences.append(value) 228 elif len(self.confidences) == 1: 229 self.confidences[0] = value 230 else: 231 raise ValueError("multiple confidence values already exist; " 232 "use Phylogeny.confidences instead")
233
234 - def _del_confidence(self):
235 self.confidences = []
236 237 confidence = property(_get_confidence, _set_confidence, _del_confidence) 238
239 240 -class Clade(PhyloElement, BaseTree.Clade):
241 """Describes a branch of the current phylogenetic tree. 242 243 Used recursively, describes the topology of a phylogenetic tree. 244 245 Both 'color' and 'width' elements should be interpreted by client code as 246 applying to the whole clade, including all descendents, unless overwritten 247 in-sub clades. This module doesn't automatically assign these attributes to 248 sub-clades to achieve this cascade -- and neither should you. 249 250 @param branch_length: parent branch length of this clade 251 @param id_source: link other elements to a clade (on the xml-level) 252 253 @param name: short string label for this clade 254 @param confidences: list of Confidence objects, used to indicate the 255 support for a clade/parent branch. 256 @param width: branch width for this clade (including branch from parent) 257 @param color: color used for graphical display of this clade 258 @param node_id: unique identifier for the root node of this clade 259 @param taxonomies: list of Taxonomy objects 260 @param sequences: list of Sequence objects 261 @param events: describe such events as gene-duplications at the root 262 node/parent branch of this clade 263 @param binary_characters: a BinaryCharacters object 264 @param distributions: list of Distribution objects 265 @param date: a date for the root node of this clade (type Date) 266 @param references: list of Reference objects 267 @param properties: list of Property objects 268 @param clades: list of sub-clades (type Clade) 269 @param other: list of non-phyloXML objects 270 """
271 - def __init__(self, 272 # Attributes 273 branch_length=None, id_source=None, 274 # Child nodes 275 name=None, width=None, color=None, node_id=None, events=None, 276 binary_characters=None, date=None, 277 # Collections 278 confidences=None, taxonomies=None, sequences=None, 279 distributions=None, references=None, properties=None, clades=None, 280 other=None, 281 ):
282 self.branch_length = branch_length 283 self.id_source = id_source 284 self.name = name 285 self.width = width 286 self.color = color 287 self.node_id = node_id 288 self.events = events 289 self.binary_characters = binary_characters 290 self.date = date 291 self.confidences = confidences or [] 292 self.taxonomies = taxonomies or [] 293 self.sequences = sequences or [] 294 self.distributions = distributions or [] 295 self.references = references or [] 296 self.properties = properties or [] 297 self.clades = clades or [] 298 self.other = other or []
299 300 @classmethod
301 - def from_clade(cls, clade, **kwargs):
302 """Create a new PhyloXML Clade from a Newick or BaseTree Clade object. 303 304 Keyword arguments are the usual PhyloXML Clade constructor parameters. 305 """ 306 new_clade = cls(branch_length=clade.branch_length, 307 name=clade.name) 308 new_clade.clades = [cls.from_clade(c) for c in clade] 309 new_clade.confidence = clade.confidence 310 new_clade.__dict__.update(kwargs) 311 return new_clade
312
313 - def to_phylogeny(self, **kwargs):
314 """Create a new phylogeny containing just this clade.""" 315 phy = Phylogeny(root=self, date=self.date) 316 phy.__dict__.update(kwargs) 317 return phy
318 319 # Shortcuts for list attributes that are usually only 1 item 320 # NB: Duplicated from Phylogeny class
321 - def _get_confidence(self):
322 if len(self.confidences) == 0: 323 return None 324 if len(self.confidences) > 1: 325 raise AttributeError("more than 1 confidence value available; " 326 "use Clade.confidences") 327 return self.confidences[0]
328
329 - def _set_confidence(self, value):
330 if value is None: 331 # Special case: mirror the behavior of _get_confidence 332 self.confidences = [] 333 return 334 if isinstance(value, float) or isinstance(value, int): 335 value = Confidence(value) 336 elif not isinstance(value, Confidence): 337 raise ValueError("value must be a number or Confidence instance") 338 if len(self.confidences) == 0: 339 self.confidences.append(value) 340 elif len(self.confidences) == 1: 341 self.confidences[0] = value 342 else: 343 raise ValueError("multiple confidence values already exist; " 344 "use Phylogeny.confidences instead")
345
346 - def _del_confidence(self):
347 self.confidences = []
348 349 confidence = property(_get_confidence, _set_confidence, _del_confidence) 350
351 - def _get_taxonomy(self):
352 if len(self.taxonomies) == 0: 353 return None 354 if len(self.taxonomies) > 1: 355 raise AttributeError("more than 1 taxonomy value available; " 356 "use Clade.taxonomies") 357 return self.taxonomies[0]
358
359 - def _set_taxonomy(self, value):
360 if not isinstance(value, Taxonomy): 361 raise ValueError("assigned value must be a Taxonomy instance") 362 if len(self.taxonomies) == 0: 363 self.taxonomies.append(value) 364 elif len(self.taxonomies) == 1: 365 self.taxonomies[0] = value 366 else: 367 raise ValueError("multiple taxonomy values already exist; " 368 "use Phylogeny.taxonomies instead")
369 370 taxonomy = property(_get_taxonomy, _set_taxonomy) 371 372 # Syntax sugar for setting the branch color
373 - def _get_color(self):
374 return self._color
375
376 - def _set_color(self, arg):
377 if arg is None or isinstance(arg, BranchColor): 378 self._color = arg 379 elif isinstance(arg, basestring): 380 if arg in BranchColor.color_names: 381 # Known color name 382 self._color = BranchColor.from_name(arg) 383 elif arg.startswith('#') and len(arg) == 7: 384 # HTML-style hex string 385 self._color = BranchColor.from_hex(arg) 386 else: 387 raise ValueError("invalid color string %s" % arg) 388 elif hasattr(arg, '__iter__') and len(arg) == 3: 389 # RGB triplet 390 self._color = BranchColor(*arg) 391 else: 392 raise ValueError("invalid color value %s" % arg)
393 394 color = property(_get_color, _set_color, doc="Branch color.")
395
396 397 # PhyloXML-specific complex types 398 399 -class Accession(PhyloElement):
400 """Captures the local part in a sequence identifier. 401 402 Example: In 'UniProtKB:P17304', the Accession instance attribute 'value' is 403 'P17304' and the 'source' attribute is 'UniProtKB'. 404 """
405 - def __init__(self, value, source):
406 self.value = value 407 self.source = source
408
409 - def __str__(self):
410 """Show the class name and an identifying attribute.""" 411 return '%s:%s' % (self.source, self.value)
412
413 414 -class Annotation(PhyloElement):
415 """The annotation of a molecular sequence. 416 417 It is recommended to annotate by using the optional 'ref' attribute (some 418 examples of acceptable values for the ref attribute: 'GO:0008270', 419 'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1'). 420 421 @type ref: str 422 @param source: plain-text source for this annotation 423 @param evidence: describe evidence as free text (e.g. 'experimental') 424 @type type: str 425 426 @param desc: free text description 427 @param confidence: state the type and value of support (type Confidence) 428 @param properties: list of typed and referenced annotations from external 429 resources 430 @type uri: Uri 431 """ 432 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 433
434 - def __init__(self, 435 # Attributes 436 ref=None, source=None, evidence=None, type=None, 437 # Child nodes 438 desc=None, confidence=None, uri=None, 439 # Collection 440 properties=None):
441 _check_str(ref, self.re_ref.match) 442 self.ref = ref 443 self.source = source 444 self.evidence = evidence 445 self.type = type 446 self.desc = desc 447 self.confidence = confidence 448 self.uri = uri 449 self.properties = properties or []
450
451 452 -class BinaryCharacters(PhyloElement):
453 """The names and/or counts of binary characters present, gained, and lost 454 at the root of a clade. 455 """
456 - def __init__(self, 457 # Attributes 458 type=None, gained_count=None, lost_count=None, present_count=None, 459 absent_count=None, 460 # Child nodes (flattened into collections) 461 gained=None, lost=None, present=None, absent=None):
462 self.type=type 463 self.gained_count=gained_count 464 self.lost_count=lost_count 465 self.present_count=present_count 466 self.absent_count=absent_count 467 self.gained=gained or [] 468 self.lost=lost or [] 469 self.present=present or [] 470 self.absent=absent or []
471
472 473 -class BranchColor(PhyloElement):
474 """Indicates the color of a clade when rendered graphically. 475 476 The color should be interpreted by client code (e.g. visualization 477 programs) as applying to the whole clade, unless overwritten by the 478 color(s) of sub-clades. 479 480 Color values must be integers from 0 to 255. 481 """ 482 483 color_names = { 484 'red': (255, 0, 0), 485 'r': (255, 0, 0), 486 'yellow': (255, 255, 0), 487 'y': (255, 255, 0), 488 'green': ( 0, 128, 0), 489 'g': ( 0, 128, 0), 490 'cyan': ( 0, 255, 255), 491 'c': ( 0, 255, 255), 492 'blue': ( 0, 0, 255), 493 'b': ( 0, 0, 255), 494 'magenta': (255, 0, 255), 495 'm': (255, 0, 255), 496 'black': ( 0, 0, 0), 497 'k': ( 0, 0, 0), 498 'white': (255, 255, 255), 499 'w': (255, 255, 255), 500 # Names standardized in HTML/CSS spec 501 # http://w3schools.com/html/html_colornames.asp 502 'maroon': (128, 0, 0), 503 'olive': (128, 128, 0), 504 'lime': ( 0, 255, 0), 505 'aqua': ( 0, 255, 255), 506 'teal': ( 0, 128, 128), 507 'navy': ( 0, 0, 128), 508 'fuchsia': (255, 0, 255), 509 'purple': (128, 0, 128), 510 'silver': (192, 192, 192), 511 'gray': (128, 128, 128), 512 # More definitions from matplotlib/gcolor2 513 'grey': (128, 128, 128), 514 'pink': (255, 192, 203), 515 'salmon': (250, 128, 114), 516 'orange': (255, 165, 0), 517 'gold': (255, 215, 0), 518 'tan': (210, 180, 140), 519 'brown': (165, 42, 42), 520 } 521
522 - def __init__(self, red, green, blue):
523 for color in (red, green, blue): 524 assert (isinstance(color, int) and 525 0 <= color <= 255 526 ), "Color values must be integers between 0 and 255." 527 self.red = red 528 self.green = green 529 self.blue = blue
530 531 @classmethod
532 - def from_hex(cls, hexstr):
533 """Construct a BranchColor object from a hexadecimal string. 534 535 The string format is the same style used in HTML and CSS, such as 536 '#FF8000' for an RGB value of (255, 128, 0). 537 """ 538 assert (isinstance(hexstr, basestring) and 539 hexstr.startswith('#') and 540 len(hexstr) == 7 541 ), "need a 24-bit hexadecimal string, e.g. #000000" 542 def unpack(cc): 543 return int('0x'+cc, base=16)
544 RGB = hexstr[1:3], hexstr[3:5], hexstr[5:] 545 return cls(*map(unpack, RGB))
546 547 @classmethod
548 - def from_name(cls, colorname):
549 """Construct a BranchColor object by the color's name.""" 550 return cls(*cls.color_names[colorname])
551
552 - def to_hex(self):
553 """Return a 24-bit hexadecimal RGB representation of this color. 554 555 The returned string is suitable for use in HTML/CSS, as a color 556 parameter in matplotlib, and perhaps other situations. 557 558 Example: 559 560 >>> bc = BranchColor(12, 200, 100) 561 >>> bc.to_hex() 562 '#0cc864' 563 """ 564 return '#' + hex( 565 self.red * (16**4) 566 + self.green * (16**2) 567 + self.blue)[2:].zfill(6)
568
569 - def to_rgb(self):
570 """Return a tuple of RGB values (0 to 255) representing this color. 571 572 Example: 573 574 >>> bc = BranchColor(255, 165, 0) 575 >>> bc.to_rgb() 576 (255, 165, 0) 577 """ 578 return (self.red, self.green, self.blue)
579
580 - def __repr__(self):
581 """Preserve the standard RGB order when representing this object.""" 582 return (u'%s(red=%d, green=%d, blue=%d)' 583 % (self.__class__.__name__, self.red, self.green, self.blue))
584
585 - def __str__(self):
586 """Show the color's RGB values.""" 587 return "(%d, %d, %d)" % (self.red, self.green, self.blue)
588
589 590 -class CladeRelation(PhyloElement):
591 """Expresses a typed relationship between two clades. 592 593 For example, this could be used to describe multiple parents of a clade. 594 595 @type id_ref_0: str 596 @type id_ref_1: str 597 @type distance: str 598 @type type: str 599 600 @type confidence: Confidence 601 """
602 - def __init__(self, type, id_ref_0, id_ref_1, 603 distance=None, confidence=None):
604 self.distance = distance 605 self.type = type 606 self.id_ref_0 = id_ref_0 607 self.id_ref_1 = id_ref_1 608 self.confidence = confidence
609
610 611 -class Confidence(PhyloElement):
612 """A general purpose confidence element. 613 614 For example, this can be used to express the bootstrap support value of a 615 clade (in which case the 'type' attribute is 'bootstrap'). 616 617 @type value: float 618 @type type: str 619 """
620 - def __init__(self, value, type='unknown'):
621 self.value = value 622 self.type = type
623
624 - def __float__(self):
625 return float(self.value)
626
627 - def __int__(self):
628 return int(self.value)
629
630 631 -class Date(PhyloElement):
632 """A date associated with a clade/node. 633 634 Its value can be numerical by using the 'value' element and/or free text 635 with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it 636 is recommended to employ the 'unit' attribute. 637 638 @param unit: type of numerical value (e.g. 'mya' for 'million years ago') 639 640 @type value: float 641 @param desc: plain-text description of the date 642 @param minimum: lower bound on the date value 643 @param maximum: upper bound on the date value 644 """
645 - def __init__(self, value=None, unit=None, desc=None, 646 minimum=None, maximum=None):
647 self.value = value 648 self.unit = unit 649 self.desc = desc 650 self.minimum = minimum 651 self.maximum = maximum
652
653 - def __str__(self):
654 """Show the class name and the human-readable date.""" 655 if self.unit and self.value is not None: 656 return '%s %s' % (self.value, self.unit) 657 if self.desc is not None: 658 return self.desc 659 return self.__class__.__name__
660
661 662 -class Distribution(PhyloElement):
663 """Geographic distribution of the items of a clade (species, sequences). 664 665 Intended for phylogeographic applications. 666 667 The location can be described either by free text in the 'desc' element 668 and/or by the coordinates of one or more 'Points' (similar to the 'Point' 669 element in Google's KML format) or by 'Polygons'. 670 """
671 - def __init__(self, desc=None, points=None, polygons=None):
672 self.desc = desc 673 self.points = points or [] 674 self.polygons = polygons or []
675
676 677 -class DomainArchitecture(PhyloElement):
678 """Domain architecture of a protein. 679 680 @param length: total length of the protein sequence (type int) 681 @param domains: list of ProteinDomain objects 682 """
683 - def __init__(self, length=None, domains=None):
684 self.length = length 685 self.domains = domains
686
687 688 -class Events(PhyloElement):
689 """Events at the root node of a clade (e.g. one gene duplication). 690 691 All attributes are set to None by default, but this object can also be 692 treated as a dictionary, in which case None values are treated as missing 693 keys and deleting a key resets that attribute's value back to None. 694 """ 695 ok_type = set(('transfer', 'fusion', 'speciation_or_duplication', 'other', 696 'mixed', 'unassigned')) 697
698 - def __init__(self, type=None, duplications=None, speciations=None, 699 losses=None, confidence=None):
700 _check_str(type, self.ok_type.__contains__) 701 self.type = type 702 self.duplications = duplications 703 self.speciations = speciations 704 self.losses = losses 705 self.confidence = confidence
706
707 - def items(self):
708 return [(k, v) for k, v in self.__dict__.iteritems() if v is not None]
709
710 - def keys(self):
711 return [k for k, v in self.__dict__.iteritems() if v is not None]
712
713 - def values(self):
714 return [v for v in self.__dict__.itervalues() if v is not None]
715
716 - def __len__(self):
717 return len(self.values())
718
719 - def __getitem__(self, key):
720 if not hasattr(self, key): 721 raise KeyError(key) 722 val = getattr(self, key) 723 if val is None: 724 raise KeyError("%s has not been set in this object" % repr(key)) 725 return val
726
727 - def __setitem__(self, key, val):
728 setattr(self, key, val)
729
730 - def __delitem__(self, key):
731 setattr(self, key, None)
732
733 - def __iter__(self):
734 return iter(self.keys())
735
736 - def __contains__(self, key):
737 return (hasattr(self, key) and getattr(self, key) is not None)
738
739 740 -class Id(PhyloElement):
741 """A general-purpose identifier element. 742 743 Allows to indicate the provider (or authority) of an identifier, e.g. NCBI, 744 along with the value itself. 745 """
746 - def __init__(self, value, provider=None):
747 self.value = value 748 self.provider = provider
749
750 - def __str__(self):
751 if self.provider is not None: 752 return '%s:%s' % (self.provider, self.value) 753 return self.value
754
755 756 -class MolSeq(PhyloElement):
757 """Store a molecular sequence. 758 759 @param value: the sequence, as a string 760 @param is_aligned: True is mol_seq is aligned (usu. meaning gaps are 761 introduced and all aligned seqs are the same length) 762 """ 763 re_value = re.compile(r'[a-zA-Z\.\-\?\*_]+') 764
765 - def __init__(self, value, is_aligned=None):
766 _check_str(value, self.re_value.match) 767 self.value = value 768 self.is_aligned = is_aligned
769
770 - def __str__(self):
771 return self.value
772
773 774 -class Point(PhyloElement):
775 """Geographic coordinates of a point, with an optional altitude. 776 777 Used by element 'Distribution'. 778 779 @param geodetic_datum: indicate the geodetic datum (also called 'map 780 datum'). For example, Google's KML uses 'WGS84'. (required) 781 @param lat: latitude 782 @param long: longitude 783 @param alt: altitude 784 @param alt_unit: unit for the altitude (e.g. 'meter') 785 """
786 - def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None):
787 self.geodetic_datum = geodetic_datum 788 self.lat = lat 789 self.long = long 790 self.alt = alt 791 self.alt_unit = alt_unit
792
793 794 -class Polygon(PhyloElement):
795 """A polygon defined by a list of 'Points' (used by element 'Distribution'). 796 797 @param points: list of 3 or more points representing vertices. 798 """
799 - def __init__(self, points=None):
800 self.points = points or []
801
802 - def __str__(self):
803 return '%s([%s])' % (self.__class__.__name__, 804 ',\n'.join(map(str, self.points)))
805
806 807 -class Property(PhyloElement):
808 """A typed and referenced property from an external resources. 809 810 Can be attached to 'Phylogeny', 'Clade', and 'Annotation' objects. 811 812 @param ref: reference to an external resource, e.g. "NOAA:depth" 813 814 @param unit: the unit of the property, e.g. "METRIC:m" (optional) 815 816 @param datatype: indicates the type of a property and is limited to 817 xsd-datatypes (e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer', 818 'xsd:decimal', 'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI'). 819 820 @param applies_to: indicates the item to which a property applies to (e.g. 821 'node' for the parent node of a clade, 'parent_branch' for the parent 822 branch of a clade, or just 'clade'). 823 824 @param id_ref: allows to attached a property specifically to one element 825 (on the xml-level). (optional) 826 827 @type value: str 828 """ 829 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 830 ok_applies_to = set(('phylogeny', 'clade', 'node', 'annotation', 831 'parent_branch', 'other')) 832 ok_datatype = set(('xsd:string', 'xsd:boolean', 'xsd:decimal', 'xsd:float', 833 'xsd:double', 'xsd:duration', 'xsd:dateTime', 'xsd:time', 'xsd:date', 834 'xsd:gYearMonth', 'xsd:gYear', 'xsd:gMonthDay', 'xsd:gDay', 835 'xsd:gMonth', 'xsd:hexBinary', 'xsd:base64Binary', 'xsd:anyURI', 836 'xsd:normalizedString', 'xsd:token', 'xsd:integer', 837 'xsd:nonPositiveInteger', 'xsd:negativeInteger', 'xsd:long', 'xsd:int', 838 'xsd:short', 'xsd:byte', 'xsd:nonNegativeInteger', 'xsd:unsignedLong', 839 'xsd:unsignedInt', 'xsd:unsignedShort', 'xsd:unsignedByte', 840 'xsd:positiveInteger')) 841
842 - def __init__(self, value, ref, applies_to, datatype, 843 unit=None, id_ref=None):
844 _check_str(ref, self.re_ref.match) 845 _check_str(applies_to, self.ok_applies_to.__contains__) 846 _check_str(datatype, self.ok_datatype.__contains__) 847 _check_str(unit, self.re_ref.match) 848 self.unit = unit 849 self.id_ref = id_ref 850 self.value = value 851 self.ref = ref 852 self.applies_to = applies_to 853 self.datatype = datatype
854
855 856 -class ProteinDomain(PhyloElement):
857 """Represents an individual domain in a domain architecture. 858 859 The locations use 0-based indexing, as most Python objects including 860 SeqFeature do, rather than the usual biological convention starting at 1. 861 This means the start and end attributes can be used directly as slice 862 indexes on Seq objects. 863 864 @param start: start of the domain on the sequence, using 0-based indexing 865 @type start: non-negative integer 866 @param end: end of the domain on the sequence 867 @type end: non-negative integer 868 @param confidence: can be used to store e.g. E-values. (type float) 869 @param id: unique identifier/name 870 """ 871 # TODO: confirm that 'start' counts from 1, not 0
872 - def __init__(self, value, start, end, confidence=None, id=None):
873 self.value = value 874 self.start = start 875 self.end = end 876 self.confidence = confidence 877 self.id = id
878 879 @classmethod
880 - def from_seqfeature(cls, feat):
881 return ProteinDomain(feat.id, 882 feat.location.nofuzzy_start, 883 feat.location.nofuzzy_end, 884 confidence=feat.qualifiers.get('confidence'))
885
886 - def to_seqfeature(self):
887 feat = SeqFeature(location=FeatureLocation(self.start, self.end), 888 id=self.value) 889 if hasattr(self, 'confidence'): 890 feat.qualifiers['confidence'] = self.confidence 891 return feat
892
893 894 -class Reference(PhyloElement):
895 """Literature reference for a clade. 896 897 It is recommended to use the 'doi' attribute instead of the free text 898 'desc' element whenever possible. 899 """ 900 re_doi = re.compile(r'[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+') 901
902 - def __init__(self, doi=None, desc=None):
903 _check_str(doi, self.re_doi.match) 904 self.doi = doi 905 self.desc = desc
906
907 908 -class Sequence(PhyloElement):
909 """A molecular sequence (Protein, DNA, RNA) associated with a node. 910 911 One intended use for 'id_ref' is to link a sequence to a taxonomy (via the 912 taxonomy's 'id_source') in case of multiple sequences and taxonomies per 913 node. 914 915 @param type: type of sequence ('dna', 'rna', or 'protein'). 916 @type id_ref: str 917 @type id_source: str 918 919 @param symbol: short symbol of the sequence, e.g. 'ACTM' (max. 10 chars) 920 @type accession: Accession 921 @param name: full name of the sequence, e.g. 'muscle Actin' 922 @param location: location of a sequence on a genome/chromosome. 923 @type mol_seq: MolSeq 924 @type uri: Uri 925 @param annotations: list of Annotation objects 926 @param domain_architecture: protein domains on this sequence (type 927 DomainArchitecture) 928 @param other: list of non-phyloXML elements (type Other) 929 """ 930 alphabets = {'dna': Alphabet.generic_dna, 931 'rna': Alphabet.generic_rna, 932 'protein': Alphabet.generic_protein} 933 re_symbol = re.compile(r'\S{1,10}') 934
935 - def __init__(self, 936 # Attributes 937 type=None, id_ref=None, id_source=None, 938 # Child nodes 939 symbol=None, accession=None, name=None, location=None, 940 mol_seq=None, uri=None, domain_architecture=None, 941 # Collections 942 annotations=None, other=None, 943 ):
944 _check_str(type, self.alphabets.__contains__) 945 _check_str(symbol, self.re_symbol.match) 946 self.type = type 947 self.id_ref = id_ref 948 self.id_source = id_source 949 self.symbol = symbol 950 self.accession = accession 951 self.name = name 952 self.location = location 953 self.mol_seq = mol_seq 954 self.uri = uri 955 self.domain_architecture = domain_architecture 956 self.annotations = annotations or [] 957 self.other = other or []
958 959 @classmethod
960 - def from_seqrecord(cls, record, is_aligned=None):
961 """Create a new PhyloXML Sequence from a SeqRecord object.""" 962 if is_aligned == None: 963 is_aligned = isinstance(record.seq.alphabet, Alphabet.Gapped) 964 params = { 965 'accession': Accession(record.id, ''), 966 'symbol': record.name, 967 'name': record.description, 968 'mol_seq': MolSeq(str(record.seq), is_aligned), 969 } 970 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 971 params['type'] = 'dna' 972 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 973 params['type'] = 'rna' 974 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 975 params['type'] = 'protein' 976 977 # Unpack record.annotations 978 for key in ('id_ref', 'id_source', 'location'): 979 if key in record.annotations: 980 params[key] = record.annotations[key] 981 if isinstance(record.annotations.get('uri'), dict): 982 params['uri'] = Uri(**record.annotations['uri']) 983 # Build a Sequence.annotation object 984 if record.annotations.get('annotations'): 985 params['annotations'] = [] 986 for annot in record.annotations['annotations']: 987 ann_args = {} 988 for key in ('ref', 'source', 'evidence', 'type', 'desc'): 989 if key in annot: 990 ann_args[key] = annot[key] 991 if isinstance(annot.get('confidence'), list): 992 ann_args['confidence'] = Confidence( 993 *annot['confidence']) 994 if isinstance(annot.get('properties'), list): 995 ann_args['properties'] = [Property(**prop) 996 for prop in annot['properties'] 997 if isinstance(prop, dict)] 998 params['annotations'].append(Annotation(**ann_args)) 999 1000 # Unpack record.features 1001 if record.features: 1002 params['domain_architecture'] = DomainArchitecture( 1003 length=len(record.seq), 1004 domains=[ProteinDomain.from_seqfeature(feat) 1005 for feat in record.features]) 1006 1007 return Sequence(**params)
1008
1009 - def to_seqrecord(self):
1010 """Create a SeqRecord object from this Sequence instance. 1011 1012 The seqrecord.annotations dictionary is packed like so:: 1013 1014 { # Sequence attributes with no SeqRecord equivalent: 1015 'id_ref': self.id_ref, 1016 'id_source': self.id_source, 1017 'location': self.location, 1018 'uri': { 'value': self.uri.value, 1019 'desc': self.uri.desc, 1020 'type': self.uri.type }, 1021 # Sequence.annotations attribute (list of Annotations) 1022 'annotations': [{ 'ref': ann.ref, 1023 'source': ann.source, 1024 'evidence': ann.evidence, 1025 'type': ann.type, 1026 'confidence': [ ann.confidence.value, 1027 ann.confidence.type ], 1028 'properties': [{ 'value': prop.value, 1029 'ref': prop.ref, 1030 'applies_to': prop.applies_to, 1031 'datatype': prop.datatype, 1032 'unit': prop.unit, 1033 'id_ref': prop.id_ref } 1034 for prop in ann.properties], 1035 } for ann in self.annotations], 1036 } 1037 """ 1038 def clean_dict(dct): 1039 """Remove None-valued items from a dictionary.""" 1040 return dict((key, val) for key, val in dct.iteritems() 1041 if val is not None)
1042 1043 seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()), 1044 **clean_dict({ 1045 'id': str(self.accession), 1046 'name': self.symbol, 1047 'description': self.name, 1048 # 'dbxrefs': None, 1049 })) 1050 if self.domain_architecture: 1051 seqrec.features = [dom.to_seqfeature() 1052 for dom in self.domain_architecture.domains] 1053 # Sequence attributes with no SeqRecord equivalent 1054 seqrec.annotations = clean_dict({ 1055 'id_ref': self.id_ref, 1056 'id_source': self.id_source, 1057 'location': self.location, 1058 'uri': self.uri and clean_dict({ 1059 'value': self.uri.value, 1060 'desc': self.uri.desc, 1061 'type': self.uri.type, 1062 }), 1063 'annotations': self.annotations and [ 1064 clean_dict({ 1065 'ref': ann.ref, 1066 'source': ann.source, 1067 'evidence': ann.evidence, 1068 'type': ann.type, 1069 'confidence': ann.confidence and [ 1070 ann.confidence.value, 1071 ann.confidence.type], 1072 'properties': [clean_dict({ 1073 'value': prop.value, 1074 'ref': prop.ref, 1075 'applies_to': prop.applies_to, 1076 'datatype': prop.datatype, 1077 'unit': prop.unit, 1078 'id_ref': prop.id_ref }) 1079 for prop in ann.properties], 1080 }) for ann in self.annotations], 1081 }) 1082 return seqrec
1083
1084 - def get_alphabet(self):
1085 alph = self.alphabets.get(self.type, Alphabet.generic_alphabet) 1086 if self.mol_seq and self.mol_seq.is_aligned: 1087 return Alphabet.Gapped(alph) 1088 return alph
1089
1090 1091 -class SequenceRelation(PhyloElement):
1092 """Express a typed relationship between two sequences. 1093 1094 For example, this could be used to describe an orthology (in which case 1095 attribute 'type' is 'orthology'). 1096 1097 @param id_ref_0: first sequence reference identifier 1098 @param id_ref_1: second sequence reference identifier 1099 @param distance: distance between the two sequences (type float) 1100 @param type: describe the type of relationship 1101 1102 @type confidence: Confidence 1103 """ 1104 ok_type = set(('orthology', 'one_to_one_orthology', 'super_orthology', 1105 'paralogy', 'ultra_paralogy', 'xenology', 'unknown', 'other')) 1106
1107 - def __init__(self, type, id_ref_0, id_ref_1, 1108 distance=None, confidence=None):
1109 _check_str(type, self.ok_type.__contains__) 1110 self.distance = distance 1111 self.type = type 1112 self.id_ref_0 = id_ref_0 1113 self.id_ref_1 = id_ref_1 1114 self.confidence = confidence
1115
1116 1117 -class Taxonomy(PhyloElement):
1118 """Describe taxonomic information for a clade. 1119 1120 @param id_source: link other elements to a taxonomy (on the XML level) 1121 1122 @param id: unique identifier of a taxon, e.g. Id('6500', 1123 provider='ncbi_taxonomy') for the California sea hare 1124 @param code: store UniProt/Swiss-Prot style organism codes, e.g. 'APLCA' 1125 for the California sea hare 'Aplysia californica' (restricted string) 1126 @param scientific_name: the standard scientific name for this organism, 1127 e.g. 'Aplysia californica' for the California sea hare 1128 @param authority: keep the authority, such as 'J. G. Cooper, 1863', 1129 associated with the 'scientific_name' 1130 @param common_names: list of common names for this organism 1131 @param synonyms: ??? 1132 @param rank: taxonomic rank (restricted string) 1133 @type uri: Uri 1134 @param other: list of non-phyloXML elements (type Other) 1135 """ 1136 re_code = re.compile(r'[a-zA-Z0-9_]{2,10}') 1137 ok_rank = set(('domain', 'kingdom', 'subkingdom', 'branch', 'infrakingdom', 1138 'superphylum', 'phylum', 'subphylum', 'infraphylum', 'microphylum', 1139 'superdivision', 'division', 'subdivision', 'infradivision', 1140 'superclass', 'class', 'subclass', 'infraclass', 'superlegion', 1141 'legion', 'sublegion', 'infralegion', 'supercohort', 'cohort', 1142 'subcohort', 'infracohort', 'superorder', 'order', 'suborder', 1143 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe', 'subtribe', 1144 'infratribe', 'genus', 'subgenus', 'superspecies', 'species', 1145 'subspecies', 'variety', 'subvariety', 'form', 'subform', 'cultivar', 1146 'unknown', 'other')) 1147
1148 - def __init__(self, 1149 # Attributes 1150 id_source=None, 1151 # Child nodes 1152 id=None, code=None, scientific_name=None, authority=None, 1153 rank=None, uri=None, 1154 # Collections 1155 common_names=None, synonyms=None, other=None, 1156 ):
1157 _check_str(code, self.re_code.match) 1158 _check_str(rank, self.ok_rank.__contains__) 1159 self.id_source = id_source 1160 self.id = id 1161 self.code = code 1162 self.scientific_name = scientific_name 1163 self.authority = authority 1164 self.rank = rank 1165 self.uri = uri 1166 self.common_names = common_names or [] 1167 self.synonyms = synonyms or [] 1168 self.other = other or []
1169
1170 - def __str__(self):
1171 """Show the class name and an identifying attribute.""" 1172 if self.code is not None: 1173 return self.code 1174 if self.scientific_name is not None: 1175 return self.scientific_name 1176 if self.rank is not None: 1177 return self.rank 1178 if self.id is not None: 1179 return str(self.id) 1180 return self.__class__.__name__
1181
1182 1183 -class Uri(PhyloElement):
1184 """A uniform resource identifier. 1185 1186 In general, this is expected to be an URL (for example, to link to an image 1187 on a website, in which case the 'type' attribute might be 'image' and 'desc' 1188 might be 'image of a California sea hare'). 1189 """
1190 - def __init__(self, value, desc=None, type=None):
1191 self.value = value 1192 self.desc = desc 1193 self.type = type
1194
1195 - def __str__(self):
1196 if self.value: 1197 return self.value 1198 return repr(self)
1199