1
2
3
4
5
6 """Classes corresponding to phyloXML elements.
7
8 See U{ http://phyloxml.org/ } for the official specification.
9
10 See also Han and Zmasek (2009) doi:10.1186/1471-2105-10-356
11 """
12 __docformat__ = "epytext en"
13
14 import re
15 import warnings
16
17 from Bio import Alphabet
18 from Bio.Align import MultipleSeqAlignment
19 from Bio.Seq import Seq
20 from Bio.SeqFeature import SeqFeature, FeatureLocation
21 from Bio.SeqRecord import SeqRecord
22 import Bio
23
24 from Bio.Phylo import BaseTree
28 """Warning for non-compliance with the phyloXML specification."""
29 pass
30
33 """Check a string using testfunc, and warn if there's no match."""
34 if text is not None and not testfunc(text):
35 warnings.warn("String %s doesn't match the given regexp" % text,
36 PhyloXMLWarning, stacklevel=2)
37
42 """Base class for all PhyloXML objects."""
43
46 """Root node of the PhyloXML document.
47
48 Contains an arbitrary number of Phylogeny elements, possibly followed by
49 elements from other namespaces.
50
51 @param attributes: (XML namespace definitions)
52 @param phylogenies: list of phylogenetic trees
53 @param other: list of arbitrary non-phyloXML elements, if any
54 """
55 - def __init__(self, attributes, phylogenies=None, other=None):
56 self.attributes = attributes
57 self.phylogenies = phylogenies or []
58 self.other = other or []
59
61 """Get a phylogeny by index or name."""
62 if isinstance(index, int) or isinstance(index, slice):
63 return self.phylogenies[index]
64 if not isinstance(index, basestring):
65 raise KeyError("can't use %s as an index" % type(index))
66 for tree in self.phylogenies:
67 if tree.name == index:
68 return tree
69 else:
70 raise KeyError("no phylogeny found with name " + repr(index))
71
73 """Iterate through the phylogenetic trees in this object."""
74 return iter(self.phylogenies)
75
77 """Number of phylogenetic trees in this object."""
78 return len(self.phylogenies)
79
81 return '%s([%s])' % (self.__class__.__name__,
82 ',\n'.join(map(str, self.phylogenies)))
83
84
85 -class Other(PhyloElement):
86 """Container for non-phyloXML elements in the tree.
87
88 Usually, an Other object will have either a 'value' or a non-empty list
89 of 'children', but not both. This is not enforced here, though.
90
91 @param tag: local tag for the XML node
92 @param namespace: XML namespace for the node -- should not be the default
93 phyloXML namespace.
94 @param attributes: string attributes on the XML node
95 @param value: text contained directly within this XML node
96 @param children: list of child nodes, if any (also Other instances)
97 """
98 - def __init__(self, tag, namespace=None, attributes=None, value=None,
99 children=None):
100 self.tag = tag
101 self.namespace = namespace
102 self.attributes = attributes
103 self.value = value
104 self.children = children or []
105
107 """Iterate through the children of this object (if any)."""
108 return iter(self.children)
109
110
111 -class Phylogeny(PhyloElement, BaseTree.Tree):
112 """A phylogenetic tree.
113
114 @param root: the root node/clade of this tree
115 @param rooted: True if this tree is rooted
116 @param rerootable: True if this tree is rerootable
117 @param branch_length_unit: unit for branch_length values on clades
118 @type type: str
119
120 @param name: string identifier for this tree, not required to be unique
121 @param id: unique identifier for this tree (type Id)
122 @param description: plain-text description
123 @param date: date for the root node of this tree (type Date)
124 @param confidences: list of Confidence objects for this tree
125 @param clade_relations: list of CladeRelation objects
126 @param sequence_relations: list of SequenceRelation objects
127 @param properties: list of Property objects
128 @param other: list of non-phyloXML elements (type Other)
129 """
130 - def __init__(self, root=None, rooted=True,
131 rerootable=None, branch_length_unit=None, type=None,
132
133 name=None, id=None, description=None, date=None,
134
135 confidences=None, clade_relations=None, sequence_relations=None,
136 properties=None, other=None,
137 ):
138 assert isinstance(rooted, bool)
139 self.root = root
140 self.rooted = rooted
141 self.rerootable = rerootable
142 self.branch_length_unit = branch_length_unit
143 self.type = type
144 self.name = name
145 self.id = id
146 self.description = description
147 self.date = date
148 self.confidences = confidences or []
149 self.clade_relations = clade_relations or []
150 self.sequence_relations = sequence_relations or []
151 self.properties = properties or []
152 self.other = other or []
153
154 @classmethod
156 """Create a new Phylogeny given a Tree (from Newick/Nexus or BaseTree).
157
158 Keyword arguments are the usual Phylogeny constructor parameters.
159 """
160 phy = cls(
161 root=Clade.from_clade(tree.root),
162 rooted=tree.rooted,
163 name=tree.name,
164 id=(tree.id is not None) and Id(str(tree.id)) or None)
165 phy.__dict__.update(kwargs)
166 return phy
167
168 @classmethod
170 """Create a new Phylogeny given a Newick or BaseTree Clade object.
171
172 Keyword arguments are the usual PhyloXML Clade constructor parameters.
173 """
174 return Clade.from_clade(clade).to_phylogeny(**kwargs)
175
177 """Return this tree, a PhyloXML-compatible Phylogeny object.
178
179 Overrides the BaseTree method.
180 """
181 return self
182
184 """Create a new Phyloxml object containing just this phylogeny."""
185 return Phyloxml(kwargs, phylogenies=[self])
186
188 """Construct an alignment from the aligned sequences in this tree."""
189 def is_aligned_seq(elem):
190 if isinstance(elem, Sequence) and elem.mol_seq.is_aligned:
191 return True
192 return False
193 seqs = self._filter_search(is_aligned_seq, 'preorder', True)
194 try:
195 first_seq = seqs.next()
196 except StopIteration:
197
198 return MultipleSeqAlignment([])
199 msa = MultipleSeqAlignment([first_seq.to_seqrecord()],
200 first_seq.get_alphabet())
201 msa.extend(seq.to_seqrecord() for seq in seqs)
202 return msa
203
204
206 """Equivalent to self.confidences[0] if there is only 1 value.
207
208 See also: Clade.confidence, Clade.taxonomy
209 """
210 if len(self.confidences) == 0:
211 return None
212 if len(self.confidences) > 1:
213 raise AttributeError("more than 1 confidence value available; "
214 "use Phylogeny.confidences")
215 return self.confidences[0]
216
218 if value is None:
219
220 self.confidences = []
221 return
222 if isinstance(value, float) or isinstance(value, int):
223 value = Confidence(value)
224 elif not isinstance(value, Confidence):
225 raise ValueError("value must be a number or Confidence instance")
226 if len(self.confidences) == 0:
227 self.confidences.append(value)
228 elif len(self.confidences) == 1:
229 self.confidences[0] = value
230 else:
231 raise ValueError("multiple confidence values already exist; "
232 "use Phylogeny.confidences instead")
233
235 self.confidences = []
236
237 confidence = property(_get_confidence, _set_confidence, _del_confidence)
238
239
240 -class Clade(PhyloElement, BaseTree.Clade):
241 """Describes a branch of the current phylogenetic tree.
242
243 Used recursively, describes the topology of a phylogenetic tree.
244
245 Both 'color' and 'width' elements should be interpreted by client code as
246 applying to the whole clade, including all descendents, unless overwritten
247 in-sub clades. This module doesn't automatically assign these attributes to
248 sub-clades to achieve this cascade -- and neither should you.
249
250 @param branch_length: parent branch length of this clade
251 @param id_source: link other elements to a clade (on the xml-level)
252
253 @param name: short string label for this clade
254 @param confidences: list of Confidence objects, used to indicate the
255 support for a clade/parent branch.
256 @param width: branch width for this clade (including branch from parent)
257 @param color: color used for graphical display of this clade
258 @param node_id: unique identifier for the root node of this clade
259 @param taxonomies: list of Taxonomy objects
260 @param sequences: list of Sequence objects
261 @param events: describe such events as gene-duplications at the root
262 node/parent branch of this clade
263 @param binary_characters: a BinaryCharacters object
264 @param distributions: list of Distribution objects
265 @param date: a date for the root node of this clade (type Date)
266 @param references: list of Reference objects
267 @param properties: list of Property objects
268 @param clades: list of sub-clades (type Clade)
269 @param other: list of non-phyloXML objects
270 """
271 - def __init__(self,
272
273 branch_length=None, id_source=None,
274
275 name=None, width=None, color=None, node_id=None, events=None,
276 binary_characters=None, date=None,
277
278 confidences=None, taxonomies=None, sequences=None,
279 distributions=None, references=None, properties=None, clades=None,
280 other=None,
281 ):
299
300 @classmethod
312
314 """Create a new phylogeny containing just this clade."""
315 phy = Phylogeny(root=self, date=self.date)
316 phy.__dict__.update(kwargs)
317 return phy
318
319
320
322 if len(self.confidences) == 0:
323 return None
324 if len(self.confidences) > 1:
325 raise AttributeError("more than 1 confidence value available; "
326 "use Clade.confidences")
327 return self.confidences[0]
328
330 if value is None:
331
332 self.confidences = []
333 return
334 if isinstance(value, float) or isinstance(value, int):
335 value = Confidence(value)
336 elif not isinstance(value, Confidence):
337 raise ValueError("value must be a number or Confidence instance")
338 if len(self.confidences) == 0:
339 self.confidences.append(value)
340 elif len(self.confidences) == 1:
341 self.confidences[0] = value
342 else:
343 raise ValueError("multiple confidence values already exist; "
344 "use Phylogeny.confidences instead")
345
347 self.confidences = []
348
349 confidence = property(_get_confidence, _set_confidence, _del_confidence)
350
352 if len(self.taxonomies) == 0:
353 return None
354 if len(self.taxonomies) > 1:
355 raise AttributeError("more than 1 taxonomy value available; "
356 "use Clade.taxonomies")
357 return self.taxonomies[0]
358
360 if not isinstance(value, Taxonomy):
361 raise ValueError("assigned value must be a Taxonomy instance")
362 if len(self.taxonomies) == 0:
363 self.taxonomies.append(value)
364 elif len(self.taxonomies) == 1:
365 self.taxonomies[0] = value
366 else:
367 raise ValueError("multiple taxonomy values already exist; "
368 "use Phylogeny.taxonomies instead")
369
370 taxonomy = property(_get_taxonomy, _set_taxonomy)
371
372
375
377 if arg is None or isinstance(arg, BranchColor):
378 self._color = arg
379 elif isinstance(arg, basestring):
380 if arg in BranchColor.color_names:
381
382 self._color = BranchColor.from_name(arg)
383 elif arg.startswith('#') and len(arg) == 7:
384
385 self._color = BranchColor.from_hex(arg)
386 else:
387 raise ValueError("invalid color string %s" % arg)
388 elif hasattr(arg, '__iter__') and len(arg) == 3:
389
390 self._color = BranchColor(*arg)
391 else:
392 raise ValueError("invalid color value %s" % arg)
393
394 color = property(_get_color, _set_color, doc="Branch color.")
395
400 """Captures the local part in a sequence identifier.
401
402 Example: In 'UniProtKB:P17304', the Accession instance attribute 'value' is
403 'P17304' and the 'source' attribute is 'UniProtKB'.
404 """
408
410 """Show the class name and an identifying attribute."""
411 return '%s:%s' % (self.source, self.value)
412
415 """The annotation of a molecular sequence.
416
417 It is recommended to annotate by using the optional 'ref' attribute (some
418 examples of acceptable values for the ref attribute: 'GO:0008270',
419 'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1').
420
421 @type ref: str
422 @param source: plain-text source for this annotation
423 @param evidence: describe evidence as free text (e.g. 'experimental')
424 @type type: str
425
426 @param desc: free text description
427 @param confidence: state the type and value of support (type Confidence)
428 @param properties: list of typed and referenced annotations from external
429 resources
430 @type uri: Uri
431 """
432 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+')
433
434 - def __init__(self,
435
436 ref=None, source=None, evidence=None, type=None,
437
438 desc=None, confidence=None, uri=None,
439
440 properties=None):
450
453 """The names and/or counts of binary characters present, gained, and lost
454 at the root of a clade.
455 """
456 - def __init__(self,
457
458 type=None, gained_count=None, lost_count=None, present_count=None,
459 absent_count=None,
460
461 gained=None, lost=None, present=None, absent=None):
462 self.type=type
463 self.gained_count=gained_count
464 self.lost_count=lost_count
465 self.present_count=present_count
466 self.absent_count=absent_count
467 self.gained=gained or []
468 self.lost=lost or []
469 self.present=present or []
470 self.absent=absent or []
471
474 """Indicates the color of a clade when rendered graphically.
475
476 The color should be interpreted by client code (e.g. visualization
477 programs) as applying to the whole clade, unless overwritten by the
478 color(s) of sub-clades.
479
480 Color values must be integers from 0 to 255.
481 """
482
483 color_names = {
484 'red': (255, 0, 0),
485 'r': (255, 0, 0),
486 'yellow': (255, 255, 0),
487 'y': (255, 255, 0),
488 'green': ( 0, 128, 0),
489 'g': ( 0, 128, 0),
490 'cyan': ( 0, 255, 255),
491 'c': ( 0, 255, 255),
492 'blue': ( 0, 0, 255),
493 'b': ( 0, 0, 255),
494 'magenta': (255, 0, 255),
495 'm': (255, 0, 255),
496 'black': ( 0, 0, 0),
497 'k': ( 0, 0, 0),
498 'white': (255, 255, 255),
499 'w': (255, 255, 255),
500
501
502 'maroon': (128, 0, 0),
503 'olive': (128, 128, 0),
504 'lime': ( 0, 255, 0),
505 'aqua': ( 0, 255, 255),
506 'teal': ( 0, 128, 128),
507 'navy': ( 0, 0, 128),
508 'fuchsia': (255, 0, 255),
509 'purple': (128, 0, 128),
510 'silver': (192, 192, 192),
511 'gray': (128, 128, 128),
512
513 'grey': (128, 128, 128),
514 'pink': (255, 192, 203),
515 'salmon': (250, 128, 114),
516 'orange': (255, 165, 0),
517 'gold': (255, 215, 0),
518 'tan': (210, 180, 140),
519 'brown': (165, 42, 42),
520 }
521
530
531 @classmethod
533 """Construct a BranchColor object from a hexadecimal string.
534
535 The string format is the same style used in HTML and CSS, such as
536 '#FF8000' for an RGB value of (255, 128, 0).
537 """
538 assert (isinstance(hexstr, basestring) and
539 hexstr.startswith('#') and
540 len(hexstr) == 7
541 ), "need a 24-bit hexadecimal string, e.g. #000000"
542 def unpack(cc):
543 return int('0x'+cc, base=16)
544 RGB = hexstr[1:3], hexstr[3:5], hexstr[5:]
545 return cls(*map(unpack, RGB))
546
547 @classmethod
549 """Construct a BranchColor object by the color's name."""
550 return cls(*cls.color_names[colorname])
551
553 """Return a 24-bit hexadecimal RGB representation of this color.
554
555 The returned string is suitable for use in HTML/CSS, as a color
556 parameter in matplotlib, and perhaps other situations.
557
558 Example:
559
560 >>> bc = BranchColor(12, 200, 100)
561 >>> bc.to_hex()
562 '#0cc864'
563 """
564 return '#' + hex(
565 self.red * (16**4)
566 + self.green * (16**2)
567 + self.blue)[2:].zfill(6)
568
570 """Return a tuple of RGB values (0 to 255) representing this color.
571
572 Example:
573
574 >>> bc = BranchColor(255, 165, 0)
575 >>> bc.to_rgb()
576 (255, 165, 0)
577 """
578 return (self.red, self.green, self.blue)
579
581 """Preserve the standard RGB order when representing this object."""
582 return (u'%s(red=%d, green=%d, blue=%d)'
583 % (self.__class__.__name__, self.red, self.green, self.blue))
584
586 """Show the color's RGB values."""
587 return "(%d, %d, %d)" % (self.red, self.green, self.blue)
588
591 """Expresses a typed relationship between two clades.
592
593 For example, this could be used to describe multiple parents of a clade.
594
595 @type id_ref_0: str
596 @type id_ref_1: str
597 @type distance: str
598 @type type: str
599
600 @type confidence: Confidence
601 """
602 - def __init__(self, type, id_ref_0, id_ref_1,
603 distance=None, confidence=None):
609
612 """A general purpose confidence element.
613
614 For example, this can be used to express the bootstrap support value of a
615 clade (in which case the 'type' attribute is 'bootstrap').
616
617 @type value: float
618 @type type: str
619 """
620 - def __init__(self, value, type='unknown'):
623
625 return float(self.value)
626
628 return int(self.value)
629
630
631 -class Date(PhyloElement):
632 """A date associated with a clade/node.
633
634 Its value can be numerical by using the 'value' element and/or free text
635 with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it
636 is recommended to employ the 'unit' attribute.
637
638 @param unit: type of numerical value (e.g. 'mya' for 'million years ago')
639
640 @type value: float
641 @param desc: plain-text description of the date
642 @param minimum: lower bound on the date value
643 @param maximum: upper bound on the date value
644 """
645 - def __init__(self, value=None, unit=None, desc=None,
646 minimum=None, maximum=None):
652
654 """Show the class name and the human-readable date."""
655 if self.unit and self.value is not None:
656 return '%s %s' % (self.value, self.unit)
657 if self.desc is not None:
658 return self.desc
659 return self.__class__.__name__
660
663 """Geographic distribution of the items of a clade (species, sequences).
664
665 Intended for phylogeographic applications.
666
667 The location can be described either by free text in the 'desc' element
668 and/or by the coordinates of one or more 'Points' (similar to the 'Point'
669 element in Google's KML format) or by 'Polygons'.
670 """
671 - def __init__(self, desc=None, points=None, polygons=None):
672 self.desc = desc
673 self.points = points or []
674 self.polygons = polygons or []
675
676
677 -class DomainArchitecture(PhyloElement):
678 """Domain architecture of a protein.
679
680 @param length: total length of the protein sequence (type int)
681 @param domains: list of ProteinDomain objects
682 """
683 - def __init__(self, length=None, domains=None):
684 self.length = length
685 self.domains = domains
686
687
688 -class Events(PhyloElement):
689 """Events at the root node of a clade (e.g. one gene duplication).
690
691 All attributes are set to None by default, but this object can also be
692 treated as a dictionary, in which case None values are treated as missing
693 keys and deleting a key resets that attribute's value back to None.
694 """
695 ok_type = set(('transfer', 'fusion', 'speciation_or_duplication', 'other',
696 'mixed', 'unassigned'))
697
698 - def __init__(self, type=None, duplications=None, speciations=None,
699 losses=None, confidence=None):
706
708 return [(k, v) for k, v in self.__dict__.iteritems() if v is not None]
709
711 return [k for k, v in self.__dict__.iteritems() if v is not None]
712
714 return [v for v in self.__dict__.itervalues() if v is not None]
715
718
720 if not hasattr(self, key):
721 raise KeyError(key)
722 val = getattr(self, key)
723 if val is None:
724 raise KeyError("%s has not been set in this object" % repr(key))
725 return val
726
728 setattr(self, key, val)
729
731 setattr(self, key, None)
732
734 return iter(self.keys())
735
737 return (hasattr(self, key) and getattr(self, key) is not None)
738
739
740 -class Id(PhyloElement):
741 """A general-purpose identifier element.
742
743 Allows to indicate the provider (or authority) of an identifier, e.g. NCBI,
744 along with the value itself.
745 """
746 - def __init__(self, value, provider=None):
747 self.value = value
748 self.provider = provider
749
751 if self.provider is not None:
752 return '%s:%s' % (self.provider, self.value)
753 return self.value
754
755
756 -class MolSeq(PhyloElement):
757 """Store a molecular sequence.
758
759 @param value: the sequence, as a string
760 @param is_aligned: True is mol_seq is aligned (usu. meaning gaps are
761 introduced and all aligned seqs are the same length)
762 """
763 re_value = re.compile(r'[a-zA-Z\.\-\?\*_]+')
764
765 - def __init__(self, value, is_aligned=None):
769
772
773
774 -class Point(PhyloElement):
775 """Geographic coordinates of a point, with an optional altitude.
776
777 Used by element 'Distribution'.
778
779 @param geodetic_datum: indicate the geodetic datum (also called 'map
780 datum'). For example, Google's KML uses 'WGS84'. (required)
781 @param lat: latitude
782 @param long: longitude
783 @param alt: altitude
784 @param alt_unit: unit for the altitude (e.g. 'meter')
785 """
786 - def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None):
787 self.geodetic_datum = geodetic_datum
788 self.lat = lat
789 self.long = long
790 self.alt = alt
791 self.alt_unit = alt_unit
792
795 """A polygon defined by a list of 'Points' (used by element 'Distribution').
796
797 @param points: list of 3 or more points representing vertices.
798 """
800 self.points = points or []
801
803 return '%s([%s])' % (self.__class__.__name__,
804 ',\n'.join(map(str, self.points)))
805
808 """A typed and referenced property from an external resources.
809
810 Can be attached to 'Phylogeny', 'Clade', and 'Annotation' objects.
811
812 @param ref: reference to an external resource, e.g. "NOAA:depth"
813
814 @param unit: the unit of the property, e.g. "METRIC:m" (optional)
815
816 @param datatype: indicates the type of a property and is limited to
817 xsd-datatypes (e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer',
818 'xsd:decimal', 'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI').
819
820 @param applies_to: indicates the item to which a property applies to (e.g.
821 'node' for the parent node of a clade, 'parent_branch' for the parent
822 branch of a clade, or just 'clade').
823
824 @param id_ref: allows to attached a property specifically to one element
825 (on the xml-level). (optional)
826
827 @type value: str
828 """
829 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+')
830 ok_applies_to = set(('phylogeny', 'clade', 'node', 'annotation',
831 'parent_branch', 'other'))
832 ok_datatype = set(('xsd:string', 'xsd:boolean', 'xsd:decimal', 'xsd:float',
833 'xsd:double', 'xsd:duration', 'xsd:dateTime', 'xsd:time', 'xsd:date',
834 'xsd:gYearMonth', 'xsd:gYear', 'xsd:gMonthDay', 'xsd:gDay',
835 'xsd:gMonth', 'xsd:hexBinary', 'xsd:base64Binary', 'xsd:anyURI',
836 'xsd:normalizedString', 'xsd:token', 'xsd:integer',
837 'xsd:nonPositiveInteger', 'xsd:negativeInteger', 'xsd:long', 'xsd:int',
838 'xsd:short', 'xsd:byte', 'xsd:nonNegativeInteger', 'xsd:unsignedLong',
839 'xsd:unsignedInt', 'xsd:unsignedShort', 'xsd:unsignedByte',
840 'xsd:positiveInteger'))
841
842 - def __init__(self, value, ref, applies_to, datatype,
843 unit=None, id_ref=None):
854
855
856 -class ProteinDomain(PhyloElement):
857 """Represents an individual domain in a domain architecture.
858
859 The locations use 0-based indexing, as most Python objects including
860 SeqFeature do, rather than the usual biological convention starting at 1.
861 This means the start and end attributes can be used directly as slice
862 indexes on Seq objects.
863
864 @param start: start of the domain on the sequence, using 0-based indexing
865 @type start: non-negative integer
866 @param end: end of the domain on the sequence
867 @type end: non-negative integer
868 @param confidence: can be used to store e.g. E-values. (type float)
869 @param id: unique identifier/name
870 """
871
872 - def __init__(self, value, start, end, confidence=None, id=None):
873 self.value = value
874 self.start = start
875 self.end = end
876 self.confidence = confidence
877 self.id = id
878
879 @classmethod
880 - def from_seqfeature(cls, feat):
881 return ProteinDomain(feat.id,
882 feat.location.nofuzzy_start,
883 feat.location.nofuzzy_end,
884 confidence=feat.qualifiers.get('confidence'))
885
886 - def to_seqfeature(self):
887 feat = SeqFeature(location=FeatureLocation(self.start, self.end),
888 id=self.value)
889 if hasattr(self, 'confidence'):
890 feat.qualifiers['confidence'] = self.confidence
891 return feat
892
895 """Literature reference for a clade.
896
897 It is recommended to use the 'doi' attribute instead of the free text
898 'desc' element whenever possible.
899 """
900 re_doi = re.compile(r'[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+')
901
902 - def __init__(self, doi=None, desc=None):
906
909 """A molecular sequence (Protein, DNA, RNA) associated with a node.
910
911 One intended use for 'id_ref' is to link a sequence to a taxonomy (via the
912 taxonomy's 'id_source') in case of multiple sequences and taxonomies per
913 node.
914
915 @param type: type of sequence ('dna', 'rna', or 'protein').
916 @type id_ref: str
917 @type id_source: str
918
919 @param symbol: short symbol of the sequence, e.g. 'ACTM' (max. 10 chars)
920 @type accession: Accession
921 @param name: full name of the sequence, e.g. 'muscle Actin'
922 @param location: location of a sequence on a genome/chromosome.
923 @type mol_seq: MolSeq
924 @type uri: Uri
925 @param annotations: list of Annotation objects
926 @param domain_architecture: protein domains on this sequence (type
927 DomainArchitecture)
928 @param other: list of non-phyloXML elements (type Other)
929 """
930 alphabets = {'dna': Alphabet.generic_dna,
931 'rna': Alphabet.generic_rna,
932 'protein': Alphabet.generic_protein}
933 re_symbol = re.compile(r'\S{1,10}')
934
935 - def __init__(self,
936
937 type=None, id_ref=None, id_source=None,
938
939 symbol=None, accession=None, name=None, location=None,
940 mol_seq=None, uri=None, domain_architecture=None,
941
942 annotations=None, other=None,
943 ):
958
959 @classmethod
961 """Create a new PhyloXML Sequence from a SeqRecord object."""
962 if is_aligned == None:
963 is_aligned = isinstance(record.seq.alphabet, Alphabet.Gapped)
964 params = {
965 'accession': Accession(record.id, ''),
966 'symbol': record.name,
967 'name': record.description,
968 'mol_seq': MolSeq(str(record.seq), is_aligned),
969 }
970 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
971 params['type'] = 'dna'
972 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
973 params['type'] = 'rna'
974 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
975 params['type'] = 'protein'
976
977
978 for key in ('id_ref', 'id_source', 'location'):
979 if key in record.annotations:
980 params[key] = record.annotations[key]
981 if isinstance(record.annotations.get('uri'), dict):
982 params['uri'] = Uri(**record.annotations['uri'])
983
984 if record.annotations.get('annotations'):
985 params['annotations'] = []
986 for annot in record.annotations['annotations']:
987 ann_args = {}
988 for key in ('ref', 'source', 'evidence', 'type', 'desc'):
989 if key in annot:
990 ann_args[key] = annot[key]
991 if isinstance(annot.get('confidence'), list):
992 ann_args['confidence'] = Confidence(
993 *annot['confidence'])
994 if isinstance(annot.get('properties'), list):
995 ann_args['properties'] = [Property(**prop)
996 for prop in annot['properties']
997 if isinstance(prop, dict)]
998 params['annotations'].append(Annotation(**ann_args))
999
1000
1001 if record.features:
1002 params['domain_architecture'] = DomainArchitecture(
1003 length=len(record.seq),
1004 domains=[ProteinDomain.from_seqfeature(feat)
1005 for feat in record.features])
1006
1007 return Sequence(**params)
1008
1010 """Create a SeqRecord object from this Sequence instance.
1011
1012 The seqrecord.annotations dictionary is packed like so::
1013
1014 { # Sequence attributes with no SeqRecord equivalent:
1015 'id_ref': self.id_ref,
1016 'id_source': self.id_source,
1017 'location': self.location,
1018 'uri': { 'value': self.uri.value,
1019 'desc': self.uri.desc,
1020 'type': self.uri.type },
1021 # Sequence.annotations attribute (list of Annotations)
1022 'annotations': [{ 'ref': ann.ref,
1023 'source': ann.source,
1024 'evidence': ann.evidence,
1025 'type': ann.type,
1026 'confidence': [ ann.confidence.value,
1027 ann.confidence.type ],
1028 'properties': [{ 'value': prop.value,
1029 'ref': prop.ref,
1030 'applies_to': prop.applies_to,
1031 'datatype': prop.datatype,
1032 'unit': prop.unit,
1033 'id_ref': prop.id_ref }
1034 for prop in ann.properties],
1035 } for ann in self.annotations],
1036 }
1037 """
1038 def clean_dict(dct):
1039 """Remove None-valued items from a dictionary."""
1040 return dict((key, val) for key, val in dct.iteritems()
1041 if val is not None)
1042
1043 seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()),
1044 **clean_dict({
1045 'id': str(self.accession),
1046 'name': self.symbol,
1047 'description': self.name,
1048
1049 }))
1050 if self.domain_architecture:
1051 seqrec.features = [dom.to_seqfeature()
1052 for dom in self.domain_architecture.domains]
1053
1054 seqrec.annotations = clean_dict({
1055 'id_ref': self.id_ref,
1056 'id_source': self.id_source,
1057 'location': self.location,
1058 'uri': self.uri and clean_dict({
1059 'value': self.uri.value,
1060 'desc': self.uri.desc,
1061 'type': self.uri.type,
1062 }),
1063 'annotations': self.annotations and [
1064 clean_dict({
1065 'ref': ann.ref,
1066 'source': ann.source,
1067 'evidence': ann.evidence,
1068 'type': ann.type,
1069 'confidence': ann.confidence and [
1070 ann.confidence.value,
1071 ann.confidence.type],
1072 'properties': [clean_dict({
1073 'value': prop.value,
1074 'ref': prop.ref,
1075 'applies_to': prop.applies_to,
1076 'datatype': prop.datatype,
1077 'unit': prop.unit,
1078 'id_ref': prop.id_ref })
1079 for prop in ann.properties],
1080 }) for ann in self.annotations],
1081 })
1082 return seqrec
1083
1089
1092 """Express a typed relationship between two sequences.
1093
1094 For example, this could be used to describe an orthology (in which case
1095 attribute 'type' is 'orthology').
1096
1097 @param id_ref_0: first sequence reference identifier
1098 @param id_ref_1: second sequence reference identifier
1099 @param distance: distance between the two sequences (type float)
1100 @param type: describe the type of relationship
1101
1102 @type confidence: Confidence
1103 """
1104 ok_type = set(('orthology', 'one_to_one_orthology', 'super_orthology',
1105 'paralogy', 'ultra_paralogy', 'xenology', 'unknown', 'other'))
1106
1107 - def __init__(self, type, id_ref_0, id_ref_1,
1108 distance=None, confidence=None):
1115
1118 """Describe taxonomic information for a clade.
1119
1120 @param id_source: link other elements to a taxonomy (on the XML level)
1121
1122 @param id: unique identifier of a taxon, e.g. Id('6500',
1123 provider='ncbi_taxonomy') for the California sea hare
1124 @param code: store UniProt/Swiss-Prot style organism codes, e.g. 'APLCA'
1125 for the California sea hare 'Aplysia californica' (restricted string)
1126 @param scientific_name: the standard scientific name for this organism,
1127 e.g. 'Aplysia californica' for the California sea hare
1128 @param authority: keep the authority, such as 'J. G. Cooper, 1863',
1129 associated with the 'scientific_name'
1130 @param common_names: list of common names for this organism
1131 @param synonyms: ???
1132 @param rank: taxonomic rank (restricted string)
1133 @type uri: Uri
1134 @param other: list of non-phyloXML elements (type Other)
1135 """
1136 re_code = re.compile(r'[a-zA-Z0-9_]{2,10}')
1137 ok_rank = set(('domain', 'kingdom', 'subkingdom', 'branch', 'infrakingdom',
1138 'superphylum', 'phylum', 'subphylum', 'infraphylum', 'microphylum',
1139 'superdivision', 'division', 'subdivision', 'infradivision',
1140 'superclass', 'class', 'subclass', 'infraclass', 'superlegion',
1141 'legion', 'sublegion', 'infralegion', 'supercohort', 'cohort',
1142 'subcohort', 'infracohort', 'superorder', 'order', 'suborder',
1143 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe', 'subtribe',
1144 'infratribe', 'genus', 'subgenus', 'superspecies', 'species',
1145 'subspecies', 'variety', 'subvariety', 'form', 'subform', 'cultivar',
1146 'unknown', 'other'))
1147
1148 - def __init__(self,
1149
1150 id_source=None,
1151
1152 id=None, code=None, scientific_name=None, authority=None,
1153 rank=None, uri=None,
1154
1155 common_names=None, synonyms=None, other=None,
1156 ):
1169
1171 """Show the class name and an identifying attribute."""
1172 if self.code is not None:
1173 return self.code
1174 if self.scientific_name is not None:
1175 return self.scientific_name
1176 if self.rank is not None:
1177 return self.rank
1178 if self.id is not None:
1179 return str(self.id)
1180 return self.__class__.__name__
1181
1182
1183 -class Uri(PhyloElement):
1184 """A uniform resource identifier.
1185
1186 In general, this is expected to be an URL (for example, to link to an image
1187 on a website, in which case the 'type' attribute might be 'image' and 'desc'
1188 might be 'image of a California sea hare').
1189 """
1190 - def __init__(self, value, desc=None, type=None):
1194
1196 if self.value:
1197 return self.value
1198 return repr(self)
1199