Package Bio :: Package Alphabet
[hide private]
[frames] | no frames]

Source Code for Package Bio.Alphabet

  1  # Copyright 2000-2002 by Andrew Dalke. 
  2  # Revisions copyright 2007-2010 by Peter Cock. 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Alphabets used in Seq objects etc to declare sequence type and letters. 
  9   
 10  This is used by sequences which contain a finite number of similar words. 
 11  """ 
 12   
13 -class Alphabet:
14 size = None # default to no fixed size for words 15 letters = None # default to no fixed alphabet 16 # In general, a list-like object. However, 17 # assuming letters are single characters, use a 18 # string. This is expected for use with Seq like 19 # objects. 20
21 - def __repr__(self):
22 return self.__class__.__name__ + "()"
23
24 - def contains(self, other):
25 """Does this alphabet 'contain' the other (OBSOLETE?). 26 27 Returns a boolean. This relies on the Alphabet subclassing 28 hierarchy only, and does not check the letters property. 29 This isn't ideal, and doesn't seem to work as intended 30 with the AlphabetEncoder classes.""" 31 return isinstance(other, self.__class__)
32
33 - def _case_less(self):
34 """Return an case-less variant of the current alphabet (PRIVATE).""" 35 #TODO - remove this method by dealing with things in subclasses? 36 if isinstance(self, ProteinAlphabet): 37 return generic_protein 38 elif isinstance(self, DNAAlphabet): 39 return generic_dna 40 elif isinstance(self, NucleotideAlphabet): 41 return generic_rna 42 elif isinstance(self, NucleotideAlphabet): 43 return generic_nucleotide 44 elif isinstance(self, SingleLetterAlphabet): 45 return single_letter_alphabet 46 else: 47 return generic_alphabet
48
49 - def _upper(self):
50 """Return an upper case variant of the current alphabet (PRIVATE).""" 51 if not self.letters or self.letters==self.letters.upper(): 52 #Easy case, no letters or already upper case! 53 return self 54 else: 55 #TODO - Raise NotImplementedError and handle via subclass? 56 return self._case_less()
57
58 - def _lower(self):
59 """Return a lower case variant of the current alphabet (PRIVATE).""" 60 if not self.letters or self.letters==self.letters.lower(): 61 #Easy case, no letters or already lower case! 62 return self 63 else: 64 #TODO - Raise NotImplementedError and handle via subclass? 65 return self._case_less()
66 67 generic_alphabet = Alphabet() 68
69 -class SingleLetterAlphabet(Alphabet):
70 size = 1 71 letters = None # string of all letters in the alphabet
72 73 single_letter_alphabet = SingleLetterAlphabet() 74 75 ########### Protein 76
77 -class ProteinAlphabet(SingleLetterAlphabet):
78 pass
79 80 generic_protein = ProteinAlphabet() 81 82 ########### DNA
83 -class NucleotideAlphabet(SingleLetterAlphabet):
84 pass
85 86 generic_nucleotide = NucleotideAlphabet() 87
88 -class DNAAlphabet(NucleotideAlphabet):
89 pass
90 91 generic_dna = DNAAlphabet() 92 93 94 ########### RNA 95
96 -class RNAAlphabet(NucleotideAlphabet):
97 pass
98 99 generic_rna = RNAAlphabet() 100 101 102 103 ########### Other per-sequence encodings 104
105 -class SecondaryStructure(SingleLetterAlphabet):
106 letters = "HSTC"
107
108 -class ThreeLetterProtein(Alphabet):
109 size = 3 110 letters = [ 111 "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", 112 "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", 113 "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx", 114 ]
115 116 ###### Non per-sequence modifications 117 118 # (These are Decorator classes) 119
120 -class AlphabetEncoder:
121 - def __init__(self, alphabet, new_letters):
122 self.alphabet = alphabet 123 self.new_letters = new_letters 124 if alphabet.letters is not None: 125 self.letters = alphabet.letters + new_letters 126 else: 127 self.letters = None
128 - def __getattr__(self, key):
129 if key[:2] == "__" and key[-2:] == "__": 130 raise AttributeError(key) 131 return getattr(self.alphabet, key)
132
133 - def __repr__(self):
134 return "%s(%r, %r)" % (self.__class__.__name__, self.alphabet, 135 self.new_letters)
136
137 - def contains(self, other):
138 """Does this alphabet 'contain' the other (OBSOLETE?). 139 140 This is isn't implemented for the base AlphabetEncoder, 141 which will always return 0 (False).""" 142 return 0
143
144 - def _upper(self):
145 """Return an upper case variant of the current alphabet (PRIVATE).""" 146 return AlphabetEncoder(self.alphabet._upper(), self.new_letters.upper())
147
148 - def _lower(self):
149 """Return a lower case variant of the current alphabet (PRIVATE).""" 150 return AlphabetEncoder(self.alphabet._lower(), self.new_letters.lower())
151 152
153 -class Gapped(AlphabetEncoder):
154 - def __init__(self, alphabet, gap_char = "-"):
155 AlphabetEncoder.__init__(self, alphabet, gap_char) 156 self.gap_char = gap_char
157
158 - def contains(self, other):
159 """Does this alphabet 'contain' the other (OBSOLETE?). 160 161 Returns a boolean. This relies on the Alphabet subclassing 162 hierarchy, and attempts to check the gap character. This fails 163 if the other alphabet does not have a gap character! 164 """ 165 return other.gap_char == self.gap_char and \ 166 self.alphabet.contains(other.alphabet)
167
168 - def _upper(self):
169 """Return an upper case variant of the current alphabet (PRIVATE).""" 170 return Gapped(self.alphabet._upper(), self.gap_char.upper())
171
172 - def _lower(self):
173 """Return a lower case variant of the current alphabet (PRIVATE).""" 174 return Gapped(self.alphabet._lower(), self.gap_char.lower())
175 176
177 -class HasStopCodon(AlphabetEncoder):
178 - def __init__(self, alphabet, stop_symbol = "*"):
179 AlphabetEncoder.__init__(self, alphabet, stop_symbol) 180 self.stop_symbol = stop_symbol
181
182 - def __cmp__(self, other):
183 x = cmp(self.alphabet, other.alphabet) 184 if x == 0: 185 return cmp(self.stop_symbol, other.stop_symbol) 186 return x
187
188 - def contains(self, other):
189 """Does this alphabet 'contain' the other (OBSOLETE?). 190 191 Returns a boolean. This relies on the Alphabet subclassing 192 hierarchy, and attempts to check the stop symbol. This fails 193 if the other alphabet does not have a stop symbol! 194 """ 195 return other.stop_symbol == self.stop_symbol and \ 196 self.alphabet.contains(other.alphabet)
197
198 - def _upper(self):
199 """Return an upper case variant of the current alphabet (PRIVATE).""" 200 return HasStopCodon(self.alphabet._upper(), self.stop_symbol.upper())
201
202 - def _lower(self):
203 """Return a lower case variant of the current alphabet (PRIVATE).""" 204 return HasStopCodon(self.alphabet._lower(), self.stop_symbol.lower())
205 206
207 -def _get_base_alphabet(alphabet):
208 """Returns the non-gapped non-stop-codon Alphabet object (PRIVATE).""" 209 a = alphabet 210 while isinstance(a, AlphabetEncoder): 211 a = a.alphabet 212 assert isinstance(a, Alphabet), \ 213 "Invalid alphabet found, %s" % repr(a) 214 return a
215
216 -def _ungap(alphabet):
217 """Returns the alphabet without any gap encoder (PRIVATE).""" 218 #TODO - Handle via method of the objects? 219 if not hasattr(alphabet, "gap_char"): 220 return alphabet 221 elif isinstance(alphabet, Gapped): 222 return alphabet.alphabet 223 elif isinstance(alphabet, HasStopCodon): 224 return HasStopCodon(_ungap(alphabet.alphabet), stop_symbol=alphabet.stop_symbol) 225 elif isinstance(alphabet, AlphabetEncoder): 226 return AlphabetEncoder(_ungap(alphabet.alphabet), letters=alphabet.letters) 227 else: 228 raise NotImplementedError
229
230 -def _consensus_base_alphabet(alphabets):
231 """Returns a common but often generic base alphabet object (PRIVATE). 232 233 This throws away any AlphabetEncoder information, e.g. Gapped alphabets. 234 235 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 236 letter. These DO NOT raise an exception!""" 237 common = None 238 for alpha in alphabets: 239 a = _get_base_alphabet(alpha) 240 if common is None: 241 common = a 242 elif common == a: 243 pass 244 elif isinstance(a, common.__class__): 245 pass 246 elif isinstance(common, a.__class__): 247 common = a 248 elif isinstance(a, NucleotideAlphabet) \ 249 and isinstance(common, NucleotideAlphabet): 250 #e.g. Give a mix of RNA and DNA alphabets 251 common = generic_nucleotide 252 elif isinstance(a, SingleLetterAlphabet) \ 253 and isinstance(common, SingleLetterAlphabet): 254 #This is a pretty big mis-match! 255 common = single_letter_alphabet 256 else: 257 #We have a major mis-match... take the easy way out! 258 return generic_alphabet 259 if common is None: 260 #Given NO alphabets! 261 return generic_alphabet 262 return common
263
264 -def _consensus_alphabet(alphabets):
265 """Returns a common but often generic alphabet object (PRIVATE). 266 267 >>> from Bio.Alphabet import IUPAC 268 >>> _consensus_alphabet([IUPAC.extended_protein, IUPAC.protein]) 269 ExtendedIUPACProtein() 270 >>> _consensus_alphabet([generic_protein, IUPAC.protein]) 271 ProteinAlphabet() 272 273 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 274 letter. These DO NOT raise an exception! 275 276 >>> _consensus_alphabet([generic_dna, generic_nucleotide]) 277 NucleotideAlphabet() 278 >>> _consensus_alphabet([generic_dna, generic_rna]) 279 NucleotideAlphabet() 280 >>> _consensus_alphabet([generic_dna, generic_protein]) 281 SingleLetterAlphabet() 282 >>> _consensus_alphabet([single_letter_alphabet, generic_protein]) 283 SingleLetterAlphabet() 284 285 This is aware of Gapped and HasStopCodon and new letters added by 286 other AlphabetEncoders. This WILL raise an exception if more than 287 one gap character or stop symbol is present. 288 289 >>> from Bio.Alphabet import IUPAC 290 >>> _consensus_alphabet([Gapped(IUPAC.extended_protein), HasStopCodon(IUPAC.protein)]) 291 HasStopCodon(Gapped(ExtendedIUPACProtein(), '-'), '*') 292 >>> _consensus_alphabet([Gapped(IUPAC.protein, "-"), Gapped(IUPAC.protein, "=")]) 293 Traceback (most recent call last): 294 ... 295 ValueError: More than one gap character present 296 >>> _consensus_alphabet([HasStopCodon(IUPAC.protein, "*"), HasStopCodon(IUPAC.protein, "+")]) 297 Traceback (most recent call last): 298 ... 299 ValueError: More than one stop symbol present 300 """ 301 base = _consensus_base_alphabet(alphabets) 302 gap = None 303 stop = None 304 new_letters = "" 305 for alpha in alphabets: 306 #Gaps... 307 if not hasattr(alpha, "gap_char"): 308 pass 309 elif gap is None: 310 gap = alpha.gap_char 311 elif gap == alpha.gap_char: 312 pass 313 else: 314 raise ValueError("More than one gap character present") 315 #Stops... 316 if not hasattr(alpha, "stop_symbol"): 317 pass 318 elif stop is None: 319 stop = alpha.stop_symbol 320 elif stop == alpha.stop_symbol: 321 pass 322 else: 323 raise ValueError("More than one stop symbol present") 324 #New letters... 325 if hasattr(alpha, "new_letters"): 326 for letter in alpha.new_letters: 327 if letter not in new_letters \ 328 and letter != gap and letter != stop: 329 new_letters += letter 330 331 alpha = base 332 if new_letters: 333 alpha = AlphabetEncoder(alpha, new_letters) 334 if gap: 335 alpha = Gapped(alpha, gap_char=gap) 336 if stop: 337 alpha = HasStopCodon(alpha, stop_symbol=stop) 338 return alpha
339
340 -def _check_type_compatible(alphabets):
341 """Returns True except for DNA+RNA or Nucleotide+Protein (PRIVATE). 342 343 >>> _check_type_compatible([generic_dna, generic_nucleotide]) 344 True 345 >>> _check_type_compatible([generic_dna, generic_rna]) 346 False 347 >>> _check_type_compatible([generic_dna, generic_protein]) 348 False 349 >>> _check_type_compatible([single_letter_alphabet, generic_protein]) 350 True 351 352 This relies on the Alphabet subclassing hierarchy. It does not 353 check things like gap characters or stop symbols.""" 354 dna, rna, nucl, protein = False, False, False, False 355 for alpha in alphabets: 356 a = _get_base_alphabet(alpha) 357 if isinstance(a, DNAAlphabet): 358 dna = True 359 nucl = True 360 if rna or protein : return False 361 elif isinstance(a, RNAAlphabet): 362 rna = True 363 nucl = True 364 if dna or protein : return False 365 elif isinstance(a, NucleotideAlphabet): 366 nucl = True 367 if protein : return False 368 elif isinstance(a, ProteinAlphabet): 369 protein = True 370 if nucl : return False 371 return True
372
373 -def _verify_alphabet(sequence):
374 """Check all letters in sequence are in the alphabet (PRIVATE). 375 376 >>> from Bio.Seq import Seq 377 >>> from Bio.Alphabet import IUPAC 378 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 379 ... IUPAC.protein) 380 >>> _verify_alphabet(my_seq) 381 True 382 383 This example has an X, which is not in the IUPAC protein alphabet 384 (you should be using the IUPAC extended protein alphabet): 385 386 >>> bad_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVFX", 387 ... IUPAC.protein) 388 >>> _verify_alphabet(bad_seq) 389 False 390 391 This replaces Bio.utils.verify_alphabet() since we are deprecating 392 that. Potentially this could be added to the Alphabet object, and 393 I would like it to be an option when creating a Seq object... but 394 that might slow things down. 395 """ 396 letters = sequence.alphabet.letters 397 if not letters: 398 raise ValueError("Alphabet does not define letters.") 399 for letter in sequence: 400 if letter not in letters: 401 return False 402 return True
403