Package BioSQL :: Module BioSeq
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeq

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2009 copyright by Peter Cock.  All rights reserved. 
  3  # Revisions 2008-2009 copyright by Cymon J. Cox.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  # 
  8  # Note that BioSQL (including the database schema and scripts) is 
  9  # available and licensed separately.  Please consult www.biosql.org 
 10  """Implementations of Biopython-like Seq objects on top of BioSQL. 
 11   
 12  This allows retrival of items stored in a BioSQL database using 
 13  a biopython-like SeqRecord and Seq interface. 
 14   
 15  Note: Currently we do not support recording per-letter-annotations 
 16  (like quality scores) in BioSQL. 
 17  """ 
 18   
 19  from Bio import Alphabet 
 20  from Bio.Seq import Seq, UnknownSeq 
 21  from Bio.SeqRecord import SeqRecord, _RestrictedDict 
 22  from Bio import SeqFeature 
 23   
24 -class DBSeq(Seq): # This implements the biopython Seq interface
25 - def __init__(self, primary_id, adaptor, alphabet, start, length):
26 """Create a new DBSeq object referring to a BioSQL entry. 27 28 You wouldn't normally create a DBSeq object yourself, this is done 29 for you when retreiving a DBSeqRecord object from the database. 30 """ 31 self.primary_id = primary_id 32 self.adaptor = adaptor 33 self.alphabet = alphabet 34 self._length = length 35 self.start = start
36
37 - def __len__(self):
38 return self._length
39
40 - def __getitem__(self, index) : # Seq API requirement
41 #Note since Python 2.0, __getslice__ is deprecated 42 #and __getitem__ is used instead. 43 #See http://docs.python.org/ref/sequence-methods.html 44 if isinstance(index, int): 45 #Return a single letter as a string 46 i = index 47 if i < 0: 48 if -i > self._length: 49 raise IndexError(i) 50 i = i + self._length 51 elif i >= self._length: 52 raise IndexError(i) 53 return self.adaptor.get_subseq_as_string(self.primary_id, 54 self.start + i, 55 self.start + i + 1) 56 if not isinstance(index, slice): 57 raise ValueError("Unexpected index type") 58 59 #Return the (sub)sequence as another DBSeq or Seq object 60 #(see the Seq obect's __getitem__ method) 61 if index.start is None: 62 i=0 63 else: 64 i = index.start 65 if i < 0: 66 #Map to equavilent positive index 67 if -i > self._length: 68 raise IndexError(i) 69 i = i + self._length 70 elif i >= self._length: 71 #Trivial case, should return empty string! 72 i = self._length 73 74 if index.stop is None: 75 j = self._length 76 else: 77 j = index.stop 78 if j < 0: 79 #Map to equavilent positive index 80 if -j > self._length: 81 raise IndexError(j) 82 j = j + self._length 83 elif j >= self._length: 84 j = self._length 85 86 if i >= j: 87 #Trivial case, empty string. 88 return Seq("", self.alphabet) 89 elif index.step is None or index.step == 1: 90 #Easy case - can return a DBSeq with the start and end adjusted 91 return self.__class__(self.primary_id, self.adaptor, self.alphabet, 92 self.start + i, j - i) 93 else: 94 #Tricky. Will have to create a Seq object because of the stride 95 full = self.adaptor.get_subseq_as_string(self.primary_id, 96 self.start + i, 97 self.start + j) 98 return Seq(full[::index.step], self.alphabet) 99
100 - def tostring(self):
101 """Returns the full sequence as a python string. 102 103 Although not formally deprecated, you are now encouraged to use 104 str(my_seq) instead of my_seq.tostring().""" 105 return self.adaptor.get_subseq_as_string(self.primary_id, 106 self.start, 107 self.start + self._length)
108 - def __str__(self):
109 """Returns the full sequence as a python string.""" 110 return self.adaptor.get_subseq_as_string(self.primary_id, 111 self.start, 112 self.start + self._length)
113 114 data = property(tostring, doc="Sequence as string (DEPRECATED)") 115
116 - def toseq(self):
117 """Returns the full sequence as a Seq object.""" 118 #Note - the method name copies that of the MutableSeq object 119 return Seq(str(self), self.alphabet)
120
121 - def __add__(self, other):
122 #Let the Seq object deal with the alphabet issues etc 123 return self.toseq() + other
124
125 - def __radd__(self, other):
126 #Let the Seq object deal with the alphabet issues etc 127 return other + self.toseq()
128 129
130 -def _retrieve_seq(adaptor, primary_id):
131 #The database schema ensures there will be only one matching 132 #row in the table. 133 134 #If an UnknownSeq was recorded, seq will be NULL, 135 #but length will be populated. This means length(seq) 136 #will return None. 137 seqs = adaptor.execute_and_fetchall( 138 "SELECT alphabet, length, length(seq) FROM biosequence" \ 139 " WHERE bioentry_id = %s", (primary_id,)) 140 if not seqs : return 141 assert len(seqs) == 1 142 moltype, given_length, length = seqs[0] 143 144 try: 145 length = int(length) 146 given_length = int(length) 147 assert length == given_length 148 have_seq = True 149 except TypeError: 150 assert length is None 151 seqs = adaptor.execute_and_fetchall( 152 "SELECT alphabet, length, seq FROM biosequence" \ 153 " WHERE bioentry_id = %s", (primary_id,)) 154 assert len(seqs) == 1 155 moltype, given_length, seq = seqs[0] 156 assert seq is None or seq=="" 157 length = int(given_length) 158 have_seq = False 159 del seq 160 del given_length 161 162 moltype = moltype.lower() #might be upper case in database 163 #We have no way of knowing if these sequences will use IUPAC 164 #alphabets, and we certainly can't assume they are unambiguous! 165 if moltype == "dna": 166 alphabet = Alphabet.generic_dna 167 elif moltype == "rna": 168 alphabet = Alphabet.generic_rna 169 elif moltype == "protein": 170 alphabet = Alphabet.generic_protein 171 elif moltype == "unknown": 172 #This is used in BioSQL/Loader.py and would happen 173 #for any generic or nucleotide alphabets. 174 alphabet = Alphabet.single_letter_alphabet 175 else: 176 raise AssertionError("Unknown moltype: %s" % moltype) 177 178 if have_seq: 179 return DBSeq(primary_id, adaptor, alphabet, 0, int(length)) 180 else: 181 return UnknownSeq(length, alphabet)
182
183 -def _retrieve_dbxrefs(adaptor, primary_id):
184 """Retrieve the database cross references for the sequence.""" 185 _dbxrefs = [] 186 dbxrefs = adaptor.execute_and_fetchall( 187 "SELECT dbname, accession, version" \ 188 " FROM bioentry_dbxref join dbxref using (dbxref_id)" \ 189 " WHERE bioentry_id = %s" \ 190 " ORDER BY rank", (primary_id,)) 191 for dbname, accession, version in dbxrefs: 192 if version and version != "0": 193 v = "%s.%s" % (accession, version) 194 else: 195 v = accession 196 _dbxrefs.append("%s:%s" % (dbname, v)) 197 return _dbxrefs
198
199 -def _retrieve_features(adaptor, primary_id):
200 sql = "SELECT seqfeature_id, type.name, rank" \ 201 " FROM seqfeature join term type on (type_term_id = type.term_id)" \ 202 " WHERE bioentry_id = %s" \ 203 " ORDER BY rank" 204 results = adaptor.execute_and_fetchall(sql, (primary_id,)) 205 seq_feature_list = [] 206 for seqfeature_id, seqfeature_type, seqfeature_rank in results: 207 # Get qualifiers [except for db_xref which is stored separately] 208 qvs = adaptor.execute_and_fetchall( 209 "SELECT name, value" \ 210 " FROM seqfeature_qualifier_value join term using (term_id)" \ 211 " WHERE seqfeature_id = %s" \ 212 " ORDER BY rank", (seqfeature_id,)) 213 qualifiers = {} 214 for qv_name, qv_value in qvs: 215 qualifiers.setdefault(qv_name, []).append(qv_value) 216 # Get db_xrefs [special case of qualifiers] 217 qvs = adaptor.execute_and_fetchall( 218 "SELECT dbxref.dbname, dbxref.accession" \ 219 " FROM dbxref join seqfeature_dbxref using (dbxref_id)" \ 220 " WHERE seqfeature_dbxref.seqfeature_id = %s" \ 221 " ORDER BY rank", (seqfeature_id,)) 222 for qv_name, qv_value in qvs: 223 value = "%s:%s" % (qv_name, qv_value) 224 qualifiers.setdefault("db_xref", []).append(value) 225 # Get locations 226 results = adaptor.execute_and_fetchall( 227 "SELECT location_id, start_pos, end_pos, strand" \ 228 " FROM location" \ 229 " WHERE seqfeature_id = %s" \ 230 " ORDER BY rank", (seqfeature_id,)) 231 locations = [] 232 # convert to Python standard form 233 # Convert strand = 0 to strand = None 234 # re: comment in Loader.py: 235 # Biopython uses None when we don't know strand information but 236 # BioSQL requires something (non null) and sets this as zero 237 # So we'll use the strand or 0 if Biopython spits out None 238 for location_id, start, end, strand in results: 239 if start: 240 start -= 1 241 if strand == 0: 242 strand = None 243 if strand not in (+1, -1, None): 244 raise ValueError("Invalid strand %s found in database for " \ 245 "seqfeature_id %s" % (strand, seqfeature_id)) 246 if end < start: 247 import warnings 248 warnings.warn("Inverted location start/end (%i and %i) for " \ 249 "seqfeature_id %s" % (start, end, seqfeature_id)) 250 locations.append( (location_id, start, end, strand) ) 251 # Get possible remote reference information 252 remote_results = adaptor.execute_and_fetchall( 253 "SELECT location_id, dbname, accession, version" \ 254 " FROM location join dbxref using (dbxref_id)" \ 255 " WHERE seqfeature_id = %s", (seqfeature_id,)) 256 lookup = {} 257 for location_id, dbname, accession, version in remote_results: 258 if version and version != "0": 259 v = "%s.%s" % (accession, version) 260 else: 261 v = accession 262 # subfeature remote location db_ref are stored as a empty string when 263 # not present 264 if dbname == "": 265 dbname = None 266 lookup[location_id] = (dbname, v) 267 268 feature = SeqFeature.SeqFeature(type = seqfeature_type) 269 feature._seqfeature_id = seqfeature_id #Store the key as a private property 270 feature.qualifiers = qualifiers 271 if len(locations) == 0: 272 pass 273 elif len(locations) == 1: 274 location_id, start, end, strand = locations[0] 275 #See Bug 2677, we currently don't record the location_operator 276 #For consistency with older versions Biopython, default to "". 277 feature.location_operator = \ 278 _retrieve_location_qualifier_value(adaptor, location_id) 279 dbname, version = lookup.get(location_id, (None, None)) 280 feature.location = SeqFeature.FeatureLocation(start, end) 281 feature.strand = strand 282 feature.ref_db = dbname 283 feature.ref = version 284 else: 285 assert feature.sub_features == [] 286 for location in locations: 287 location_id, start, end, strand = location 288 dbname, version = lookup.get(location_id, (None, None)) 289 subfeature = SeqFeature.SeqFeature() 290 subfeature.type = seqfeature_type 291 subfeature.location_operator = \ 292 _retrieve_location_qualifier_value(adaptor, location_id) 293 #TODO - See Bug 2677 - we don't yet record location_operator, 294 #so for consistency with older versions of Biopython default 295 #to assuming its a join. 296 if not subfeature.location_operator: 297 subfeature.location_operator="join" 298 subfeature.location = SeqFeature.FeatureLocation(start, end) 299 subfeature.strand = strand 300 subfeature.ref_db = dbname 301 subfeature.ref = version 302 feature.sub_features.append(subfeature) 303 # Assuming that the feature loc.op is the same as the sub_feature 304 # loc.op: 305 feature.location_operator = \ 306 feature.sub_features[0].location_operator 307 # Locations are in order, but because of remote locations for 308 # sub-features they are not necessarily in numerical order: 309 start = locations[0][1] 310 end = locations[-1][2] 311 feature.location = SeqFeature.FeatureLocation(start, end) 312 # To get the parent strand (as done when parsing GenBank files), 313 # need to consider evil mixed strand examples like this, 314 # join(complement(69611..69724),139856..140087,140625..140650) 315 strands = set(sf.strand for sf in feature.sub_features) 316 if len(strands)==1: 317 feature.strand = feature.sub_features[0].strand 318 else: 319 feature.strand = None # i.e. mixed strands 320 321 seq_feature_list.append(feature) 322 323 return seq_feature_list
324
325 -def _retrieve_location_qualifier_value(adaptor, location_id):
326 value = adaptor.execute_and_fetch_col0( 327 "SELECT value FROM location_qualifier_value" \ 328 " WHERE location_id = %s", (location_id,)) 329 try: 330 return value[0] 331 except IndexError: 332 return ""
333
334 -def _retrieve_annotations(adaptor, primary_id, taxon_id):
335 annotations = {} 336 annotations.update(_retrieve_qualifier_value(adaptor, primary_id)) 337 annotations.update(_retrieve_reference(adaptor, primary_id)) 338 annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id)) 339 annotations.update(_retrieve_comment(adaptor, primary_id)) 340 # Convert values into strings in cases of unicode from the database. 341 # BioSQL could eventually be expanded to be unicode aware. 342 str_anns = {} 343 for key, val in annotations.items(): 344 if isinstance(val, list): 345 val = [_make_unicode_into_string(x) for x in val] 346 elif isinstance(val, unicode): 347 val = str(val) 348 str_anns[key] = val 349 return str_anns
350
351 -def _make_unicode_into_string(text):
352 if isinstance(text, unicode): 353 return str(text) 354 else : 355 return text
356
357 -def _retrieve_qualifier_value(adaptor, primary_id):
358 qvs = adaptor.execute_and_fetchall( 359 "SELECT name, value" \ 360 " FROM bioentry_qualifier_value JOIN term USING (term_id)" \ 361 " WHERE bioentry_id = %s" \ 362 " ORDER BY rank", (primary_id,)) 363 qualifiers = {} 364 for name, value in qvs: 365 if name == "keyword": name = "keywords" 366 #See handling of "date" in Loader.py 367 elif name == "date_changed": name = "date" 368 elif name == "secondary_accession": name = "accessions" 369 qualifiers.setdefault(name, []).append(value) 370 return qualifiers
371
372 -def _retrieve_reference(adaptor, primary_id):
373 # XXX dbxref_qualifier_value 374 375 refs = adaptor.execute_and_fetchall( 376 "SELECT start_pos, end_pos, " \ 377 " location, title, authors," \ 378 " dbname, accession" \ 379 " FROM bioentry_reference" \ 380 " JOIN reference USING (reference_id)" \ 381 " LEFT JOIN dbxref USING (dbxref_id)" \ 382 " WHERE bioentry_id = %s" \ 383 " ORDER BY rank", (primary_id,)) 384 references = [] 385 for start, end, location, title, authors, dbname, accession in refs: 386 reference = SeqFeature.Reference() 387 #If the start/end are missing, reference.location is an empty list 388 if (start is not None) or (end is not None): 389 if start is not None: start -= 1 #python counting 390 reference.location = [SeqFeature.FeatureLocation(start, end)] 391 #Don't replace the default "" with None. 392 if authors : reference.authors = authors 393 if title : reference.title = title 394 reference.journal = location 395 if dbname == 'PUBMED': 396 reference.pubmed_id = accession 397 elif dbname == 'MEDLINE': 398 reference.medline_id = accession 399 references.append(reference) 400 if references: 401 return {'references': references} 402 else: 403 return {}
404
405 -def _retrieve_taxon(adaptor, primary_id, taxon_id):
406 a = {} 407 common_names = adaptor.execute_and_fetch_col0( 408 "SELECT name FROM taxon_name WHERE taxon_id = %s" \ 409 " AND name_class = 'genbank common name'", (taxon_id,)) 410 if common_names: 411 a['source'] = common_names[0] 412 scientific_names = adaptor.execute_and_fetch_col0( 413 "SELECT name FROM taxon_name WHERE taxon_id = %s" \ 414 " AND name_class = 'scientific name'", (taxon_id,)) 415 if scientific_names: 416 a['organism'] = scientific_names[0] 417 ncbi_taxids = adaptor.execute_and_fetch_col0( 418 "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,)) 419 if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0": 420 a['ncbi_taxid'] = ncbi_taxids[0] 421 422 #Old code used the left/right values in the taxon table to get the 423 #taxonomy lineage in one SQL command. This was actually very slow, 424 #and would fail if the (optional) left/right values were missing. 425 # 426 #The following code is based on a contribution from Eric Gibert, and 427 #relies on the taxon table's parent_taxon_id field only (ignoring the 428 #optional left/right values). This means that it has to make a 429 #separate SQL query for each entry in the lineage, but it does still 430 #appear to be *much* faster. See Bug 2494. 431 taxonomy = [] 432 while taxon_id: 433 name, rank, parent_taxon_id = adaptor.execute_one( 434 "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id" \ 435 " FROM taxon, taxon_name" \ 436 " WHERE taxon.taxon_id=taxon_name.taxon_id" \ 437 " AND taxon_name.name_class='scientific name'" \ 438 " AND taxon.taxon_id = %s", (taxon_id,)) 439 if taxon_id == parent_taxon_id: 440 # If the taxon table has been populated by the BioSQL script 441 # load_ncbi_taxonomy.pl this is how top parent nodes are stored. 442 # Personally, I would have used a NULL parent_taxon_id here. 443 break 444 if rank != "no rank": 445 #For consistency with older versions of Biopython, we are only 446 #interested in taxonomy entries with a stated rank. 447 #Add this to the start of the lineage list. 448 taxonomy.insert(0, name) 449 taxon_id = parent_taxon_id 450 451 if taxonomy: 452 a['taxonomy'] = taxonomy 453 return a
454
455 -def _retrieve_comment(adaptor, primary_id):
456 qvs = adaptor.execute_and_fetchall( 457 "SELECT comment_text FROM comment" \ 458 " WHERE bioentry_id=%s" \ 459 " ORDER BY rank", (primary_id,)) 460 comments = [comm[0] for comm in qvs] 461 #Don't want to add an empty list... 462 if comments: 463 return {"comment": comments} 464 else: 465 return {}
466
467 -class DBSeqRecord(SeqRecord):
468 """BioSQL equivalent of the biopython SeqRecord object. 469 """ 470
471 - def __init__(self, adaptor, primary_id):
472 self._adaptor = adaptor 473 self._primary_id = primary_id 474 475 (self._biodatabase_id, self._taxon_id, self.name, 476 accession, version, self._identifier, 477 self._division, self.description) = self._adaptor.execute_one( 478 "SELECT biodatabase_id, taxon_id, name, accession, version," \ 479 " identifier, division, description" \ 480 " FROM bioentry" \ 481 " WHERE bioentry_id = %s", (self._primary_id,)) 482 if version and version != "0": 483 self.id = "%s.%s" % (accession, version) 484 else: 485 self.id = accession 486 #We don't yet record any per-letter-annotations in the 487 #BioSQL database, but we should set this property up 488 #for completeness (and the __str__ method). 489 try: 490 length = len(self.seq) 491 except: 492 #Could be no sequence in the database! 493 length = 0 494 self._per_letter_annotations = _RestrictedDict(length=length)
495
496 - def __get_seq(self):
497 if not hasattr(self, "_seq"): 498 self._seq = _retrieve_seq(self._adaptor, self._primary_id) 499 return self._seq
500 - def __set_seq(self, seq): self._seq = seq
501 - def __del_seq(self): del self._seq
502 seq = property(__get_seq, __set_seq, __del_seq, "Seq object") 503
504 - def __get_dbxrefs(self):
505 if not hasattr(self,"_dbxrefs"): 506 self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id) 507 return self._dbxrefs
508 - def __set_dbxrefs(self, dbxrefs): self._dbxrefs = dbxrefs
509 - def __del_dbxrefs(self): del self._dbxrefs
510 dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs, 511 "Database cross references") 512
513 - def __get_features(self):
514 if not hasattr(self, "_features"): 515 self._features = _retrieve_features(self._adaptor, 516 self._primary_id) 517 return self._features
518 - def __set_features(self, features): self._features = features
519 - def __del_features(self): del self._features
520 features = property(__get_features, __set_features, __del_features, 521 "Features") 522
523 - def __get_annotations(self):
524 if not hasattr(self, "_annotations"): 525 self._annotations = _retrieve_annotations(self._adaptor, 526 self._primary_id, 527 self._taxon_id) 528 if self._identifier: 529 self._annotations["gi"] = self._identifier 530 if self._division: 531 self._annotations["data_file_division"] = self._division 532 return self._annotations
533 - def __set_annotations(self, annotations): self._annotations = annotations
534 - def __del_annotations(self): del self._annotations
535 annotations = property(__get_annotations, __set_annotations, 536 __del_annotations, "Annotations")
537