1
2
3
4
5
6
7
8
9
10
11 """Load biopython objects into a BioSQL database for persistent storage.
12
13 This code makes it possible to store biopython objects in a relational
14 database and then retrieve them back. You shouldn't use any of the
15 classes in this module directly. Rather, call the load() method on
16 a database object.
17 """
18
19 from time import gmtime, strftime
20
21
22 from Bio import Alphabet
23 from Bio.SeqUtils.CheckSum import crc64
24 from Bio import Entrez
25
27 """Object used to load SeqRecord objects into a BioSQL database."""
29 """Initialize with connection information for the database.
30
31 Creating a DatabaseLoader object is normally handled via the
32 BioSeqDatabase DBServer object, for example:
33
34 from BioSQL import BioSeqDatabase
35 server = BioSeqDatabase.open_database(driver="MySQLdb", user="gbrowse",
36 passwd = "biosql", host = "localhost", db="test_biosql")
37 try :
38 db = server["test"]
39 except KeyError :
40 db = server.new_database("test", description="For testing GBrowse")
41 """
42 self.adaptor = adaptor
43 self.dbid = dbid
44 self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
45
47 """Load a Biopython SeqRecord into the database.
48 """
49 bioentry_id = self._load_bioentry_table(record)
50 self._load_bioentry_date(record, bioentry_id)
51 self._load_biosequence(record, bioentry_id)
52 self._load_comment(record, bioentry_id)
53 self._load_dbxrefs(record, bioentry_id)
54 references = record.annotations.get('references', ())
55 for reference, rank in zip(references, range(len(references))):
56 self._load_reference(reference, rank, bioentry_id)
57 self._load_annotations(record, bioentry_id)
58 for seq_feature_num in range(len(record.features)):
59 seq_feature = record.features[seq_feature_num]
60 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
61
63 """Returns the identifier for the named ontology (PRIVATE).
64
65 This looks through the onotology table for a the given entry name.
66 If it is not found, a row is added for this ontology (using the
67 definition if supplied). In either case, the id corresponding to
68 the provided name is returned, so that you can reference it in
69 another table.
70 """
71 oids = self.adaptor.execute_and_fetch_col0(
72 "SELECT ontology_id FROM ontology WHERE name = %s",
73 (name,))
74 if oids:
75 return oids[0]
76 self.adaptor.execute(
77 "INSERT INTO ontology(name, definition) VALUES (%s, %s)",
78 (name, definition))
79 return self.adaptor.last_id("ontology")
80
81
82 - def _get_term_id(self,
83 name,
84 ontology_id=None,
85 definition=None,
86 identifier=None):
87 """Get the id that corresponds to a term (PRIVATE).
88
89 This looks through the term table for a the given term. If it
90 is not found, a new id corresponding to this term is created.
91 In either case, the id corresponding to that term is returned, so
92 that you can reference it in another table.
93
94 The ontology_id should be used to disambiguate the term.
95 """
96
97
98 sql = r"SELECT term_id FROM term " \
99 r"WHERE name = %s"
100 fields = [name]
101 if ontology_id:
102 sql += ' AND ontology_id = %s'
103 fields.append(ontology_id)
104 id_results = self.adaptor.execute_and_fetchall(sql, fields)
105
106 if len(id_results) > 1:
107 raise ValueError("Multiple term ids for %s: %r" %
108 (name, id_results))
109 elif len(id_results) == 1:
110 return id_results[0][0]
111 else:
112 sql = r"INSERT INTO term (name, definition," \
113 r" identifier, ontology_id)" \
114 r" VALUES (%s, %s, %s, %s)"
115 self.adaptor.execute(sql, (name, definition,
116 identifier, ontology_id))
117 return self.adaptor.last_id("term")
118
120 """Insert a dbxref and return its id."""
121
122 self.adaptor.execute(
123 "INSERT INTO dbxref(dbname, accession, version)" \
124 " VALUES (%s, %s, %s)", (dbname, accession, version))
125 return self.adaptor.last_id("dbxref")
126
128 """Get the taxon id for this record (PRIVATE).
129
130 record - a SeqRecord object
131
132 This searches the taxon/taxon_name tables using the
133 NCBI taxon ID, scientific name and common name to find
134 the matching taxon table entry's id.
135
136 If the species isn't in the taxon table, and we have at
137 least the NCBI taxon ID, scientific name or common name,
138 at least a minimal stub entry is created in the table.
139
140 Returns the taxon id (database key for the taxon table,
141 not an NCBI taxon ID), or None if the taxonomy information
142 is missing.
143
144 See also the BioSQL script load_ncbi_taxonomy.pl which
145 will populate and update the taxon/taxon_name tables
146 with the latest information from the NCBI.
147 """
148
149
150 ncbi_taxon_id = None
151 if "ncbi_taxid" in record.annotations :
152
153 if isinstance(record.annotations["ncbi_taxid"],list) :
154 if len(record.annotations["ncbi_taxid"])==1 :
155 ncbi_taxon_id = record.annotations["ncbi_taxid"][0]
156 else :
157 ncbi_taxon_id = record.annotations["ncbi_taxid"]
158 if not ncbi_taxon_id:
159
160 for f in record.features:
161 if f.type == 'source':
162 quals = getattr(f, 'qualifiers', {})
163 if "db_xref" in quals:
164 for db_xref in f.qualifiers["db_xref"]:
165 if db_xref.startswith("taxon:"):
166 ncbi_taxon_id = int(db_xref[6:])
167 break
168 if ncbi_taxon_id: break
169
170 try :
171 scientific_name = record.annotations["organism"][:255]
172 except KeyError :
173 scientific_name = None
174 try :
175 common_name = record.annotations["source"][:255]
176 except KeyError :
177 common_name = None
178
179
180
181
182
183 if ncbi_taxon_id:
184
185
186
187 return self._get_taxon_id_from_ncbi_taxon_id(ncbi_taxon_id,
188 scientific_name,
189 common_name)
190
191 if not common_name and not scientific_name :
192
193
194
195 return None
196
197
198
199 if scientific_name:
200 taxa = self.adaptor.execute_and_fetch_col0(
201 "SELECT taxon_id FROM taxon_name" \
202 " WHERE name_class = 'scientific name' AND name = %s",
203 (scientific_name,))
204 if taxa:
205
206 return taxa[0]
207
208
209 if common_name:
210 taxa = self.adaptor.execute_and_fetch_col0(
211 "SELECT DISTINCT taxon_id FROM taxon_name" \
212 " WHERE name = %s",
213 (common_name,))
214
215
216 if len(taxa) > 1:
217 raise ValueError("Taxa: %d species have name %r" % (
218 len(taxa),
219 common_name))
220 if taxa:
221
222 return taxa[0]
223
224
225
226
227
228
229
230
231
232
233
234
235
236 lineage = []
237 for c in record.annotations.get("taxonomy", []):
238 lineage.append([None, None, c])
239 if lineage:
240 lineage[-1][1] = "genus"
241 lineage.append([None, "species", record.annotations["organism"]])
242
243 if "subspecies" in record.annotations:
244 lineage.append([None, "subspecies",
245 record.annotations["subspecies"]])
246 if "variant" in record.annotations:
247 lineage.append([None, "varietas",
248 record.annotations["variant"]])
249 lineage[-1][0] = ncbi_taxon_id
250
251 left_value = self.adaptor.execute_one(
252 "SELECT MAX(left_value) FROM taxon")[0]
253 if not left_value:
254 left_value = 0
255 left_value += 1
256
257
258
259
260
261 right_start_value = self.adaptor.execute_one(
262 "SELECT MAX(right_value) FROM taxon")[0]
263 if not right_start_value:
264 right_start_value = 0
265 right_value = right_start_value + 2 * len(lineage) - 1
266
267 parent_taxon_id = None
268 for taxon in lineage:
269 self.adaptor.execute(
270 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\
271 " left_value, right_value)" \
272 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id,
273 taxon[0],
274 taxon[1],
275 left_value,
276 right_value))
277 taxon_id = self.adaptor.last_id("taxon")
278 self.adaptor.execute(
279 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
280 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255]))
281
282
283 left_value += 1
284 right_value -= 1
285 parent_taxon_id = taxon_id
286 if common_name:
287 self.adaptor.execute(
288 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
289 "VALUES (%s, %s, 'common name')", (
290 taxon_id, common_name))
291
292 return taxon_id
293
295 """Map Entrez name terms to those used in taxdump (PRIVATE).
296
297 We need to make this conversion to match the taxon_name.name_class
298 values used by the BioSQL load_ncbi_taxonomy.pl script.
299
300 e.g.
301 "ScientificName" -> "scientific name",
302 "EquivalentName" -> "equivalent name",
303 "Synonym" -> "synonym",
304 """
305
306
307
308
309
310
311
312
313
314 def add_space(letter) :
315 if letter.isupper() :
316 return " "+letter.lower()
317 else :
318 return letter
319 answer = "".join([add_space(letter) for letter in entrez_name]).strip()
320 assert answer == answer.lower()
321 return answer
322
326 """Get the taxon id for this record from the NCBI taxon ID (PRIVATE).
327
328 ncbi_taxon_id - string containing an NCBI taxon id
329 scientific_name - string, used if a stub entry is recorded
330 common_name - string, used if a stub entry is recorded
331
332 This searches the taxon table using ONLY the NCBI taxon ID
333 to find the matching taxon table entry's ID (database key).
334
335 If the species isn't in the taxon table, and the fetch_NCBI_taxonomy
336 flag is true, Biopython will attempt to go online using Bio.Entrez
337 to fetch the official NCBI lineage, recursing up the tree until an
338 existing entry is found in the database or the full lineage has been
339 fetched.
340
341 Otherwise the NCBI taxon ID, scientific name and common name are
342 recorded as a minimal stub entry in the taxon and taxon_name tables.
343 Any partial information about the lineage from the SeqRecord is NOT
344 recorded. This should mean that (re)running the BioSQL script
345 load_ncbi_taxonomy.pl can fill in the taxonomy lineage.
346
347 Returns the taxon id (database key for the taxon table, not
348 an NCBI taxon ID).
349 """
350 assert ncbi_taxon_id
351
352 taxon_id = self.adaptor.execute_and_fetch_col0(
353 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s",
354 (ncbi_taxon_id,))
355 if taxon_id:
356
357 return taxon_id[0]
358
359
360
361
362 parent_taxon_id = None
363 rank = "species"
364 genetic_code = None
365 mito_genetic_code = None
366 species_names = []
367 if scientific_name :
368 species_names.append(("scientific name", scientific_name))
369 if common_name :
370 species_names.append(("common name", common_name))
371
372 if self.fetch_NCBI_taxonomy :
373
374 handle = Entrez.efetch(db="taxonomy",id=ncbi_taxon_id,retmode="XML")
375 taxonomic_record = Entrez.read(handle)
376 if len(taxonomic_record) == 1:
377 assert taxonomic_record[0]["TaxId"] == str(ncbi_taxon_id), \
378 "%s versus %s" % (taxonomic_record[0]["TaxId"],
379 ncbi_taxon_id)
380 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage( \
381 taxonomic_record[0]["LineageEx"])
382 rank = taxonomic_record[0]["Rank"]
383 genetic_code = taxonomic_record[0]["GeneticCode"]["GCId"]
384 mito_genetic_code = taxonomic_record[0]["MitoGeneticCode"]["MGCId"]
385 species_names = [("scientific name",
386 taxonomic_record[0]["ScientificName"])]
387 try :
388 for name_class, names in taxonomic_record[0]["OtherNames"].iteritems():
389 name_class = self._fix_name_class(name_class)
390 if not isinstance(names, list) :
391
392
393 names = [names]
394 for name in names :
395
396 if isinstance(name, basestring) :
397 species_names.append((name_class, name))
398 except KeyError :
399
400
401 pass
402 else :
403 pass
404
405
406
407
408
409
410
411
412
413 self.adaptor.execute(
414 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\
415 " genetic_code, mito_genetic_code, left_value, right_value)" \
416 " VALUES (%s, %s, %s, %s, %s, %s, %s)", (parent_taxon_id,
417 ncbi_taxon_id,
418 rank,
419 genetic_code,
420 mito_genetic_code,
421 None,
422 None))
423 taxon_id = self.adaptor.last_id("taxon")
424
425
426 for name_class, name in species_names :
427 self.adaptor.execute(
428 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
429 " VALUES (%s, %s, %s)", (taxon_id,
430 name[:255],
431 name_class))
432 return taxon_id
433
435 """This is recursive! (PRIVATE).
436
437 taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez
438
439 First dictionary in list is the taxonomy root, highest would be the species.
440 Each dictionary includes:
441 - TaxID (string, NCBI taxon id)
442 - Rank (string, e.g. "species", "genus", ..., "phylum", ...)
443 - ScientificName (string)
444 (and that is all at the time of writing)
445
446 This method will record all the lineage given, returning the the taxon id
447 (database key, not NCBI taxon id) of the final entry (the species).
448 """
449 ncbi_taxon_id = taxonomic_lineage[-1]["TaxId"]
450
451
452 taxon_id = self.adaptor.execute_and_fetch_col0(
453 "SELECT taxon_id FROM taxon" \
454 " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id)
455 if taxon_id:
456
457
458 if isinstance(taxon_id, list) :
459 assert len(taxon_id)==1
460 return taxon_id[0]
461 else :
462 return taxon_id
463
464
465 if len(taxonomic_lineage) > 1 :
466
467 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage(taxonomic_lineage[:-1])
468 assert isinstance(parent_taxon_id, int) or isinstance(parent_taxon_id, long), repr(parent_taxon_id)
469 else :
470 parent_taxon_id = None
471
472
473 rank = taxonomic_lineage[-1].get("Rank", None)
474 self.adaptor.execute(
475 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank)"\
476 " VALUES (%s, %s, %s)", (ncbi_taxon_id, parent_taxon_id, rank))
477 taxon_id = self.adaptor.last_id("taxon")
478 assert isinstance(taxon_id, int) or isinstance(taxon_id, long), repr(taxon_id)
479
480 scientific_name = taxonomic_lineage[-1].get("ScientificName", None)
481 if scientific_name :
482 self.adaptor.execute(
483 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
484 " VALUES (%s, %s, 'scientific name')", (taxon_id,
485 scientific_name[:255]))
486 return taxon_id
487
488
489 - def _load_bioentry_table(self, record):
490 """Fill the bioentry table with sequence information (PRIVATE).
491
492 record - SeqRecord object to add to the database.
493 """
494
495
496 if record.id.count(".") == 1:
497
498 accession, version = record.id.split('.')
499 try :
500 version = int(version)
501 except ValueError :
502 accession = record.id
503 version = 0
504 else:
505 accession = record.id
506 version = 0
507
508 if "accessions" in record.annotations \
509 and isinstance(record.annotations["accessions"],list) \
510 and record.annotations["accessions"] :
511
512 accession = record.annotations["accessions"][0]
513
514
515
516
517 taxon_id = self._get_taxon_id(record)
518
519 if "gi" in record.annotations :
520 identifier = record.annotations["gi"]
521 else :
522 identifier = record.id
523
524
525 description = getattr(record, 'description', None)
526 division = record.annotations.get("data_file_division", None)
527
528 sql = """
529 INSERT INTO bioentry (
530 biodatabase_id,
531 taxon_id,
532 name,
533 accession,
534 identifier,
535 division,
536 description,
537 version)
538 VALUES (
539 %s,
540 %s,
541 %s,
542 %s,
543 %s,
544 %s,
545 %s,
546 %s)"""
547
548
549 self.adaptor.execute(sql, (self.dbid,
550 taxon_id,
551 record.name,
552 accession,
553 identifier,
554 division,
555 description,
556 version))
557
558 bioentry_id = self.adaptor.last_id('bioentry')
559
560 return bioentry_id
561
562 - def _load_bioentry_date(self, record, bioentry_id):
563 """Add the effective date of the entry into the database.
564
565 record - a SeqRecord object with an annotated date
566 bioentry_id - corresponding database identifier
567 """
568
569
570 date = record.annotations.get("date",
571 strftime("%d-%b-%Y", gmtime()).upper())
572 annotation_tags_id = self._get_ontology_id("Annotation Tags")
573 date_id = self._get_term_id("date_changed", annotation_tags_id)
574 sql = r"INSERT INTO bioentry_qualifier_value" \
575 r" (bioentry_id, term_id, value, rank)" \
576 r" VALUES (%s, %s, %s, 1)"
577 self.adaptor.execute(sql, (bioentry_id, date_id, date))
578
580 """Record a SeqRecord's sequence and alphabet in the database (PRIVATE).
581
582 record - a SeqRecord object with a seq property
583 bioentry_id - corresponding database identifier
584 """
585
586 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
587 alphabet = "dna"
588 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
589 alphabet = "rna"
590 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
591 alphabet = "protein"
592 else:
593 alphabet = "unknown"
594
595 sql = r"INSERT INTO biosequence (bioentry_id, version, " \
596 r"length, seq, alphabet) " \
597 r"VALUES (%s, 0, %s, %s, %s)"
598 self.adaptor.execute(sql, (bioentry_id,
599 len(record.seq.data),
600 record.seq.data,
601 alphabet))
602
618
620 """Record a SeqRecord's misc annotations in the database (PRIVATE).
621
622 The annotation strings are recorded in the bioentry_qualifier_value
623 table, except for special cases like the reference, comment and
624 taxonomy which are handled with their own tables.
625
626 record - a SeqRecord object with an annotations dictionary
627 bioentry_id - corresponding database identifier
628 """
629 mono_sql = "INSERT INTO bioentry_qualifier_value" \
630 "(bioentry_id, term_id, value)" \
631 " VALUES (%s, %s, %s)"
632 many_sql = "INSERT INTO bioentry_qualifier_value" \
633 "(bioentry_id, term_id, value, rank)" \
634 " VALUES (%s, %s, %s, %s)"
635 tag_ontology_id = self._get_ontology_id('Annotation Tags')
636 for key, value in record.annotations.iteritems() :
637 if key in ["references", "comment", "ncbi_taxid"] :
638
639 continue
640 term_id = self._get_term_id(key, ontology_id=tag_ontology_id)
641 if isinstance(value, list) :
642 rank = 0
643 for entry in value :
644 if isinstance(entry, str) or isinstance(entry, int):
645
646 rank += 1
647 self.adaptor.execute(many_sql, \
648 (bioentry_id, term_id, str(entry), rank))
649 else :
650 pass
651
652
653 elif isinstance(value, str) or isinstance(value, int):
654
655 self.adaptor.execute(mono_sql, \
656 (bioentry_id, term_id, str(value)))
657 else :
658 pass
659
660
661
662
664 """Record a SeqRecord's annotated references in the database (PRIVATE).
665
666 record - a SeqRecord object with annotated references
667 bioentry_id - corresponding database identifier
668 """
669
670 refs = None
671 if reference.medline_id:
672 refs = self.adaptor.execute_and_fetch_col0(
673 "SELECT reference_id" \
674 " FROM reference JOIN dbxref USING (dbxref_id)" \
675 " WHERE dbname = 'MEDLINE' AND accession = %s",
676 (reference.medline_id,))
677 if not refs and reference.pubmed_id:
678 refs = self.adaptor.execute_and_fetch_col0(
679 "SELECT reference_id" \
680 " FROM reference JOIN dbxref USING (dbxref_id)" \
681 " WHERE dbname = 'PUBMED' AND accession = %s",
682 (reference.pubmed_id,))
683 if not refs:
684 s = []
685 for f in reference.authors, reference.title, reference.journal:
686 s.append(f or "<undef>")
687 crc = crc64("".join(s))
688 refs = self.adaptor.execute_and_fetch_col0(
689 "SELECT reference_id FROM reference" \
690 r" WHERE crc = %s", (crc,))
691 if not refs:
692 if reference.medline_id:
693 dbxref_id = self._add_dbxref("MEDLINE",
694 reference.medline_id, 0)
695 elif reference.pubmed_id:
696 dbxref_id = self._add_dbxref("PUBMED",
697 reference.pubmed_id, 0)
698 else:
699 dbxref_id = None
700 authors = reference.authors or None
701 title = reference.title or None
702
703
704 journal = reference.journal or ""
705 self.adaptor.execute(
706 "INSERT INTO reference (dbxref_id, location," \
707 " title, authors, crc)" \
708 " VALUES (%s, %s, %s, %s, %s)",
709 (dbxref_id, journal, title,
710 authors, crc))
711 reference_id = self.adaptor.last_id("reference")
712 else:
713 reference_id = refs[0]
714
715 if reference.location:
716 start = 1 + int(str(reference.location[0].start))
717 end = int(str(reference.location[0].end))
718 else:
719 start = None
720 end = None
721
722 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \
723 " start_pos, end_pos, rank)" \
724 " VALUES (%s, %s, %s, %s, %s)"
725 self.adaptor.execute(sql, (bioentry_id, reference_id,
726 start, end, rank + 1))
727
735
737 """Load the first tables of a seqfeature and returns the id (PRIVATE).
738
739 This loads the "key" of the seqfeature (ie. CDS, gene) and
740 the basic seqfeature table itself.
741 """
742 ontology_id = self._get_ontology_id('SeqFeature Keys')
743 seqfeature_key_id = self._get_term_id(feature_type,
744 ontology_id = ontology_id)
745
746
747 source_cat_id = self._get_ontology_id('SeqFeature Sources')
748 source_term_id = self._get_term_id('EMBL/GenBank/SwissProt',
749 ontology_id = source_cat_id)
750
751 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \
752 r"source_term_id, rank) VALUES (%s, %s, %s, %s)"
753 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id,
754 source_term_id, feature_rank + 1))
755 seqfeature_id = self.adaptor.last_id('seqfeature')
756
757 return seqfeature_id
758
760 """Load all of the locations for a SeqFeature into tables (PRIVATE).
761
762 This adds the locations related to the SeqFeature into the
763 seqfeature_location table. Fuzzies are not handled right now.
764 For a simple location, ie (1..2), we have a single table row
765 with seq_start = 1, seq_end = 2, location_rank = 1.
766
767 For split locations, ie (1..2, 3..4, 5..6) we would have three
768 row tables with:
769 start = 1, end = 2, rank = 1
770 start = 3, end = 4, rank = 2
771 start = 5, end = 6, rank = 3
772 """
773
774
775
776
777 if not feature.sub_features:
778 self._insert_seqfeature_location(feature, 1, seqfeature_id)
779 else:
780 for rank, cur_feature in enumerate(feature.sub_features):
781 self._insert_seqfeature_location(cur_feature,
782 rank + 1,
783 seqfeature_id)
784
786 """Add a location of a SeqFeature to the seqfeature_location table (PRIVATE).
787
788 TODO - Add location_operators to location_qualifier_value.
789 """
790
791
792
793 start = feature.location.nofuzzy_start + 1
794 end = feature.location.nofuzzy_end
795
796
797
798
799 strand = feature.strand or 0
800
801
802
803 loc_term_id = None
804
805 if feature.ref:
806
807
808
809 dbxref_id = self._get_dbxref_id(feature.ref_db or "", feature.ref)
810 else :
811 dbxref_id = None
812
813 sql = r"INSERT INTO location (seqfeature_id, dbxref_id, term_id," \
814 r"start_pos, end_pos, strand, rank) " \
815 r"VALUES (%s, %s, %s, %s, %s, %s, %s)"
816 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, loc_term_id,
817 start, end, strand, rank))
818
819 """
820 # See Bug 2677
821 # TODO - Record the location_operator (e.g. "join" or "order")
822 # using the location_qualifier_value table (which we and BioPerl
823 # have historically left empty).
824 # Note this will need an ontology term for the location qualifer
825 # (location_qualifier_value.term_id) for which oddly the schema
826 # does not allow NULL.
827 if feature.location_operator:
828 #e.g. "join" (common),
829 #or "order" (see Tests/GenBank/protein_refseq2.gb)
830 location_id = self.adaptor.last_id('location')
831 loc_qual_term_id = None # Not allowed in BioSQL v1.0.1
832 sql = r"INSERT INTO location_qualifier_value" \
833 r"(location_id, term_id, value)" \
834 r"VALUES (%s, %s, %s)"
835 self.adaptor.execute(sql, (location_id, loc_qual_term_id,
836 feature.location_operator))
837 """
838
840 """Insert the (key, value) pair qualifiers relating to a feature (PRIVATE).
841
842 Qualifiers should be a dictionary of the form:
843 {key : [value1, value2]}
844 """
845 tag_ontology_id = self._get_ontology_id('Annotation Tags')
846 for qualifier_key in qualifiers.keys():
847
848
849
850
851 if qualifier_key != 'db_xref':
852 qualifier_key_id = self._get_term_id(qualifier_key,
853 ontology_id=tag_ontology_id)
854
855 entries = qualifiers[qualifier_key]
856 if not isinstance(entries, list) :
857
858
859 entries = [entries]
860 for qual_value_rank in range(len(entries)):
861 qualifier_value = entries[qual_value_rank]
862 sql = r"INSERT INTO seqfeature_qualifier_value "\
863 r" (seqfeature_id, term_id, rank, value) VALUES"\
864 r" (%s, %s, %s, %s)"
865 self.adaptor.execute(sql, (seqfeature_id,
866 qualifier_key_id,
867 qual_value_rank + 1,
868 qualifier_value))
869 else:
870
871
872
873
874 self._load_seqfeature_dbxref(qualifiers[qualifier_key],
875 seqfeature_id)
876
877
879 """Add database crossreferences of a SeqFeature to the database (PRIVATE).
880
881 o dbxrefs List, dbxref data from the source file in the
882 format <database>:<accession>
883
884 o seqfeature_id Int, the identifier for the seqfeature in the
885 seqfeature table
886
887 Insert dbxref qualifier data for a seqfeature into the
888 seqfeature_dbxref and, if required, dbxref tables.
889 The dbxref_id qualifier/value sets go into the dbxref table
890 as dbname, accession, version tuples, with dbxref.dbxref_id
891 being automatically assigned, and into the seqfeature_dbxref
892 table as seqfeature_id, dbxref_id, and rank tuples
893 """
894
895
896
897
898 for rank, value in enumerate(dbxrefs):
899
900
901 try:
902 dbxref_data = value.replace(' ','').replace('\n','').split(':')
903 db = dbxref_data[0]
904 accessions = dbxref_data[1:]
905 except:
906 raise ValueError("Parsing of db_xref failed: '%s'" % value)
907
908
909 for accession in accessions:
910
911 dbxref_id = self._get_dbxref_id(db, accession)
912
913 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank+1)
914
916 """ _get_dbxref_id(self, db, accession) -> Int
917
918 o db String, the name of the external database containing
919 the accession number
920
921 o accession String, the accession of the dbxref data
922
923 Finds and returns the dbxref_id for the passed data. The method
924 attempts to find an existing record first, and inserts the data
925 if there is no record.
926 """
927
928 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \
929 r'AND accession = %s'
930 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession))
931
932
933 if dbxref_id:
934 return dbxref_id[0]
935 return self._add_dbxref(db, accession, 0)
936
938 """ Check for a pre-existing seqfeature_dbxref entry with the passed
939 seqfeature_id and dbxref_id. If one does not exist, insert new
940 data
941
942 """
943
944 sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \
945 r"WHERE seqfeature_id = '%s' AND dbxref_id = '%s'"
946 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id,
947 dbxref_id))
948
949
950 if result:
951 return result
952 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
953
955 """ Insert a seqfeature_dbxref row and return the seqfeature_id and
956 dbxref_id
957 """
958 sql = r'INSERT INTO seqfeature_dbxref ' \
959 '(seqfeature_id, dbxref_id, rank) VALUES' \
960 r'(%s, %s, %s)'
961 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank))
962 return (seqfeature_id, dbxref_id)
963
987
988 - def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
989 """ Check for a pre-existing bioentry_dbxref entry with the passed
990 seqfeature_id and dbxref_id. If one does not exist, insert new
991 data
992
993 """
994
995 sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \
996 r"WHERE bioentry_id = '%s' AND dbxref_id = '%s'"
997 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id,
998 dbxref_id))
999
1000
1001 if result:
1002 return result
1003 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
1004
1005 - def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1006 """ Insert a bioentry_dbxref row and return the seqfeature_id and
1007 dbxref_id
1008 """
1009 sql = r'INSERT INTO bioentry_dbxref ' \
1010 '(bioentry_id,dbxref_id,rank) VALUES ' \
1011 '(%s, %s, %s)'
1012 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank))
1013 return (bioentry_id, dbxref_id)
1014
1016 """Complement the Loader functionality by fully removing a database.
1017
1018 This probably isn't really useful for normal purposes, since you
1019 can just do a:
1020 DROP DATABASE db_name
1021 and then recreate the database. But, it's really useful for testing
1022 purposes.
1023
1024 YB: now use the cascaded deletions
1025 """
1027 """Initialize with a database id and adaptor connection.
1028 """
1029 self.adaptor = adaptor
1030 self.dbid = dbid
1031
1033 """Remove everything related to the given database id.
1034 """
1035 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s"
1036 self.adaptor.execute(sql, (self.dbid,))
1037 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s"
1038 self.adaptor.execute(sql, (self.dbid,))
1039