1
2
3
4
5
6
7
8
9
10
11 """Load biopython objects into a BioSQL database for persistent storage.
12
13 This code makes it possible to store biopython objects in a relational
14 database and then retrieve them back. You shouldn't use any of the
15 classes in this module directly. Rather, call the load() method on
16 a database object.
17 """
18
19 from time import gmtime, strftime
20
21
22 from Bio import Alphabet
23 from Bio.SeqUtils.CheckSum import crc64
24 from Bio import Entrez
25 from Bio.Seq import UnknownSeq
26
28 """Object used to load SeqRecord objects into a BioSQL database."""
29 - def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
30 """Initialize with connection information for the database.
31
32 Creating a DatabaseLoader object is normally handled via the
33 BioSeqDatabase DBServer object, for example:
34
35 from BioSQL import BioSeqDatabase
36 server = BioSeqDatabase.open_database(driver="MySQLdb", user="gbrowse",
37 passwd = "biosql", host = "localhost", db="test_biosql")
38 try:
39 db = server["test"]
40 except KeyError:
41 db = server.new_database("test", description="For testing GBrowse")
42 """
43 self.adaptor = adaptor
44 self.dbid = dbid
45 self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
46
48 """Load a Biopython SeqRecord into the database.
49 """
50 bioentry_id = self._load_bioentry_table(record)
51 self._load_bioentry_date(record, bioentry_id)
52 self._load_biosequence(record, bioentry_id)
53 self._load_comment(record, bioentry_id)
54 self._load_dbxrefs(record, bioentry_id)
55 references = record.annotations.get('references', ())
56 for reference, rank in zip(references, range(len(references))):
57 self._load_reference(reference, rank, bioentry_id)
58 self._load_annotations(record, bioentry_id)
59 for seq_feature_num in range(len(record.features)):
60 seq_feature = record.features[seq_feature_num]
61 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
62
64 """Returns the identifier for the named ontology (PRIVATE).
65
66 This looks through the onotology table for a the given entry name.
67 If it is not found, a row is added for this ontology (using the
68 definition if supplied). In either case, the id corresponding to
69 the provided name is returned, so that you can reference it in
70 another table.
71 """
72 oids = self.adaptor.execute_and_fetch_col0(
73 "SELECT ontology_id FROM ontology WHERE name = %s",
74 (name,))
75 if oids:
76 return oids[0]
77 self.adaptor.execute(
78 "INSERT INTO ontology(name, definition) VALUES (%s, %s)",
79 (name, definition))
80 return self.adaptor.last_id("ontology")
81
82
83 - def _get_term_id(self,
84 name,
85 ontology_id=None,
86 definition=None,
87 identifier=None):
88 """Get the id that corresponds to a term (PRIVATE).
89
90 This looks through the term table for a the given term. If it
91 is not found, a new id corresponding to this term is created.
92 In either case, the id corresponding to that term is returned, so
93 that you can reference it in another table.
94
95 The ontology_id should be used to disambiguate the term.
96 """
97
98
99 sql = r"SELECT term_id FROM term " \
100 r"WHERE name = %s"
101 fields = [name]
102 if ontology_id:
103 sql += ' AND ontology_id = %s'
104 fields.append(ontology_id)
105 id_results = self.adaptor.execute_and_fetchall(sql, fields)
106
107 if len(id_results) > 1:
108 raise ValueError("Multiple term ids for %s: %r" %
109 (name, id_results))
110 elif len(id_results) == 1:
111 return id_results[0][0]
112 else:
113 sql = r"INSERT INTO term (name, definition," \
114 r" identifier, ontology_id)" \
115 r" VALUES (%s, %s, %s, %s)"
116 self.adaptor.execute(sql, (name, definition,
117 identifier, ontology_id))
118 return self.adaptor.last_id("term")
119
121 """Insert a dbxref and return its id."""
122
123 self.adaptor.execute(
124 "INSERT INTO dbxref(dbname, accession, version)" \
125 " VALUES (%s, %s, %s)", (dbname, accession, version))
126 return self.adaptor.last_id("dbxref")
127
129 """Get the taxon id for this record (PRIVATE).
130
131 record - a SeqRecord object
132
133 This searches the taxon/taxon_name tables using the
134 NCBI taxon ID, scientific name and common name to find
135 the matching taxon table entry's id.
136
137 If the species isn't in the taxon table, and we have at
138 least the NCBI taxon ID, scientific name or common name,
139 at least a minimal stub entry is created in the table.
140
141 Returns the taxon id (database key for the taxon table,
142 not an NCBI taxon ID), or None if the taxonomy information
143 is missing.
144
145 See also the BioSQL script load_ncbi_taxonomy.pl which
146 will populate and update the taxon/taxon_name tables
147 with the latest information from the NCBI.
148 """
149
150
151 ncbi_taxon_id = None
152 if "ncbi_taxid" in record.annotations:
153
154 if isinstance(record.annotations["ncbi_taxid"],list):
155 if len(record.annotations["ncbi_taxid"])==1:
156 ncbi_taxon_id = record.annotations["ncbi_taxid"][0]
157 else:
158 ncbi_taxon_id = record.annotations["ncbi_taxid"]
159 if not ncbi_taxon_id:
160
161 for f in record.features:
162 if f.type == 'source':
163 quals = getattr(f, 'qualifiers', {})
164 if "db_xref" in quals:
165 for db_xref in f.qualifiers["db_xref"]:
166 if db_xref.startswith("taxon:"):
167 ncbi_taxon_id = int(db_xref[6:])
168 break
169 if ncbi_taxon_id: break
170
171 try:
172 scientific_name = record.annotations["organism"][:255]
173 except KeyError:
174 scientific_name = None
175 try:
176 common_name = record.annotations["source"][:255]
177 except KeyError:
178 common_name = None
179
180
181
182
183
184 if ncbi_taxon_id:
185
186
187
188 return self._get_taxon_id_from_ncbi_taxon_id(ncbi_taxon_id,
189 scientific_name,
190 common_name)
191
192 if not common_name and not scientific_name:
193
194
195
196 return None
197
198
199
200 if scientific_name:
201 taxa = self.adaptor.execute_and_fetch_col0(
202 "SELECT taxon_id FROM taxon_name" \
203 " WHERE name_class = 'scientific name' AND name = %s",
204 (scientific_name,))
205 if taxa:
206
207 return taxa[0]
208
209
210 if common_name:
211 taxa = self.adaptor.execute_and_fetch_col0(
212 "SELECT DISTINCT taxon_id FROM taxon_name" \
213 " WHERE name = %s",
214 (common_name,))
215
216
217 if len(taxa) > 1:
218 raise ValueError("Taxa: %d species have name %r" % (
219 len(taxa),
220 common_name))
221 if taxa:
222
223 return taxa[0]
224
225
226
227
228
229
230
231
232
233
234
235
236
237 lineage = []
238 for c in record.annotations.get("taxonomy", []):
239 lineage.append([None, None, c])
240 if lineage:
241 lineage[-1][1] = "genus"
242 lineage.append([None, "species", record.annotations["organism"]])
243
244 if "subspecies" in record.annotations:
245 lineage.append([None, "subspecies",
246 record.annotations["subspecies"]])
247 if "variant" in record.annotations:
248 lineage.append([None, "varietas",
249 record.annotations["variant"]])
250 lineage[-1][0] = ncbi_taxon_id
251
252 left_value = self.adaptor.execute_one(
253 "SELECT MAX(left_value) FROM taxon")[0]
254 if not left_value:
255 left_value = 0
256 left_value += 1
257
258
259
260
261
262 right_start_value = self.adaptor.execute_one(
263 "SELECT MAX(right_value) FROM taxon")[0]
264 if not right_start_value:
265 right_start_value = 0
266 right_value = right_start_value + 2 * len(lineage) - 1
267
268 parent_taxon_id = None
269 for taxon in lineage:
270 self.adaptor.execute(
271 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\
272 " left_value, right_value)" \
273 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id,
274 taxon[0],
275 taxon[1],
276 left_value,
277 right_value))
278 taxon_id = self.adaptor.last_id("taxon")
279 self.adaptor.execute(
280 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
281 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255]))
282
283
284 left_value += 1
285 right_value -= 1
286 parent_taxon_id = taxon_id
287 if common_name:
288 self.adaptor.execute(
289 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
290 "VALUES (%s, %s, 'common name')", (
291 taxon_id, common_name))
292
293 return taxon_id
294
296 """Map Entrez name terms to those used in taxdump (PRIVATE).
297
298 We need to make this conversion to match the taxon_name.name_class
299 values used by the BioSQL load_ncbi_taxonomy.pl script.
300
301 e.g.
302 "ScientificName" -> "scientific name",
303 "EquivalentName" -> "equivalent name",
304 "Synonym" -> "synonym",
305 """
306
307
308
309
310
311
312
313
314
315 def add_space(letter):
316 if letter.isupper():
317 return " "+letter.lower()
318 else:
319 return letter
320 answer = "".join([add_space(letter) for letter in entrez_name]).strip()
321 assert answer == answer.lower()
322 return answer
323
327 """Get the taxon id for this record from the NCBI taxon ID (PRIVATE).
328
329 ncbi_taxon_id - string containing an NCBI taxon id
330 scientific_name - string, used if a stub entry is recorded
331 common_name - string, used if a stub entry is recorded
332
333 This searches the taxon table using ONLY the NCBI taxon ID
334 to find the matching taxon table entry's ID (database key).
335
336 If the species isn't in the taxon table, and the fetch_NCBI_taxonomy
337 flag is true, Biopython will attempt to go online using Bio.Entrez
338 to fetch the official NCBI lineage, recursing up the tree until an
339 existing entry is found in the database or the full lineage has been
340 fetched.
341
342 Otherwise the NCBI taxon ID, scientific name and common name are
343 recorded as a minimal stub entry in the taxon and taxon_name tables.
344 Any partial information about the lineage from the SeqRecord is NOT
345 recorded. This should mean that (re)running the BioSQL script
346 load_ncbi_taxonomy.pl can fill in the taxonomy lineage.
347
348 Returns the taxon id (database key for the taxon table, not
349 an NCBI taxon ID).
350 """
351 assert ncbi_taxon_id
352
353 taxon_id = self.adaptor.execute_and_fetch_col0(
354 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s",
355 (ncbi_taxon_id,))
356 if taxon_id:
357
358 return taxon_id[0]
359
360
361
362
363 parent_taxon_id = None
364 rank = "species"
365 genetic_code = None
366 mito_genetic_code = None
367 species_names = []
368 if scientific_name:
369 species_names.append(("scientific name", scientific_name))
370 if common_name:
371 species_names.append(("common name", common_name))
372
373 if self.fetch_NCBI_taxonomy:
374
375 handle = Entrez.efetch(db="taxonomy",id=ncbi_taxon_id,retmode="XML")
376 taxonomic_record = Entrez.read(handle)
377 if len(taxonomic_record) == 1:
378 assert taxonomic_record[0]["TaxId"] == str(ncbi_taxon_id), \
379 "%s versus %s" % (taxonomic_record[0]["TaxId"],
380 ncbi_taxon_id)
381 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage( \
382 taxonomic_record[0]["LineageEx"])
383 rank = taxonomic_record[0]["Rank"]
384 genetic_code = taxonomic_record[0]["GeneticCode"]["GCId"]
385 mito_genetic_code = taxonomic_record[0]["MitoGeneticCode"]["MGCId"]
386 species_names = [("scientific name",
387 taxonomic_record[0]["ScientificName"])]
388 try:
389 for name_class, names in taxonomic_record[0]["OtherNames"].iteritems():
390 name_class = self._fix_name_class(name_class)
391 if not isinstance(names, list):
392
393
394 names = [names]
395 for name in names:
396
397 if isinstance(name, basestring):
398 species_names.append((name_class, name))
399 except KeyError:
400
401
402 pass
403 else:
404 pass
405
406
407
408
409
410
411
412
413
414 self.adaptor.execute(
415 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\
416 " genetic_code, mito_genetic_code, left_value, right_value)" \
417 " VALUES (%s, %s, %s, %s, %s, %s, %s)", (parent_taxon_id,
418 ncbi_taxon_id,
419 rank,
420 genetic_code,
421 mito_genetic_code,
422 None,
423 None))
424 taxon_id = self.adaptor.last_id("taxon")
425
426
427 for name_class, name in species_names:
428 self.adaptor.execute(
429 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
430 " VALUES (%s, %s, %s)", (taxon_id,
431 name[:255],
432 name_class))
433 return taxon_id
434
436 """This is recursive! (PRIVATE).
437
438 taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez
439
440 First dictionary in list is the taxonomy root, highest would be the species.
441 Each dictionary includes:
442 - TaxID (string, NCBI taxon id)
443 - Rank (string, e.g. "species", "genus", ..., "phylum", ...)
444 - ScientificName (string)
445 (and that is all at the time of writing)
446
447 This method will record all the lineage given, returning the the taxon id
448 (database key, not NCBI taxon id) of the final entry (the species).
449 """
450 ncbi_taxon_id = taxonomic_lineage[-1]["TaxId"]
451
452
453 taxon_id = self.adaptor.execute_and_fetch_col0(
454 "SELECT taxon_id FROM taxon" \
455 " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id)
456 if taxon_id:
457
458
459 if isinstance(taxon_id, list):
460 assert len(taxon_id)==1
461 return taxon_id[0]
462 else:
463 return taxon_id
464
465
466 if len(taxonomic_lineage) > 1:
467
468 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage(taxonomic_lineage[:-1])
469 assert isinstance(parent_taxon_id, int) or isinstance(parent_taxon_id, long), repr(parent_taxon_id)
470 else:
471 parent_taxon_id = None
472
473
474 rank = taxonomic_lineage[-1].get("Rank", None)
475 self.adaptor.execute(
476 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank)"\
477 " VALUES (%s, %s, %s)", (ncbi_taxon_id, parent_taxon_id, rank))
478 taxon_id = self.adaptor.last_id("taxon")
479 assert isinstance(taxon_id, int) or isinstance(taxon_id, long), repr(taxon_id)
480
481 scientific_name = taxonomic_lineage[-1].get("ScientificName", None)
482 if scientific_name:
483 self.adaptor.execute(
484 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
485 " VALUES (%s, %s, 'scientific name')", (taxon_id,
486 scientific_name[:255]))
487 return taxon_id
488
489
490 - def _load_bioentry_table(self, record):
491 """Fill the bioentry table with sequence information (PRIVATE).
492
493 record - SeqRecord object to add to the database.
494 """
495
496
497 if record.id.count(".") == 1:
498
499 accession, version = record.id.split('.')
500 try:
501 version = int(version)
502 except ValueError:
503 accession = record.id
504 version = 0
505 else:
506 accession = record.id
507 version = 0
508
509 if "accessions" in record.annotations \
510 and isinstance(record.annotations["accessions"],list) \
511 and record.annotations["accessions"]:
512
513 accession = record.annotations["accessions"][0]
514
515
516
517
518 taxon_id = self._get_taxon_id(record)
519
520 if "gi" in record.annotations:
521 identifier = record.annotations["gi"]
522 else:
523 identifier = record.id
524
525
526 description = getattr(record, 'description', None)
527 division = record.annotations.get("data_file_division", None)
528
529 sql = """
530 INSERT INTO bioentry (
531 biodatabase_id,
532 taxon_id,
533 name,
534 accession,
535 identifier,
536 division,
537 description,
538 version)
539 VALUES (
540 %s,
541 %s,
542 %s,
543 %s,
544 %s,
545 %s,
546 %s,
547 %s)"""
548
549
550 self.adaptor.execute(sql, (self.dbid,
551 taxon_id,
552 record.name,
553 accession,
554 identifier,
555 division,
556 description,
557 version))
558
559 bioentry_id = self.adaptor.last_id('bioentry')
560
561 return bioentry_id
562
563 - def _load_bioentry_date(self, record, bioentry_id):
564 """Add the effective date of the entry into the database.
565
566 record - a SeqRecord object with an annotated date
567 bioentry_id - corresponding database identifier
568 """
569
570
571 date = record.annotations.get("date",
572 strftime("%d-%b-%Y", gmtime()).upper())
573 if isinstance(date, list) : date = date[0]
574 annotation_tags_id = self._get_ontology_id("Annotation Tags")
575 date_id = self._get_term_id("date_changed", annotation_tags_id)
576 sql = r"INSERT INTO bioentry_qualifier_value" \
577 r" (bioentry_id, term_id, value, rank)" \
578 r" VALUES (%s, %s, %s, 1)"
579 self.adaptor.execute(sql, (bioentry_id, date_id, date))
580
582 """Record a SeqRecord's sequence and alphabet in the database (PRIVATE).
583
584 record - a SeqRecord object with a seq property
585 bioentry_id - corresponding database identifier
586 """
587 if record.seq is None:
588
589
590 return
591
592
593 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
594 alphabet = "dna"
595 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
596 alphabet = "rna"
597 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
598 alphabet = "protein"
599 else:
600 alphabet = "unknown"
601
602 if isinstance(record.seq, UnknownSeq):
603 seq_str = None
604 else:
605 seq_str = str(record.seq)
606
607 sql = r"INSERT INTO biosequence (bioentry_id, version, " \
608 r"length, seq, alphabet) " \
609 r"VALUES (%s, 0, %s, %s, %s)"
610 self.adaptor.execute(sql, (bioentry_id,
611 len(record.seq),
612 seq_str,
613 alphabet))
614
635
637 """Record a SeqRecord's misc annotations in the database (PRIVATE).
638
639 The annotation strings are recorded in the bioentry_qualifier_value
640 table, except for special cases like the reference, comment and
641 taxonomy which are handled with their own tables.
642
643 record - a SeqRecord object with an annotations dictionary
644 bioentry_id - corresponding database identifier
645 """
646 mono_sql = "INSERT INTO bioentry_qualifier_value" \
647 "(bioentry_id, term_id, value)" \
648 " VALUES (%s, %s, %s)"
649 many_sql = "INSERT INTO bioentry_qualifier_value" \
650 "(bioentry_id, term_id, value, rank)" \
651 " VALUES (%s, %s, %s, %s)"
652 tag_ontology_id = self._get_ontology_id('Annotation Tags')
653 for key, value in record.annotations.iteritems():
654 if key in ["references", "comment", "ncbi_taxid"]:
655
656 continue
657 term_id = self._get_term_id(key, ontology_id=tag_ontology_id)
658 if isinstance(value, list) or isinstance(value, tuple):
659 rank = 0
660 for entry in value:
661 if isinstance(entry, str) or isinstance(entry, int):
662
663 rank += 1
664 self.adaptor.execute(many_sql, \
665 (bioentry_id, term_id, str(entry), rank))
666 else:
667 pass
668
669
670 elif isinstance(value, str) or isinstance(value, int):
671
672 self.adaptor.execute(mono_sql, \
673 (bioentry_id, term_id, str(value)))
674 else:
675 pass
676
677
678
679
681 """Record a SeqRecord's annotated references in the database (PRIVATE).
682
683 record - a SeqRecord object with annotated references
684 bioentry_id - corresponding database identifier
685 """
686
687 refs = None
688 if reference.medline_id:
689 refs = self.adaptor.execute_and_fetch_col0(
690 "SELECT reference_id" \
691 " FROM reference JOIN dbxref USING (dbxref_id)" \
692 " WHERE dbname = 'MEDLINE' AND accession = %s",
693 (reference.medline_id,))
694 if not refs and reference.pubmed_id:
695 refs = self.adaptor.execute_and_fetch_col0(
696 "SELECT reference_id" \
697 " FROM reference JOIN dbxref USING (dbxref_id)" \
698 " WHERE dbname = 'PUBMED' AND accession = %s",
699 (reference.pubmed_id,))
700 if not refs:
701 s = []
702 for f in reference.authors, reference.title, reference.journal:
703 s.append(f or "<undef>")
704 crc = crc64("".join(s))
705 refs = self.adaptor.execute_and_fetch_col0(
706 "SELECT reference_id FROM reference" \
707 r" WHERE crc = %s", (crc,))
708 if not refs:
709 if reference.medline_id:
710 dbxref_id = self._add_dbxref("MEDLINE",
711 reference.medline_id, 0)
712 elif reference.pubmed_id:
713 dbxref_id = self._add_dbxref("PUBMED",
714 reference.pubmed_id, 0)
715 else:
716 dbxref_id = None
717 authors = reference.authors or None
718 title = reference.title or None
719
720
721 journal = reference.journal or ""
722 self.adaptor.execute(
723 "INSERT INTO reference (dbxref_id, location," \
724 " title, authors, crc)" \
725 " VALUES (%s, %s, %s, %s, %s)",
726 (dbxref_id, journal, title,
727 authors, crc))
728 reference_id = self.adaptor.last_id("reference")
729 else:
730 reference_id = refs[0]
731
732 if reference.location:
733 start = 1 + int(str(reference.location[0].start))
734 end = int(str(reference.location[0].end))
735 else:
736 start = None
737 end = None
738
739 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \
740 " start_pos, end_pos, rank)" \
741 " VALUES (%s, %s, %s, %s, %s)"
742 self.adaptor.execute(sql, (bioentry_id, reference_id,
743 start, end, rank + 1))
744
752
754 """Load the first tables of a seqfeature and returns the id (PRIVATE).
755
756 This loads the "key" of the seqfeature (ie. CDS, gene) and
757 the basic seqfeature table itself.
758 """
759 ontology_id = self._get_ontology_id('SeqFeature Keys')
760 seqfeature_key_id = self._get_term_id(feature_type,
761 ontology_id = ontology_id)
762
763
764 source_cat_id = self._get_ontology_id('SeqFeature Sources')
765 source_term_id = self._get_term_id('EMBL/GenBank/SwissProt',
766 ontology_id = source_cat_id)
767
768 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \
769 r"source_term_id, rank) VALUES (%s, %s, %s, %s)"
770 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id,
771 source_term_id, feature_rank + 1))
772 seqfeature_id = self.adaptor.last_id('seqfeature')
773
774 return seqfeature_id
775
777 """Load all of the locations for a SeqFeature into tables (PRIVATE).
778
779 This adds the locations related to the SeqFeature into the
780 seqfeature_location table. Fuzzies are not handled right now.
781 For a simple location, ie (1..2), we have a single table row
782 with seq_start = 1, seq_end = 2, location_rank = 1.
783
784 For split locations, ie (1..2, 3..4, 5..6) we would have three
785 row tables with:
786 start = 1, end = 2, rank = 1
787 start = 3, end = 4, rank = 2
788 start = 5, end = 6, rank = 3
789 """
790
791
792 if feature.location_operator and feature.location_operator != "join":
793
794
795 import warnings
796 warnings.warn("%s location operators are not fully supported" \
797 % feature.location_operator)
798
799
800 if not feature.sub_features:
801 self._insert_seqfeature_location(feature, 1, seqfeature_id)
802 else:
803 for rank, cur_feature in enumerate(feature.sub_features):
804 self._insert_seqfeature_location(cur_feature,
805 rank + 1,
806 seqfeature_id)
807
809 """Add a location of a SeqFeature to the seqfeature_location table (PRIVATE).
810
811 TODO - Add location_operators to location_qualifier_value.
812 """
813
814
815
816 start = feature.location.nofuzzy_start + 1
817 end = feature.location.nofuzzy_end
818
819
820
821
822 strand = feature.strand or 0
823
824
825
826
827
828 loc_term_id = None
829
830 if feature.ref:
831
832
833
834 dbxref_id = self._get_dbxref_id(feature.ref_db or "", feature.ref)
835 else:
836 dbxref_id = None
837
838 sql = r"INSERT INTO location (seqfeature_id, dbxref_id, term_id," \
839 r"start_pos, end_pos, strand, rank) " \
840 r"VALUES (%s, %s, %s, %s, %s, %s, %s)"
841 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, loc_term_id,
842 start, end, strand, rank))
843
844 """
845 # See Bug 2677
846 # TODO - Record the location_operator (e.g. "join" or "order")
847 # using the location_qualifier_value table (which we and BioPerl
848 # have historically left empty).
849 # Note this will need an ontology term for the location qualifer
850 # (location_qualifier_value.term_id) for which oddly the schema
851 # does not allow NULL.
852 if feature.location_operator:
853 #e.g. "join" (common),
854 #or "order" (see Tests/GenBank/protein_refseq2.gb)
855 location_id = self.adaptor.last_id('location')
856 loc_qual_term_id = None # Not allowed in BioSQL v1.0.1
857 sql = r"INSERT INTO location_qualifier_value" \
858 r"(location_id, term_id, value)" \
859 r"VALUES (%s, %s, %s)"
860 self.adaptor.execute(sql, (location_id, loc_qual_term_id,
861 feature.location_operator))
862 """
863
865 """Insert the (key, value) pair qualifiers relating to a feature (PRIVATE).
866
867 Qualifiers should be a dictionary of the form:
868 {key : [value1, value2]}
869 """
870 tag_ontology_id = self._get_ontology_id('Annotation Tags')
871 for qualifier_key in qualifiers.keys():
872
873
874
875
876 if qualifier_key != 'db_xref':
877 qualifier_key_id = self._get_term_id(qualifier_key,
878 ontology_id=tag_ontology_id)
879
880 entries = qualifiers[qualifier_key]
881 if not isinstance(entries, list):
882
883
884 entries = [entries]
885 for qual_value_rank in range(len(entries)):
886 qualifier_value = entries[qual_value_rank]
887 sql = r"INSERT INTO seqfeature_qualifier_value "\
888 r" (seqfeature_id, term_id, rank, value) VALUES"\
889 r" (%s, %s, %s, %s)"
890 self.adaptor.execute(sql, (seqfeature_id,
891 qualifier_key_id,
892 qual_value_rank + 1,
893 qualifier_value))
894 else:
895
896
897
898
899 self._load_seqfeature_dbxref(qualifiers[qualifier_key],
900 seqfeature_id)
901
902
904 """Add database crossreferences of a SeqFeature to the database (PRIVATE).
905
906 o dbxrefs List, dbxref data from the source file in the
907 format <database>:<accession>
908
909 o seqfeature_id Int, the identifier for the seqfeature in the
910 seqfeature table
911
912 Insert dbxref qualifier data for a seqfeature into the
913 seqfeature_dbxref and, if required, dbxref tables.
914 The dbxref_id qualifier/value sets go into the dbxref table
915 as dbname, accession, version tuples, with dbxref.dbxref_id
916 being automatically assigned, and into the seqfeature_dbxref
917 table as seqfeature_id, dbxref_id, and rank tuples
918 """
919
920
921
922
923 for rank, value in enumerate(dbxrefs):
924
925
926 try:
927 dbxref_data = value.replace(' ','').replace('\n','').split(':')
928 db = dbxref_data[0]
929 accessions = dbxref_data[1:]
930 except:
931 raise ValueError("Parsing of db_xref failed: '%s'" % value)
932
933
934 for accession in accessions:
935
936 dbxref_id = self._get_dbxref_id(db, accession)
937
938 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank+1)
939
941 """ _get_dbxref_id(self, db, accession) -> Int
942
943 o db String, the name of the external database containing
944 the accession number
945
946 o accession String, the accession of the dbxref data
947
948 Finds and returns the dbxref_id for the passed data. The method
949 attempts to find an existing record first, and inserts the data
950 if there is no record.
951 """
952
953 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \
954 r'AND accession = %s'
955 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession))
956
957
958 if dbxref_id:
959 return dbxref_id[0]
960 return self._add_dbxref(db, accession, 0)
961
963 """ Check for a pre-existing seqfeature_dbxref entry with the passed
964 seqfeature_id and dbxref_id. If one does not exist, insert new
965 data
966
967 """
968
969 sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \
970 r"WHERE seqfeature_id = %s AND dbxref_id = %s"
971 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id,
972 dbxref_id))
973
974
975 if result:
976 return result
977 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
978
980 """ Insert a seqfeature_dbxref row and return the seqfeature_id and
981 dbxref_id
982 """
983 sql = r'INSERT INTO seqfeature_dbxref ' \
984 '(seqfeature_id, dbxref_id, rank) VALUES' \
985 r'(%s, %s, %s)'
986 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank))
987 return (seqfeature_id, dbxref_id)
988
1012
1013 - def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1014 """ Check for a pre-existing bioentry_dbxref entry with the passed
1015 seqfeature_id and dbxref_id. If one does not exist, insert new
1016 data
1017
1018 """
1019
1020 sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \
1021 r"WHERE bioentry_id = %s AND dbxref_id = %s"
1022 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id,
1023 dbxref_id))
1024
1025
1026 if result:
1027 return result
1028 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
1029
1030 - def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1031 """ Insert a bioentry_dbxref row and return the seqfeature_id and
1032 dbxref_id
1033 """
1034 sql = r'INSERT INTO bioentry_dbxref ' \
1035 '(bioentry_id,dbxref_id,rank) VALUES ' \
1036 '(%s, %s, %s)'
1037 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank))
1038 return (bioentry_id, dbxref_id)
1039
1041 """Complement the Loader functionality by fully removing a database.
1042
1043 This probably isn't really useful for normal purposes, since you
1044 can just do a:
1045 DROP DATABASE db_name
1046 and then recreate the database. But, it's really useful for testing
1047 purposes.
1048
1049 YB: now use the cascaded deletions
1050 """
1052 """Initialize with a database id and adaptor connection.
1053 """
1054 self.adaptor = adaptor
1055 self.dbid = dbid
1056
1058 """Remove everything related to the given database id.
1059 """
1060 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s"
1061 self.adaptor.execute(sql, (self.dbid,))
1062 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s"
1063 self.adaptor.execute(sql, (self.dbid,))
1064