1
2
3
4
5
6
7 """Bio.SeqIO support for the "genbank" and "embl" file formats.
8
9 You are expected to use this module via the Bio.SeqIO functions.
10 Note that internally this module calls Bio.GenBank to do the actual
11 parsing of both GenBank and EMBL files.
12
13 See also:
14
15 International Nucleotide Sequence Database Collaboration
16 http://www.insdc.org/
17
18 GenBank
19 http://www.ncbi.nlm.nih.gov/Genbank/
20
21 EMBL Nucleotide Sequence Database
22 http://www.ebi.ac.uk/embl/
23
24 DDBJ (DNA Data Bank of Japan)
25 http://www.ddbj.nig.ac.jp/
26 """
27
28 from Bio.Seq import UnknownSeq
29 from Bio.GenBank.Scanner import GenBankScanner, EmblScanner
30 from Bio import Alphabet
31 from Interfaces import SequentialSequenceWriter
32 from Bio import SeqFeature
33
34
35
36
37
38
39
40
41
43 """Breaks up a Genbank file into SeqRecord objects.
44
45 Every section from the LOCUS line to the terminating // becomes
46 a single SeqRecord with associated annotation and features.
47
48 Note that for genomes or chromosomes, there is typically only
49 one record."""
50
51 return GenBankScanner(debug=0).parse_records(handle)
52
54 """Breaks up an EMBL file into SeqRecord objects.
55
56 Every section from the LOCUS line to the terminating // becomes
57 a single SeqRecord with associated annotation and features.
58
59 Note that for genomes or chromosomes, there is typically only
60 one record."""
61
62 return EmblScanner(debug=0).parse_records(handle)
63
65 """Breaks up a Genbank file into SeqRecord objects for each CDS feature.
66
67 Every section from the LOCUS line to the terminating // can contain
68 many CDS features. These are returned as with the stated amino acid
69 translation sequence (if given).
70 """
71
72 return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
73
75 """Breaks up a EMBL file into SeqRecord objects for each CDS feature.
76
77 Every section from the LOCUS line to the terminating // can contain
78 many CDS features. These are returned as with the stated amino acid
79 translation sequence (if given).
80 """
81
82 return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
83
85 """Build a GenBank/EMBL position string (PRIVATE).
86
87 Use offset=1 to add one to convert a start position from python counting.
88 """
89 if isinstance(pos, SeqFeature.ExactPosition):
90 return "%i" % (pos.position+offset)
91 elif isinstance(pos, SeqFeature.WithinPosition):
92 return "(%i.%i)" % (pos.position + offset,
93 pos.position + pos.extension + offset)
94 elif isinstance(pos, SeqFeature.BetweenPosition):
95 return "(%i^%i)" % (pos.position + offset,
96 pos.position + pos.extension + offset)
97 elif isinstance(pos, SeqFeature.BeforePosition):
98 return "<%i" % (pos.position + offset)
99 elif isinstance(pos, SeqFeature.AfterPosition):
100 return ">%i" % (pos.position + offset)
101 elif isinstance(pos, SeqFeature.OneOfPosition):
102 return "one-of(%s)" \
103 % ",".join([_insdc_feature_position_string(p,offset) \
104 for p in pos.position_choices])
105 elif isinstance(pos, SeqFeature.AbstractPosition):
106 raise NotImplementedError("Please report this as a bug in Biopython.")
107 else:
108 raise ValueError("Expected a SeqFeature position object.")
109
110
129
174
175
177 HEADER_WIDTH = 12
178 MAX_WIDTH = 80
179 QUALIFIER_INDENT = 21
180
188
216
225
227 default = "01-JAN-1980"
228 try :
229 date = record.annotations["date"]
230 except KeyError :
231 return default
232
233 if isinstance(date, list) and len(date)==1 :
234 date = date[0]
235
236 if not isinstance(date, str) or len(date) != 11 \
237 or date[2] != "-" or date[6] != "-" \
238 or not date[:2].isdigit() or not date[7:].isdigit() \
239 or int(date[:2]) > 31 \
240 or date[3:6] not in ["JAN","FEB","MAR","APR","MAY","JUN",
241 "JUL","AUG","SEP","OCT","NOV","DEC"] :
242
243 return default
244 return date
245
247 """Write the LOCUS line."""
248
249 locus = record.name
250 if not locus or locus == "<unknown name>":
251 locus = record.id
252 if not locus or locus == "<unknown id>":
253 locus = self._get_annotation_str(record, "accession", just_first=True)
254 if len(locus) > 16:
255 raise ValueError("Locus identifier %s is too long" % repr(locus))
256
257 if len(record) > 99999999999:
258
259
260 raise ValueError("Sequence too long!")
261
262
263 a = Alphabet._get_base_alphabet(record.seq.alphabet)
264 if not isinstance(a, Alphabet.Alphabet):
265 raise TypeError("Invalid alphabet")
266 elif isinstance(a, Alphabet.ProteinAlphabet):
267 units = "aa"
268 elif isinstance(a, Alphabet.NucleotideAlphabet):
269 units = "bp"
270 else:
271
272
273 raise ValueError("Need a Nucleotide or Protein alphabet")
274
275
276
277 if isinstance(a, Alphabet.ProteinAlphabet):
278 mol_type = ""
279 elif isinstance(a, Alphabet.DNAAlphabet):
280 mol_type = "DNA"
281 elif isinstance(a, Alphabet.RNAAlphabet):
282 mol_type = "RNA"
283 else:
284
285
286 raise ValueError("Need a DNA, RNA or Protein alphabet")
287
288 try:
289 division = record.annotations["data_file_division"]
290 except KeyError:
291 division = "UNK"
292 if division not in ["PRI","ROD","MAM","VRT","INV","PLN","BCT",
293 "VRL","PHG","SYN","UNA","EST","PAT","STS",
294 "GSS","HTG","HTC","ENV","CON"]:
295 division = "UNK"
296
297 assert len(units) == 2
298 assert len(division) == 3
299
300
301 line = "LOCUS %s %s %s %s %s %s\n" \
302 % (locus.ljust(16),
303 str(len(record)).rjust(11),
304 units,
305 mol_type.ljust(6),
306 division,
307 self._get_date(record))
308 assert len(line) == 79+1, repr(line)
309
310 assert line[12:28].rstrip() == locus, \
311 'LOCUS line does not contain the locus at the expected position:\n' + line
312 assert line[28:29] == " "
313 assert line[29:40].lstrip() == str(len(record)), \
314 'LOCUS line does not contain the length at the expected position:\n' + line
315
316
317 assert line[40:44] in [' bp ', ' aa '] , \
318 'LOCUS line does not contain size units at expected position:\n' + line
319 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \
320 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
321 assert line[47:54].strip() == "" \
322 or line[47:54].strip().find('DNA') != -1 \
323 or line[47:54].strip().find('RNA') != -1, \
324 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
325 assert line[54:55] == ' ', \
326 'LOCUS line does not contain space at position 55:\n' + line
327 assert line[55:63].strip() in ['','linear','circular'], \
328 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
329 assert line[63:64] == ' ', \
330 'LOCUS line does not contain space at position 64:\n' + line
331 assert line[67:68] == ' ', \
332 'LOCUS line does not contain space at position 68:\n' + line
333 assert line[70:71] == '-', \
334 'LOCUS line does not contain - at position 71 in date:\n' + line
335 assert line[74:75] == '-', \
336 'LOCUS line does not contain - at position 75 in date:\n' + line
337
338 self.handle.write(line)
339
341 """Get an annotation dictionary entry (as a string).
342
343 Some entries are lists, in which case if just_first=True the first entry
344 is returned. If just_first=False (default) this verifies there is only
345 one entry before returning it."""
346 try:
347 answer = record.annotations[key]
348 except KeyError:
349 return default
350 if isinstance(answer, list):
351 if not just_first : assert len(answer) == 1
352 return str(answer[0])
353 else:
354 return str(answer)
355
373
375
376
377
378 max_len = self.MAX_WIDTH - self.HEADER_WIDTH
379 contig = record.annotations.get("contig","")
380 if isinstance(contig, list) or isinstance(contig, tuple):
381 contig = "".join(contig)
382 contig = self.clean(contig)
383 i=0
384 while contig:
385 if len(contig) > max_len:
386
387 pos = contig[:max_len-1].rfind(",")
388 if pos==-1:
389 raise ValueError("Could not break up CONTIG")
390 text, contig = contig[:pos+1], contig[pos+1:]
391 else:
392 text, contig = contig, ""
393 if i==0:
394 self._write_single_line("CONTIG",text)
395 else:
396 self._write_single_line("",text)
397 i+=1
398
399
423
425 """Write a single record to the output file."""
426 handle = self.handle
427 self._write_the_first_line(record)
428
429 accession = self._get_annotation_str(record, "accession",
430 record.id.split(".",1)[0],
431 just_first=True)
432 acc_with_version = accession
433 if record.id.startswith(accession+"."):
434 try:
435 acc_with_version = "%s.%i" \
436 % (accession, int(record.id.split(".",1)[1]))
437 except ValueError:
438 pass
439 gi = self._get_annotation_str(record, "gi", just_first=True)
440
441 descr = record.description
442 if descr == "<unknown description>" : descr = "."
443 self._write_multi_line("DEFINITION", descr)
444
445 self._write_single_line("ACCESSION", accession)
446 if gi != ".":
447 self._write_single_line("VERSION", "%s GI:%s" % (acc_with_version,gi))
448 else:
449 self._write_single_line("VERSION", "%s" % (acc_with_version))
450
451
452
453
454 self._write_multi_entries("DBLINK", record.dbxrefs)
455
456 try:
457
458 keywords = "; ".join(record.annotations["keywords"])
459 except KeyError:
460 keywords = "."
461 self._write_multi_line("KEYWORDS", keywords)
462
463 if "segment" in record.annotations:
464
465
466 segment = record.annotations["segment"]
467 if isinstance(segment, list):
468 assert len(segment)==1, segment
469 segment = segment[0]
470 self._write_single_line("SEGMENT", segment)
471
472 self._write_multi_line("SOURCE", \
473 self._get_annotation_str(record, "source"))
474
475 org = self._get_annotation_str(record, "organism")
476 if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH:
477 org = org[:self.MAX_WIDTH - self.HEADER_WIDTH-4]+"..."
478 self._write_single_line(" ORGANISM", org)
479 try:
480
481 taxonomy = "; ".join(record.annotations["taxonomy"])
482 except KeyError:
483 taxonomy = "."
484 self._write_multi_line("", taxonomy)
485
486
487 if "comment" in record.annotations:
488 self._write_comment(record)
489 handle.write("FEATURES Location/Qualifiers\n")
490 for feature in record.features:
491 self._write_feature(feature)
492 self._write_sequence(record)
493 handle.write("//\n")
494
496 if not value:
497 self.handle.write("%s/%s\n" % (" "*self.QUALIFIER_INDENT, key))
498 return
499
500
501 if quote is None:
502
503 if isinstance(value, int) or isinstance(value, long):
504 quote = False
505 else:
506 quote = True
507 if quote:
508 line = '%s/%s="%s"' % (" "*self.QUALIFIER_INDENT, key, value)
509 else:
510 line = '%s/%s=%s' % (" "*self.QUALIFIER_INDENT, key, value)
511 if len(line) < self.MAX_WIDTH:
512 self.handle.write(line+"\n")
513 return
514 while line.lstrip():
515 if len(line) < self.MAX_WIDTH:
516 self.handle.write(line+"\n")
517 return
518
519 for index in range(min(len(line)-1,self.MAX_WIDTH),self.QUALIFIER_INDENT+1,-1):
520 if line[index]==" " : break
521 if line[index] != " ":
522
523 index = self.MAX_WIDTH
524 self.handle.write(line[:index] + "\n")
525 line = " "*self.QUALIFIER_INDENT + line[index:].lstrip()
526
541
561
562
563 if __name__ == "__main__":
564 print "Quick self test"
565 import os
566 from StringIO import StringIO
567
591
593 """Check two lists of SeqRecords agree, raises a ValueError if mismatch."""
594 if len(old_list) != len(new_list):
595 raise ValueError("%i vs %i records" % (len(old_list), len(new_list)))
596 for old, new in zip(old_list, new_list):
597 if not compare_record(old,new):
598 return False
599 return True
600
602 """Check two SeqFeatures agree."""
603 if old.type != new.type:
604 raise ValueError("Type %s versus %s" % (old.type, new.type))
605 if old.location.nofuzzy_start != new.location.nofuzzy_start \
606 or old.location.nofuzzy_end != new.location.nofuzzy_end:
607 raise ValueError("%s versus %s:\n%s\nvs:\n%s" \
608 % (old.location, new.location, str(old), str(new)))
609 if old.strand != new.strand:
610 raise ValueError("Different strand:\n%s\nvs:\n%s" % (str(old), str(new)))
611 if old.location.start != new.location.start:
612 raise ValueError("Start %s versus %s:\n%s\nvs:\n%s" \
613 % (old.location.start, new.location.start, str(old), str(new)))
614 if old.location.end != new.location.end:
615 raise ValueError("End %s versus %s:\n%s\nvs:\n%s" \
616 % (old.location.end, new.location.end, str(old), str(new)))
617 if not ignore_sub_features:
618 if len(old.sub_features) != len(new.sub_features):
619 raise ValueError("Different sub features")
620 for a,b in zip(old.sub_features, new.sub_features):
621 if not compare_feature(a,b):
622 return False
623
624
625
626 for key in set(old.qualifiers.keys()).intersection(new.qualifiers.keys()):
627 if key in ["db_xref","protein_id","product","note"]:
628
629 continue
630 if old.qualifiers[key] != new.qualifiers[key]:
631 raise ValueError("Qualifier mis-match for %s:\n%s\n%s" \
632 % (key, old.qualifiers[key], new.qualifiers[key]))
633 return True
634
636 """Check two lists of SeqFeatures agree, raises a ValueError if mismatch."""
637 if len(old_list) != len(new_list):
638 raise ValueError("%i vs %i features" % (len(old_list), len(new_list)))
639 for old, new in zip(old_list, new_list):
640
641 if not compare_feature(old,new,ignore_sub_features):
642 return False
643 return True
644
652
653 for filename in os.listdir("../../Tests/GenBank"):
654 if not filename.endswith(".gbk") and not filename.endswith(".gb"):
655 continue
656 print filename
657
658 handle = open("../../Tests/GenBank/%s" % filename)
659 records = list(GenBankIterator(handle))
660 handle.close()
661
662 check_genbank_writer(records)
663
664 for filename in os.listdir("../../Tests/EMBL"):
665 if not filename.endswith(".embl"):
666 continue
667 print filename
668
669 handle = open("../../Tests/EMBL/%s" % filename)
670 records = list(EmblIterator(handle))
671 handle.close()
672
673 check_genbank_writer(records)
674
675 from Bio import SeqIO
676 for filename in os.listdir("../../Tests/SwissProt"):
677 if not filename.startswith("sp"):
678 continue
679 print filename
680
681 handle = open("../../Tests/SwissProt/%s" % filename)
682 records = list(SeqIO.parse(handle,"swiss"))
683 handle.close()
684
685 check_genbank_writer(records)
686