Package Bio :: Package SeqIO :: Module InsdcIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.InsdcIO

  1  # Copyright 2007-2009 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package.. 
  6   
  7  """Bio.SeqIO support for the "genbank" and "embl" file formats. 
  8   
  9  You are expected to use this module via the Bio.SeqIO functions. 
 10  Note that internally this module calls Bio.GenBank to do the actual 
 11  parsing of both GenBank and EMBL files. 
 12   
 13  See also: 
 14   
 15  International Nucleotide Sequence Database Collaboration 
 16  http://www.insdc.org/ 
 17    
 18  GenBank 
 19  http://www.ncbi.nlm.nih.gov/Genbank/ 
 20   
 21  EMBL Nucleotide Sequence Database 
 22  http://www.ebi.ac.uk/embl/ 
 23   
 24  DDBJ (DNA Data Bank of Japan) 
 25  http://www.ddbj.nig.ac.jp/ 
 26  """ 
 27   
 28  from Bio.Seq import UnknownSeq 
 29  from Bio.GenBank.Scanner import GenBankScanner, EmblScanner 
 30  from Bio import Alphabet 
 31  from Interfaces import SequentialSequenceWriter 
 32  from Bio import SeqFeature 
 33   
 34  # NOTE 
 35  # ==== 
 36  # The "brains" for parsing GenBank and EMBL files (and any 
 37  # other flat file variants from the INSDC in future) is in 
 38  # Bio.GenBank.Scanner (plus the _FeatureConsumer in Bio.GenBank) 
 39  # However, all the writing code is in this file. 
 40   
 41   
42 -def GenBankIterator(handle):
43 """Breaks up a Genbank file into SeqRecord objects. 44 45 Every section from the LOCUS line to the terminating // becomes 46 a single SeqRecord with associated annotation and features. 47 48 Note that for genomes or chromosomes, there is typically only 49 one record.""" 50 #This calls a generator function: 51 return GenBankScanner(debug=0).parse_records(handle)
52
53 -def EmblIterator(handle):
54 """Breaks up an EMBL file into SeqRecord objects. 55 56 Every section from the LOCUS line to the terminating // becomes 57 a single SeqRecord with associated annotation and features. 58 59 Note that for genomes or chromosomes, there is typically only 60 one record.""" 61 #This calls a generator function: 62 return EmblScanner(debug=0).parse_records(handle)
63
64 -def GenBankCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
65 """Breaks up a Genbank file into SeqRecord objects for each CDS feature. 66 67 Every section from the LOCUS line to the terminating // can contain 68 many CDS features. These are returned as with the stated amino acid 69 translation sequence (if given). 70 """ 71 #This calls a generator function: 72 return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
73
74 -def EmblCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
75 """Breaks up a EMBL file into SeqRecord objects for each CDS feature. 76 77 Every section from the LOCUS line to the terminating // can contain 78 many CDS features. These are returned as with the stated amino acid 79 translation sequence (if given). 80 """ 81 #This calls a generator function: 82 return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
83
84 -def _insdc_feature_position_string(pos, offset=0):
85 """Build a GenBank/EMBL position string (PRIVATE). 86 87 Use offset=1 to add one to convert a start position from python counting. 88 """ 89 if isinstance(pos, SeqFeature.ExactPosition): 90 return "%i" % (pos.position+offset) 91 elif isinstance(pos, SeqFeature.WithinPosition): 92 return "(%i.%i)" % (pos.position + offset, 93 pos.position + pos.extension + offset) 94 elif isinstance(pos, SeqFeature.BetweenPosition): 95 return "(%i^%i)" % (pos.position + offset, 96 pos.position + pos.extension + offset) 97 elif isinstance(pos, SeqFeature.BeforePosition): 98 return "<%i" % (pos.position + offset) 99 elif isinstance(pos, SeqFeature.AfterPosition): 100 return ">%i" % (pos.position + offset) 101 elif isinstance(pos, SeqFeature.OneOfPosition): 102 return "one-of(%s)" \ 103 % ",".join([_insdc_feature_position_string(p,offset) \ 104 for p in pos.position_choices]) 105 elif isinstance(pos, SeqFeature.AbstractPosition): 106 raise NotImplementedError("Please report this as a bug in Biopython.") 107 else: 108 raise ValueError("Expected a SeqFeature position object.")
109 110
111 -def _insdc_location_string_ignoring_strand_and_subfeatures(feature):
112 if feature.ref: 113 ref = "%s:" % feature.ref 114 else: 115 ref = "" 116 assert not feature.ref_db 117 if feature.location.start==feature.location.end \ 118 and isinstance(feature.location.end, SeqFeature.ExactPosition): 119 #Special case, 12^13 gets mapped to location 12:12 120 #(a zero length slice, meaning the point between two letters) 121 return "%s%i^%i" % (ref, feature.location.end.position, 122 feature.location.end.position+1) 123 else: 124 #Typical case, e.g. 12..15 gets mapped to 11:15 125 return ref \ 126 + _insdc_feature_position_string(feature.location.start, +1) \ 127 + ".." + \ 128 _insdc_feature_position_string(feature.location.end)
129
130 -def _insdc_feature_location_string(feature):
131 """Build a GenBank/EMBL location string from a SeqFeature (PRIVATE).""" 132 # Have a choice of how to show joins on the reverse complement strand, 133 # complement(join(1,10),(20,100)) vs join(complement(20,100),complement(1,10)) 134 # Notice that the order of the entries gets flipped! 135 # 136 # GenBank and EMBL would both use now complement(join(1,10),(20,100)) 137 # which is shorter at least. 138 # 139 # In the above situations, we expect the parent feature and the two children 140 # to all be marked as strand==-1, and in the order 0:10 then 19:100. 141 # 142 # Also need to consider dual-strand examples like these from the Arabidopsis 143 # thaliana chloroplast NC_000932: join(complement(69611..69724),139856..140650) 144 # gene ArthCp047, GeneID:844801 or its CDS which is even better due to a splice: 145 # join(complement(69611..69724),139856..140087,140625..140650) 146 # protein NP_051038.1 GI:7525057 147 # 148 149 if not feature.sub_features: 150 #Non-recursive. 151 #assert feature.location_operator == "", \ 152 # "%s has no subfeatures but location_operator %s" \ 153 # % (repr(feature), feature.location_operator) 154 location = _insdc_location_string_ignoring_strand_and_subfeatures(feature) 155 if feature.strand == -1: 156 location = "complement(%s)" % location 157 return location 158 # As noted above, treat reverse complement strand features carefully: 159 if feature.strand == -1: 160 for f in feature.sub_features: 161 assert f.strand == -1 162 return "complement(%s(%s))" \ 163 % (feature.location_operator, 164 ",".join(_insdc_location_string_ignoring_strand_and_subfeatures(f) \ 165 for f in feature.sub_features)) 166 #if feature.strand == +1: 167 # for f in feature.sub_features: 168 # assert f.strand == +1 169 #This covers typical forward strand features, and also an evil mixed strand: 170 assert feature.location_operator != "" 171 return "%s(%s)" % (feature.location_operator, 172 ",".join([_insdc_feature_location_string(f) \ 173 for f in feature.sub_features]))
174 175
176 -class GenBankWriter(SequentialSequenceWriter):
177 HEADER_WIDTH = 12 178 MAX_WIDTH = 80 179 QUALIFIER_INDENT = 21 180
181 - def _write_single_line(self, tag, text):
182 "Used in the the 'header' of each GenBank record.""" 183 assert len(tag) < self.HEADER_WIDTH 184 assert len(text) < self.MAX_WIDTH - self.HEADER_WIDTH, \ 185 "Annotation %s too long for %s line" % (repr(text), tag) 186 self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH), 187 text.replace("\n"," ")))
188
189 - def _write_multi_line(self, tag, text):
190 "Used in the the 'header' of each GenBank record.""" 191 #TODO - Do the line spliting while preserving white space? 192 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 193 assert len(tag) < self.HEADER_WIDTH 194 text = text.strip() 195 if len(text) < max_len: 196 self._write_single_line(tag, text) 197 return 198 199 words = text.split() 200 assert max([len(w) for w in words]) < max_len, \ 201 "Your description cannot be broken into nice lines!" 202 text = "" 203 while words and len(text) + 1 + len(words[0]) < max_len: 204 text += " " + words.pop(0) 205 text = text.strip() 206 assert len(text) < max_len 207 self._write_single_line(tag, text) 208 while words: 209 text = "" 210 while words and len(text) + 1 + len(words[0]) < max_len: 211 text += " " + words.pop(0) 212 text = text.strip() 213 assert len(text) < max_len 214 self._write_single_line("", text) 215 assert not words
216
217 - def _write_multi_entries(self, tag, text_list):
218 #used for DBLINK and any similar later line types. 219 #If the list of strings is empty, nothing is written. 220 for i, text in enumerate(text_list): 221 if i==0: 222 self._write_single_line(tag, text) 223 else: 224 self._write_single_line("", text)
225
226 - def _get_date(self, record) :
227 default = "01-JAN-1980" 228 try : 229 date = record.annotations["date"] 230 except KeyError : 231 return default 232 #Cope with a list of one string: 233 if isinstance(date, list) and len(date)==1 : 234 date = date[0] 235 #TODO - allow a Python date object 236 if not isinstance(date, str) or len(date) != 11 \ 237 or date[2] != "-" or date[6] != "-" \ 238 or not date[:2].isdigit() or not date[7:].isdigit() \ 239 or int(date[:2]) > 31 \ 240 or date[3:6] not in ["JAN","FEB","MAR","APR","MAY","JUN", 241 "JUL","AUG","SEP","OCT","NOV","DEC"] : 242 #TODO - Check is a valid date (e.g. not 31 Feb) 243 return default 244 return date
245
246 - def _write_the_first_line(self, record):
247 """Write the LOCUS line.""" 248 249 locus = record.name 250 if not locus or locus == "<unknown name>": 251 locus = record.id 252 if not locus or locus == "<unknown id>": 253 locus = self._get_annotation_str(record, "accession", just_first=True) 254 if len(locus) > 16: 255 raise ValueError("Locus identifier %s is too long" % repr(locus)) 256 257 if len(record) > 99999999999: 258 #Currently GenBank only officially support up to 350000, but 259 #the length field can take eleven digits 260 raise ValueError("Sequence too long!") 261 262 #Get the base alphabet (underneath any Gapped or StopCodon encoding) 263 a = Alphabet._get_base_alphabet(record.seq.alphabet) 264 if not isinstance(a, Alphabet.Alphabet): 265 raise TypeError("Invalid alphabet") 266 elif isinstance(a, Alphabet.ProteinAlphabet): 267 units = "aa" 268 elif isinstance(a, Alphabet.NucleotideAlphabet): 269 units = "bp" 270 else: 271 #Must be something like NucleotideAlphabet or 272 #just the generic Alphabet (default for fasta files) 273 raise ValueError("Need a Nucleotide or Protein alphabet") 274 275 #Get the molecule type 276 #TODO - record this explicitly in the parser? 277 if isinstance(a, Alphabet.ProteinAlphabet): 278 mol_type = "" 279 elif isinstance(a, Alphabet.DNAAlphabet): 280 mol_type = "DNA" 281 elif isinstance(a, Alphabet.RNAAlphabet): 282 mol_type = "RNA" 283 else: 284 #Must be something like NucleotideAlphabet or 285 #just the generic Alphabet (default for fasta files) 286 raise ValueError("Need a DNA, RNA or Protein alphabet") 287 288 try: 289 division = record.annotations["data_file_division"] 290 except KeyError: 291 division = "UNK" 292 if division not in ["PRI","ROD","MAM","VRT","INV","PLN","BCT", 293 "VRL","PHG","SYN","UNA","EST","PAT","STS", 294 "GSS","HTG","HTC","ENV","CON"]: 295 division = "UNK" 296 297 assert len(units) == 2 298 assert len(division) == 3 299 #TODO - date 300 #TODO - mol_type 301 line = "LOCUS %s %s %s %s %s %s\n" \ 302 % (locus.ljust(16), 303 str(len(record)).rjust(11), 304 units, 305 mol_type.ljust(6), 306 division, 307 self._get_date(record)) 308 assert len(line) == 79+1, repr(line) #plus one for new line 309 310 assert line[12:28].rstrip() == locus, \ 311 'LOCUS line does not contain the locus at the expected position:\n' + line 312 assert line[28:29] == " " 313 assert line[29:40].lstrip() == str(len(record)), \ 314 'LOCUS line does not contain the length at the expected position:\n' + line 315 316 #Tests copied from Bio.GenBank.Scanner 317 assert line[40:44] in [' bp ', ' aa '] , \ 318 'LOCUS line does not contain size units at expected position:\n' + line 319 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 320 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 321 assert line[47:54].strip() == "" \ 322 or line[47:54].strip().find('DNA') != -1 \ 323 or line[47:54].strip().find('RNA') != -1, \ 324 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 325 assert line[54:55] == ' ', \ 326 'LOCUS line does not contain space at position 55:\n' + line 327 assert line[55:63].strip() in ['','linear','circular'], \ 328 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 329 assert line[63:64] == ' ', \ 330 'LOCUS line does not contain space at position 64:\n' + line 331 assert line[67:68] == ' ', \ 332 'LOCUS line does not contain space at position 68:\n' + line 333 assert line[70:71] == '-', \ 334 'LOCUS line does not contain - at position 71 in date:\n' + line 335 assert line[74:75] == '-', \ 336 'LOCUS line does not contain - at position 75 in date:\n' + line 337 338 self.handle.write(line)
339
340 - def _get_annotation_str(self, record, key, default=".", just_first=False):
341 """Get an annotation dictionary entry (as a string). 342 343 Some entries are lists, in which case if just_first=True the first entry 344 is returned. If just_first=False (default) this verifies there is only 345 one entry before returning it.""" 346 try: 347 answer = record.annotations[key] 348 except KeyError: 349 return default 350 if isinstance(answer, list): 351 if not just_first : assert len(answer) == 1 352 return str(answer[0]) 353 else: 354 return str(answer)
355
356 - def _write_comment(self, record):
357 #This is a bit complicated due to the range of possible 358 #ways people might have done their annotation... 359 #Currently the parser uses a single string with newlines. 360 #A list of lines is also reasonable. 361 #A single (long) string is perhaps the most natural of all. 362 #This means we may need to deal with line wrapping. 363 comment = record.annotations["comment"] 364 if isinstance(comment, basestring): 365 lines = comment.split("\n") 366 elif isinstance(comment, list) or isinstance(comment, tuple): 367 lines = comment 368 else: 369 raise ValueError("Could not understand comment annotation") 370 self._write_multi_line("COMMENT",lines[0]) 371 for line in lines[1:]: 372 self._write_multi_line("",line)
373
374 - def _write_contig(self, record):
375 #TODO - Merge this with _write_multi_line method? 376 #It would need the addition of the comma splitting logic... 377 #are there any other cases where that would be sensible? 378 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 379 contig = record.annotations.get("contig","") 380 if isinstance(contig, list) or isinstance(contig, tuple): 381 contig = "".join(contig) 382 contig = self.clean(contig) 383 i=0 384 while contig: 385 if len(contig) > max_len: 386 #Split lines at the commas 387 pos = contig[:max_len-1].rfind(",") 388 if pos==-1: 389 raise ValueError("Could not break up CONTIG") 390 text, contig = contig[:pos+1], contig[pos+1:] 391 else: 392 text, contig = contig, "" 393 if i==0: 394 self._write_single_line("CONTIG",text) 395 else: 396 self._write_single_line("",text) 397 i+=1
398 399
400 - def _write_sequence(self, record):
401 #Loosely based on code from Howard Salis 402 #TODO - Force lower case? 403 LETTERS_PER_LINE = 60 404 SEQUENCE_INDENT = 9 405 406 if isinstance(record.seq, UnknownSeq): 407 #We have already recorded the length, and there is no need 408 #to record a long sequence of NNNNNNN...NNN or whatever. 409 if "contig" in record.annotations: 410 self._write_contig(record) 411 else: 412 self.handle.write("ORIGIN\n") 413 return 414 415 data = self._get_seq_string(record) #Catches sequence being None 416 seq_len = len(data) 417 self.handle.write("ORIGIN\n") 418 for line_number in range(0,seq_len,LETTERS_PER_LINE): 419 self.handle.write(str(line_number+1).rjust(SEQUENCE_INDENT)) 420 for words in range(line_number,min(line_number+LETTERS_PER_LINE,seq_len),10): 421 self.handle.write(" %s" % data[words:words+10]) 422 self.handle.write("\n")
423
424 - def write_record(self, record):
425 """Write a single record to the output file.""" 426 handle = self.handle 427 self._write_the_first_line(record) 428 429 accession = self._get_annotation_str(record, "accession", 430 record.id.split(".",1)[0], 431 just_first=True) 432 acc_with_version = accession 433 if record.id.startswith(accession+"."): 434 try: 435 acc_with_version = "%s.%i" \ 436 % (accession, int(record.id.split(".",1)[1])) 437 except ValueError: 438 pass 439 gi = self._get_annotation_str(record, "gi", just_first=True) 440 441 descr = record.description 442 if descr == "<unknown description>" : descr = "." 443 self._write_multi_line("DEFINITION", descr) 444 445 self._write_single_line("ACCESSION", accession) 446 if gi != ".": 447 self._write_single_line("VERSION", "%s GI:%s" % (acc_with_version,gi)) 448 else: 449 self._write_single_line("VERSION", "%s" % (acc_with_version)) 450 451 #The NCBI only expect two types of link so far, 452 #e.g. "Project:28471" and "Trace Assembly Archive:123456" 453 #TODO - Filter the dbxrefs list to just these? 454 self._write_multi_entries("DBLINK", record.dbxrefs) 455 456 try: 457 #List of strings 458 keywords = "; ".join(record.annotations["keywords"]) 459 except KeyError: 460 keywords = "." 461 self._write_multi_line("KEYWORDS", keywords) 462 463 if "segment" in record.annotations: 464 #Deal with SEGMENT line found only in segmented records, 465 #e.g. AH000819 466 segment = record.annotations["segment"] 467 if isinstance(segment, list): 468 assert len(segment)==1, segment 469 segment = segment[0] 470 self._write_single_line("SEGMENT", segment) 471 472 self._write_multi_line("SOURCE", \ 473 self._get_annotation_str(record, "source")) 474 #The ORGANISM line MUST be a single line, as any continuation is the taxonomy 475 org = self._get_annotation_str(record, "organism") 476 if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH: 477 org = org[:self.MAX_WIDTH - self.HEADER_WIDTH-4]+"..." 478 self._write_single_line(" ORGANISM", org) 479 try: 480 #List of strings 481 taxonomy = "; ".join(record.annotations["taxonomy"]) 482 except KeyError: 483 taxonomy = "." 484 self._write_multi_line("", taxonomy) 485 486 #TODO - References... 487 if "comment" in record.annotations: 488 self._write_comment(record) 489 handle.write("FEATURES Location/Qualifiers\n") 490 for feature in record.features: 491 self._write_feature(feature) 492 self._write_sequence(record) 493 handle.write("//\n")
494
495 - def _write_feature_qualifier(self, key, value=None, quote=None):
496 if not value: 497 self.handle.write("%s/%s\n" % (" "*self.QUALIFIER_INDENT, key)) 498 return 499 #Quick hack with no line wrapping, may be useful for testing: 500 #self.handle.write('%s/%s="%s"\n' % (" "*self.QUALIFIER_INDENT, key, value)) 501 if quote is None: 502 #Try to mimic unwritten rules about when quotes can be left out: 503 if isinstance(value, int) or isinstance(value, long): 504 quote = False 505 else: 506 quote = True 507 if quote: 508 line = '%s/%s="%s"' % (" "*self.QUALIFIER_INDENT, key, value) 509 else: 510 line = '%s/%s=%s' % (" "*self.QUALIFIER_INDENT, key, value) 511 if len(line) < self.MAX_WIDTH: 512 self.handle.write(line+"\n") 513 return 514 while line.lstrip(): 515 if len(line) < self.MAX_WIDTH: 516 self.handle.write(line+"\n") 517 return 518 #Insert line break... 519 for index in range(min(len(line)-1,self.MAX_WIDTH),self.QUALIFIER_INDENT+1,-1): 520 if line[index]==" " : break 521 if line[index] != " ": 522 #No nice place to break... 523 index = self.MAX_WIDTH 524 self.handle.write(line[:index] + "\n") 525 line = " "*self.QUALIFIER_INDENT + line[index:].lstrip()
526
527 - def _wrap_location(self, location):
528 """Split a feature location into lines (break at commas).""" 529 #TODO - Rewrite this not to recurse! 530 length = self.MAX_WIDTH - self.QUALIFIER_INDENT 531 if len(location) <= length: 532 return location 533 index = location[:length].rfind(",") 534 if index == -1: 535 #No good place to split (!) 536 import warnings 537 warnings.warn("Couldn't split location:\n%s" % location) 538 return location 539 return location[:index+1] + "\n" + \ 540 " "*self.QUALIFIER_INDENT + self._wrap_location(location[index+1:])
541
542 - def _write_feature(self, feature):
543 """Write a single SeqFeature object to features table.""" 544 assert feature.type, feature 545 #TODO - Line wrapping for long locations! 546 location = _insdc_feature_location_string(feature) 547 line = (" %s " % feature.type)[:self.QUALIFIER_INDENT] \ 548 + self._wrap_location(location) + "\n" 549 self.handle.write(line) 550 #Now the qualifiers... 551 for key, values in feature.qualifiers.iteritems(): 552 if isinstance(values, list) or isinstance(values, tuple): 553 for value in values: 554 self._write_feature_qualifier(key, value) 555 elif values: 556 #String, int, etc 557 self._write_feature_qualifier(key, values) 558 else: 559 #e.g. a /psuedo entry 560 self._write_feature_qualifier(key)
561 562 563 if __name__ == "__main__": 564 print "Quick self test" 565 import os 566 from StringIO import StringIO 567
568 - def compare_record(old, new):
569 if old.id != new.id and old.name != new.name: 570 raise ValueError("'%s' or '%s' vs '%s' or '%s' records" \ 571 % (old.id, old.name, new.id, new.name)) 572 if len(old.seq) != len(new.seq): 573 raise ValueError("%i vs %i" % (len(old.seq), len(new.seq))) 574 if str(old.seq).upper() != str(new.seq).upper(): 575 if len(old.seq) < 200: 576 raise ValueError("'%s' vs '%s'" % (old.seq, new.seq)) 577 else: 578 raise ValueError("'%s...' vs '%s...'" % (old.seq[:100], new.seq[:100])) 579 if old.features and new.features: 580 return compare_features(old.features, new.features) 581 #Just insist on at least one word in common: 582 if (old.description or new.description) \ 583 and not set(old.description.split()).intersection(new.description.split()): 584 raise ValueError("%s versus %s" \ 585 % (repr(old.description), repr(new.description))) 586 #TODO - check annotation 587 if "contig" in old.annotations: 588 assert old.annotations["contig"] == \ 589 new.annotations["contig"] 590 return True
591
592 - def compare_records(old_list, new_list):
593 """Check two lists of SeqRecords agree, raises a ValueError if mismatch.""" 594 if len(old_list) != len(new_list): 595 raise ValueError("%i vs %i records" % (len(old_list), len(new_list))) 596 for old, new in zip(old_list, new_list): 597 if not compare_record(old,new): 598 return False 599 return True
600
601 - def compare_feature(old, new, ignore_sub_features=False):
602 """Check two SeqFeatures agree.""" 603 if old.type != new.type: 604 raise ValueError("Type %s versus %s" % (old.type, new.type)) 605 if old.location.nofuzzy_start != new.location.nofuzzy_start \ 606 or old.location.nofuzzy_end != new.location.nofuzzy_end: 607 raise ValueError("%s versus %s:\n%s\nvs:\n%s" \ 608 % (old.location, new.location, str(old), str(new))) 609 if old.strand != new.strand: 610 raise ValueError("Different strand:\n%s\nvs:\n%s" % (str(old), str(new))) 611 if old.location.start != new.location.start: 612 raise ValueError("Start %s versus %s:\n%s\nvs:\n%s" \ 613 % (old.location.start, new.location.start, str(old), str(new))) 614 if old.location.end != new.location.end: 615 raise ValueError("End %s versus %s:\n%s\nvs:\n%s" \ 616 % (old.location.end, new.location.end, str(old), str(new))) 617 if not ignore_sub_features: 618 if len(old.sub_features) != len(new.sub_features): 619 raise ValueError("Different sub features") 620 for a,b in zip(old.sub_features, new.sub_features): 621 if not compare_feature(a,b): 622 return False 623 #This only checks key shared qualifiers 624 #Would a white list be easier? 625 #for key in ["name","gene","translation","codon_table","codon_start","locus_tag"]: 626 for key in set(old.qualifiers.keys()).intersection(new.qualifiers.keys()): 627 if key in ["db_xref","protein_id","product","note"]: 628 #EMBL and GenBank files are use different references/notes/etc 629 continue 630 if old.qualifiers[key] != new.qualifiers[key]: 631 raise ValueError("Qualifier mis-match for %s:\n%s\n%s" \ 632 % (key, old.qualifiers[key], new.qualifiers[key])) 633 return True
634
635 - def compare_features(old_list, new_list, ignore_sub_features=False):
636 """Check two lists of SeqFeatures agree, raises a ValueError if mismatch.""" 637 if len(old_list) != len(new_list): 638 raise ValueError("%i vs %i features" % (len(old_list), len(new_list))) 639 for old, new in zip(old_list, new_list): 640 #This assumes they are in the same order 641 if not compare_feature(old,new,ignore_sub_features): 642 return False 643 return True
644
645 - def check_genbank_writer(records):
646 handle = StringIO() 647 GenBankWriter(handle).write_file(records) 648 handle.seek(0) 649 650 records2 = list(GenBankIterator(handle)) 651 assert compare_records(records, records2)
652 653 for filename in os.listdir("../../Tests/GenBank"): 654 if not filename.endswith(".gbk") and not filename.endswith(".gb"): 655 continue 656 print filename 657 658 handle = open("../../Tests/GenBank/%s" % filename) 659 records = list(GenBankIterator(handle)) 660 handle.close() 661 662 check_genbank_writer(records) 663 664 for filename in os.listdir("../../Tests/EMBL"): 665 if not filename.endswith(".embl"): 666 continue 667 print filename 668 669 handle = open("../../Tests/EMBL/%s" % filename) 670 records = list(EmblIterator(handle)) 671 handle.close() 672 673 check_genbank_writer(records) 674 675 from Bio import SeqIO 676 for filename in os.listdir("../../Tests/SwissProt"): 677 if not filename.startswith("sp"): 678 continue 679 print filename 680 681 handle = open("../../Tests/SwissProt/%s" % filename) 682 records = list(SeqIO.parse(handle,"swiss")) 683 handle.close() 684 685 check_genbank_writer(records) 686