Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006-2008 by Peter Cock.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6   
   7  """Code to work with GenBank formatted files. 
   8   
   9  Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with 
  10  the "genbank" or "embl" format names to parse GenBank or EMBL files into 
  11  SeqRecord and SeqFeature objects (see the Biopython tutorial for details). 
  12   
  13  Also, rather than using Bio.GenBank to search or download files from the NCBI, 
  14  you are now encouraged to use Bio.Entrez instead (again, see the Biopython 
  15  tutorial for details). 
  16   
  17  Currently the ONLY reason to use Bio.GenBank directly is for the RecordParser 
  18  which turns a GenBank file into GenBank-specific Record objects.  This is a 
  19  much closer representation to the raw file contents that the SeqRecord 
  20  alternative from the FeatureParser (used in Bio.SeqIO). 
  21   
  22  Classes: 
  23  Iterator              Iterate through a file of GenBank entries 
  24  ErrorFeatureParser    Catch errors caused during parsing. 
  25  FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects. 
  26  RecordParser          Parse GenBank data into a Record object. 
  27  NCBIDictionary        Access GenBank using a dictionary interface (DEPRECATED). 
  28   
  29  _BaseGenBankConsumer  A base class for GenBank consumer that implements 
  30                        some helpful functions that are in common between 
  31                        consumers. 
  32  _FeatureConsumer      Create SeqFeature objects from info generated by 
  33                        the Scanner 
  34  _RecordConsumer       Create a GenBank record object from Scanner info. 
  35  _PrintingConsumer     A debugging consumer. 
  36   
  37  ParserFailureError    Exception indicating a failure in the parser (ie. 
  38                        scanner or consumer) 
  39  LocationParserError   Exception indiciating a problem with the spark based 
  40                        location parser. 
  41   
  42  Functions: 
  43  search_for            Do a query against GenBank (DEPRECATED). 
  44  download_many         Download many GenBank records (DEPRECATED). 
  45   
  46  17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  47  These are GenBank files that summarize the content of a project, and provide lists of 
  48  scaffold and contig files in the project. These will be in annotations['wgs'] and 
  49  annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  50  http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  51   
  52  http://is.gd/nNgk 
  53  for more details of this format, and an example. 
  54  Added by Ying Huang & Iddo Friedberg 
  55  """ 
  56  import cStringIO 
  57   
  58  # other Biopython stuff 
  59  from Bio import SeqFeature 
  60  from Bio.ParserSupport import AbstractConsumer 
  61  from Bio import Entrez 
  62   
  63  # other Bio.GenBank stuff 
  64  import LocationParser 
  65  from utils import FeatureValueCleaner 
  66  from Scanner import GenBankScanner 
  67   
  68  #Constants used to parse GenBank header lines 
  69  GENBANK_INDENT = 12 
  70  GENBANK_SPACER = " " * GENBANK_INDENT 
  71   
  72  #Constants for parsing GenBank feature lines 
  73  FEATURE_KEY_INDENT = 5 
  74  FEATURE_QUALIFIER_INDENT = 21 
  75  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  76  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  77   
78 -class Iterator:
79 """Iterator interface to move over a file of GenBank entries one at a time. 80 """
81 - def __init__(self, handle, parser = None):
82 """Initialize the iterator. 83 84 Arguments: 85 o handle - A handle with GenBank entries to iterate through. 86 o parser - An optional parser to pass the entries through before 87 returning them. If None, then the raw entry will be returned. 88 """ 89 self.handle = handle 90 self._parser = parser
91
92 - def next(self):
93 """Return the next GenBank record from the handle. 94 95 Will return None if we ran out of records. 96 """ 97 if self._parser is None: 98 lines = [] 99 while True: 100 line = self.handle.readline() 101 if not line : return None #Premature end of file? 102 lines.append(line) 103 if line.rstrip() == "//" : break 104 return "".join(lines) 105 try: 106 return self._parser.parse(self.handle) 107 except StopIteration: 108 return None
109
110 - def __iter__(self):
111 return iter(self.next, None)
112
113 -class ParserFailureError(Exception):
114 """Failure caused by some kind of problem in the parser. 115 """ 116 pass
117
118 -class LocationParserError(Exception):
119 """Could not Properly parse out a location from a GenBank file. 120 """ 121 pass
122
123 -class FeatureParser:
124 """Parse GenBank files into Seq + Feature objects. 125 """
126 - def __init__(self, debug_level = 0, use_fuzziness = 1, 127 feature_cleaner = FeatureValueCleaner()):
128 """Initialize a GenBank parser and Feature consumer. 129 130 Arguments: 131 o debug_level - An optional argument that species the amount of 132 debugging information the parser should spit out. By default we have 133 no debugging info (the fastest way to do things), but if you want 134 you can set this as high as two and see exactly where a parse fails. 135 o use_fuzziness - Specify whether or not to use fuzzy representations. 136 The default is 1 (use fuzziness). 137 o feature_cleaner - A class which will be used to clean out the 138 values of features. This class must implement the function 139 clean_value. GenBank.utils has a "standard" cleaner class, which 140 is used by default. 141 """ 142 self._scanner = GenBankScanner(debug_level) 143 self.use_fuzziness = use_fuzziness 144 self._cleaner = feature_cleaner
145
146 - def parse(self, handle):
147 """Parse the specified handle. 148 """ 149 self._consumer = _FeatureConsumer(self.use_fuzziness, 150 self._cleaner) 151 self._scanner.feed(handle, self._consumer) 152 return self._consumer.data
153
154 -class RecordParser:
155 """Parse GenBank files into Record objects 156 """
157 - def __init__(self, debug_level = 0):
158 """Initialize the parser. 159 160 Arguments: 161 o debug_level - An optional argument that species the amount of 162 debugging information the parser should spit out. By default we have 163 no debugging info (the fastest way to do things), but if you want 164 you can set this as high as two and see exactly where a parse fails. 165 """ 166 self._scanner = GenBankScanner(debug_level)
167
168 - def parse(self, handle):
169 """Parse the specified handle into a GenBank record. 170 """ 171 self._consumer = _RecordConsumer() 172 self._scanner.feed(handle, self._consumer) 173 return self._consumer.data
174
175 -class _BaseGenBankConsumer(AbstractConsumer):
176 """Abstract GenBank consumer providing useful general functions. 177 178 This just helps to eliminate some duplication in things that most 179 GenBank consumers want to do. 180 """ 181 # Special keys in GenBank records that we should remove spaces from 182 # For instance, \translation keys have values which are proteins and 183 # should have spaces and newlines removed from them. This class 184 # attribute gives us more control over specific formatting problems. 185 remove_space_keys = ["translation"] 186
187 - def __init__(self):
188 pass
189
190 - def _split_keywords(self, keyword_string):
191 """Split a string of keywords into a nice clean list. 192 """ 193 # process the keywords into a python list 194 if keyword_string == "" or keyword_string == ".": 195 keywords = "" 196 elif keyword_string[-1] == '.': 197 keywords = keyword_string[:-1] 198 else: 199 keywords = keyword_string 200 keyword_list = keywords.split(';') 201 clean_keyword_list = [x.strip() for x in keyword_list] 202 return clean_keyword_list
203
204 - def _split_accessions(self, accession_string):
205 """Split a string of accession numbers into a list. 206 """ 207 # first replace all line feeds with spaces 208 # Also, EMBL style accessions are split with ';' 209 accession = accession_string.replace("\n", " ").replace(";"," ") 210 211 return [x.strip() for x in accession.split() if x.strip()]
212
213 - def _split_taxonomy(self, taxonomy_string):
214 """Split a string with taxonomy info into a list. 215 """ 216 if not taxonomy_string or taxonomy_string==".": 217 #Missing data, no taxonomy 218 return [] 219 220 if taxonomy_string[-1] == '.': 221 tax_info = taxonomy_string[:-1] 222 else: 223 tax_info = taxonomy_string 224 tax_list = tax_info.split(';') 225 new_tax_list = [] 226 for tax_item in tax_list: 227 new_items = tax_item.split("\n") 228 new_tax_list.extend(new_items) 229 while '' in new_tax_list: 230 new_tax_list.remove('') 231 clean_tax_list = [x.strip() for x in new_tax_list] 232 233 return clean_tax_list
234
235 - def _clean_location(self, location_string):
236 """Clean whitespace out of a location string. 237 238 The location parser isn't a fan of whitespace, so we clean it out 239 before feeding it into the parser. 240 """ 241 #Originally this imported string.whitespace and did a replace 242 #via a loop. It's simpler to just split on whitespace and rejoin 243 #the string - and this avoids importing string too. See Bug 2684. 244 return ''.join(location_string.split())
245
246 - def _remove_newlines(self, text):
247 """Remove any newlines in the passed text, returning the new string. 248 """ 249 # get rid of newlines in the qualifier value 250 newlines = ["\n", "\r"] 251 for ws in newlines: 252 text = text.replace(ws, "") 253 254 return text
255
256 - def _normalize_spaces(self, text):
257 """Replace multiple spaces in the passed text with single spaces. 258 """ 259 # get rid of excessive spaces 260 text_parts = text.split(" ") 261 text_parts = filter(None, text_parts) 262 return ' '.join(text_parts)
263
264 - def _remove_spaces(self, text):
265 """Remove all spaces from the passed text. 266 """ 267 return text.replace(" ", "")
268
269 - def _convert_to_python_numbers(self, start, end):
270 """Convert a start and end range to python notation. 271 272 In GenBank, starts and ends are defined in "biological" coordinates, 273 where 1 is the first base and [i, j] means to include both i and j. 274 275 In python, 0 is the first base and [i, j] means to include i, but 276 not j. 277 278 So, to convert "biological" to python coordinates, we need to 279 subtract 1 from the start, and leave the end and things should 280 be converted happily. 281 """ 282 new_start = start - 1 283 new_end = end 284 285 return new_start, new_end
286
287 -class _FeatureConsumer(_BaseGenBankConsumer):
288 """Create a SeqRecord object with Features to return. 289 290 Attributes: 291 o use_fuzziness - specify whether or not to parse with fuzziness in 292 feature locations. 293 o feature_cleaner - a class that will be used to provide specialized 294 cleaning-up of feature values. 295 """
296 - def __init__(self, use_fuzziness, feature_cleaner = None):
297 from Bio.SeqRecord import SeqRecord 298 _BaseGenBankConsumer.__init__(self) 299 self.data = SeqRecord(None, id = None) 300 self.data.id = None 301 self.data.description = "" 302 303 self._use_fuzziness = use_fuzziness 304 self._feature_cleaner = feature_cleaner 305 306 self._seq_type = '' 307 self._seq_data = [] 308 self._cur_reference = None 309 self._cur_feature = None 310 self._cur_qualifier_key = None 311 self._cur_qualifier_value = None 312 self._expected_size = None
313
314 - def locus(self, locus_name):
315 """Set the locus name is set as the name of the Sequence. 316 """ 317 self.data.name = locus_name
318
319 - def size(self, content):
320 """Record the sequence length.""" 321 self._expected_size = int(content)
322
323 - def residue_type(self, type):
324 """Record the sequence type so we can choose an appropriate alphabet. 325 """ 326 self._seq_type = type
327
328 - def data_file_division(self, division):
329 self.data.annotations['data_file_division'] = division
330
331 - def date(self, submit_date):
332 self.data.annotations['date'] = submit_date
333
334 - def definition(self, definition):
335 """Set the definition as the description of the sequence. 336 """ 337 if self.data.description: 338 #Append to any existing description 339 #e.g. EMBL files with two DE lines. 340 self.data.description += " " + definition 341 else: 342 self.data.description = definition
343
344 - def accession(self, acc_num):
345 """Set the accession number as the id of the sequence. 346 347 If we have multiple accession numbers, the first one passed is 348 used. 349 """ 350 new_acc_nums = self._split_accessions(acc_num) 351 352 #Also record them ALL in the annotations 353 try: 354 #On the off chance there was more than one accession line: 355 for acc in new_acc_nums: 356 #Prevent repeat entries 357 if acc not in self.data.annotations['accessions']: 358 self.data.annotations['accessions'].append(acc) 359 except KeyError: 360 self.data.annotations['accessions'] = new_acc_nums 361 362 # if we haven't set the id information yet, add the first acc num 363 if self.data.id is None: 364 if len(new_acc_nums) > 0: 365 #self.data.id = new_acc_nums[0] 366 #Use the FIRST accession as the ID, not the first on this line! 367 self.data.id = self.data.annotations['accessions'][0]
368
369 - def wgs(self, content):
370 self.data.annotations['wgs'] = content.split('-')
371
372 - def add_wgs_scafld(self, content):
373 self.data.annotations.setdefault('wgs_scafld',[]).append(content.split('-'))
374
375 - def nid(self, content):
376 self.data.annotations['nid'] = content
377
378 - def pid(self, content):
379 self.data.annotations['pid'] = content
380
381 - def version(self, version_id):
382 #Want to use the versioned accession as the record.id 383 #This comes from the VERSION line in GenBank files, or the 384 #obsolete SV line in EMBL. For the new EMBL files we need 385 #both the version suffix from the ID line and the accession 386 #from the AC line. 387 if version_id.count(".")==1 and version_id.split(".")[1].isdigit(): 388 self.accession(version_id.split(".")[0]) 389 self.version_suffix(version_id.split(".")[1]) 390 else: 391 #For backwards compatibility... 392 self.data.id = version_id
393
394 - def project(self, content):
395 """Handle the information from the PROJECT line as a list of projects. 396 397 e.g. 398 PROJECT GenomeProject:28471 399 400 or: 401 PROJECT GenomeProject:13543 GenomeProject:99999 402 403 This is stored as dbxrefs in the SeqRecord to be consistent with the 404 projected switch of this line to DBLINK in future GenBank versions. 405 Note the NCBI plan to replace "GenomeProject:28471" with the shorter 406 "Project:28471" as part of this transition. 407 """ 408 content = content.replace("GenomeProject:", "Project:") 409 self.data.dbxrefs.extend([p for p in content.split() if p])
410 436
437 - def version_suffix(self, version):
438 """Set the version to overwrite the id. 439 440 Since the verison provides the same information as the accession 441 number, plus some extra info, we set this as the id if we have 442 a version. 443 """ 444 #e.g. GenBank line: 445 #VERSION U49845.1 GI:1293613 446 #or the obsolete EMBL line: 447 #SV U49845.1 448 #Scanner calls consumer.version("U49845.1") 449 #which then calls consumer.version_suffix(1) 450 # 451 #e.g. EMBL new line: 452 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 453 #Scanner calls consumer.version_suffix(1) 454 assert version.isdigit() 455 self.data.annotations['sequence_version'] = int(version)
456
457 - def db_source(self, content):
458 self.data.annotations['db_source'] = content.rstrip()
459
460 - def gi(self, content):
461 self.data.annotations['gi'] = content
462
463 - def keywords(self, content):
464 self.data.annotations['keywords'] = self._split_keywords(content)
465
466 - def segment(self, content):
467 self.data.annotations['segment'] = content
468
469 - def source(self, content):
470 #Note that some software (e.g. VectorNTI) may produce an empty 471 #source (rather than using a dot/period as might be expected). 472 if content == "": 473 source_info = "" 474 elif content[-1] == '.': 475 source_info = content[:-1] 476 else: 477 source_info = content 478 self.data.annotations['source'] = source_info
479
480 - def organism(self, content):
481 self.data.annotations['organism'] = content
482
483 - def taxonomy(self, content):
484 """Records (another line of) the taxonomy lineage. 485 """ 486 lineage = self._split_taxonomy(content) 487 try: 488 self.data.annotations['taxonomy'].extend(lineage) 489 except KeyError: 490 self.data.annotations['taxonomy'] = lineage
491
492 - def reference_num(self, content):
493 """Signal the beginning of a new reference object. 494 """ 495 # if we have a current reference that hasn't been added to 496 # the list of references, add it. 497 if self._cur_reference is not None: 498 self.data.annotations['references'].append(self._cur_reference) 499 else: 500 self.data.annotations['references'] = [] 501 502 self._cur_reference = SeqFeature.Reference()
503
504 - def reference_bases(self, content):
505 """Attempt to determine the sequence region the reference entails. 506 507 Possible types of information we may have to deal with: 508 509 (bases 1 to 86436) 510 (sites) 511 (bases 1 to 105654; 110423 to 111122) 512 1 (residues 1 to 182) 513 """ 514 # first remove the parentheses or other junk 515 ref_base_info = content[1:-1] 516 517 all_locations = [] 518 # parse if we've got 'bases' and 'to' 519 if ref_base_info.find('bases') != -1 and \ 520 ref_base_info.find('to') != -1: 521 # get rid of the beginning 'bases' 522 ref_base_info = ref_base_info[5:] 523 locations = self._split_reference_locations(ref_base_info) 524 all_locations.extend(locations) 525 elif (ref_base_info.find("residues") >= 0 and 526 ref_base_info.find("to") >= 0): 527 residues_start = ref_base_info.find("residues") 528 # get only the information after "residues" 529 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 530 locations = self._split_reference_locations(ref_base_info) 531 all_locations.extend(locations) 532 533 # make sure if we are not finding information then we have 534 # the string 'sites' or the string 'bases' 535 elif (ref_base_info == 'sites' or 536 ref_base_info.strip() == 'bases'): 537 pass 538 # otherwise raise an error 539 else: 540 raise ValueError("Could not parse base info %s in record %s" % 541 (ref_base_info, self.data.id)) 542 543 self._cur_reference.location = all_locations
544
545 - def _split_reference_locations(self, location_string):
546 """Get reference locations out of a string of reference information 547 548 The passed string should be of the form: 549 550 1 to 20; 20 to 100 551 552 This splits the information out and returns a list of location objects 553 based on the reference locations. 554 """ 555 # split possibly multiple locations using the ';' 556 all_base_info = location_string.split(';') 557 558 new_locations = [] 559 for base_info in all_base_info: 560 start, end = base_info.split('to') 561 new_start, new_end = \ 562 self._convert_to_python_numbers(int(start.strip()), 563 int(end.strip())) 564 this_location = SeqFeature.FeatureLocation(new_start, new_end) 565 new_locations.append(this_location) 566 return new_locations
567
568 - def authors(self, content):
569 if self._cur_reference.authors: 570 self._cur_reference.authors += ' ' + content 571 else: 572 self._cur_reference.authors = content
573
574 - def consrtm(self, content):
575 if self._cur_reference.consrtm: 576 self._cur_reference.consrtm += ' ' + content 577 else: 578 self._cur_reference.consrtm = content
579
580 - def title(self, content):
581 if self._cur_reference is None: 582 import warnings 583 warnings.warn("GenBank TITLE line without REFERENCE line.") 584 elif self._cur_reference.title: 585 self._cur_reference.title += ' ' + content 586 else: 587 self._cur_reference.title = content
588
589 - def journal(self, content):
590 if self._cur_reference.journal: 591 self._cur_reference.journal += ' ' + content 592 else: 593 self._cur_reference.journal = content
594
595 - def medline_id(self, content):
596 self._cur_reference.medline_id = content
597
598 - def pubmed_id(self, content):
599 self._cur_reference.pubmed_id = content
600
601 - def remark(self, content):
602 """Deal with a reference comment.""" 603 if self._cur_reference.comment: 604 self._cur_reference.comment += ' ' + content 605 else: 606 self._cur_reference.comment = content
607
608 - def comment(self, content):
609 try: 610 self.data.annotations['comment'] += "\n" + "\n".join(content) 611 except KeyError: 612 self.data.annotations['comment'] = "\n".join(content)
613
614 - def features_line(self, content):
615 """Get ready for the feature table when we reach the FEATURE line. 616 """ 617 self.start_feature_table()
618
619 - def start_feature_table(self):
620 """Indicate we've got to the start of the feature table. 621 """ 622 # make sure we've added on our last reference object 623 if self._cur_reference is not None: 624 self.data.annotations['references'].append(self._cur_reference) 625 self._cur_reference = None
626
627 - def _add_feature(self):
628 """Utility function to add a feature to the SeqRecord. 629 630 This does all of the appropriate checking to make sure we haven't 631 left any info behind, and that we are only adding info if it 632 exists. 633 """ 634 if self._cur_feature: 635 # if we have a left over qualifier, add it to the qualifiers 636 # on the current feature 637 self._add_qualifier() 638 639 self._cur_qualifier_key = '' 640 self._cur_qualifier_value = '' 641 self.data.features.append(self._cur_feature)
642
643 - def feature_key(self, content):
644 # if we already have a feature, add it on 645 self._add_feature() 646 647 # start a new feature 648 self._cur_feature = SeqFeature.SeqFeature() 649 self._cur_feature.type = content 650 651 # assume positive strand to start with if we have DNA or cDNA 652 # (labelled as mRNA). The complement in the location will 653 # change this later if something is on the reverse strand 654 if self._seq_type.find("DNA") >= 0 or self._seq_type.find("mRNA") >= 0: 655 self._cur_feature.strand = 1
656
657 - def location(self, content):
658 """Parse out location information from the location string. 659 660 This uses a comprehensive but slow spark based parser to do the 661 parsing, and then translates the results of the parse into appropriate 662 Location objects. 663 """ 664 # --- first preprocess the location for the spark parser 665 666 # we need to clean up newlines and other whitespace inside 667 # the location before feeding it to the parser. 668 # locations should have no whitespace whatsoever based on the 669 # grammer 670 location_line = self._clean_location(content) 671 672 # Older records have junk like replace(266,"c") in the 673 # location line. Newer records just replace this with 674 # the number 266 and have the information in a more reasonable 675 # place. So we'll just grab out the number and feed this to the 676 # parser. We shouldn't really be losing any info this way. 677 if location_line.find('replace') != -1: 678 comma_pos = location_line.find(',') 679 location_line = location_line[8:comma_pos] 680 681 # feed everything into the scanner and parser 682 try: 683 parse_info = \ 684 LocationParser.parse(LocationParser.scan(location_line)) 685 # spark raises SystemExit errors when parsing fails 686 except SystemExit: 687 raise LocationParserError(location_line) 688 689 # print "parse_info:", repr(parse_info) 690 691 # add the parser information the current feature 692 self._set_location_info(parse_info, self._cur_feature)
693
694 - def _set_function(self, function, cur_feature):
695 """Set the location information based on a function. 696 697 This handles all of the location functions like 'join', 'complement' 698 and 'order'. 699 700 Arguments: 701 o function - A LocationParser.Function object specifying the 702 function we are acting on. 703 o cur_feature - The feature to add information to. 704 """ 705 assert isinstance(function, LocationParser.Function), \ 706 "Expected a Function object, got %s" % function 707 708 if function.name == "complement": 709 # mark the current feature as being on the opposite strand 710 cur_feature.strand = -1 711 # recursively deal with whatever is left inside the complement 712 for inner_info in function.args: 713 self._set_location_info(inner_info, cur_feature) 714 # deal with functions that have multipe internal segments that 715 # are connected somehow. 716 # join and order are current documented functions. 717 # one-of is something I ran across in old files. Treating it 718 # as a sub sequence feature seems appropriate to me. 719 # bond is some piece of junk I found in RefSeq files. I have 720 # no idea how to interpret it, so I jam it in here 721 elif (function.name == "join" or function.name == "order" or 722 function.name == "one-of" or function.name == "bond"): 723 self._set_ordering_info(function, cur_feature) 724 elif (function.name == "gap"): 725 assert len(function.args) == 1, \ 726 "Unexpected number of arguments in gap %s" % function.args 727 # make the cur information location a gap object 728 position = self._get_position(function.args[0].local_location) 729 cur_feature.location = SeqFeature.PositionGap(position) 730 else: 731 raise ValueError("Unexpected function name: %s" % function.name)
732
733 - def _set_ordering_info(self, function, cur_feature):
734 """Parse a join or order and all of the information in it. 735 736 This deals with functions that order a bunch of locations, 737 specifically 'join' and 'order'. The inner locations are 738 added as subfeatures of the top level feature 739 """ 740 # for each inner element, create a sub SeqFeature within the 741 # current feature, then get the information for this feature 742 cur_feature.location_operator = function.name 743 for inner_element in function.args: 744 new_sub_feature = SeqFeature.SeqFeature() 745 # inherit the type from the parent 746 new_sub_feature.type = cur_feature.type 747 # add the join or order info to the location_operator 748 new_sub_feature.location_operator = function.name 749 # inherit references and strand from the parent feature 750 new_sub_feature.ref = cur_feature.ref 751 new_sub_feature.ref_db = cur_feature.ref_db 752 new_sub_feature.strand = cur_feature.strand 753 754 # set the information for the inner element 755 self._set_location_info(inner_element, new_sub_feature) 756 757 # now add the feature to the sub_features 758 cur_feature.sub_features.append(new_sub_feature) 759 760 # set the location of the top -- this should be a combination of 761 # the start position of the first sub_feature and the end position 762 # of the last sub_feature 763 764 # these positions are already converted to python coordinates 765 # (when the sub_features were added) so they don't need to 766 # be converted again 767 feature_start = cur_feature.sub_features[0].location.start 768 feature_end = cur_feature.sub_features[-1].location.end 769 cur_feature.location = SeqFeature.FeatureLocation(feature_start, 770 feature_end) 771 # Historically a join on the reverse strand has been represented 772 # in Biopython with both the parent SeqFeature and its children 773 # (the exons for a CDS) all given a strand of -1. Likewise, for 774 # a join feature on the forward strand they all have strand +1. 775 # However, we must also consider evil mixed strand examples like 776 # this, join(complement(69611..69724),139856..140087,140625..140650) 777 strands = set(sf.strand for sf in cur_feature.sub_features) 778 if len(strands)==1: 779 cur_feature.strand = cur_feature.sub_features[0].strand 780 else: 781 cur_feature.strand = None # i.e. mixed strands
782
783 - def _set_location_info(self, parse_info, cur_feature):
784 """Set the location information for a feature from the parse info. 785 786 Arguments: 787 o parse_info - The classes generated by the LocationParser. 788 o cur_feature - The feature to add the information to. 789 """ 790 # base case -- we are out of information 791 if parse_info is None: 792 return 793 # parse a location -- this is another base_case -- we assume 794 # we have no information after a single location 795 elif isinstance(parse_info, LocationParser.AbsoluteLocation): 796 self._set_location(parse_info, cur_feature) 797 return 798 # parse any of the functions (join, complement, etc) 799 elif isinstance(parse_info, LocationParser.Function): 800 self._set_function(parse_info, cur_feature) 801 # otherwise we are stuck and should raise an error 802 else: 803 raise ValueError("Could not parse location info: %s" 804 % parse_info)
805
806 - def _set_location(self, location, cur_feature):
807 """Set the location information for a feature. 808 809 Arguments: 810 o location - An AbsoluteLocation object specifying the info 811 about the location. 812 o cur_feature - The feature to add the information to. 813 """ 814 # check to see if we have a cross reference to another accession 815 # ie. U05344.1:514..741 816 if location.path is not None: 817 cur_feature.ref = location.path.accession 818 cur_feature.ref_db = location.path.database 819 # now get the actual location information 820 cur_feature.location = self._get_location(location.local_location)
821
822 - def _get_location(self, range_info):
823 """Return a (possibly fuzzy) location from a Range object. 824 825 Arguments: 826 o range_info - A location range (ie. something like 67..100). This 827 may also be a single position (ie 27). 828 829 This returns a FeatureLocation object. 830 If parser.use_fuzziness is set at one, the positions for the 831 end points will possibly be fuzzy. 832 """ 833 if isinstance(range_info, LocationParser.Between) \ 834 and range_info.low.val+1 == range_info.high.val: 835 #A between location like "67^68" (one based counting) is a 836 #special case (note it has zero length). In python slice 837 #notation this is 67:67, a zero length slice. See Bug 2622 838 pos = self._get_position(range_info.low) 839 return SeqFeature.FeatureLocation(pos, pos) 840 #NOTE - We can imagine between locations like "2^4", but this 841 #is just "3". Similarly, "2^5" is just "3..4" 842 # check if we just have a single base 843 elif not(isinstance(range_info, LocationParser.Range)): 844 #A single base like "785" becomes [784:785] in python 845 s_pos = self._get_position(range_info) 846 # move the single position back one to be consistent with how 847 # python indexes numbers (starting at 0) 848 s_pos.position = s_pos.position - 1 849 e_pos = self._get_position(range_info) 850 return SeqFeature.FeatureLocation(s_pos, e_pos) 851 # otherwise we need to get both sides of the range 852 else: 853 # get *Position objects for the start and end 854 start_pos = self._get_position(range_info.low) 855 end_pos = self._get_position(range_info.high) 856 857 start_pos.position, end_pos.position = \ 858 self._convert_to_python_numbers(start_pos.position, 859 end_pos.position) 860 #If the start location is a one-of position, we also need to 861 #adjust their positions to use python counting. 862 if isinstance(start_pos, SeqFeature.OneOfPosition): 863 for p in start_pos.position_choices: 864 p.position -= 1 865 866 return SeqFeature.FeatureLocation(start_pos, end_pos)
867
868 - def _get_position(self, position):
869 """Return a (possibly fuzzy) position for a single coordinate. 870 871 Arguments: 872 o position - This is a LocationParser.* object that specifies 873 a single coordinate. We will examine the object to determine 874 the fuzziness of the position. 875 876 This is used with _get_location to parse out a location of any 877 end_point of arbitrary fuzziness. 878 """ 879 # case 1 -- just a normal number 880 if (isinstance(position, LocationParser.Integer)): 881 final_pos = SeqFeature.ExactPosition(position.val) 882 # case 2 -- we've got a > sign 883 elif isinstance(position, LocationParser.LowBound): 884 final_pos = SeqFeature.AfterPosition(position.base.val) 885 # case 3 -- we've got a < sign 886 elif isinstance(position, LocationParser.HighBound): 887 final_pos = SeqFeature.BeforePosition(position.base.val) 888 # case 4 -- we've got 100^101 889 # Is the extension is zero in this example? 890 elif isinstance(position, LocationParser.Between): 891 #NOTE - We don't *expect* this code to get called! 892 #We only except between locations like 3^4 (consecutive) 893 #which are handled in _get_location. We don't expect 894 #non consecutive variants like "2^5" as this is just "3..4". 895 #Similarly there is no reason to expect composite locations 896 #like "(3^4)..6" which should just be "4..6". 897 final_pos = SeqFeature.BetweenPosition(position.low.val, 898 position.high.val-position.low.val) 899 # case 5 -- we've got (100.101) 900 elif isinstance(position, LocationParser.TwoBound): 901 final_pos = SeqFeature.WithinPosition(position.low.val, 902 position.high.val-position.low.val) 903 # case 6 -- we've got a one-of(100, 110) location 904 elif isinstance(position, LocationParser.Function) and \ 905 position.name == "one-of": 906 # first convert all of the arguments to positions 907 position_choices = [] 908 for arg in position.args: 909 # we only handle AbsoluteLocations with no path 910 # right now. Not sure if other cases will pop up 911 assert isinstance(arg, LocationParser.AbsoluteLocation), \ 912 "Unhandled Location type %r" % arg 913 assert arg.path is None, "Unhandled path in location" 914 position = self._get_position(arg.local_location) 915 position_choices.append(position) 916 final_pos = SeqFeature.OneOfPosition(position_choices) 917 # if it is none of these cases we've got a problem! 918 else: 919 raise ValueError("Unexpected LocationParser object %r" % 920 position) 921 922 # if we are using fuzziness return what we've got 923 if self._use_fuzziness: 924 return final_pos 925 # otherwise return an ExactPosition equivalent 926 else: 927 return SeqFeature.ExactPosition(final_pos.location)
928
929 - def _add_qualifier(self):
930 """Add a qualifier to the current feature without loss of info. 931 932 If there are multiple qualifier keys with the same name we 933 would lose some info in the dictionary, so we append a unique 934 number to the end of the name in case of conflicts. 935 """ 936 # if we've got a key from before, add it to the dictionary of 937 # qualifiers 938 if self._cur_qualifier_key: 939 key = self._cur_qualifier_key 940 value = "".join(self._cur_qualifier_value) 941 if self._feature_cleaner is not None: 942 value = self._feature_cleaner.clean_value(key, value) 943 # if the qualifier name exists, append the value 944 if key in self._cur_feature.qualifiers: 945 self._cur_feature.qualifiers[key].append(value) 946 # otherwise start a new list of the key with its values 947 else: 948 self._cur_feature.qualifiers[key] = [value]
949
950 - def feature_qualifier_name(self, content_list):
951 """When we get a qualifier key, use it as a dictionary key. 952 953 We receive a list of keys, since you can have valueless keys such as 954 /pseudo which would be passed in with the next key (since no other 955 tags separate them in the file) 956 """ 957 for content in content_list: 958 # add a qualifier if we've got one 959 self._add_qualifier() 960 961 # remove the / and = from the qualifier if they're present 962 qual_key = content.replace('/', '') 963 qual_key = qual_key.replace('=', '') 964 qual_key = qual_key.strip() 965 966 self._cur_qualifier_key = qual_key 967 self._cur_qualifier_value = []
968
969 - def feature_qualifier_description(self, content):
970 # get rid of the quotes surrounding the qualifier if we've got 'em 971 qual_value = content.replace('"', '') 972 973 self._cur_qualifier_value.append(qual_value)
974
975 - def contig_location(self, content):
976 """Deal with CONTIG information.""" 977 #Historically this was stored as a SeqFeature object, but it was 978 #stored under record.annotations["contig"] and not under 979 #record.features with the other SeqFeature objects. 980 # 981 #The CONTIG location line can include additional tokens like 982 #Gap(), Gap(100) or Gap(unk100) which are not used in the feature 983 #location lines, so storing it using SeqFeature based location 984 #objects is difficult. 985 # 986 #We now store this a string, which means for BioSQL we are now in 987 #much better agreement with how BioPerl records the CONTIG line 988 #in the database. 989 # 990 #NOTE - This code assumes the scanner will return all the CONTIG 991 #lines already combined into one long string! 992 self.data.annotations["contig"] = content
993
994 - def origin_name(self, content):
995 pass
996
997 - def base_count(self, content):
998 pass
999
1000 - def base_number(self, content):
1001 pass
1002
1003 - def sequence(self, content):
1004 """Add up sequence information as we get it. 1005 1006 To try and make things speedier, this puts all of the strings 1007 into a list of strings, and then uses string.join later to put 1008 them together. Supposedly, this is a big time savings 1009 """ 1010 new_seq = content.replace(' ', '') 1011 new_seq = new_seq.upper() 1012 1013 self._seq_data.append(new_seq)
1014
1015 - def record_end(self, content):
1016 """Clean up when we've finished the record. 1017 """ 1018 from Bio import Alphabet 1019 from Bio.Alphabet import IUPAC 1020 from Bio.Seq import Seq, UnknownSeq 1021 1022 #Try and append the version number to the accession for the full id 1023 if self.data.id is None: 1024 assert 'accessions' not in self.data.annotations, \ 1025 self.data.annotations['accessions'] 1026 self.data.id = self.data.name #Good fall back? 1027 elif self.data.id.count('.') == 0: 1028 try: 1029 self.data.id+='.%i' % self.data.annotations['sequence_version'] 1030 except KeyError: 1031 pass 1032 1033 # add the last feature in the table which hasn't been added yet 1034 self._add_feature() 1035 1036 # add the sequence information 1037 # first, determine the alphabet 1038 # we default to an generic alphabet if we don't have a 1039 # seq type or have strange sequence information. 1040 seq_alphabet = Alphabet.generic_alphabet 1041 1042 # now set the sequence 1043 sequence = "".join(self._seq_data) 1044 1045 if self._expected_size is not None \ 1046 and len(sequence) != 0 \ 1047 and self._expected_size != len(sequence): 1048 raise ValueError("Expected sequence length %i, found %i." \ 1049 % (self._expected_size, len(sequence))) 1050 1051 if self._seq_type: 1052 # mRNA is really also DNA, since it is actually cDNA 1053 if self._seq_type.find('DNA') != -1 or \ 1054 self._seq_type.find('mRNA') != -1: 1055 seq_alphabet = IUPAC.ambiguous_dna 1056 # are there ever really RNA sequences in GenBank? 1057 elif self._seq_type.find('RNA') != -1: 1058 #Even for data which was from RNA, the sequence string 1059 #is usually given as DNA (T not U). Bug 2408 1060 if "T" in sequence and "U" not in sequence: 1061 seq_alphabet = IUPAC.ambiguous_dna 1062 else: 1063 seq_alphabet = IUPAC.ambiguous_rna 1064 elif self._seq_type.find('PROTEIN') != -1: 1065 seq_alphabet = IUPAC.protein # or extended protein? 1066 # work around ugly GenBank records which have circular or 1067 # linear but no indication of sequence type 1068 elif self._seq_type in ["circular", "linear"]: 1069 pass 1070 # we have a bug if we get here 1071 else: 1072 raise ValueError("Could not determine alphabet for seq_type %s" 1073 % self._seq_type) 1074 1075 if not sequence and self.__expected_size: 1076 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet) 1077 else: 1078 self.data.seq = Seq(sequence, seq_alphabet)
1079
1080 -class _RecordConsumer(_BaseGenBankConsumer):
1081 """Create a GenBank Record object from scanner generated information. 1082 """
1083 - def __init__(self):
1084 _BaseGenBankConsumer.__init__(self) 1085 import Record 1086 self.data = Record.Record() 1087 1088 self._seq_data = [] 1089 self._cur_reference = None 1090 self._cur_feature = None 1091 self._cur_qualifier = None
1092
1093 - def wgs(self, content):
1094 self.data.wgs = content.split('-')
1095
1096 - def add_wgs_scafld(self, content):
1097 self.data.wgs_scafld.append(content.split('-'))
1098
1099 - def locus(self, content):
1100 self.data.locus = content
1101
1102 - def size(self, content):
1103 self.data.size = content
1104
1105 - def residue_type(self, content):
1106 self.data.residue_type = content
1107
1108 - def data_file_division(self, content):
1109 self.data.data_file_division = content
1110
1111 - def date(self, content):
1112 self.data.date = content
1113
1114 - def definition(self, content):
1115 self.data.definition = content
1116
1117 - def accession(self, content):
1118 for acc in self._split_accessions(content): 1119 if acc not in self.data.accession: 1120 self.data.accession.append(acc)
1121
1122 - def nid(self, content):
1123 self.data.nid = content
1124
1125 - def pid(self, content):
1126 self.data.pid = content
1127
1128 - def version(self, content):
1129 self.data.version = content
1130
1131 - def db_source(self, content):
1132 self.data.db_source = content.rstrip()
1133
1134 - def gi(self, content):
1135 self.data.gi = content
1136
1137 - def keywords(self, content):
1138 self.data.keywords = self._split_keywords(content)
1139
1140 - def project(self, content):
1141 self.data.projects.extend([p for p in content.split() if p])
1142 1145
1146 - def segment(self, content):
1147 self.data.segment = content
1148
1149 - def source(self, content):
1150 self.data.source = content
1151
1152 - def organism(self, content):
1153 self.data.organism = content
1154
1155 - def taxonomy(self, content):
1156 self.data.taxonomy = self._split_taxonomy(content)
1157
1158 - def reference_num(self, content):
1159 """Grab the reference number and signal the start of a new reference. 1160 """ 1161 # check if we have a reference to add 1162 if self._cur_reference is not None: 1163 self.data.references.append(self._cur_reference) 1164 1165 self._cur_reference = Record.Reference() 1166 self._cur_reference.number = content
1167
1168 - def reference_bases(self, content):
1169 self._cur_reference.bases = content
1170
1171 - def authors(self, content):
1172 self._cur_reference.authors = content
1173
1174 - def consrtm(self, content):
1175 self._cur_reference.consrtm = content
1176
1177 - def title(self, content):
1178 if self._cur_reference is None: 1179 import warnings 1180 warnings.warn("GenBank TITLE line without REFERENCE line.") 1181 return 1182 self._cur_reference.title = content
1183
1184 - def journal(self, content):
1185 self._cur_reference.journal = content
1186
1187 - def medline_id(self, content):
1188 self._cur_reference.medline_id = content
1189
1190 - def pubmed_id(self, content):
1191 self._cur_reference.pubmed_id = content
1192
1193 - def remark(self, content):
1194 self._cur_reference.remark = content
1195
1196 - def comment(self, content):
1197 self.data.comment += "\n".join(content)
1198
1199 - def primary_ref_line(self,content):
1200 """Data for the PRIMARY line""" 1201 self.data.primary.append(content)
1202
1203 - def primary(self,content):
1204 pass
1205
1206 - def features_line(self, content):
1207 """Get ready for the feature table when we reach the FEATURE line. 1208 """ 1209 self.start_feature_table()
1210
1211 - def start_feature_table(self):
1212 """Signal the start of the feature table. 1213 """ 1214 # we need to add on the last reference 1215 if self._cur_reference is not None: 1216 self.data.references.append(self._cur_reference)
1217
1218 - def feature_key(self, content):
1219 """Grab the key of the feature and signal the start of a new feature. 1220 """ 1221 # first add on feature information if we've got any 1222 self._add_feature() 1223 1224 self._cur_feature = Record.Feature() 1225 self._cur_feature.key = content
1226
1227 - def _add_feature(self):
1228 """Utility function to add a feature to the Record. 1229 1230 This does all of the appropriate checking to make sure we haven't 1231 left any info behind, and that we are only adding info if it 1232 exists. 1233 """ 1234 if self._cur_feature is not None: 1235 # if we have a left over qualifier, add it to the qualifiers 1236 # on the current feature 1237 if self._cur_qualifier is not None: 1238 self._cur_feature.qualifiers.append(self._cur_qualifier) 1239 1240 self._cur_qualifier = None 1241 self.data.features.append(self._cur_feature)
1242
1243 - def location(self, content):
1244 self._cur_feature.location = self._clean_location(content)
1245
1246 - def feature_qualifier_name(self, content_list):
1247 """Deal with qualifier names 1248 1249 We receive a list of keys, since you can have valueless keys such as 1250 /pseudo which would be passed in with the next key (since no other 1251 tags separate them in the file) 1252 """ 1253 for content in content_list: 1254 # the record parser keeps the /s -- add them if we don't have 'em 1255 if content.find("/") != 0: 1256 content = "/%s" % content 1257 # add on a qualifier if we've got one 1258 if self._cur_qualifier is not None: 1259 self._cur_feature.qualifiers.append(self._cur_qualifier) 1260 1261 self._cur_qualifier = Record.Qualifier() 1262 self._cur_qualifier.key = content
1263
1264 - def feature_qualifier_description(self, content):
1265 # if we have info then the qualifier key should have a ='s 1266 if self._cur_qualifier.key.find("=") == -1: 1267 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1268 cur_content = self._remove_newlines(content) 1269 # remove all spaces from the value if it is a type where spaces 1270 # are not important 1271 for remove_space_key in self.__class__.remove_space_keys: 1272 if self._cur_qualifier.key.find(remove_space_key) >= 0: 1273 cur_content = self._remove_spaces(cur_content) 1274 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1275
1276 - def base_count(self, content):
1277 self.data.base_counts = content
1278
1279 - def origin_name(self, content):
1280 self.data.origin = content
1281
1282 - def contig_location(self, content):
1283 """Signal that we have contig information to add to the record. 1284 """ 1285 self.data.contig = self._clean_location(content)
1286
1287 - def sequence(self, content):
1288 """Add sequence information to a list of sequence strings. 1289 1290 This removes spaces in the data and uppercases the sequence, and 1291 then adds it to a list of sequences. Later on we'll join this 1292 list together to make the final sequence. This is faster than 1293 adding on the new string every time. 1294 """ 1295 new_seq = content.replace(' ', '') 1296 self._seq_data.append(new_seq.upper())
1297
1298 - def record_end(self, content):
1299 """Signal the end of the record and do any necessary clean-up. 1300 """ 1301 # add together all of the sequence parts to create the 1302 # final sequence string 1303 self.data.sequence = "".join(self._seq_data) 1304 # add on the last feature 1305 self._add_feature()
1306 1307
1308 -class NCBIDictionary:
1309 """Access GenBank using a read-only dictionary interface (DEPRECATED). 1310 1311 This object is deprecated and will be removed in a future release of 1312 Biopython. Please use Bio.Entrez instead as described in the tutorial. 1313 """ 1314 VALID_DATABASES = ['nucleotide', 'protein', 'genome'] 1315 VALID_FORMATS = ['genbank', 'fasta']
1316 - def __init__(self, database, format, parser = None):
1317 """Initialize an NCBI dictionary to retrieve sequences. 1318 1319 Create a new Dictionary to access GenBank. Valid values for 1320 database are 'nucleotide' and 'protein'. 1321 Valid values for format are 'genbank' (for nucleotide genbank and 1322 protein genpept) and 'fasta'. 1323 dely and retmax are old options kept only for compatibility -- do not 1324 bother to set them. 1325 parser is an optional parser object 1326 to change the results into another form. If unspecified, then 1327 the raw contents of the file will be returned. 1328 """ 1329 import warnings 1330 warnings.warn("Bio.GenBank.NCBIDictionary has been deprecated, and will be"\ 1331 " removed in a future release of Biopython. Please use"\ 1332 " Bio.Entrez instead which is described in the tutorial.", 1333 DeprecationWarning) 1334 1335 self.parser = parser 1336 if database not in self.__class__.VALID_DATABASES: 1337 raise ValueError("Invalid database %s, should be one of %s" % 1338 (database, self.__class__.VALID_DATABASES)) 1339 if format not in self.__class__.VALID_FORMATS: 1340 raise ValueError("Invalid format %s, should be one of %s" % 1341 (format, self.__class__.VALID_FORMATS)) 1342 1343 if format=="genbank": format = "gb" 1344 self.db = database 1345 self.format = format
1346
1347 - def __len__(self):
1348 raise NotImplementedError("GenBank contains lots of entries")
1349 - def clear(self):
1350 raise NotImplementedError("This is a read-only dictionary")
1351 - def __setitem__(self, key, item):
1352 raise NotImplementedError("This is a read-only dictionary")
1353 - def update(self):
1354 raise NotImplementedError("This is a read-only dictionary")
1355 - def copy(self):
1356 raise NotImplementedError("You don't need to do this...")
1357 - def keys(self):
1358 raise NotImplementedError("You don't really want to do this...")
1359 - def items(self):
1360 raise NotImplementedError("You don't really want to do this...")
1361 - def values(self):
1362 raise NotImplementedError("You don't really want to do this...")
1363
1364 - def has_key(self, id):
1365 """S.has_key(id) -> bool""" 1366 try: 1367 self[id] 1368 except KeyError: 1369 return 0 1370 return 1
1371
1372 - def get(self, id, failobj=None):
1373 try: 1374 return self[id] 1375 except KeyError: 1376 return failobj
1377
1378 - def __getitem__(self, id):
1379 """Return the GenBank entry specified by the GenBank ID. 1380 1381 Raises a KeyError if there's an error. 1382 """ 1383 handle = Entrez.efetch(db = self.db, id = id, rettype = self.format) 1384 # Parse the record if a parser was passed in. 1385 if self.parser is not None: 1386 return self.parser.parse(handle) 1387 return handle.read()
1388
1389 -def search_for(search, database='nucleotide', 1390 reldate=None, mindate=None, maxdate=None, 1391 start_id = 0, max_ids = 50000000):
1392 """Do an online search at the NCBI, returns a list of IDs (DEPRECATED). 1393 1394 This function is deprecated and will be removed in a future release of 1395 Biopython. Please use Bio.Entrez instead as described in the tutorial. 1396 1397 Search GenBank and return a list of the GenBank identifiers (gi's) 1398 that match the criteria. search is the search string used to 1399 search the database. Valid values for database are 1400 'nucleotide', 'protein', 'popset' and 'genome'. reldate is 1401 the number of dates prior to the current date to restrict the 1402 search. mindate and maxdate are the dates to restrict the search, 1403 e.g. 2002/12/20. start_id is the number to begin retrieval on. 1404 max_ids specifies the maximum number of id's to retrieve. 1405 """ 1406 import warnings 1407 warnings.warn("Bio.GenBank.search_for has been deprecated, and will be"\ 1408 " removed in a future release of Biopython. Please use"\ 1409 " Bio.Entrez instead which is described in the tutorial.", 1410 DeprecationWarning) 1411 1412 # mindate and maxdate are NCBI parameters in "YYYY/MM/DD" format 1413 # (and both should be supplied or neither) 1414 # relate is an NCBI parameter for "within N days" 1415 1416 #Following date checking from Bio/EUtils/Datatypes.py, 1417 import re 1418 _date_re_match = re.compile(r"\d{4}(/\d\d(/\d\d)?)?$").match 1419 errinfo = None 1420 if mindate is not None and _date_re_match(mindate) is None: 1421 errinfo = ("mindate", mindate) 1422 elif maxdate is not None and _date_re_match(maxdate) is None: 1423 errinfo = ("maxdate", maxdate) 1424 if errinfo: 1425 raise TypeError( 1426 "%s is not in YYYY/MM/DD format (month and " 1427 "day are optional): %r" % errinfo) 1428 1429 #Bio.Entrez can now ignore None arguments automatically 1430 handle = Entrez.esearch(database, search, retmode="xml", 1431 retstart=start_id, retmax=max_ids, 1432 mindate=mindate, maxdate=maxdate, 1433 reldate=reldate) 1434 return Entrez.read(handle)["IdList"]
1435
1436 -def download_many(ids, database = 'nucleotide'):
1437 """Download multiple NCBI GenBank records, returned as a handle (DEPRECATED). 1438 1439 This function is deprecated and will be removed in a future release of 1440 Biopython. Please use Bio.Entrez instead as described in the tutorial. 1441 1442 Download many records from GenBank. ids is a list of gis or 1443 accessions. 1444 """ 1445 import warnings 1446 warnings.warn("Bio.GenBank.download_many has been deprecated, and will be"\ 1447 " removed in a future release of Biopython. Please use"\ 1448 " Bio.Entrez instead which is described in the tutorial.", 1449 DeprecationWarning) 1450 1451 if database in ['nucleotide']: 1452 format = 'gb' 1453 elif database in ['protein']: 1454 format = 'gp' 1455 else: 1456 raise ValueError("Unexpected database: %s" % database) 1457 #TODO - Batch the queries? 1458 result_handle = Entrez.efetch(database, 1459 id=",".join(ids), 1460 retmode = "text", 1461 rettype = format) 1462 return cStringIO.StringIO(result_handle.read())
1463