1
2
3
4
5
6
7 """Code to work with GenBank formatted files.
8
9 Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with
10 the "genbank" or "embl" format names to parse GenBank or EMBL files into
11 SeqRecord and SeqFeature objects (see the Biopython tutorial for details).
12
13 Also, rather than using Bio.GenBank to search or download files from the NCBI,
14 you are now encouraged to use Bio.Entrez instead (again, see the Biopython
15 tutorial for details).
16
17 Currently the ONLY reason to use Bio.GenBank directly is for the RecordParser
18 which turns a GenBank file into GenBank-specific Record objects. This is a
19 much closer representation to the raw file contents that the SeqRecord
20 alternative from the FeatureParser (used in Bio.SeqIO).
21
22 Classes:
23 Iterator Iterate through a file of GenBank entries
24 ErrorFeatureParser Catch errors caused during parsing.
25 FeatureParser Parse GenBank data in SeqRecord and SeqFeature objects.
26 RecordParser Parse GenBank data into a Record object.
27 NCBIDictionary Access GenBank using a dictionary interface (DEPRECATED).
28
29 _BaseGenBankConsumer A base class for GenBank consumer that implements
30 some helpful functions that are in common between
31 consumers.
32 _FeatureConsumer Create SeqFeature objects from info generated by
33 the Scanner
34 _RecordConsumer Create a GenBank record object from Scanner info.
35 _PrintingConsumer A debugging consumer.
36
37 ParserFailureError Exception indicating a failure in the parser (ie.
38 scanner or consumer)
39 LocationParserError Exception indiciating a problem with the spark based
40 location parser.
41
42 Functions:
43 search_for Do a query against GenBank (DEPRECATED).
44 download_many Download many GenBank records (DEPRECATED).
45
46 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records.
47 These are GenBank files that summarize the content of a project, and provide lists of
48 scaffold and contig files in the project. These will be in annotations['wgs'] and
49 annotations['wgs_scafld']. These GenBank files do not have sequences. See
50 http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36
51
52 http://is.gd/nNgk
53 for more details of this format, and an example.
54 Added by Ying Huang & Iddo Friedberg
55 """
56 import cStringIO
57
58
59 from Bio import SeqFeature
60 from Bio.ParserSupport import AbstractConsumer
61 from Bio import Entrez
62
63
64 import LocationParser
65 from utils import FeatureValueCleaner
66 from Scanner import GenBankScanner
67
68
69 GENBANK_INDENT = 12
70 GENBANK_SPACER = " " * GENBANK_INDENT
71
72
73 FEATURE_KEY_INDENT = 5
74 FEATURE_QUALIFIER_INDENT = 21
75 FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT
76 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
77
79 """Iterator interface to move over a file of GenBank entries one at a time.
80 """
81 - def __init__(self, handle, parser = None):
82 """Initialize the iterator.
83
84 Arguments:
85 o handle - A handle with GenBank entries to iterate through.
86 o parser - An optional parser to pass the entries through before
87 returning them. If None, then the raw entry will be returned.
88 """
89 self.handle = handle
90 self._parser = parser
91
93 """Return the next GenBank record from the handle.
94
95 Will return None if we ran out of records.
96 """
97 if self._parser is None:
98 lines = []
99 while True:
100 line = self.handle.readline()
101 if not line : return None
102 lines.append(line)
103 if line.rstrip() == "//" : break
104 return "".join(lines)
105 try:
106 return self._parser.parse(self.handle)
107 except StopIteration:
108 return None
109
111 return iter(self.next, None)
112
114 """Failure caused by some kind of problem in the parser.
115 """
116 pass
117
119 """Could not Properly parse out a location from a GenBank file.
120 """
121 pass
122
124 """Parse GenBank files into Seq + Feature objects.
125 """
128 """Initialize a GenBank parser and Feature consumer.
129
130 Arguments:
131 o debug_level - An optional argument that species the amount of
132 debugging information the parser should spit out. By default we have
133 no debugging info (the fastest way to do things), but if you want
134 you can set this as high as two and see exactly where a parse fails.
135 o use_fuzziness - Specify whether or not to use fuzzy representations.
136 The default is 1 (use fuzziness).
137 o feature_cleaner - A class which will be used to clean out the
138 values of features. This class must implement the function
139 clean_value. GenBank.utils has a "standard" cleaner class, which
140 is used by default.
141 """
142 self._scanner = GenBankScanner(debug_level)
143 self.use_fuzziness = use_fuzziness
144 self._cleaner = feature_cleaner
145
146 - def parse(self, handle):
147 """Parse the specified handle.
148 """
149 self._consumer = _FeatureConsumer(self.use_fuzziness,
150 self._cleaner)
151 self._scanner.feed(handle, self._consumer)
152 return self._consumer.data
153
155 """Parse GenBank files into Record objects
156 """
158 """Initialize the parser.
159
160 Arguments:
161 o debug_level - An optional argument that species the amount of
162 debugging information the parser should spit out. By default we have
163 no debugging info (the fastest way to do things), but if you want
164 you can set this as high as two and see exactly where a parse fails.
165 """
166 self._scanner = GenBankScanner(debug_level)
167
168 - def parse(self, handle):
169 """Parse the specified handle into a GenBank record.
170 """
171 self._consumer = _RecordConsumer()
172 self._scanner.feed(handle, self._consumer)
173 return self._consumer.data
174
176 """Abstract GenBank consumer providing useful general functions.
177
178 This just helps to eliminate some duplication in things that most
179 GenBank consumers want to do.
180 """
181
182
183
184
185 remove_space_keys = ["translation"]
186
189
191 """Split a string of keywords into a nice clean list.
192 """
193
194 if keyword_string == "" or keyword_string == ".":
195 keywords = ""
196 elif keyword_string[-1] == '.':
197 keywords = keyword_string[:-1]
198 else:
199 keywords = keyword_string
200 keyword_list = keywords.split(';')
201 clean_keyword_list = [x.strip() for x in keyword_list]
202 return clean_keyword_list
203
205 """Split a string of accession numbers into a list.
206 """
207
208
209 accession = accession_string.replace("\n", " ").replace(";"," ")
210
211 return [x.strip() for x in accession.split() if x.strip()]
212
214 """Split a string with taxonomy info into a list.
215 """
216 if not taxonomy_string or taxonomy_string==".":
217
218 return []
219
220 if taxonomy_string[-1] == '.':
221 tax_info = taxonomy_string[:-1]
222 else:
223 tax_info = taxonomy_string
224 tax_list = tax_info.split(';')
225 new_tax_list = []
226 for tax_item in tax_list:
227 new_items = tax_item.split("\n")
228 new_tax_list.extend(new_items)
229 while '' in new_tax_list:
230 new_tax_list.remove('')
231 clean_tax_list = [x.strip() for x in new_tax_list]
232
233 return clean_tax_list
234
236 """Clean whitespace out of a location string.
237
238 The location parser isn't a fan of whitespace, so we clean it out
239 before feeding it into the parser.
240 """
241
242
243
244 return ''.join(location_string.split())
245
247 """Remove any newlines in the passed text, returning the new string.
248 """
249
250 newlines = ["\n", "\r"]
251 for ws in newlines:
252 text = text.replace(ws, "")
253
254 return text
255
257 """Replace multiple spaces in the passed text with single spaces.
258 """
259
260 text_parts = text.split(" ")
261 text_parts = filter(None, text_parts)
262 return ' '.join(text_parts)
263
265 """Remove all spaces from the passed text.
266 """
267 return text.replace(" ", "")
268
270 """Convert a start and end range to python notation.
271
272 In GenBank, starts and ends are defined in "biological" coordinates,
273 where 1 is the first base and [i, j] means to include both i and j.
274
275 In python, 0 is the first base and [i, j] means to include i, but
276 not j.
277
278 So, to convert "biological" to python coordinates, we need to
279 subtract 1 from the start, and leave the end and things should
280 be converted happily.
281 """
282 new_start = start - 1
283 new_end = end
284
285 return new_start, new_end
286
288 """Create a SeqRecord object with Features to return.
289
290 Attributes:
291 o use_fuzziness - specify whether or not to parse with fuzziness in
292 feature locations.
293 o feature_cleaner - a class that will be used to provide specialized
294 cleaning-up of feature values.
295 """
296 - def __init__(self, use_fuzziness, feature_cleaner = None):
297 from Bio.SeqRecord import SeqRecord
298 _BaseGenBankConsumer.__init__(self)
299 self.data = SeqRecord(None, id = None)
300 self.data.id = None
301 self.data.description = ""
302
303 self._use_fuzziness = use_fuzziness
304 self._feature_cleaner = feature_cleaner
305
306 self._seq_type = ''
307 self._seq_data = []
308 self._cur_reference = None
309 self._cur_feature = None
310 self._cur_qualifier_key = None
311 self._cur_qualifier_value = None
312 self._expected_size = None
313
314 - def locus(self, locus_name):
315 """Set the locus name is set as the name of the Sequence.
316 """
317 self.data.name = locus_name
318
319 - def size(self, content):
320 """Record the sequence length."""
321 self._expected_size = int(content)
322
324 """Record the sequence type so we can choose an appropriate alphabet.
325 """
326 self._seq_type = type
327
330
331 - def date(self, submit_date):
333
343
345 """Set the accession number as the id of the sequence.
346
347 If we have multiple accession numbers, the first one passed is
348 used.
349 """
350 new_acc_nums = self._split_accessions(acc_num)
351
352
353 try:
354
355 for acc in new_acc_nums:
356
357 if acc not in self.data.annotations['accessions']:
358 self.data.annotations['accessions'].append(acc)
359 except KeyError:
360 self.data.annotations['accessions'] = new_acc_nums
361
362
363 if self.data.id is None:
364 if len(new_acc_nums) > 0:
365
366
367 self.data.id = self.data.annotations['accessions'][0]
368
369 - def wgs(self, content):
371
374
375 - def nid(self, content):
377
378 - def pid(self, content):
380
393
395 """Handle the information from the PROJECT line as a list of projects.
396
397 e.g.
398 PROJECT GenomeProject:28471
399
400 or:
401 PROJECT GenomeProject:13543 GenomeProject:99999
402
403 This is stored as dbxrefs in the SeqRecord to be consistent with the
404 projected switch of this line to DBLINK in future GenBank versions.
405 Note the NCBI plan to replace "GenomeProject:28471" with the shorter
406 "Project:28471" as part of this transition.
407 """
408 content = content.replace("GenomeProject:", "Project:")
409 self.data.dbxrefs.extend([p for p in content.split() if p])
410
412 """Store DBLINK cross references as dbxrefs in our record object.
413
414 This line type is expected to replace the PROJECT line in 2009. e.g.
415
416 During transition:
417
418 PROJECT GenomeProject:28471
419 DBLINK Project:28471
420 Trace Assembly Archive:123456
421
422 Once the project line is dropped:
423
424 DBLINK Project:28471
425 Trace Assembly Archive:123456
426
427 Note GenomeProject -> Project.
428
429 We'll have to see some real examples to be sure, but based on the
430 above example we can expect one reference per line.
431 """
432
433
434 if content.strip() not in self.data.dbxrefs:
435 self.data.dbxrefs.append(content.strip())
436
438 """Set the version to overwrite the id.
439
440 Since the verison provides the same information as the accession
441 number, plus some extra info, we set this as the id if we have
442 a version.
443 """
444
445
446
447
448
449
450
451
452
453
454 assert version.isdigit()
455 self.data.annotations['sequence_version'] = int(version)
456
459
460 - def gi(self, content):
462
465
468
470
471
472 if content == "":
473 source_info = ""
474 elif content[-1] == '.':
475 source_info = content[:-1]
476 else:
477 source_info = content
478 self.data.annotations['source'] = source_info
479
482
491
493 """Signal the beginning of a new reference object.
494 """
495
496
497 if self._cur_reference is not None:
498 self.data.annotations['references'].append(self._cur_reference)
499 else:
500 self.data.annotations['references'] = []
501
502 self._cur_reference = SeqFeature.Reference()
503
505 """Attempt to determine the sequence region the reference entails.
506
507 Possible types of information we may have to deal with:
508
509 (bases 1 to 86436)
510 (sites)
511 (bases 1 to 105654; 110423 to 111122)
512 1 (residues 1 to 182)
513 """
514
515 ref_base_info = content[1:-1]
516
517 all_locations = []
518
519 if ref_base_info.find('bases') != -1 and \
520 ref_base_info.find('to') != -1:
521
522 ref_base_info = ref_base_info[5:]
523 locations = self._split_reference_locations(ref_base_info)
524 all_locations.extend(locations)
525 elif (ref_base_info.find("residues") >= 0 and
526 ref_base_info.find("to") >= 0):
527 residues_start = ref_base_info.find("residues")
528
529 ref_base_info = ref_base_info[(residues_start + len("residues ")):]
530 locations = self._split_reference_locations(ref_base_info)
531 all_locations.extend(locations)
532
533
534
535 elif (ref_base_info == 'sites' or
536 ref_base_info.strip() == 'bases'):
537 pass
538
539 else:
540 raise ValueError("Could not parse base info %s in record %s" %
541 (ref_base_info, self.data.id))
542
543 self._cur_reference.location = all_locations
544
546 """Get reference locations out of a string of reference information
547
548 The passed string should be of the form:
549
550 1 to 20; 20 to 100
551
552 This splits the information out and returns a list of location objects
553 based on the reference locations.
554 """
555
556 all_base_info = location_string.split(';')
557
558 new_locations = []
559 for base_info in all_base_info:
560 start, end = base_info.split('to')
561 new_start, new_end = \
562 self._convert_to_python_numbers(int(start.strip()),
563 int(end.strip()))
564 this_location = SeqFeature.FeatureLocation(new_start, new_end)
565 new_locations.append(this_location)
566 return new_locations
567
569 if self._cur_reference.authors:
570 self._cur_reference.authors += ' ' + content
571 else:
572 self._cur_reference.authors = content
573
575 if self._cur_reference.consrtm:
576 self._cur_reference.consrtm += ' ' + content
577 else:
578 self._cur_reference.consrtm = content
579
580 - def title(self, content):
581 if self._cur_reference is None:
582 import warnings
583 warnings.warn("GenBank TITLE line without REFERENCE line.")
584 elif self._cur_reference.title:
585 self._cur_reference.title += ' ' + content
586 else:
587 self._cur_reference.title = content
588
590 if self._cur_reference.journal:
591 self._cur_reference.journal += ' ' + content
592 else:
593 self._cur_reference.journal = content
594
597
600
602 """Deal with a reference comment."""
603 if self._cur_reference.comment:
604 self._cur_reference.comment += ' ' + content
605 else:
606 self._cur_reference.comment = content
607
613
615 """Get ready for the feature table when we reach the FEATURE line.
616 """
617 self.start_feature_table()
618
620 """Indicate we've got to the start of the feature table.
621 """
622
623 if self._cur_reference is not None:
624 self.data.annotations['references'].append(self._cur_reference)
625 self._cur_reference = None
626
628 """Utility function to add a feature to the SeqRecord.
629
630 This does all of the appropriate checking to make sure we haven't
631 left any info behind, and that we are only adding info if it
632 exists.
633 """
634 if self._cur_feature:
635
636
637 self._add_qualifier()
638
639 self._cur_qualifier_key = ''
640 self._cur_qualifier_value = ''
641 self.data.features.append(self._cur_feature)
642
656
658 """Parse out location information from the location string.
659
660 This uses a comprehensive but slow spark based parser to do the
661 parsing, and then translates the results of the parse into appropriate
662 Location objects.
663 """
664
665
666
667
668
669
670 location_line = self._clean_location(content)
671
672
673
674
675
676
677 if location_line.find('replace') != -1:
678 comma_pos = location_line.find(',')
679 location_line = location_line[8:comma_pos]
680
681
682 try:
683 parse_info = \
684 LocationParser.parse(LocationParser.scan(location_line))
685
686 except SystemExit:
687 raise LocationParserError(location_line)
688
689
690
691
692 self._set_location_info(parse_info, self._cur_feature)
693
695 """Set the location information based on a function.
696
697 This handles all of the location functions like 'join', 'complement'
698 and 'order'.
699
700 Arguments:
701 o function - A LocationParser.Function object specifying the
702 function we are acting on.
703 o cur_feature - The feature to add information to.
704 """
705 assert isinstance(function, LocationParser.Function), \
706 "Expected a Function object, got %s" % function
707
708 if function.name == "complement":
709
710 cur_feature.strand = -1
711
712 for inner_info in function.args:
713 self._set_location_info(inner_info, cur_feature)
714
715
716
717
718
719
720
721 elif (function.name == "join" or function.name == "order" or
722 function.name == "one-of" or function.name == "bond"):
723 self._set_ordering_info(function, cur_feature)
724 elif (function.name == "gap"):
725 assert len(function.args) == 1, \
726 "Unexpected number of arguments in gap %s" % function.args
727
728 position = self._get_position(function.args[0].local_location)
729 cur_feature.location = SeqFeature.PositionGap(position)
730 else:
731 raise ValueError("Unexpected function name: %s" % function.name)
732
734 """Parse a join or order and all of the information in it.
735
736 This deals with functions that order a bunch of locations,
737 specifically 'join' and 'order'. The inner locations are
738 added as subfeatures of the top level feature
739 """
740
741
742 cur_feature.location_operator = function.name
743 for inner_element in function.args:
744 new_sub_feature = SeqFeature.SeqFeature()
745
746 new_sub_feature.type = cur_feature.type
747
748 new_sub_feature.location_operator = function.name
749
750 new_sub_feature.ref = cur_feature.ref
751 new_sub_feature.ref_db = cur_feature.ref_db
752 new_sub_feature.strand = cur_feature.strand
753
754
755 self._set_location_info(inner_element, new_sub_feature)
756
757
758 cur_feature.sub_features.append(new_sub_feature)
759
760
761
762
763
764
765
766
767 feature_start = cur_feature.sub_features[0].location.start
768 feature_end = cur_feature.sub_features[-1].location.end
769 cur_feature.location = SeqFeature.FeatureLocation(feature_start,
770 feature_end)
771
772
773
774
775
776
777 strands = set(sf.strand for sf in cur_feature.sub_features)
778 if len(strands)==1:
779 cur_feature.strand = cur_feature.sub_features[0].strand
780 else:
781 cur_feature.strand = None
782
784 """Set the location information for a feature from the parse info.
785
786 Arguments:
787 o parse_info - The classes generated by the LocationParser.
788 o cur_feature - The feature to add the information to.
789 """
790
791 if parse_info is None:
792 return
793
794
795 elif isinstance(parse_info, LocationParser.AbsoluteLocation):
796 self._set_location(parse_info, cur_feature)
797 return
798
799 elif isinstance(parse_info, LocationParser.Function):
800 self._set_function(parse_info, cur_feature)
801
802 else:
803 raise ValueError("Could not parse location info: %s"
804 % parse_info)
805
807 """Set the location information for a feature.
808
809 Arguments:
810 o location - An AbsoluteLocation object specifying the info
811 about the location.
812 o cur_feature - The feature to add the information to.
813 """
814
815
816 if location.path is not None:
817 cur_feature.ref = location.path.accession
818 cur_feature.ref_db = location.path.database
819
820 cur_feature.location = self._get_location(location.local_location)
821
823 """Return a (possibly fuzzy) location from a Range object.
824
825 Arguments:
826 o range_info - A location range (ie. something like 67..100). This
827 may also be a single position (ie 27).
828
829 This returns a FeatureLocation object.
830 If parser.use_fuzziness is set at one, the positions for the
831 end points will possibly be fuzzy.
832 """
833 if isinstance(range_info, LocationParser.Between) \
834 and range_info.low.val+1 == range_info.high.val:
835
836
837
838 pos = self._get_position(range_info.low)
839 return SeqFeature.FeatureLocation(pos, pos)
840
841
842
843 elif not(isinstance(range_info, LocationParser.Range)):
844
845 s_pos = self._get_position(range_info)
846
847
848 s_pos.position = s_pos.position - 1
849 e_pos = self._get_position(range_info)
850 return SeqFeature.FeatureLocation(s_pos, e_pos)
851
852 else:
853
854 start_pos = self._get_position(range_info.low)
855 end_pos = self._get_position(range_info.high)
856
857 start_pos.position, end_pos.position = \
858 self._convert_to_python_numbers(start_pos.position,
859 end_pos.position)
860
861
862 if isinstance(start_pos, SeqFeature.OneOfPosition):
863 for p in start_pos.position_choices:
864 p.position -= 1
865
866 return SeqFeature.FeatureLocation(start_pos, end_pos)
867
869 """Return a (possibly fuzzy) position for a single coordinate.
870
871 Arguments:
872 o position - This is a LocationParser.* object that specifies
873 a single coordinate. We will examine the object to determine
874 the fuzziness of the position.
875
876 This is used with _get_location to parse out a location of any
877 end_point of arbitrary fuzziness.
878 """
879
880 if (isinstance(position, LocationParser.Integer)):
881 final_pos = SeqFeature.ExactPosition(position.val)
882
883 elif isinstance(position, LocationParser.LowBound):
884 final_pos = SeqFeature.AfterPosition(position.base.val)
885
886 elif isinstance(position, LocationParser.HighBound):
887 final_pos = SeqFeature.BeforePosition(position.base.val)
888
889
890 elif isinstance(position, LocationParser.Between):
891
892
893
894
895
896
897 final_pos = SeqFeature.BetweenPosition(position.low.val,
898 position.high.val-position.low.val)
899
900 elif isinstance(position, LocationParser.TwoBound):
901 final_pos = SeqFeature.WithinPosition(position.low.val,
902 position.high.val-position.low.val)
903
904 elif isinstance(position, LocationParser.Function) and \
905 position.name == "one-of":
906
907 position_choices = []
908 for arg in position.args:
909
910
911 assert isinstance(arg, LocationParser.AbsoluteLocation), \
912 "Unhandled Location type %r" % arg
913 assert arg.path is None, "Unhandled path in location"
914 position = self._get_position(arg.local_location)
915 position_choices.append(position)
916 final_pos = SeqFeature.OneOfPosition(position_choices)
917
918 else:
919 raise ValueError("Unexpected LocationParser object %r" %
920 position)
921
922
923 if self._use_fuzziness:
924 return final_pos
925
926 else:
927 return SeqFeature.ExactPosition(final_pos.location)
928
930 """Add a qualifier to the current feature without loss of info.
931
932 If there are multiple qualifier keys with the same name we
933 would lose some info in the dictionary, so we append a unique
934 number to the end of the name in case of conflicts.
935 """
936
937
938 if self._cur_qualifier_key:
939 key = self._cur_qualifier_key
940 value = "".join(self._cur_qualifier_value)
941 if self._feature_cleaner is not None:
942 value = self._feature_cleaner.clean_value(key, value)
943
944 if key in self._cur_feature.qualifiers:
945 self._cur_feature.qualifiers[key].append(value)
946
947 else:
948 self._cur_feature.qualifiers[key] = [value]
949
951 """When we get a qualifier key, use it as a dictionary key.
952
953 We receive a list of keys, since you can have valueless keys such as
954 /pseudo which would be passed in with the next key (since no other
955 tags separate them in the file)
956 """
957 for content in content_list:
958
959 self._add_qualifier()
960
961
962 qual_key = content.replace('/', '')
963 qual_key = qual_key.replace('=', '')
964 qual_key = qual_key.strip()
965
966 self._cur_qualifier_key = qual_key
967 self._cur_qualifier_value = []
968
970
971 qual_value = content.replace('"', '')
972
973 self._cur_qualifier_value.append(qual_value)
974
976 """Deal with CONTIG information."""
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992 self.data.annotations["contig"] = content
993
996
999
1002
1004 """Add up sequence information as we get it.
1005
1006 To try and make things speedier, this puts all of the strings
1007 into a list of strings, and then uses string.join later to put
1008 them together. Supposedly, this is a big time savings
1009 """
1010 new_seq = content.replace(' ', '')
1011 new_seq = new_seq.upper()
1012
1013 self._seq_data.append(new_seq)
1014
1079
1081 """Create a GenBank Record object from scanner generated information.
1082 """
1092
1093 - def wgs(self, content):
1095
1098
1099 - def locus(self, content):
1101
1102 - def size(self, content):
1104
1107
1110
1111 - def date(self, content):
1113
1116
1121
1122 - def nid(self, content):
1124
1125 - def pid(self, content):
1127
1130
1133
1134 - def gi(self, content):
1136
1139
1142
1145
1148
1151
1154
1157
1159 """Grab the reference number and signal the start of a new reference.
1160 """
1161
1162 if self._cur_reference is not None:
1163 self.data.references.append(self._cur_reference)
1164
1165 self._cur_reference = Record.Reference()
1166 self._cur_reference.number = content
1167
1169 self._cur_reference.bases = content
1170
1172 self._cur_reference.authors = content
1173
1175 self._cur_reference.consrtm = content
1176
1177 - def title(self, content):
1178 if self._cur_reference is None:
1179 import warnings
1180 warnings.warn("GenBank TITLE line without REFERENCE line.")
1181 return
1182 self._cur_reference.title = content
1183
1185 self._cur_reference.journal = content
1186
1189
1192
1194 self._cur_reference.remark = content
1195
1198
1202
1205
1207 """Get ready for the feature table when we reach the FEATURE line.
1208 """
1209 self.start_feature_table()
1210
1212 """Signal the start of the feature table.
1213 """
1214
1215 if self._cur_reference is not None:
1216 self.data.references.append(self._cur_reference)
1217
1219 """Grab the key of the feature and signal the start of a new feature.
1220 """
1221
1222 self._add_feature()
1223
1224 self._cur_feature = Record.Feature()
1225 self._cur_feature.key = content
1226
1228 """Utility function to add a feature to the Record.
1229
1230 This does all of the appropriate checking to make sure we haven't
1231 left any info behind, and that we are only adding info if it
1232 exists.
1233 """
1234 if self._cur_feature is not None:
1235
1236
1237 if self._cur_qualifier is not None:
1238 self._cur_feature.qualifiers.append(self._cur_qualifier)
1239
1240 self._cur_qualifier = None
1241 self.data.features.append(self._cur_feature)
1242
1245
1247 """Deal with qualifier names
1248
1249 We receive a list of keys, since you can have valueless keys such as
1250 /pseudo which would be passed in with the next key (since no other
1251 tags separate them in the file)
1252 """
1253 for content in content_list:
1254
1255 if content.find("/") != 0:
1256 content = "/%s" % content
1257
1258 if self._cur_qualifier is not None:
1259 self._cur_feature.qualifiers.append(self._cur_qualifier)
1260
1261 self._cur_qualifier = Record.Qualifier()
1262 self._cur_qualifier.key = content
1263
1275
1277 self.data.base_counts = content
1278
1280 self.data.origin = content
1281
1283 """Signal that we have contig information to add to the record.
1284 """
1285 self.data.contig = self._clean_location(content)
1286
1288 """Add sequence information to a list of sequence strings.
1289
1290 This removes spaces in the data and uppercases the sequence, and
1291 then adds it to a list of sequences. Later on we'll join this
1292 list together to make the final sequence. This is faster than
1293 adding on the new string every time.
1294 """
1295 new_seq = content.replace(' ', '')
1296 self._seq_data.append(new_seq.upper())
1297
1299 """Signal the end of the record and do any necessary clean-up.
1300 """
1301
1302
1303 self.data.sequence = "".join(self._seq_data)
1304
1305 self._add_feature()
1306
1307
1309 """Access GenBank using a read-only dictionary interface (DEPRECATED).
1310
1311 This object is deprecated and will be removed in a future release of
1312 Biopython. Please use Bio.Entrez instead as described in the tutorial.
1313 """
1314 VALID_DATABASES = ['nucleotide', 'protein', 'genome']
1315 VALID_FORMATS = ['genbank', 'fasta']
1316 - def __init__(self, database, format, parser = None):
1317 """Initialize an NCBI dictionary to retrieve sequences.
1318
1319 Create a new Dictionary to access GenBank. Valid values for
1320 database are 'nucleotide' and 'protein'.
1321 Valid values for format are 'genbank' (for nucleotide genbank and
1322 protein genpept) and 'fasta'.
1323 dely and retmax are old options kept only for compatibility -- do not
1324 bother to set them.
1325 parser is an optional parser object
1326 to change the results into another form. If unspecified, then
1327 the raw contents of the file will be returned.
1328 """
1329 import warnings
1330 warnings.warn("Bio.GenBank.NCBIDictionary has been deprecated, and will be"\
1331 " removed in a future release of Biopython. Please use"\
1332 " Bio.Entrez instead which is described in the tutorial.",
1333 DeprecationWarning)
1334
1335 self.parser = parser
1336 if database not in self.__class__.VALID_DATABASES:
1337 raise ValueError("Invalid database %s, should be one of %s" %
1338 (database, self.__class__.VALID_DATABASES))
1339 if format not in self.__class__.VALID_FORMATS:
1340 raise ValueError("Invalid format %s, should be one of %s" %
1341 (format, self.__class__.VALID_FORMATS))
1342
1343 if format=="genbank": format = "gb"
1344 self.db = database
1345 self.format = format
1346
1348 raise NotImplementedError("GenBank contains lots of entries")
1350 raise NotImplementedError("This is a read-only dictionary")
1352 raise NotImplementedError("This is a read-only dictionary")
1354 raise NotImplementedError("This is a read-only dictionary")
1356 raise NotImplementedError("You don't need to do this...")
1358 raise NotImplementedError("You don't really want to do this...")
1360 raise NotImplementedError("You don't really want to do this...")
1362 raise NotImplementedError("You don't really want to do this...")
1363
1365 """S.has_key(id) -> bool"""
1366 try:
1367 self[id]
1368 except KeyError:
1369 return 0
1370 return 1
1371
1372 - def get(self, id, failobj=None):
1373 try:
1374 return self[id]
1375 except KeyError:
1376 return failobj
1377
1379 """Return the GenBank entry specified by the GenBank ID.
1380
1381 Raises a KeyError if there's an error.
1382 """
1383 handle = Entrez.efetch(db = self.db, id = id, rettype = self.format)
1384
1385 if self.parser is not None:
1386 return self.parser.parse(handle)
1387 return handle.read()
1388
1389 -def search_for(search, database='nucleotide',
1390 reldate=None, mindate=None, maxdate=None,
1391 start_id = 0, max_ids = 50000000):
1392 """Do an online search at the NCBI, returns a list of IDs (DEPRECATED).
1393
1394 This function is deprecated and will be removed in a future release of
1395 Biopython. Please use Bio.Entrez instead as described in the tutorial.
1396
1397 Search GenBank and return a list of the GenBank identifiers (gi's)
1398 that match the criteria. search is the search string used to
1399 search the database. Valid values for database are
1400 'nucleotide', 'protein', 'popset' and 'genome'. reldate is
1401 the number of dates prior to the current date to restrict the
1402 search. mindate and maxdate are the dates to restrict the search,
1403 e.g. 2002/12/20. start_id is the number to begin retrieval on.
1404 max_ids specifies the maximum number of id's to retrieve.
1405 """
1406 import warnings
1407 warnings.warn("Bio.GenBank.search_for has been deprecated, and will be"\
1408 " removed in a future release of Biopython. Please use"\
1409 " Bio.Entrez instead which is described in the tutorial.",
1410 DeprecationWarning)
1411
1412
1413
1414
1415
1416
1417 import re
1418 _date_re_match = re.compile(r"\d{4}(/\d\d(/\d\d)?)?$").match
1419 errinfo = None
1420 if mindate is not None and _date_re_match(mindate) is None:
1421 errinfo = ("mindate", mindate)
1422 elif maxdate is not None and _date_re_match(maxdate) is None:
1423 errinfo = ("maxdate", maxdate)
1424 if errinfo:
1425 raise TypeError(
1426 "%s is not in YYYY/MM/DD format (month and "
1427 "day are optional): %r" % errinfo)
1428
1429
1430 handle = Entrez.esearch(database, search, retmode="xml",
1431 retstart=start_id, retmax=max_ids,
1432 mindate=mindate, maxdate=maxdate,
1433 reldate=reldate)
1434 return Entrez.read(handle)["IdList"]
1435
1437 """Download multiple NCBI GenBank records, returned as a handle (DEPRECATED).
1438
1439 This function is deprecated and will be removed in a future release of
1440 Biopython. Please use Bio.Entrez instead as described in the tutorial.
1441
1442 Download many records from GenBank. ids is a list of gis or
1443 accessions.
1444 """
1445 import warnings
1446 warnings.warn("Bio.GenBank.download_many has been deprecated, and will be"\
1447 " removed in a future release of Biopython. Please use"\
1448 " Bio.Entrez instead which is described in the tutorial.",
1449 DeprecationWarning)
1450
1451 if database in ['nucleotide']:
1452 format = 'gb'
1453 elif database in ['protein']:
1454 format = 'gp'
1455 else:
1456 raise ValueError("Unexpected database: %s" % database)
1457
1458 result_handle = Entrez.efetch(database,
1459 id=",".join(ids),
1460 retmode = "text",
1461 rettype = format)
1462 return cStringIO.StringIO(result_handle.read())
1463