1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 import warnings
28 import os
29 from Bio.Seq import Seq
30 from Bio.SeqRecord import SeqRecord
31 from Bio.Alphabet import generic_alphabet, generic_protein
32
34 """Basic functions for breaking up a GenBank/EMBL file into sub sections.
35
36 The International Nucleotide Sequence Database Collaboration (INSDC)
37 between the DDBJ, EMBL, and GenBank. These organisations all use the
38 same "Feature Table" layout in their plain text flat file formats.
39
40 However, the header and sequence sections of an EMBL file are very
41 different in layout to those produced by GenBank/DDBJ."""
42
43
44 RECORD_START = "XXX"
45 HEADER_WIDTH = 3
46 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"]
47 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"]
48 FEATURE_QUALIFIER_INDENT = 0
49 FEATURE_QUALIFIER_SPACER = ""
50 SEQUENCE_HEADERS=["XXX"]
51
59
63
65 """Read in lines until find the ID/LOCUS line, which is returned.
66
67 Any preamble (such as the header used by the NCBI on *.seq.gz archives)
68 will we ignored."""
69 while True:
70 if self.line:
71 line = self.line
72 self.line = ""
73 else:
74 line = self.handle.readline()
75 if not line:
76 if self.debug : print "End of file"
77 return None
78 if line[:self.HEADER_WIDTH]==self.RECORD_START:
79 if self.debug > 1: print "Found the start of a record:\n" + line
80 break
81 line = line.rstrip()
82 if line == "//":
83 if self.debug > 1: print "Skipping // marking end of last record"
84 elif line == "":
85 if self.debug > 1: print "Skipping blank line before record"
86 else:
87
88 if self.debug > 1:
89 print "Skipping header line before record:\n" + line
90 self.line = line
91 return line
92
94 """Return list of strings making up the header
95
96 New line characters are removed.
97
98 Assumes you have just read in the ID/LOCUS line.
99 """
100 assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \
101 "Not at start of record"
102
103 header_lines = []
104 while True:
105 line = self.handle.readline()
106 if not line:
107 raise ValueError("Premature end of line during sequence data")
108 line = line.rstrip()
109 if line in self.FEATURE_START_MARKERS:
110 if self.debug : print "Found header table"
111 break
112
113
114
115 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
116 if self.debug : print "Found start of sequence"
117 break
118 if line == "//":
119 raise ValueError("Premature end of sequence data marker '//' found")
120 header_lines.append(line)
121 self.line = line
122 return header_lines
123
175
177 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers)
178
179 For example given this GenBank feature:
180
181 CDS complement(join(490883..490885,1..879))
182 /locus_tag="NEQ001"
183 /note="conserved hypothetical [Methanococcus jannaschii];
184 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear
185 localization signal; IPR002743: Protein of unknown
186 function DUF57"
187 /codon_start=1
188 /transl_table=11
189 /product="hypothetical protein"
190 /protein_id="NP_963295.1"
191 /db_xref="GI:41614797"
192 /db_xref="GeneID:2732620"
193 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK
194 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK
195 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP
196 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE
197 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS
198 LNSMGFGFVNTKKNSAR"
199
200 Then should give input key="CDS" and the rest of the data as a list of strings
201 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"]
202 where the leading spaces and trailing newlines have been removed.
203
204 Returns tuple containing: (key as string, location string, qualifiers as list)
205 as follows for this example:
206
207 key = "CDS", string
208 location = "complement(join(490883..490885,1..879))", string
209 qualifiers = list of string tuples:
210
211 [('locus_tag', '"NEQ001"'),
212 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'),
213 ('codon_start', '1'),
214 ('transl_table', '11'),
215 ('product', '"hypothetical protein"'),
216 ('protein_id', '"NP_963295.1"'),
217 ('db_xref', '"GI:41614797"'),
218 ('db_xref', '"GeneID:2732620"'),
219 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')]
220
221 In the above example, the "note" and "translation" were edited for compactness,
222 and they would contain multiple new line characters (displayed above as \n)
223
224 If a qualifier is quoted (in this case, everything except codon_start and
225 transl_table) then the quotes are NOT removed.
226
227 Note that no whitespace is removed.
228 """
229
230 iterator = iter(filter(None, lines))
231 try:
232 line = iterator.next()
233
234 feature_location = line.strip()
235 while feature_location[-1:]==",":
236
237 feature_location += iterator.next().strip()
238
239 qualifiers=[]
240
241 for line in iterator:
242 if line[0]=="/":
243
244 i = line.find("=")
245 key = line[1:i]
246 value = line[i+1:]
247 if i==-1:
248
249 key = line[1:]
250 qualifiers.append((key,None))
251 elif value[0]=='"':
252
253 if value[-1]!='"' or value!='"':
254
255 while value[-1] != '"':
256 value += "\n" + iterator.next()
257 else:
258
259 assert value == '"'
260 if self.debug : print "Quoted line %s:%s" % (key, value)
261
262 qualifiers.append((key,value))
263 else:
264
265
266 qualifiers.append((key,value))
267 else:
268
269 assert len(qualifiers) > 0
270 assert key==qualifiers[-1][0]
271
272 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line)
273 return (feature_key, feature_location, qualifiers)
274 except StopIteration:
275
276 raise ValueError("Problem with '%s' feature:\n%s" \
277 % (feature_key, "\n".join(lines)))
278
299
301 """Handle the LOCUS/ID line, passing data to the comsumer
302
303 This should be implemented by the EMBL / GenBank specific subclass
304
305 Used by the parse_records() and parse() methods.
306 """
307 pass
308
310 """Handle the header lines (list of strings), passing data to the comsumer
311
312 This should be implemented by the EMBL / GenBank specific subclass
313
314 Used by the parse_records() and parse() methods.
315 """
316 pass
317
318
332
334 """Handle any lines between features and sequence (list of strings), passing data to the consumer
335
336 This should be implemented by the EMBL / GenBank specific subclass
337
338 Used by the parse_records() and parse() methods.
339 """
340 pass
341
342 - def feed(self, handle, consumer, do_features=True):
343 """Feed a set of data into the consumer.
344
345 This method is intended for use with the "old" code in Bio.GenBank
346
347 Arguments:
348 handle - A handle with the information to parse.
349 consumer - The consumer that should be informed of events.
350 do_features - Boolean, should the features be parsed?
351 Skipping the features can be much faster.
352
353 Return values:
354 true - Passed a record
355 false - Did not find a record
356 """
357
358
359 self.set_handle(handle)
360 if not self.find_start():
361
362 consumer.data=None
363 return False
364
365
366
367
368
369
370 self._feed_first_line(consumer, self.line)
371 self._feed_header_lines(consumer, self.parse_header())
372
373
374 if do_features:
375 self._feed_feature_table(consumer, self.parse_features(skip=False))
376 else:
377 self.parse_features(skip=True)
378
379
380 misc_lines, sequence_string = self.parse_footer()
381 self._feed_misc_lines(consumer, misc_lines)
382
383 consumer.sequence(sequence_string)
384
385 consumer.record_end("//")
386
387 assert self.line == "//"
388
389
390 return True
391
392 - def parse(self, handle, do_features=True):
407
408
410 """Returns a SeqRecord object iterator
411
412 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord
413
414 The SeqRecord objects include SeqFeatures if do_features=True
415
416 This method is intended for use in Bio.SeqIO
417 """
418
419 while True:
420 record = self.parse(handle, do_features)
421 if record is None : break
422 assert record.id is not None
423 assert record.name != "<unknown name>"
424 assert record.description != "<unknown description>"
425 yield record
426
430 """Returns SeqRecord object iterator
431
432 Each CDS feature becomes a SeqRecord.
433
434 alphabet - Used for any sequence found in a translation field.
435 tags2id - Tupple of three strings, the feature keys to use
436 for the record id, name and description,
437
438 This method is intended for use in Bio.SeqIO
439 """
440 self.set_handle(handle)
441 while self.find_start():
442
443 self.parse_header()
444 feature_tuples = self.parse_features()
445
446 while True:
447 line = self.handle.readline()
448 if not line : break
449 if line[:2]=="//" : break
450 self.line = line.rstrip()
451
452
453 for key, location_string, qualifiers in feature_tuples:
454 if key=="CDS":
455
456
457
458
459
460 record = SeqRecord(seq=None)
461 annotations = record.annotations
462
463
464
465
466 annotations['raw_location'] = location_string.replace(' ','')
467
468 for (qualifier_name, qualifier_data) in qualifiers:
469 if qualifier_data is not None \
470 and qualifier_data[0]=='"' and qualifier_data[-1]=='"':
471
472 qualifier_data = qualifier_data[1:-1]
473
474 if qualifier_name == "translation":
475 assert record.seq is None, "Multiple translations!"
476 record.seq = Seq(qualifier_data.replace("\n",""), alphabet)
477 elif qualifier_name == "db_xref":
478
479 record.dbxrefs.append(qualifier_data)
480 else:
481 if qualifier_data is not None:
482 qualifier_data = qualifier_data.replace("\n"," ").replace(" "," ")
483 try:
484 annotations[qualifier_name] += " " + qualifier_data
485 except KeyError:
486
487 annotations[qualifier_name]= qualifier_data
488
489
490
491 try:
492 record.id = annotations[tags2id[0]]
493 except KeyError:
494 pass
495 try:
496 record.name = annotations[tags2id[1]]
497 except KeyError:
498 pass
499 try:
500 record.description = annotations[tags2id[2]]
501 except KeyError:
502 pass
503
504 yield record
505
507 """For extracting chunks of information in EMBL files"""
508
509 RECORD_START = "ID "
510 HEADER_WIDTH = 5
511 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers","FH"]
512 FEATURE_END_MARKERS = ["XX"]
513 FEATURE_QUALIFIER_INDENT = 21
514 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2)
515 SEQUENCE_HEADERS=["SQ"]
516
550
561
563
564
565
566 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
567 fields = [line[self.HEADER_WIDTH:].split(None,1)[0]]
568 fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";"))
569 fields = [entry.strip() for entry in fields]
570 """
571 The tokens represent:
572 0. Primary accession number
573 (space sep)
574 1. ??? (e.g. standard)
575 (semi-colon)
576 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA')
577 3. Taxonomic division (e.g. 'PRO')
578 4. Sequence length (e.g. '4639675 BP.')
579 """
580 consumer.locus(fields[0])
581 consumer.residue_type(fields[2])
582 consumer.data_file_division(fields[3])
583 self._feed_seq_length(consumer, fields[4])
584
586
587
588
589 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
590 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")]
591 assert len(fields) == 7
592 """
593 The tokens represent:
594 0. Primary accession number
595 1. Sequence version number
596 2. Topology: 'circular' or 'linear'
597 3. Molecule type (e.g. 'genomic DNA')
598 4. Data class (e.g. 'STD')
599 5. Taxonomic division (e.g. 'PRO')
600 6. Sequence length (e.g. '4639675 BP.')
601 """
602
603 consumer.locus(fields[0])
604
605
606
607 consumer.accession(fields[0])
608
609
610
611 version_parts = fields[1].split()
612 if len(version_parts)==2 \
613 and version_parts[0]=="SV" \
614 and version_parts[1].isdigit():
615 consumer.version_suffix(version_parts[1])
616
617
618 consumer.residue_type(" ".join(fields[2:4]))
619
620
621
622 consumer.data_file_division(fields[5])
623
624 self._feed_seq_length(consumer, fields[6])
625
627 length_parts = text.split()
628 assert len(length_parts) == 2
629 assert length_parts[1].upper() in ["BP", "BP."]
630 consumer.size(length_parts[0])
631
633 EMBL_INDENT = self.HEADER_WIDTH
634 EMBL_SPACER = " " * EMBL_INDENT
635 consumer_dict = {
636 'AC' : 'accession',
637 'SV' : 'version',
638 'DE' : 'definition',
639
640
641
642 'RA' : 'authors',
643 'RT' : 'title',
644 'RL' : 'journal',
645 'OS' : 'organism',
646 'OC' : 'taxonomy',
647
648 'CC' : 'comment',
649
650 }
651
652
653 lines = filter(None,lines)
654 line_iter = iter(lines)
655 try:
656 while True:
657 try:
658 line = line_iter.next()
659 except StopIteration:
660 break
661 if not line : break
662 line_type = line[:EMBL_INDENT].strip()
663 data = line[EMBL_INDENT:].strip()
664
665 if line_type == 'XX':
666 pass
667 elif line_type == 'RN':
668
669
670 if data[0] == "[" and data[-1] == "]" : data = data[1:-1]
671 consumer.reference_num(data)
672 elif line_type == 'RP':
673
674
675 assert data.count("-")==1
676 consumer.reference_bases("(bases " + data.replace("-", " to ") + ")")
677 elif line_type == 'RX':
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693 key, value = data.split(";",1)
694 if value.endswith(".") : value = value[:-1]
695 value = value.strip()
696 if key == "PUBMED":
697 consumer.pubmed_id(value)
698
699 elif line_type == 'CC':
700
701 consumer.comment([data])
702 elif line_type == 'DR':
703
704
705
706
707
708
709
710
711 pass
712 elif line_type in consumer_dict:
713
714 getattr(consumer, consumer_dict[line_type])(data)
715 else:
716 if self.debug:
717 print "Ignoring EMBL header line:\n%s" % line
718 except StopIteration:
719 raise ValueError("Problem with header")
720
724
726 """For extracting chunks of information in GenBank files"""
727
728 RECORD_START = "LOCUS "
729 HEADER_WIDTH = 12
730 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers","FEATURES"]
731 FEATURE_END_MARKERS = []
732 FEATURE_QUALIFIER_INDENT = 21
733 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
734 SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT", "WGS"]
735
781
783
784
785
786 GENBANK_INDENT = self.HEADER_WIDTH
787 GENBANK_SPACER = " "*GENBANK_INDENT
788 assert line[0:GENBANK_INDENT] == 'LOCUS ', \
789 'LOCUS line does not start correctly:\n' + line
790
791
792
793 if line[29:33] in [' bp ', ' aa ',' rc ']:
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812 assert line[29:33] in [' bp ', ' aa ',' rc '] , \
813 'LOCUS line does not contain size units at expected position:\n' + line
814 assert line[41:42] == ' ', \
815 'LOCUS line does not contain space at position 42:\n' + line
816 assert line[42:51].strip() in ['','linear','circular'], \
817 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
818 assert line[51:52] == ' ', \
819 'LOCUS line does not contain space at position 52:\n' + line
820 assert line[55:62] == ' ', \
821 'LOCUS line does not contain spaces from position 56 to 62:\n' + line
822 if line[62:73].strip():
823 assert line[64:65] == '-', \
824 'LOCUS line does not contain - at position 65 in date:\n' + line
825 assert line[68:69] == '-', \
826 'LOCUS line does not contain - at position 69 in date:\n' + line
827
828 name_and_length_str = line[GENBANK_INDENT:29]
829 while name_and_length_str.find(' ')!=-1:
830 name_and_length_str = name_and_length_str.replace(' ',' ')
831 name_and_length = name_and_length_str.split(' ')
832 assert len(name_and_length)<=2, \
833 'Cannot parse the name and length in the LOCUS line:\n' + line
834 assert len(name_and_length)!=1, \
835 'Name and length collide in the LOCUS line:\n' + line
836
837
838
839 consumer.locus(name_and_length[0])
840 consumer.size(name_and_length[1])
841
842
843 if line[33:51].strip() == "" and line[29:33] == ' aa ':
844
845
846
847
848 consumer.residue_type("PROTEIN")
849 else:
850 consumer.residue_type(line[33:51].strip())
851
852 consumer.data_file_division(line[52:55])
853 if line[62:73].strip():
854 consumer.date(line[62:73])
855 elif line[40:44] in [' bp ', ' aa ',' rc ']:
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875 assert line[40:44] in [' bp ', ' aa ',' rc '] , \
876 'LOCUS line does not contain size units at expected position:\n' + line
877 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \
878 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
879 assert line[47:54].strip() == "" \
880 or line[47:54].strip().find('DNA') != -1 \
881 or line[47:54].strip().find('RNA') != -1, \
882 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
883 assert line[54:55] == ' ', \
884 'LOCUS line does not contain space at position 55:\n' + line
885 assert line[55:63].strip() in ['','linear','circular'], \
886 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
887 assert line[63:64] == ' ', \
888 'LOCUS line does not contain space at position 64:\n' + line
889 assert line[67:68] == ' ', \
890 'LOCUS line does not contain space at position 68:\n' + line
891 if line[68:79].strip():
892 assert line[70:71] == '-', \
893 'LOCUS line does not contain - at position 71 in date:\n' + line
894 assert line[74:75] == '-', \
895 'LOCUS line does not contain - at position 75 in date:\n' + line
896
897 name_and_length_str = line[GENBANK_INDENT:40]
898 while name_and_length_str.find(' ')!=-1:
899 name_and_length_str = name_and_length_str.replace(' ',' ')
900 name_and_length = name_and_length_str.split(' ')
901 assert len(name_and_length)<=2, \
902 'Cannot parse the name and length in the LOCUS line:\n' + line
903 assert len(name_and_length)!=1, \
904 'Name and length collide in the LOCUS line:\n' + line
905
906
907
908 consumer.locus(name_and_length[0])
909 consumer.size(name_and_length[1])
910
911 if line[44:54].strip() == "" and line[40:44] == ' aa ':
912
913
914
915
916 consumer.residue_type(("PROTEIN " + line[54:63]).strip())
917 else:
918 consumer.residue_type(line[44:63].strip())
919
920 consumer.data_file_division(line[64:67])
921 if line[68:79].strip():
922 consumer.date(line[68:79])
923 elif line[GENBANK_INDENT:].strip().count(" ")==0 :
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939 if line[GENBANK_INDENT:].strip() != "":
940 consumer.locus(line[GENBANK_INDENT:].strip())
941 else:
942
943
944 warnings.warn("Minimal LOCUS line found - is this correct?\n" + line)
945 elif len(line.split())>=4 and line.split()[3] in ["aa","bp"]:
946
947
948 consumer.locus(line.split()[1])
949 consumer.size(line.split()[2])
950 warnings.warn("Malformed LOCUS line found - is this correct?\n" + line)
951 else:
952 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
953
954
956
957
958
959
960 GENBANK_INDENT = self.HEADER_WIDTH
961 GENBANK_SPACER = " "*GENBANK_INDENT
962 consumer_dict = {
963 'DEFINITION' : 'definition',
964 'ACCESSION' : 'accession',
965 'NID' : 'nid',
966 'PID' : 'pid',
967 'DBSOURCE' : 'db_source',
968 'KEYWORDS' : 'keywords',
969 'SEGMENT' : 'segment',
970 'SOURCE' : 'source',
971 'AUTHORS' : 'authors',
972 'CONSRTM' : 'consrtm',
973 'PROJECT' : 'project',
974 'DBLINK' : 'dblink',
975 'TITLE' : 'title',
976 'JOURNAL' : 'journal',
977 'MEDLINE' : 'medline_id',
978 'PUBMED' : 'pubmed_id',
979 'REMARK' : 'remark'}
980
981
982
983
984
985
986 lines = filter(None,lines)
987 lines.append("")
988 line_iter = iter(lines)
989 try:
990 line = line_iter.next()
991 while True:
992 if not line : break
993 line_type = line[:GENBANK_INDENT].strip()
994 data = line[GENBANK_INDENT:].strip()
995
996 if line_type == 'VERSION':
997
998
999
1000 while data.find(' ')!=-1:
1001 data = data.replace(' ',' ')
1002 if data.find(' GI:')==-1:
1003 consumer.version(data)
1004 else:
1005 if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]"
1006 consumer.version(data.split(' GI:')[0])
1007 consumer.gi(data.split(' GI:')[1])
1008
1009 line = line_iter.next()
1010 elif line_type == 'REFERENCE':
1011 if self.debug >1 : print "Found reference [" + data + "]"
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022 data = data.strip()
1023
1024
1025 while True:
1026 line = line_iter.next()
1027 if line[:GENBANK_INDENT] == GENBANK_SPACER:
1028
1029 data += " " + line[GENBANK_INDENT:]
1030 if self.debug >1 : print "Extended reference text [" + data + "]"
1031 else:
1032
1033 break
1034
1035
1036
1037 while data.find(' ')!=-1:
1038 data = data.replace(' ',' ')
1039 if data.find(' ')==-1:
1040 if self.debug >2 : print 'Reference number \"' + data + '\"'
1041 consumer.reference_num(data)
1042 else:
1043 if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"'
1044 consumer.reference_num(data[:data.find(' ')])
1045 consumer.reference_bases(data[data.find(' ')+1:])
1046 elif line_type == 'ORGANISM':
1047
1048
1049
1050
1051
1052
1053
1054
1055 organism_data = data
1056 lineage_data = ""
1057 while True:
1058 line = line_iter.next()
1059 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1060 if lineage_data or ";" in line:
1061 lineage_data += " " + line[GENBANK_INDENT:]
1062 else:
1063 organism_data += " " + line[GENBANK_INDENT:].strip()
1064 else:
1065
1066 break
1067 consumer.organism(organism_data)
1068 if lineage_data.strip() == "" and self.debug > 1:
1069 print "Taxonomy line(s) missing or blank"
1070 consumer.taxonomy(lineage_data.strip())
1071 del organism_data, lineage_data
1072 elif line_type == 'COMMENT':
1073 if self.debug > 1 : print "Found comment"
1074
1075
1076 comment_list=[]
1077 comment_list.append(data)
1078 while True:
1079 line = line_iter.next()
1080 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1081 data = line[GENBANK_INDENT:]
1082 comment_list.append(data)
1083 if self.debug > 2 : print "Comment continuation [" + data + "]"
1084 else:
1085
1086 break
1087 consumer.comment(comment_list)
1088 del comment_list
1089 elif line_type in consumer_dict:
1090
1091
1092 while True:
1093 line = line_iter.next()
1094 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1095 data += ' ' + line[GENBANK_INDENT:]
1096 else:
1097
1098 getattr(consumer, consumer_dict[line_type])(data)
1099
1100 break
1101 else:
1102 if self.debug:
1103 print "Ignoring GenBank header line:\n" % line
1104
1105 line = line_iter.next()
1106 except StopIteration:
1107 raise ValueError("Problem in header")
1108
1149
1150 if __name__ == "__main__":
1151 from StringIO import StringIO
1152
1153 gbk_example = \
1154 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999
1155 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
1156 (AXL2) and Rev7p (REV7) genes, complete cds.
1157 ACCESSION U49845
1158 VERSION U49845.1 GI:1293613
1159 KEYWORDS .
1160 SOURCE Saccharomyces cerevisiae (baker's yeast)
1161 ORGANISM Saccharomyces cerevisiae
1162 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
1163 Saccharomycetales; Saccharomycetaceae; Saccharomyces.
1164 REFERENCE 1 (bases 1 to 5028)
1165 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
1166 TITLE Cloning and sequence of REV7, a gene whose function is required for
1167 DNA damage-induced mutagenesis in Saccharomyces cerevisiae
1168 JOURNAL Yeast 10 (11), 1503-1509 (1994)
1169 PUBMED 7871890
1170 REFERENCE 2 (bases 1 to 5028)
1171 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M.
1172 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel
1173 plasma membrane glycoprotein
1174 JOURNAL Genes Dev. 10 (7), 777-793 (1996)
1175 PUBMED 8846915
1176 REFERENCE 3 (bases 1 to 5028)
1177 AUTHORS Roemer,T.
1178 TITLE Direct Submission
1179 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
1180 Haven, CT, USA
1181 FEATURES Location/Qualifiers
1182 source 1..5028
1183 /organism="Saccharomyces cerevisiae"
1184 /db_xref="taxon:4932"
1185 /chromosome="IX"
1186 /map="9"
1187 CDS <1..206
1188 /codon_start=3
1189 /product="TCP1-beta"
1190 /protein_id="AAA98665.1"
1191 /db_xref="GI:1293614"
1192 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
1193 AEVLLRVDNIIRARPRTANRQHM"
1194 gene 687..3158
1195 /gene="AXL2"
1196 CDS 687..3158
1197 /gene="AXL2"
1198 /note="plasma membrane glycoprotein"
1199 /codon_start=1
1200 /function="required for axial budding pattern of S.
1201 cerevisiae"
1202 /product="Axl2p"
1203 /protein_id="AAA98666.1"
1204 /db_xref="GI:1293615"
1205 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
1206 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
1207 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
1208 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
1209 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
1210 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
1211 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
1212 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
1213 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
1214 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
1215 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
1216 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
1217 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
1218 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
1219 VDFSNKSNVNVGQVKDIHGRIPEML"
1220 gene complement(3300..4037)
1221 /gene="REV7"
1222 CDS complement(3300..4037)
1223 /gene="REV7"
1224 /codon_start=1
1225 /product="Rev7p"
1226 /protein_id="AAA98667.1"
1227 /db_xref="GI:1293616"
1228 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
1229 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
1230 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
1231 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
1232 LISGDDKILNGVYSQYEEGESIFGSLF"
1233 ORIGIN
1234 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
1235 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
1236 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
1237 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg
1238 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa
1239 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa
1240 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat
1241 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga
1242 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc
1243 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga
1244 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta
1245 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag
1246 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa
1247 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata
1248 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga
1249 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac
1250 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg
1251 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc
1252 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa
1253 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca
1254 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac
1255 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa
1256 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag
1257 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct
1258 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac
1259 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa
1260 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc
1261 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata
1262 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca
1263 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc
1264 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc
1265 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca
1266 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc
1267 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg
1268 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt
1269 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc
1270 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg
1271 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca
1272 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata
1273 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg
1274 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga
1275 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt
1276 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat
1277 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt
1278 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc
1279 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag
1280 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta
1281 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa
1282 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact
1283 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt
1284 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa
1285 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag
1286 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct
1287 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt
1288 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact
1289 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa
1290 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg
1291 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt
1292 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc
1293 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca
1294 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc
1295 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc
1296 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat
1297 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa
1298 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga
1299 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat
1300 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc
1301 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc
1302 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa
1303 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg
1304 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc
1305 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt
1306 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg
1307 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg
1308 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt
1309 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt
1310 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat
1311 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc
1312 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct
1313 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta
1314 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac
1315 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct
1316 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct
1317 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
1318 //"""
1319
1320
1321
1322 gbk_example2 = \
1323 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001
1324 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica].
1325 ACCESSION AAD51968
1326 VERSION AAD51968.1 GI:5805369
1327 DBSOURCE locus AF171097 accession AF171097.1
1328 KEYWORDS .
1329 SOURCE Yersinia enterocolitica
1330 ORGANISM Yersinia enterocolitica
1331 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
1332 Enterobacteriaceae; Yersinia.
1333 REFERENCE 1 (residues 1 to 143)
1334 AUTHORS Revell,P.A. and Miller,V.L.
1335 TITLE A chromosomally encoded regulator is required for expression of the
1336 Yersinia enterocolitica inv gene and for virulence
1337 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000)
1338 MEDLINE 20138369
1339 PUBMED 10672189
1340 REFERENCE 2 (residues 1 to 143)
1341 AUTHORS Revell,P.A. and Miller,V.L.
1342 TITLE Direct Submission
1343 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington
1344 University School of Medicine, Campus Box 8230, 660 South Euclid,
1345 St. Louis, MO 63110, USA
1346 COMMENT Method: conceptual translation.
1347 FEATURES Location/Qualifiers
1348 source 1..143
1349 /organism="Yersinia enterocolitica"
1350 /mol_type="unassigned DNA"
1351 /strain="JB580v"
1352 /serotype="O:8"
1353 /db_xref="taxon:630"
1354 Protein 1..143
1355 /product="transcriptional regulator RovA"
1356 /name="regulates inv expression"
1357 CDS 1..143
1358 /gene="rovA"
1359 /coded_by="AF171097.1:380..811"
1360 /note="regulator of virulence"
1361 /transl_table=11
1362 ORIGIN
1363 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq
1364 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp
1365 121 deiellsgli dklerniiql qsk
1366 //
1367 """
1368
1369 embl_example="""ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
1370 XX
1371 AC X56734; S46826;
1372 XX
1373 DT 12-SEP-1991 (Rel. 29, Created)
1374 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
1375 XX
1376 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
1377 XX
1378 KW beta-glucosidase.
1379 XX
1380 OS Trifolium repens (white clover)
1381 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
1382 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
1383 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
1384 XX
1385 RN [5]
1386 RP 1-1859
1387 RX PUBMED; 1907511.
1388 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
1389 RT "Nucleotide and derived amino acid sequence of the cyanogenic
1390 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
1391 RL Plant Mol. Biol. 17(2):209-219(1991).
1392 XX
1393 RN [6]
1394 RP 1-1859
1395 RA Hughes M.A.;
1396 RT ;
1397 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases.
1398 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
1399 RL Upon Tyne, NE2 4HH, UK
1400 XX
1401 FH Key Location/Qualifiers
1402 FH
1403 FT source 1..1859
1404 FT /organism="Trifolium repens"
1405 FT /mol_type="mRNA"
1406 FT /clone_lib="lambda gt10"
1407 FT /clone="TRE361"
1408 FT /tissue_type="leaves"
1409 FT /db_xref="taxon:3899"
1410 FT CDS 14..1495
1411 FT /product="beta-glucosidase"
1412 FT /EC_number="3.2.1.21"
1413 FT /note="non-cyanogenic"
1414 FT /db_xref="GOA:P26204"
1415 FT /db_xref="InterPro:IPR001360"
1416 FT /db_xref="InterPro:IPR013781"
1417 FT /db_xref="UniProtKB/Swiss-Prot:P26204"
1418 FT /protein_id="CAA40058.1"
1419 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
1420 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
1421 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
1422 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
1423 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
1424 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
1425 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
1426 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
1427 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
1428 FT mRNA 1..1859
1429 FT /experiment="experimental evidence, no additional details
1430 FT recorded"
1431 XX
1432 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
1433 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
1434 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
1435 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
1436 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
1437 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
1438 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
1439 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
1440 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
1441 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
1442 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
1443 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
1444 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
1445 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
1446 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
1447 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
1448 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
1449 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
1450 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
1451 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
1452 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
1453 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
1454 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
1455 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
1456 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
1457 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
1458 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
1459 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
1460 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
1461 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
1462 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
1463 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859
1464 //
1465 """
1466
1467 print "GenBank CDS Iteration"
1468 print "====================="
1469
1470 g = GenBankScanner()
1471 for record in g.parse_cds_features(StringIO(gbk_example)):
1472 print record
1473
1474 g = GenBankScanner()
1475 for record in g.parse_cds_features(StringIO(gbk_example2),
1476 tags2id=('gene','locus_tag','product')):
1477 print record
1478
1479 g = GenBankScanner()
1480 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2),
1481 tags2id=('gene','locus_tag','product')):
1482 print record
1483
1484 print
1485 print "GenBank Iteration"
1486 print "================="
1487 g = GenBankScanner()
1488 for record in g.parse_records(StringIO(gbk_example),do_features=False):
1489 print record.id, record.name, record.description
1490 print record.seq
1491
1492 g = GenBankScanner()
1493 for record in g.parse_records(StringIO(gbk_example),do_features=True):
1494 print record.id, record.name, record.description
1495 print record.seq
1496
1497 g = GenBankScanner()
1498 for record in g.parse_records(StringIO(gbk_example2),do_features=False):
1499 print record.id, record.name, record.description
1500 print record.seq
1501
1502 g = GenBankScanner()
1503 for record in g.parse_records(StringIO(gbk_example2),do_features=True):
1504 print record.id, record.name, record.description
1505 print record.seq
1506
1507 print
1508 print "EMBL CDS Iteration"
1509 print "=================="
1510
1511 e = EmblScanner()
1512 for record in e.parse_cds_features(StringIO(embl_example)):
1513 print record
1514
1515 print
1516 print "EMBL Iteration"
1517 print "=============="
1518 e = EmblScanner()
1519 for record in e.parse_records(StringIO(embl_example),do_features=True):
1520 print record.id, record.name, record.description
1521 print record.seq
1522