1
2
3
4
5
6
7
8 """Represent a Sequence Record, a sequence with annotation."""
9 __docformat__ = "epytext en"
10
11
12
13
14
15
17 """Dict which only allows sequences of given length as values (PRIVATE).
18
19 This simple subclass of the Python dictionary is used in the SeqRecord
20 object for holding per-letter-annotations. This class is intended to
21 prevent simple errors by only allowing python sequences (e.g. lists,
22 strings and tuples) to be stored, and only if their length matches that
23 expected (the length of the SeqRecord's seq object). It cannot however
24 prevent the entries being edited in situ (for example appending entries
25 to a list).
26 """
32 if not hasattr(value,"__len__") or not hasattr(value,"__getitem__") \
33 or len(value) != self._length:
34 raise TypeError("We only allow python sequences (lists, tuples or "
35 "strings) of length %i." % self._length)
36 dict.__setitem__(self, key, value)
41
43 """A SeqRecord object holds a sequence and information about it.
44
45 Main attributes:
46 - id - Identifier such as a locus tag (string)
47 - seq - The sequence itself (Seq object or similar)
48
49 Additional attributes:
50 - name - Sequence name, e.g. gene name (string)
51 - description - Additional text (string)
52 - dbxrefs - List of database cross references (list of strings)
53 - features - Any (sub)features defined (list of SeqFeature objects)
54 - annotations - Further information about the whole sequence (dictionary)
55 Most entries are strings, or lists of strings.
56 - letter_annotations - Per letter/symbol annotation (restricted
57 dictionary). This holds Python sequences (lists, strings
58 or tuples) whose length matches that of the sequence.
59 A typical use would be to hold a list of integers
60 representing sequencing quality scores, or a string
61 representing the secondary structure.
62
63 You will typically use Bio.SeqIO to read in sequences from files as
64 SeqRecord objects. However, you may want to create your own SeqRecord
65 objects directly (see the __init__ method for further details):
66
67 >>> from Bio.Seq import Seq
68 >>> from Bio.SeqRecord import SeqRecord
69 >>> from Bio.Alphabet import IUPAC
70 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
71 ... IUPAC.protein),
72 ... id="YP_025292.1", name="HokC",
73 ... description="toxic membrane protein")
74 >>> print record
75 ID: YP_025292.1
76 Name: HokC
77 Description: toxic membrane protein
78 Number of features: 0
79 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
80
81 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO
82 for this. For the special case where you want the SeqRecord turned into
83 a string in a particular file format there is a format method which uses
84 Bio.SeqIO internally:
85
86 >>> print record.format("fasta")
87 >YP_025292.1 toxic membrane protein
88 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
89 <BLANKLINE>
90
91 You can also do things like slicing a SeqRecord, checking its length, etc
92
93 >>> len(record)
94 44
95 >>> edited = record[:10] + record[11:]
96 >>> print edited.seq
97 MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
98 >>> print record.seq
99 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
100
101 """
102 - def __init__(self, seq, id = "<unknown id>", name = "<unknown name>",
103 description = "<unknown description>", dbxrefs = None,
104 features = None, annotations = None,
105 letter_annotations = None):
106 """Create a SeqRecord.
107
108 Arguments:
109 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq)
110 - id - Sequence identifier, recommended (string)
111 - name - Sequence name, optional (string)
112 - description - Sequence description, optional (string)
113 - dbxrefs - Database cross references, optional (list of strings)
114 - features - Any (sub)features, optional (list of SeqFeature objects)
115 - annotations - Dictionary of annotations for the whole sequence
116 - letter_annotations - Dictionary of per-letter-annotations, values
117 should be strings, list or tuples of the same
118 length as the full sequence.
119
120 You will typically use Bio.SeqIO to read in sequences from files as
121 SeqRecord objects. However, you may want to create your own SeqRecord
122 objects directly.
123
124 Note that while an id is optional, we strongly recommend you supply a
125 unique id string for each record. This is especially important
126 if you wish to write your sequences to a file.
127
128 If you don't have the actual sequence, but you do know its length,
129 then using the UnknownSeq object from Bio.Seq is appropriate.
130
131 You can create a 'blank' SeqRecord object, and then populate the
132 attributes later.
133 """
134 if id is not None and not isinstance(id, basestring):
135
136 raise TypeError("id argument should be a string")
137 if not isinstance(name, basestring):
138 raise TypeError("name argument should be a string")
139 if not isinstance(description, basestring):
140 raise TypeError("description argument should be a string")
141 self._seq = seq
142 self.id = id
143 self.name = name
144 self.description = description
145
146
147 if dbxrefs is None:
148 dbxrefs = []
149 elif not isinstance(dbxrefs, list):
150 raise TypeError("dbxrefs argument should be a list (of strings)")
151 self.dbxrefs = dbxrefs
152
153
154 if annotations is None:
155 annotations = {}
156 elif not isinstance(annotations, dict):
157 raise TypeError("annotations argument should be a dict")
158 self.annotations = annotations
159
160 if letter_annotations is None:
161
162 if seq is None:
163
164 self._per_letter_annotations = _RestrictedDict(length=0)
165 else:
166 try:
167 self._per_letter_annotations = \
168 _RestrictedDict(length=len(seq))
169 except:
170 raise TypeError("seq argument should be a Seq object or similar")
171 else:
172
173
174
175 self.letter_annotations = letter_annotations
176
177
178 if features is None:
179 features = []
180 elif not isinstance(features, list):
181 raise TypeError("features argument should be a list (of SeqFeature objects)")
182 self.features = features
183
184
186 if not isinstance(value, dict):
187 raise TypeError("The per-letter-annotations should be a "
188 "(restricted) dictionary.")
189
190 try:
191 self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
192 except AttributeError:
193
194 self._per_letter_annotations = _RestrictedDict(length=0)
195 self._per_letter_annotations.update(value)
196 letter_annotations = property( \
197 fget=lambda self : self._per_letter_annotations,
198 fset=_set_per_letter_annotations,
199 doc="""Dictionary of per-letter-annotation for the sequence.
200
201 For example, this can hold quality scores used in FASTQ or QUAL files.
202 Consider this example using Bio.SeqIO to read in an example Solexa
203 variant FASTQ file as a SeqRecord:
204
205 >>> from Bio import SeqIO
206 >>> handle = open("Quality/solexa_faked.fastq", "rU")
207 >>> record = SeqIO.read(handle, "fastq-solexa")
208 >>> handle.close()
209 >>> print record.id, record.seq
210 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
211 >>> print record.letter_annotations.keys()
212 ['solexa_quality']
213 >>> print record.letter_annotations["solexa_quality"]
214 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
215
216 The letter_annotations get sliced automatically if you slice the
217 parent SeqRecord, for example taking the last ten bases:
218
219 >>> sub_record = record[-10:]
220 >>> print sub_record.id, sub_record.seq
221 slxa_0001_1_0001_01 ACGTNNNNNN
222 >>> print sub_record.letter_annotations["solexa_quality"]
223 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
224
225 Any python sequence (i.e. list, tuple or string) can be recorded in
226 the SeqRecord's letter_annotations dictionary as long as the length
227 matches that of the SeqRecord's sequence. e.g.
228
229 >>> len(sub_record.letter_annotations)
230 1
231 >>> sub_record.letter_annotations["dummy"] = "abcdefghij"
232 >>> len(sub_record.letter_annotations)
233 2
234
235 You can delete entries from the letter_annotations dictionary as usual:
236
237 >>> del sub_record.letter_annotations["solexa_quality"]
238 >>> sub_record.letter_annotations
239 {'dummy': 'abcdefghij'}
240
241 You can completely clear the dictionary easily as follows:
242
243 >>> sub_record.letter_annotations = {}
244 >>> sub_record.letter_annotations
245 {}
246 """)
247
249
250 if self._per_letter_annotations:
251
252 raise ValueError("You must empty the letter annotations first!")
253 self._seq = value
254 try:
255 self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
256 except AttributeError:
257
258 self._per_letter_annotations = _RestrictedDict(length=0)
259
260 seq = property(fget=lambda self : self._seq,
261 fset=_set_seq,
262 doc="The sequence itself, as a Seq or MutableSeq object.")
263
265 """Returns a sub-sequence or an individual letter.
266
267 Slicing, e.g. my_record[5:10], returns a new SeqRecord for
268 that sub-sequence with approriate annotation preserved. The
269 name, id and description are kept.
270
271 Any per-letter-annotations are sliced to match the requested
272 sub-sequence. Unless a stride is used, all those features
273 which fall fully within the subsequence are included (with
274 their locations adjusted accordingly).
275
276 However, the annotations dictionary and the dbxrefs list are
277 not used for the new SeqRecord, as in general they may not
278 apply to the subsequence. If you want to preserve them, you
279 must explictly copy them to the new SeqRecord yourself.
280
281 Using an integer index, e.g. my_record[5] is shorthand for
282 extracting that letter from the sequence, my_record.seq[5].
283
284 For example, consider this short protein and its secondary
285 structure as encoded by the PDB (e.g. H for alpha helices),
286 plus a simple feature for its histidine self phosphorylation
287 site:
288
289 >>> from Bio.Seq import Seq
290 >>> from Bio.SeqRecord import SeqRecord
291 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
292 >>> from Bio.Alphabet import IUPAC
293 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT"
294 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR",
295 ... IUPAC.protein),
296 ... id="1JOY", name="EnvZ",
297 ... description="Homodimeric domain of EnvZ from E. coli")
298 >>> rec.letter_annotations["secondary_structure"] = \
299 " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT "
300 >>> rec.features.append(SeqFeature(FeatureLocation(20,21),
301 ... type = "Site"))
302
303 Now let's have a quick look at the full record,
304
305 >>> print rec
306 ID: 1JOY
307 Name: EnvZ
308 Description: Homodimeric domain of EnvZ from E. coli
309 Number of features: 1
310 Per letter annotation for: secondary_structure
311 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
312 >>> print rec.letter_annotations["secondary_structure"]
313 S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT
314 >>> print rec.features[0].location
315 [20:21]
316
317 Now let's take a sub sequence, here chosen as the first (fractured)
318 alpha helix which includes the histidine phosphorylation site:
319
320 >>> sub = rec[11:41]
321 >>> print sub
322 ID: 1JOY
323 Name: EnvZ
324 Description: Homodimeric domain of EnvZ from E. coli
325 Number of features: 1
326 Per letter annotation for: secondary_structure
327 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein())
328 >>> print sub.letter_annotations["secondary_structure"]
329 HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH
330 >>> print sub.features[0].location
331 [9:10]
332
333 You can also of course omit the start or end values, for
334 example to get the first ten letters only:
335
336 >>> print rec[:10]
337 ID: 1JOY
338 Name: EnvZ
339 Description: Homodimeric domain of EnvZ from E. coli
340 Number of features: 0
341 Per letter annotation for: secondary_structure
342 Seq('MAAGVKQLAD', IUPACProtein())
343
344 Or for the last ten letters:
345
346 >>> print rec[-10:]
347 ID: 1JOY
348 Name: EnvZ
349 Description: Homodimeric domain of EnvZ from E. coli
350 Number of features: 0
351 Per letter annotation for: secondary_structure
352 Seq('IIEQFIDYLR', IUPACProtein())
353
354 If you omit both, then you get a copy of the original record (although
355 lacking the annotations and dbxrefs):
356
357 >>> print rec[:]
358 ID: 1JOY
359 Name: EnvZ
360 Description: Homodimeric domain of EnvZ from E. coli
361 Number of features: 1
362 Per letter annotation for: secondary_structure
363 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
364
365 Finally, indexing with a simple integer is shorthand for pulling out
366 that letter from the sequence directly:
367
368 >>> rec[5]
369 'K'
370 >>> rec.seq[5]
371 'K'
372 """
373 if isinstance(index, int):
374
375
376
377 return self.seq[index]
378 elif isinstance(index, slice):
379 if self.seq is None:
380 raise ValueError("If the sequence is None, we cannot slice it.")
381 parent_length = len(self)
382 answer = self.__class__(self.seq[index],
383 id=self.id,
384 name=self.name,
385 description=self.description)
386
387
388
389
390
391
392
393
394
395
396
397 if index.step is None or index.step == 1:
398
399 if index.start is None:
400 start = 0
401 else:
402 start = index.start
403 if index.stop is None:
404 stop = -1
405 else:
406 stop = index.stop
407 if (start < 0 or stop < 0) and parent_length == 0:
408 raise ValueError, \
409 "Cannot support negative indices without the sequence length"
410 if start < 0:
411 start = parent_length + start
412 if stop < 0:
413 stop = parent_length + stop + 1
414
415 for f in self.features:
416 if f.ref or f.ref_db:
417
418 import warnings
419 warnings.warn("When slicing SeqRecord objects, any "
420 "SeqFeature referencing other sequences (e.g. "
421 "from segmented GenBank records) is ignored.")
422 continue
423 if start <= f.location.nofuzzy_start \
424 and f.location.nofuzzy_end <= stop:
425 answer.features.append(f._shift(-start))
426
427
428
429 for key, value in self.letter_annotations.iteritems():
430 answer._per_letter_annotations[key] = value[index]
431
432 return answer
433 raise ValueError, "Invalid index"
434
436 """Iterate over the letters in the sequence.
437
438 For example, using Bio.SeqIO to read in a protein FASTA file:
439
440 >>> from Bio import SeqIO
441 >>> record = SeqIO.read(open("Fasta/loveliesbleeding.pro"),"fasta")
442 >>> for amino in record:
443 ... print amino
444 ... if amino == "L" : break
445 X
446 A
447 G
448 L
449 >>> print record.seq[3]
450 L
451
452 This is just a shortcut for iterating over the sequence directly:
453
454 >>> for amino in record.seq:
455 ... print amino
456 ... if amino == "L" : break
457 X
458 A
459 G
460 L
461 >>> print record.seq[3]
462 L
463
464 Note that this does not facilitate iteration together with any
465 per-letter-annotation. However, you can achieve that using the
466 python zip function on the record (or its sequence) and the relevant
467 per-letter-annotation:
468
469 >>> from Bio import SeqIO
470 >>> rec = SeqIO.read(open("Quality/solexa_faked.fastq", "rU"),
471 ... "fastq-solexa")
472 >>> print rec.id, rec.seq
473 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
474 >>> print rec.letter_annotations.keys()
475 ['solexa_quality']
476 >>> for nuc, qual in zip(rec,rec.letter_annotations["solexa_quality"]):
477 ... if qual > 35:
478 ... print nuc, qual
479 A 40
480 C 39
481 G 38
482 T 37
483 A 36
484
485 You may agree that using zip(rec.seq, ...) is more explicit than using
486 zip(rec, ...) as shown above.
487 """
488 return iter(self.seq)
489
491 """Implements the 'in' keyword, searches the sequence.
492
493 e.g.
494
495 >>> from Bio import SeqIO
496 >>> record = SeqIO.read(open("Fasta/sweetpea.nu"), "fasta")
497 >>> "GAATTC" in record
498 False
499 >>> "AAA" in record
500 True
501
502 This essentially acts as a proxy for using "in" on the sequence:
503
504 >>> "GAATTC" in record.seq
505 False
506 >>> "AAA" in record.seq
507 True
508
509 Note that you can also use Seq objects as the query,
510
511 >>> from Bio.Seq import Seq
512 >>> from Bio.Alphabet import generic_dna
513 >>> Seq("AAA") in record
514 True
515 >>> Seq("AAA", generic_dna) in record
516 True
517
518 See also the Seq object's __contains__ method.
519 """
520 return char in self.seq
521
522
524 """A human readable summary of the record and its annotation (string).
525
526 The python built in function str works by calling the object's ___str__
527 method. e.g.
528
529 >>> from Bio.Seq import Seq
530 >>> from Bio.SeqRecord import SeqRecord
531 >>> from Bio.Alphabet import IUPAC
532 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
533 ... IUPAC.protein),
534 ... id="YP_025292.1", name="HokC",
535 ... description="toxic membrane protein, small")
536 >>> print str(record)
537 ID: YP_025292.1
538 Name: HokC
539 Description: toxic membrane protein, small
540 Number of features: 0
541 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
542
543 In this example you don't actually need to call str explicity, as the
544 print command does this automatically:
545
546 >>> print record
547 ID: YP_025292.1
548 Name: HokC
549 Description: toxic membrane protein, small
550 Number of features: 0
551 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
552
553 Note that long sequences are shown truncated.
554 """
555 lines = []
556 if self.id : lines.append("ID: %s" % self.id)
557 if self.name : lines.append("Name: %s" % self.name)
558 if self.description : lines.append("Description: %s" % self.description)
559 if self.dbxrefs : lines.append("Database cross-references: " \
560 + ", ".join(self.dbxrefs))
561 lines.append("Number of features: %i" % len(self.features))
562 for a in self.annotations:
563 lines.append("/%s=%s" % (a, str(self.annotations[a])))
564 if self.letter_annotations:
565 lines.append("Per letter annotation for: " \
566 + ", ".join(self.letter_annotations.keys()))
567
568
569 lines.append(repr(self.seq))
570 return "\n".join(lines)
571
573 """A concise summary of the record for debugging (string).
574
575 The python built in function repr works by calling the object's ___repr__
576 method. e.g.
577
578 >>> from Bio.Seq import Seq
579 >>> from Bio.SeqRecord import SeqRecord
580 >>> from Bio.Alphabet import generic_protein
581 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT"
582 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ"
583 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ"
584 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF",
585 ... generic_protein),
586 ... id="NP_418483.1", name="b4059",
587 ... description="ssDNA-binding protein",
588 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"])
589 >>> print repr(rec)
590 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
591
592 At the python prompt you can also use this shorthand:
593
594 >>> rec
595 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
596
597 Note that long sequences are shown truncated. Also note that any
598 annotations, letter_annotations and features are not shown (as they
599 would lead to a very long string).
600 """
601 return self.__class__.__name__ \
602 + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \
603 % tuple(map(repr, (self.seq, self.id, self.name,
604 self.description, self.dbxrefs)))
605
639
657
659 """Returns the length of the sequence.
660
661 For example, using Bio.SeqIO to read in a FASTA nucleotide file:
662
663 >>> from Bio import SeqIO
664 >>> record = SeqIO.read(open("Fasta/sweetpea.nu"),"fasta")
665 >>> len(record)
666 309
667 >>> len(record.seq)
668 309
669 """
670 return len(self.seq)
671
673 """Returns True regardless of the length of the sequence.
674
675 This behaviour is for backwards compatibility, since until the
676 __len__ method was added, a SeqRecord always evaluated as True.
677
678 Note that in comparison, a Seq object will evaluate to False if it
679 has a zero length sequence.
680
681 WARNING: The SeqRecord may in future evaluate to False when its
682 sequence is of zero length (in order to better match the Seq
683 object behaviour)!
684 """
685 return True
686
688 """Add another sequence or string to this sequence.
689
690 The other sequence can be a SeqRecord object, a Seq object (or
691 similar, e.g. a MutableSeq) or a plain Python string. If you add
692 a plain string or a Seq (like) object, the new SeqRecord will simply
693 have this appended to the existing data. However, any per letter
694 annotation will be lost:
695
696 >>> from Bio import SeqIO
697 >>> handle = open("Quality/solexa_faked.fastq", "rU")
698 >>> record = SeqIO.read(handle, "fastq-solexa")
699 >>> handle.close()
700 >>> print record.id, record.seq
701 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
702 >>> print record.letter_annotations.keys()
703 ['solexa_quality']
704
705 >>> new = record + "ACT"
706 >>> print new.id, new.seq
707 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT
708 >>> print new.letter_annotations.keys()
709 []
710
711 The new record will attempt to combine the annotation, but for any
712 ambiguities (e.g. different names) it defaults to omitting that
713 annotation.
714
715 >>> from Bio import SeqIO
716 >>> handle = open("GenBank/pBAD30.gb")
717 >>> plasmid = SeqIO.read(handle, "gb")
718 >>> handle.close()
719 >>> print plasmid.id, len(plasmid)
720 pBAD30 4923
721
722 Now let's cut the plasmid into two pieces, and join them back up the
723 other way round (i.e. shift the starting point on this plasmid, have
724 a look at the annotated features in the original file to see why this
725 particular split point might make sense):
726
727 >>> left = plasmid[:3765]
728 >>> right = plasmid[3765:]
729 >>> new = right + left
730 >>> print new.id, len(new)
731 pBAD30 4923
732 >>> str(new.seq) == str(right.seq + left.seq)
733 True
734 >>> len(new.features) == len(left.features) + len(right.features)
735 True
736
737 When we add the left and right SeqRecord objects, their annotation
738 is all consistent, so it is all conserved in the new SeqRecord:
739
740 >>> new.id == left.id == right.id == plasmid.id
741 True
742 >>> new.name == left.name == right.name == plasmid.name
743 True
744 >>> new.description == plasmid.description
745 True
746 >>> new.annotations == left.annotations == right.annotations
747 True
748 >>> new.letter_annotations == plasmid.letter_annotations
749 True
750 >>> new.dbxrefs == left.dbxrefs == right.dbxrefs
751 True
752
753 However, we should point out that when we sliced the SeqRecord,
754 any annotations dictionary or dbxrefs list entries were lost.
755 You can explicitly copy them like this:
756
757 >>> new.annotations = plasmid.annotations.copy()
758 >>> new.dbxrefs = plasmid.dbxrefs[:]
759 """
760 if not isinstance(other, SeqRecord):
761
762
763 return SeqRecord(self.seq + other,
764 id = self.id, name = self.name,
765 description = self.description,
766 features = self.features[:],
767 annotations = self.annotations.copy(),
768 dbxrefs = self.dbxrefs[:])
769
770 answer = SeqRecord(self.seq + other.seq,
771 features = self.features[:],
772 dbxrefs = self.dbxrefs[:])
773
774 l = len(self)
775 for f in other.features:
776 answer.features.append(f._shift(l))
777 del l
778 for ref in other.dbxrefs:
779 if ref not in answer.dbxrefs:
780 answer.append(ref)
781
782 if self.id == other.id:
783 answer.id = self.id
784 if self.name == other.name:
785 answer.name = self.name
786 if self.description == other.description:
787 answer.description = self.description
788 for k,v in self.annotations.iteritems():
789 if k in other.annotations and other.annotations[k] == v:
790 answer.annotations[k] = v
791
792 for k,v in self.letter_annotations.iteritems():
793 if k in other.letter_annotations:
794 answer.letter_annotations[k] = v + other.letter_annotations[k]
795 return answer
796
798 """Add another sequence or string to this sequence (from the left).
799
800 This method handles adding a Seq object (or similar, e.g. MutableSeq)
801 or a plain Python string (on the left) to a SeqRecord (on the right).
802 See the __add__ method for more details, but for example:
803
804 >>> from Bio import SeqIO
805 >>> handle = open("Quality/solexa_faked.fastq", "rU")
806 >>> record = SeqIO.read(handle, "fastq-solexa")
807 >>> handle.close()
808 >>> print record.id, record.seq
809 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
810 >>> print record.letter_annotations.keys()
811 ['solexa_quality']
812
813 >>> new = "ACT" + record
814 >>> print new.id, new.seq
815 slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
816 >>> print new.letter_annotations.keys()
817 []
818 """
819 if isinstance(other, SeqRecord):
820 raise RuntimeError("This should have happened via the __add__ of "
821 "the other SeqRecord being added!")
822
823
824 offset = len(other)
825 return SeqRecord(other + self.seq,
826 id = self.id, name = self.name,
827 description = self.description,
828 features = [f._shift(offset) for f in self.features],
829 annotations = self.annotations.copy(),
830 dbxrefs = self.dbxrefs[:])
831
833 """Run the Bio.SeqRecord module's doctests (PRIVATE).
834
835 This will try and locate the unit tests directory, and run the doctests
836 from there in order that the relative paths used in the examples work.
837 """
838 import doctest
839 import os
840 if os.path.isdir(os.path.join("..","Tests")):
841 print "Runing doctests..."
842 cur_dir = os.path.abspath(os.curdir)
843 os.chdir(os.path.join("..","Tests"))
844 doctest.testmod()
845 os.chdir(cur_dir)
846 del cur_dir
847 print "Done"
848 elif os.path.isdir(os.path.join("Tests")) :
849 print "Runing doctests..."
850 cur_dir = os.path.abspath(os.curdir)
851 os.chdir(os.path.join("Tests"))
852 doctest.testmod()
853 os.chdir(cur_dir)
854 del cur_dir
855 print "Done"
856
857 if __name__ == "__main__":
858 _test()
859