1 """Represent a Sequence Feature holding info about a part of a sequence.
2
3 This is heavily modeled after the Biocorba SeqFeature objects, and
4 may be pretty biased towards GenBank stuff since I'm writing it
5 for the GenBank parser output...
6
7 What's here:
8
9 Base class to hold a Feature.
10 ----------------------------
11 classes:
12 o SeqFeature
13
14 Hold information about a Reference.
15 ----------------------------------
16
17 This is an attempt to create a General class to hold Reference type
18 information.
19
20 classes:
21 o Reference
22
23 Specify locations of a feature on a Sequence.
24 ---------------------------------------------
25
26 This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in
27 much the same way as Biocorba. This has the advantages of allowing us
28 to handle fuzzy stuff in case anyone needs it, and also be compatible
29 with Biocorba.
30
31 classes:
32 o FeatureLocation - Specify the start and end location of a feature.
33
34 o ExactPosition - Specify the position as being exact.
35 o WithinPosition - Specify a position occuring within some range.
36 o BetweenPosition - Specify a position occuring between a range (OBSOLETE?).
37 o BeforePosition - Specify the position as being found before some base.
38 o AfterPosition - Specify the position as being found after some base.
39 o OneOfPosition - Specify a position where the location can be multiple positions.
40 """
41
42 from Bio.Seq import MutableSeq, reverse_complement
43
45 """Represent a Sequence Feature on an object.
46
47 Attributes:
48 o location - the location of the feature on the sequence (FeatureLocation)
49 o type - the specified type of the feature (ie. CDS, exon, repeat...)
50 o location_operator - a string specifying how this SeqFeature may
51 be related to others. For example, in the example GenBank feature
52 shown below, the location_operator would be "join"
53 o strand - A value specifying on which strand (of a DNA sequence, for
54 instance) the feature deals with. 1 indicates the plus strand, -1
55 indicates the minus strand, 0 indicates both strands, and None indicates
56 that strand doesn't apply (ie. for proteins) or is not known.
57 o id - A string identifier for the feature.
58 o ref - A reference to another sequence. This could be an accession
59 number for some different sequence.
60 o ref_db - A different database for the reference accession number.
61 o qualifiers - A dictionary of qualifiers on the feature. These are
62 analagous to the qualifiers from a GenBank feature table. The keys of
63 the dictionary are qualifier names, the values are the qualifier
64 values.
65 o sub_features - Additional SeqFeatures which fall under this 'parent'
66 feature. For instance, if we having something like:
67
68 CDS join(1..10,30..40,50..60)
69
70 The the top level feature would be a CDS from 1 to 60, and the sub
71 features would be of 'CDS_join' type and would be from 1 to 10, 30 to
72 40 and 50 to 60, respectively.
73
74 To get the nucleotide sequence for this CDS, you would need to take the
75 parent sequence and do seq[0:10]+seq[29:40]+seq[49:60] (Python counting).
76 Things are more complicated with strands and fuzzy positions. To save you
77 dealing with all these special cases, the SeqFeature provides an extract
78 method to do this for you.
79 """
80 - def __init__(self, location = None, type = '', location_operator = '',
81 strand = None, id = "<unknown id>",
82 qualifiers = None, sub_features = None,
83 ref = None, ref_db = None):
84 """Initialize a SeqFeature on a Sequence.
85
86 location can either be a FeatureLocation (with strand argument also
87 given if required), or a Python slice (with strand given as the step).
88
89 e.g. With no strand, on the forward strand, and on the reverse strand:
90
91 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
92 >>> f1 = SeqFeature(FeatureLocation(5,10), type="domain")
93 >>> f2 = SeqFeature(FeatureLocation(7,110), strand=1, type="CDS")
94 >>> f3 = SeqFeature(FeatureLocation(9,108), strand=-1, type="CDS")
95
96 An invalid strand will trigger an exception:
97
98 >>> f4 = SeqFeature(FeatureLocation(50,60), strand=2)
99 Traceback (most recent call last):
100 ...
101 ValueError: Strand should be +1, -1, 0 or None, not 2
102
103 For exact start/end positions, an integer can be used (as shown above)
104 as shorthand for the ExactPosition object. For non-exact locations, the
105 FeatureLocation must be specified via the appropriate position objects.
106 """
107 if strand not in [-1, 0, 1, None] :
108 raise ValueError("Strand should be +1, -1, 0 or None, not %s" \
109 % repr(strand))
110 if location and not isinstance(location, FeatureLocation):
111 raise TypeError("FeatureLocation (or None) required for the location")
112 self.location = location
113
114 self.type = type
115 self.location_operator = location_operator
116 self.strand = strand
117 self.id = id
118 if qualifiers is None:
119 qualifiers = {}
120 self.qualifiers = qualifiers
121 if sub_features is None:
122 sub_features = []
123 self.sub_features = sub_features
124 self.ref = ref
125 self.ref_db = ref_db
126
128 """A string representation of the record for debugging."""
129 answer = "%s(%s" % (self.__class__.__name__, repr(self.location))
130 if self.type:
131 answer += ", type=%s" % repr(self.type)
132 if self.location_operator:
133 answer += ", location_operator=%s" % repr(self.location_operator)
134 if self.strand:
135 answer += ", strand=%s" % repr(self.strand)
136 if self.id and self.id != "<unknown id>":
137 answer += ", id=%s" % repr(self.id)
138 if self.ref:
139 answer += ", ref=%s" % repr(self.ref)
140 if self.ref_db:
141 answer += ", ref_db=%s" % repr(self.ref_db)
142 answer += ")"
143 return answer
144
146 """A readable summary of the feature intended to be printed to screen.
147 """
148 out = "type: %s\n" % self.type
149 out += "location: %s\n" % self.location
150 out += "ref: %s:%s\n" % (self.ref, self.ref_db)
151 out += "strand: %s\n" % self.strand
152 out += "qualifiers: \n"
153 qualifier_keys = self.qualifiers.keys()
154 qualifier_keys.sort()
155 for qual_key in qualifier_keys:
156 out += " Key: %s, Value: %s\n" % (qual_key,
157 self.qualifiers[qual_key])
158 if len(self.sub_features) != 0:
159 out += "Sub-Features\n"
160 for sub_feature in self.sub_features:
161 out +="%s\n" % sub_feature
162
163 return out
164
166 """Returns a copy of the feature with its location shifted (PRIVATE).
167
168 The annotation qaulifiers are copied."""
169 answer = SeqFeature(location = self.location._shift(offset),
170 type = self.type,
171 location_operator = self.location_operator,
172 strand = self.strand,
173 id = self.id,
174
175
176 ref = self.ref,
177 ref_db = self.ref_db)
178
179 answer.sub_features = [f._shift(offset) for f in self.sub_features]
180 answer.qualifiers = dict(self.qualifiers.iteritems())
181 return answer
182
184 """Extract feature sequence from the supplied parent sequence.
185
186 The parent_sequence can be a Seq like object or a string, and will
187 generally return an object of the same type. The exception to this is
188 a MutableSeq as the parent sequence will return a Seq object.
189
190 This should cope with complex locations including complements, joins
191 and fuzzy positions. Even mixed strand features should work! This
192 also covers features on protein sequences (e.g. domains), although
193 here reverse strand features are not permitted.
194
195 >>> from Bio.Seq import Seq
196 >>> from Bio.Alphabet import generic_protein
197 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
198 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein)
199 >>> f = SeqFeature(FeatureLocation(8,15), type="domain")
200 >>> f.extract(seq)
201 Seq('VALIVIC', ProteinAlphabet())
202
203 Note - currently only sub-features of type "join" are supported.
204 """
205 if isinstance(parent_sequence, MutableSeq):
206
207
208 parent_sequence = parent_sequence.toseq()
209 if self.sub_features:
210 if self.location_operator!="join":
211 raise ValueError(f.location_operator)
212 if self.strand == -1:
213
214
215 parts = []
216 for f_sub in self.sub_features:
217 assert f_sub.strand==-1
218 parts.append(parent_sequence[f_sub.location.nofuzzy_start:\
219 f_sub.location.nofuzzy_end])
220 else:
221
222 parts = [f_sub.extract(parent_sequence) \
223 for f_sub in self.sub_features]
224
225 f_seq = parts[0]
226 for part in parts[1:] : f_seq += part
227 else:
228 f_seq = parent_sequence[self.location.nofuzzy_start:\
229 self.location.nofuzzy_end]
230 if self.strand == -1:
231
232 try:
233 f_seq = f_seq.reverse_complement()
234 except AttributeError:
235 assert isinstance(f_seq, str)
236 f_seq = reverse_complement(f_seq)
237 return f_seq
238
239
240
241
242
244 """Represent a Generic Reference object.
245
246 Attributes:
247 o location - A list of Location objects specifying regions of
248 the sequence that the references correspond to. If no locations are
249 specified, the entire sequence is assumed.
250 o authors - A big old string, or a list split by author, of authors
251 for the reference.
252 o title - The title of the reference.
253 o journal - Journal the reference was published in.
254 o medline_id - A medline reference for the article.
255 o pubmed_id - A pubmed reference for the article.
256 o comment - A place to stick any comments about the reference.
257 """
267
269 """Output an informative string for debugging.
270 """
271 out = ""
272 for single_location in self.location:
273 out += "location: %s\n" % single_location
274 out += "authors: %s\n" % self.authors
275 if self.consrtm:
276 out += "consrtm: %s\n" % self.consrtm
277 out += "title: %s\n" % self.title
278 out += "journal: %s\n" % self.journal
279 out += "medline id: %s\n" % self.medline_id
280 out += "pubmed id: %s\n" % self.pubmed_id
281 out += "comment: %s\n" % self.comment
282 return out
283
285
286 return "%s(title=%s, ...)" % (self.__class__.__name__,
287 repr(self.title))
288
289
290
292 """Specify the location of a feature along a sequence.
293
294 This attempts to deal with fuzziness of position ends, but also
295 make it easy to get the start and end in the 'normal' case (no
296 fuzziness).
297
298 You should access the start and end attributes with
299 your_location.start and your_location.end. If the start and
300 end are exact, this will return the positions, if not, we'll return
301 the approriate Fuzzy class with info about the position and fuzziness.
302
303 Note that the start and end location numbering follow Python's scheme,
304 thus a GenBank entry of 123..150 (one based counting) becomes a location
305 of [122:150] (zero based counting).
306 """
308 """Specify the start and end of a sequence feature.
309
310 start and end arguments specify the values where the feature begins
311 and ends. These can either by any of the *Position objects that
312 inherit from AbstractPosition, or can just be integers specifying the
313 position. In the case of integers, the values are assumed to be
314 exact and are converted in ExactPosition arguments. This is meant
315 to make it easy to deal with non-fuzzy ends.
316
317 i.e. Short form:
318
319 >>> from Bio.SeqFeature import FeatureLocation
320 >>> loc = FeatureLocation(5,10)
321
322 Explicit form:
323
324 >>> from Bio.SeqFeature import FeatureLocation, ExactPosition
325 >>> loc = FeatureLocation(ExactPosition(5),ExactPosition(10))
326
327 Other fuzzy positions are used similarly,
328
329 >>> from Bio.SeqFeature import FeatureLocation
330 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
331 >>> loc2 = FeatureLocation(BeforePosition(5),AfterPosition(10))
332
333 """
334 if isinstance(start, AbstractPosition):
335 self._start = start
336 else:
337 self._start = ExactPosition(start)
338
339 if isinstance(end, AbstractPosition):
340 self._end = end
341 else:
342 self._end = ExactPosition(end)
343
345 """Returns a representation of the location (with python counting).
346
347 For the simple case this uses the python splicing syntax, [122:150]
348 (zero based counting) which GenBank would call 123..150 (one based
349 counting).
350 """
351 return "[%s:%s]" % (self._start, self._end)
352
354 """A string representation of the location for debugging."""
355 return "%s(%s,%s)" \
356 % (self.__class__.__name__, repr(self.start), repr(self.end))
357
362
363 start = property(fget= lambda self : self._start,
364 doc="Start location (possibly a fuzzy position, read only).")
365
366 end = property(fget= lambda self : self._end,
367 doc="End location (possibly a fuzzy position, read only).")
368
377 nofuzzy_start = property(fget=_get_nofuzzy_start,
378 doc="""Start position (integer, approximated if fuzzy, read only).
379
380 To get non-fuzzy attributes (ie. the position only) ask for
381 'location.nofuzzy_start', 'location.nofuzzy_end'. These should return
382 the largest range of the fuzzy position. So something like:
383 (10.20)..(30.40) should return 10 for start, and 40 for end.
384 """)
385
387
388 if ((self._start == self._end) and isinstance(self._start,
389 BetweenPosition)):
390 return self._end.position
391 else:
392 return max(self._end.position,
393 self._end.position + self._end.extension)
394 nofuzzy_end = property(fget=_get_nofuzzy_end,
395 doc="""End position (integer, approximated if fuzzy, read only).
396
397 To get non-fuzzy attributes (ie. the position only) ask for
398 'location.nofuzzy_start', 'location.nofuzzy_end'. These should return
399 the largest range of the fuzzy position. So something like:
400 (10.20)..(30.40) should return 10 for start, and 40 for end.
401 """)
402
404 """Abstract base class representing a position.
405 """
406 - def __init__(self, position, extension):
407 self.position = position
408 self.extension = extension
409
411 """String representation of the location for debugging."""
412 return "%s(%s,%s)" % (self.__class__.__name__, \
413 repr(self.position), repr(self.extension))
414
416 """A simple comparison function for positions.
417
418 This is very simple-minded and just compares the position attribute
419 of the features; extensions are not considered at all. This could
420 potentially be expanded to try to take advantage of extensions.
421 """
422 assert isinstance(other, AbstractPosition), \
423 "We can only do comparisons between Biopython Position objects."
424
425 return cmp(self.position, other.position)
426
428
429 return self.__class__(self.position + offset, self.extension)
430
432 """Specify the specific position of a boundary.
433
434 o position - The position of the boundary.
435 o extension - An optional argument which must be zero since we don't
436 have an extension. The argument is provided so that the same number of
437 arguments can be passed to all position types.
438
439 In this case, there is no fuzziness associated with the position.
440 """
441 - def __init__(self, position, extension = 0):
442 if extension != 0:
443 raise AttributeError("Non-zero extension %s for exact position."
444 % extension)
445 AbstractPosition.__init__(self, position, 0)
446
448 """String representation of the ExactPosition location for debugging."""
449 assert self.extension == 0
450 return "%s(%s)" % (self.__class__.__name__, repr(self.position))
451
453 return str(self.position)
454
456 """Specify the position of a boundary within some coordinates.
457
458 Arguments:
459 o position - The start position of the boundary
460 o extension - The range to which the boundary can extend.
461
462 This allows dealing with a position like ((1.4)..100). This
463 indicates that the start of the sequence is somewhere between 1
464 and 4. To represent that with this class we would set position as
465 1 and extension as 3.
466 """
467 - def __init__(self, position, extension = 0):
469
471 return "(%s.%s)" % (self.position, self.position + self.extension)
472
474 """Specify the position of a boundary between two coordinates (OBSOLETE?).
475
476 Arguments:
477 o position - The start position of the boundary.
478 o extension - The range to the other position of a boundary.
479
480 This specifies a coordinate which is found between the two positions.
481 So this allows us to deal with a position like ((1^2)..100). To
482 represent that with this class we set position as 1 and the
483 extension as 1.
484 """
485 - def __init__(self, position, extension = 0):
487
489 return "(%s^%s)" % (self.position, self.position + self.extension)
490
492 """Specify a position where the actual location occurs before it.
493
494 Arguments:
495 o position - The upper boundary of where the location can occur.
496 o extension - An optional argument which must be zero since we don't
497 have an extension. The argument is provided so that the same number of
498 arguments can be passed to all position types.
499
500 This is used to specify positions like (<10..100) where the location
501 occurs somewhere before position 10.
502 """
503 - def __init__(self, position, extension = 0):
504 if extension != 0:
505 raise AttributeError("Non-zero extension %s for exact position."
506 % extension)
507 AbstractPosition.__init__(self, position, 0)
508
510 """A string representation of the location for debugging."""
511 assert self.extension == 0
512 return "%s(%s)" % (self.__class__.__name__, repr(self.position))
513
515 return "<%s" % self.position
516
518 """Specify a position where the actual location is found after it.
519
520 Arguments:
521 o position - The lower boundary of where the location can occur.
522 o extension - An optional argument which must be zero since we don't
523 have an extension. The argument is provided so that the same number of
524 arguments can be passed to all position types.
525
526 This is used to specify positions like (>10..100) where the location
527 occurs somewhere after position 10.
528 """
529 - def __init__(self, position, extension = 0):
530 if extension != 0:
531 raise AttributeError("Non-zero extension %s for exact position."
532 % extension)
533 AbstractPosition.__init__(self, position, 0)
534
536 """A string representation of the location for debugging."""
537 assert self.extension == 0
538 return "%s(%s)" % (self.__class__.__name__, repr(self.position))
539
541 return ">%s" % self.position
542
544 """Specify a position where the location can be multiple positions.
545
546 This models the GenBank 'one-of(1888,1901)' function, and tries
547 to make this fit within the Biopython Position models. In our case
548 the position of the "one-of" is set as the lowest choice, and the
549 extension is the range to the highest choice.
550 """
552 """Initialize with a set of posssible positions.
553
554 position_list is a list of AbstractPosition derived objects,
555 specifying possible locations.
556 """
557
558 self.position_choices = position_list
559
560 smallest = None
561 largest = None
562 for position_choice in self.position_choices:
563 assert isinstance(position_choice, AbstractPosition), \
564 "Expected position objects, got %r" % position_choice
565 if smallest is None and largest is None:
566 smallest = position_choice.position
567 largest = position_choice.position
568 elif position_choice.position > largest:
569 largest = position_choice.position
570 elif position_choice.position < smallest:
571 smallest = position_choice.position
572
573 AbstractPosition.__init__(self, smallest, largest - smallest)
574
576 """String representation of the OneOfPosition location for debugging."""
577 return "%s(%s)" % (self.__class__.__name__, \
578 repr(self.position_choices))
579
581 out = "one-of("
582 for position in self.position_choices:
583 out += "%s," % position
584
585 out = out[:-1] + ")"
586 return out
587
589 return self.__class__([position_choice._shift(offset) \
590 for position_choice in self.position_choices])
591
593 """Simple class to hold information about a gap between positions.
594 """
596 """Intialize with a position object containing the gap information.
597 """
598 self.gap_size = gap_size
599
601 """A string representation of the position gap for debugging."""
602 return "%s(%s)" % (self.__class__.__name__, repr(self.gap_size))
603
605 out = "gap(%s)" % self.gap_size
606 return out
607
609 """Run the Bio.SeqFeature module's doctests."""
610 print "Runing doctests..."
611 import doctest
612 doctest.testmod()
613 print "Done"
614
615 if __name__ == "__main__":
616 _test()
617