1
2
3
4
5
6
7
8 """Access to General Feature Format databases created with BioPerl (DEPRECATED).
9
10 This is the "old" Bio.GFF module by Michael Hoffman, which offers access to
11 a MySQL database holding GFF data loaded by BioPerl. This code has now been
12 deprecated, and will be removed (or at best, relocated) in order to free the
13 Bio.GFF namespace for a new GFF parser in Biopython (including GFF3 support).
14
15 Based on documentation for Lincoln Stein's Perl Bio::DB::GFF
16
17 >>> import os
18 >>> import Bio.GFF
19 >>> PASSWORD = os.environ['MYSQLPASS']
20 >>> DATABASE_GFF = 'wormbase_ws93'
21 >>> FASTADIR = '/local/wormbase_ws93/'
22 >>> gff = Bio.GFF.Connection(passwd=PASSWORD, db=DATABASE_GFF, fastadir=FASTADIR)
23
24 """
25
26 import warnings
27 warnings.warn("The old Bio.GFF module for access to a MySQL GFF database "
28 "created with BioPerl is deprecated, and will be removed (or "
29 "possibly just moved) in a future release of Biopython. If you "
30 "want to continue to use this code, please get in contact with "
31 "the developers via the mailing lists to avoid its permanent "
32 "removal from Biopython. The plan is to re-use the Bio.GFF "
33 "namespace for a new GFF parsing module.", DeprecationWarning)
34
35 __version__ = "$Revision: 1.10 $"
36
37
38 import exceptions
39 import operator
40 import os.path
41 import sys
42 import types
43
44 from Bio import MissingExternalDependencyError
45
46 try:
47 import MySQLdb
48 except:
49 raise MissingExternalDependencyError("Install MySQLdb if you want to use Bio.GFF.")
50
51 from Bio.Alphabet import IUPAC
52 from Bio import DocSQL
53 from Bio.Seq import Seq
54 from Bio.SeqRecord import SeqRecord
55
56 import binning
57 import easy
58 import GenericTools
59
60
61 DEFAULT_ALPHABET = IUPAC.unambiguous_dna
62
64 """
65 this will only work for the simplest of easy.Location objects
66 """
77
80
86
88 """
89 Connection to GFF database
90
91 also functions as whole segment
92 """
93
102
103 - def segment(self, *args, **keywds):
104 return Segment(self, *args, **keywds)
105
107 """
108 Singleton: contain records of loaded FASTA files
109
110 >>> RetrieveSeqname._dir = 'GFF'
111 >>> RetrieveSeqname._diagnostics = 1
112 >>> sys.stderr = sys.stdout # dump diagnostics to stdout so doctest can see them
113 >>> record1 = RetrieveSeqname('NC_001802.fna')
114 Loading GFF/NC_001802.fna... Done.
115 >>> record1.id
116 'gi|9629357|ref|NC_001802.1|'
117 >>> record2 = RetrieveSeqname('NC_001802.fna')
118 >>> record1.seq is record2.seq # should be same space in memory
119 1
120 """
121 __records = {}
122 _diagnostics = 0
123
136
138 """
139 strand may be:
140 +/0 = Watson
141 -/1 = Crick
142
143 I propose that we start calling these the Rosalind and Franklin strands
144
145 >>> RetrieveSeqname._dir = 'GFF'
146 >>> feature = Feature("NC_001802x.fna", 73, 78) # not having the x will interfere with the RetrieveSequence test
147 >>> feature.seq()
148 Seq('AATAAA', Alphabet())
149 >>> print feature.location()
150 NC_001802x.fna:73..78
151 >>> from Bio import SeqIO
152 >>> record = feature.record()
153 >>> records = [record]
154 >>> SeqIO.write(records, sys.stdout, 'fasta')
155 > NC_001802x.fna:73..78
156 AATAAA
157 >>> feature2 = Feature(location=easy.LocationFromString("NC_001802x.fna:73..78"))
158 >>> writer.write(feature2.record())
159 > NC_001802x.fna:73..78
160 AATAAA
161 >>> location3 = easy.LocationFromString("NC_001802x.fna:complement(73..78)")
162 >>> feature3 = Feature(location=location3)
163 >>> writer.write(feature3.record())
164 > NC_001802x.fna:complement(73..78)
165 TTTATT
166 >>> location4 = easy.LocationFromString("NC_001802x.fna:336..1631")
167 >>> feature4 = Feature(location=location4, frame=0)
168 >>> feature4.frame
169 0
170 >>> feature4.translate()[:7]
171 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*'))
172 >>> feature4.frame = 6 # can't happen, but a useful demonstration
173 >>> feature4.translate()[:5]
174 Seq('ARASV', HasStopCodon(IUPACProtein(), '*'))
175 >>> feature4.frame = 1
176 >>> feature4.translate()[:5]
177 Seq('WVRER', HasStopCodon(IUPACProtein(), '*'))
178 >>> location5 = easy.LocationFromString("NC_001802lc.fna:336..1631") # lowercase data
179 >>> feature5 = Feature(location=location5, frame=0)
180 >>> feature5.translate()[:7]
181 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*'))
182 >>> location6 = easy.LocationFromString("NC_001802lc.fna:335..351")
183 >>> feature6 = Feature(location=location6, frame=1)
184 >>> feature6.translate()
185 Seq('MGARA', HasStopCodon(IUPACProtein(), '*'))
186 """
187 - def __init__(self,
188 seqname=None,
189 start=None,
190 end=None,
191 strand="+",
192 frame=None,
193 location=None,
194 alphabet=DEFAULT_ALPHABET):
210
218
222
239
242
244 if self.target_start <= self.target_end:
245 return easy.LocationFromCoords(self.target_start-1, self.target_end-1, 0, seqname=self.gname)
246 else:
247
248 return easy.LocationFromCoords(self.target_end-1, self.target_start-1, 1, seqname=self.gname)
249
251 try:
252 return "%s:%s" % (self.gclass, self.gname)
253 except AttributeError:
254 return ""
255
258
261
263 return "Feature(%s)" % self.location()
264
266 """
267 row of FeatureQuery results
268 works like a Feature
269 """
279
281 """
282 SELECT fdata.fref AS seqname,
283 ftype.fsource AS source,
284 ftype.fmethod AS feature,
285 fdata.fstart AS start,
286 fdata.fstop AS end,
287 fdata.fscore AS score,
288 fdata.fstrand AS strand,
289 fdata.fphase AS frame,
290 fdata.ftarget_start AS target_start,
291 fdata.ftarget_stop AS target_end,
292 fdata.gid,
293 fgroup.gclass,
294 fgroup.gname
295 FROM fdata, ftype, fgroup
296 WHERE fdata.ftypeid = ftype.ftypeid
297 AND fdata.gid = fgroup.gid
298 """
299
300 row_class = FeatureQueryRow
301 - def __init__(self,
302 seqname=None,
303 coords=None,
304 complement=0,
305 method=None,
306 source=None,
307 object=None,
308 gid=None,
309 *args,
310 **keywds):
311
312 DocSQL.Query.__init__(self, *args, **keywds)
313
314 if seqname is not None:
315 self.statement += ' AND fref="%s"\n' % seqname
316 if coords is not None:
317 self.statement += " AND (%s)\n" % binning.query(coords[0], coords[1])
318 if coords[0] is not None:
319 self.statement += (" AND fstop >= %s\n" % coords[0])
320 if coords[1] is not None:
321 self.statement += (" AND fstart <= %s\n" % coords[1])
322 if method is not None:
323 self.statement += ' AND fmethod LIKE "%s"\n' % method
324 if source is not None:
325 self.statement += ' AND fsource LIKE "%s"\n' % source
326 if object is not None:
327 self.statement += ' AND %s\n' % object2fgroup_sql(object)
328 if gid is not None:
329 self.statement += ' AND fgroup.gid = %s\n' % gid
330
331 if complement:
332 self.statement += " ORDER BY 0-fstart, 0-fstop"
333 else:
334 self.statement += " ORDER BY fstart, fstop"
335
338
340 return 'gclass LIKE "%s" AND gname LIKE "%s"' % object_split(object)
341
343 """
344 >>> feature1_1 = Feature(location=easy.LocationFromString("NC_001802x.fna:336..1631"), frame=0) # gag-pol
345 >>> feature1_2 = Feature(location=easy.LocationFromString("NC_001802x.fna:1631..4642"), frame=0) # slippage
346 >>> aggregate = FeatureAggregate([feature1_1, feature1_2])
347 >>> print aggregate.location()
348 join(NC_001802x.fna:336..1631,NC_001802x.fna:1631..4642)
349 >>> xlate_str = aggregate.translate().tostring()
350 >>> xlate_str[:5], xlate_str[-5:]
351 ('MGARA', 'RQDED')
352
353 >>> location1 = easy.LocationFromString("NC_001802x.fna:complement(1..6)")
354 >>> location2 = easy.LocationFromString("NC_001802x.fna:complement(7..12)")
355 >>> feature2_1 = Feature(location=location1, frame=0)
356 >>> feature2_2 = Feature(location=location2, frame=0)
357 >>> aggregate2 = FeatureAggregate([feature2_1, feature2_2])
358 >>> print aggregate2.location()
359 complement(join(NC_001802x.fna:1..6,NC_001802x.fna:7..12))
360 >>> print aggregate2.translate()
361 Seq('TRET', HasStopCodon(IUPACProtein(), '*'))
362 >>> location1.reverse()
363 >>> location2.reverse()
364 >>> aggregate3 = FeatureAggregate([Feature(location=x, frame=0) for x in [location1, location2]])
365 >>> print aggregate3.location()
366 join(NC_001802x.fna:1..6,NC_001802x.fna:7..12)
367 >>> print aggregate3.translate()
368 Seq('GLSG', HasStopCodon(IUPACProtein(), '*'))
369 >>> aggregate3[0].frame = 3
370 >>> print aggregate3.translate()
371 Seq('LSG', HasStopCodon(IUPACProtein(), '*'))
372
373 >>> aggregate4 = FeatureAggregate()
374 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:1..5"), frame=0))
375 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:6..12"), frame=2))
376 >>> aggregate4.seq()
377 Seq('GGTCTCTCTGGT', Alphabet())
378 >>> aggregate4.translate()
379 Seq('GLSG', HasStopCodon(IUPACProtein(), '*'))
380 """
381 - def __init__(self, feature_query=None):
386
391
392 - def map(self, func):
397
400
402 attributes = ['alphabet', 'frame']
403 index = [0, -1][self.location().complement]
404 for attribute in attributes:
405 self.__dict__[attribute] = self[index].__dict__[attribute]
406 translation = Feature.translate(self)
407 try:
408 assert translation.tostring().index("*") == len(translation) - 1
409 return translation[:translation.tostring().index("*")]
410 except ValueError:
411 return translation
412
414 """
415 >>> object_split("Sequence:F02E9.2a")
416 ('Sequence', 'F02E9.2a')
417 """
418 return tuple(object.split(":"))
419
420 -def _test(*args, **keywds):
421 import doctest, sys
422 doctest.testmod(sys.modules[__name__], *args, **keywds)
423
424 if __name__ == "__main__":
425 if __debug__:
426 _test()
427