Package Bio :: Package GFF
[hide private]
[frames] | no frames]

Source Code for Package Bio.GFF

  1  #!/usr/bin/env python 
  2  # 
  3  # Copyright 2002 by Michael Hoffman.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Access to General Feature Format databases created with BioPerl (DEPRECATED). 
  9   
 10  This is the "old" Bio.GFF module by Michael Hoffman, which offers access to 
 11  a MySQL database holding GFF data loaded by BioPerl. This code has now been 
 12  deprecated, and will be removed (or at best, relocated) in order to free the 
 13  Bio.GFF namespace for a new GFF parser in Biopython (including GFF3 support). 
 14   
 15  Based on documentation for Lincoln Stein's Perl Bio::DB::GFF 
 16   
 17  >>> import os 
 18  >>> import Bio.GFF 
 19  >>> PASSWORD = os.environ['MYSQLPASS'] 
 20  >>> DATABASE_GFF = 'wormbase_ws93' 
 21  >>> FASTADIR = '/local/wormbase_ws93/' 
 22  >>> gff = Bio.GFF.Connection(passwd=PASSWORD, db=DATABASE_GFF, fastadir=FASTADIR) 
 23   
 24  """ 
 25   
 26  import warnings 
 27  warnings.warn("The old Bio.GFF module for access to a MySQL GFF database " 
 28                "created with BioPerl is deprecated, and will be removed (or " 
 29                "possibly just moved) in a future release of Biopython.  If you " 
 30                "want to continue to use this code, please get in contact with " 
 31                "the developers via the mailing lists to avoid its permanent " 
 32                "removal from Biopython. The plan is to re-use the Bio.GFF " 
 33                "namespace for a new GFF parsing module.", DeprecationWarning) 
 34   
 35  __version__ = "$Revision: 1.10 $" 
 36  # $Source: /home/bartek/cvs2bzr/biopython_fastimport/cvs_repo/biopython/Bio/GFF/__init__.py,v $ 
 37   
 38  import exceptions 
 39  import operator 
 40  import os.path 
 41  import sys 
 42  import types 
 43   
 44  from Bio import MissingExternalDependencyError 
 45   
 46  try: 
 47      import MySQLdb 
 48  except: 
 49      raise MissingExternalDependencyError("Install MySQLdb if you want to use Bio.GFF.") 
 50   
 51  from Bio.Alphabet import IUPAC 
 52  from Bio import DocSQL 
 53  from Bio.Seq import Seq 
 54  from Bio.SeqRecord import SeqRecord 
 55   
 56  import binning 
 57  import easy 
 58  import GenericTools 
 59   
 60   
 61  DEFAULT_ALPHABET = IUPAC.unambiguous_dna 
 62   
63 -class Segment(object):
64 """ 65 this will only work for the simplest of easy.Location objects 66 """
67 - def __init__(self, gff, location=None):
68 self.gff = gff 69 if location is None: 70 self.seqname = None 71 self.coords = None 72 self.complement = None 73 else: 74 self.seqname = location.seqname 75 self.coords = [location.start(), location.end()] 76 self.complement = location.complement
77
78 - def features(self, *args, **keywds):
79 return FeatureQuery(self.seqname, self.coords, self.complement, connection=self.gff.db, *args, **keywds)
80
81 - def single_feature(self, *args, **keywds):
82 features = self.features(*args, **keywds) 83 all_features = GenericTools.all(features) 84 assert len(all_features) == 1 85 return all_features[0]
86
87 -class Connection(Segment):
88 """ 89 Connection to GFF database 90 91 also functions as whole segment 92 """ 93
94 - def __init__(self, *args, **keywds):
95 try: 96 RetrieveSeqname._dir = keywds['fastadir'] 97 del keywds['fastadir'] 98 except KeyError: 99 RetrieveSeqname._dir = '.' 100 self.db = MySQLdb.connect(*args, **keywds) 101 Segment.__init__(self, self)
102
103 - def segment(self, *args, **keywds):
104 return Segment(self, *args, **keywds)
105
106 -class RetrieveSeqname(GenericTools.Surrogate, SeqRecord):
107 """ 108 Singleton: contain records of loaded FASTA files 109 110 >>> RetrieveSeqname._dir = 'GFF' 111 >>> RetrieveSeqname._diagnostics = 1 112 >>> sys.stderr = sys.stdout # dump diagnostics to stdout so doctest can see them 113 >>> record1 = RetrieveSeqname('NC_001802.fna') 114 Loading GFF/NC_001802.fna... Done. 115 >>> record1.id 116 'gi|9629357|ref|NC_001802.1|' 117 >>> record2 = RetrieveSeqname('NC_001802.fna') 118 >>> record1.seq is record2.seq # should be same space in memory 119 1 120 """ 121 __records = {} 122 _diagnostics = 0 123
124 - def __init__(self, seqname):
125 try: 126 GenericTools.Surrogate.__init__(self, self.__records[seqname]) 127 except KeyError: 128 filename = os.path.join(self._dir, seqname) 129 if self._diagnostics: 130 sys.stderr.write("Loading %s..." % filename) 131 record = easy.fasta_single(filename) 132 self.__records[seqname] = record 133 GenericTools.Surrogate.__init__(self, self.__records[seqname]) 134 if self._diagnostics: 135 print >>sys.stderr, " Done."
136
137 -class Feature(object):
138 """ 139 strand may be: 140 +/0 = Watson 141 -/1 = Crick 142 143 I propose that we start calling these the Rosalind and Franklin strands 144 145 >>> RetrieveSeqname._dir = 'GFF' 146 >>> feature = Feature("NC_001802x.fna", 73, 78) # not having the x will interfere with the RetrieveSequence test 147 >>> feature.seq() 148 Seq('AATAAA', Alphabet()) 149 >>> print feature.location() 150 NC_001802x.fna:73..78 151 >>> from Bio import SeqIO 152 >>> record = feature.record() 153 >>> records = [record] 154 >>> SeqIO.write(records, sys.stdout, 'fasta') 155 > NC_001802x.fna:73..78 156 AATAAA 157 >>> feature2 = Feature(location=easy.LocationFromString("NC_001802x.fna:73..78")) 158 >>> writer.write(feature2.record()) 159 > NC_001802x.fna:73..78 160 AATAAA 161 >>> location3 = easy.LocationFromString("NC_001802x.fna:complement(73..78)") 162 >>> feature3 = Feature(location=location3) 163 >>> writer.write(feature3.record()) 164 > NC_001802x.fna:complement(73..78) 165 TTTATT 166 >>> location4 = easy.LocationFromString("NC_001802x.fna:336..1631") 167 >>> feature4 = Feature(location=location4, frame=0) 168 >>> feature4.frame 169 0 170 >>> feature4.translate()[:7] 171 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*')) 172 >>> feature4.frame = 6 # can't happen, but a useful demonstration 173 >>> feature4.translate()[:5] 174 Seq('ARASV', HasStopCodon(IUPACProtein(), '*')) 175 >>> feature4.frame = 1 176 >>> feature4.translate()[:5] 177 Seq('WVRER', HasStopCodon(IUPACProtein(), '*')) 178 >>> location5 = easy.LocationFromString("NC_001802lc.fna:336..1631") # lowercase data 179 >>> feature5 = Feature(location=location5, frame=0) 180 >>> feature5.translate()[:7] 181 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*')) 182 >>> location6 = easy.LocationFromString("NC_001802lc.fna:335..351") 183 >>> feature6 = Feature(location=location6, frame=1) 184 >>> feature6.translate() 185 Seq('MGARA', HasStopCodon(IUPACProtein(), '*')) 186 """
187 - def __init__(self, 188 seqname=None, 189 start=None, 190 end=None, 191 strand="+", 192 frame=None, 193 location=None, 194 alphabet=DEFAULT_ALPHABET):
195 if not isinstance(seqname, (types.StringType, types.NoneType)): 196 raise exceptions.TypeError("seqname needs to be string") 197 self.frame = frame 198 self.alphabet = alphabet 199 if location is None: 200 self.seqname = seqname 201 self.start = start 202 self.end = end 203 self.strand = strand 204 return 205 else: 206 self.seqname = location.seqname 207 self.start = location.start() + 1 208 self.end = location.end() + 1 209 self.strand = location.complement
210
211 - def __hash__(self):
212 return hash((self.seqname, 213 self.start, 214 self.end, 215 self.strand, 216 self.frame, 217 self.alphabet))
218
219 - def seq(self):
220 rec = RetrieveSeqname(self.seqname) 221 return easy.record_subseq(rec, self.location(), upper=1)
222
223 - def translate(self):
224 seq = self.seq() 225 try: 226 seq = Seq(seq.tostring()[self.frame:], self.alphabet) 227 except TypeError: 228 seq.alphabet = self.alphabet 229 try: 230 return seq.translate() #default table 231 except AssertionError: 232 # if the feature was pickled then we have problems 233 import cPickle 234 if cPickle.dumps(seq.alphabet) == cPickle.dumps(DEFAULT_ALPHABET): 235 seq.alphabet = DEFAULT_ALPHABET 236 return seq.translate() 237 else: 238 raise
239
240 - def location(self):
241 return easy.LocationFromCoords(self.start-1, self.end-1, self.strand, seqname=self.seqname)
242
243 - def target_location(self):
244 if self.target_start <= self.target_end: 245 return easy.LocationFromCoords(self.target_start-1, self.target_end-1, 0, seqname=self.gname) 246 else: 247 # switch start and end and make it complement: 248 return easy.LocationFromCoords(self.target_end-1, self.target_start-1, 1, seqname=self.gname)
249
250 - def id(self):
251 try: 252 return "%s:%s" % (self.gclass, self.gname) 253 except AttributeError: 254 return ""
255
256 - def record(self):
257 return SeqRecord(self.seq(), self.id(), "", self.location())
258
259 - def xrange(self):
260 return xrange(self.start, self.end)
261
262 - def __str__(self):
263 return "Feature(%s)" % self.location()
264
265 -class FeatureQueryRow(DocSQL.QueryRow, Feature):
266 """ 267 row of FeatureQuery results 268 works like a Feature 269 """
270 - def __init__(self, *args, **keywds):
271 DocSQL.QueryRow.__init__(self, *args, **keywds) 272 try: 273 self.frame = int(self.frame) 274 except ValueError: 275 self.frame = '' 276 except TypeError: 277 self.frame = None 278 self.alphabet = DEFAULT_ALPHABET
279
280 -class FeatureQuery(DocSQL.Query):
281 """ 282 SELECT fdata.fref AS seqname, 283 ftype.fsource AS source, 284 ftype.fmethod AS feature, 285 fdata.fstart AS start, 286 fdata.fstop AS end, 287 fdata.fscore AS score, 288 fdata.fstrand AS strand, 289 fdata.fphase AS frame, 290 fdata.ftarget_start AS target_start, 291 fdata.ftarget_stop AS target_end, 292 fdata.gid, 293 fgroup.gclass, 294 fgroup.gname 295 FROM fdata, ftype, fgroup 296 WHERE fdata.ftypeid = ftype.ftypeid 297 AND fdata.gid = fgroup.gid 298 """ 299 300 row_class = FeatureQueryRow
301 - def __init__(self, 302 seqname=None, 303 coords=None, 304 complement=0, 305 method=None, 306 source=None, 307 object=None, # "class:name" 308 gid=None, 309 *args, 310 **keywds):
311 312 DocSQL.Query.__init__(self, *args, **keywds) 313 314 if seqname is not None: 315 self.statement += ' AND fref="%s"\n' % seqname 316 if coords is not None: 317 self.statement += " AND (%s)\n" % binning.query(coords[0], coords[1]) 318 if coords[0] is not None: 319 self.statement += (" AND fstop >= %s\n" % coords[0]) 320 if coords[1] is not None: 321 self.statement += (" AND fstart <= %s\n" % coords[1]) 322 if method is not None: 323 self.statement += ' AND fmethod LIKE "%s"\n' % method # LIKE allows SQL "%" wildcards 324 if source is not None: 325 self.statement += ' AND fsource LIKE "%s"\n' % source 326 if object is not None: 327 self.statement += ' AND %s\n' % object2fgroup_sql(object) 328 if gid is not None: 329 self.statement += ' AND fgroup.gid = %s\n' % gid 330 331 if complement: 332 self.statement += " ORDER BY 0-fstart, 0-fstop" 333 else: 334 self.statement += " ORDER BY fstart, fstop"
335
336 - def aggregate(self):
337 return FeatureAggregate(self)
338
339 -def object2fgroup_sql(object):
340 return 'gclass LIKE "%s" AND gname LIKE "%s"' % object_split(object)
341
342 -class FeatureAggregate(list, Feature):
343 """ 344 >>> feature1_1 = Feature(location=easy.LocationFromString("NC_001802x.fna:336..1631"), frame=0) # gag-pol 345 >>> feature1_2 = Feature(location=easy.LocationFromString("NC_001802x.fna:1631..4642"), frame=0) # slippage 346 >>> aggregate = FeatureAggregate([feature1_1, feature1_2]) 347 >>> print aggregate.location() 348 join(NC_001802x.fna:336..1631,NC_001802x.fna:1631..4642) 349 >>> xlate_str = aggregate.translate().tostring() 350 >>> xlate_str[:5], xlate_str[-5:] 351 ('MGARA', 'RQDED') 352 353 >>> location1 = easy.LocationFromString("NC_001802x.fna:complement(1..6)") 354 >>> location2 = easy.LocationFromString("NC_001802x.fna:complement(7..12)") 355 >>> feature2_1 = Feature(location=location1, frame=0) 356 >>> feature2_2 = Feature(location=location2, frame=0) 357 >>> aggregate2 = FeatureAggregate([feature2_1, feature2_2]) 358 >>> print aggregate2.location() 359 complement(join(NC_001802x.fna:1..6,NC_001802x.fna:7..12)) 360 >>> print aggregate2.translate() 361 Seq('TRET', HasStopCodon(IUPACProtein(), '*')) 362 >>> location1.reverse() 363 >>> location2.reverse() 364 >>> aggregate3 = FeatureAggregate([Feature(location=x, frame=0) for x in [location1, location2]]) 365 >>> print aggregate3.location() 366 join(NC_001802x.fna:1..6,NC_001802x.fna:7..12) 367 >>> print aggregate3.translate() 368 Seq('GLSG', HasStopCodon(IUPACProtein(), '*')) 369 >>> aggregate3[0].frame = 3 370 >>> print aggregate3.translate() 371 Seq('LSG', HasStopCodon(IUPACProtein(), '*')) 372 373 >>> aggregate4 = FeatureAggregate() 374 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:1..5"), frame=0)) 375 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:6..12"), frame=2)) 376 >>> aggregate4.seq() 377 Seq('GGTCTCTCTGGT', Alphabet()) 378 >>> aggregate4.translate() 379 Seq('GLSG', HasStopCodon(IUPACProtein(), '*')) 380 """
381 - def __init__(self, feature_query=None):
382 if feature_query is None: 383 list.__init__(self, []) 384 else: 385 list.__init__(self, map(lambda x: x, feature_query))
386
387 - def location(self):
388 loc = easy.LocationJoin(map(lambda x: x.location(), self)) 389 loc.reorient() 390 return loc
391
392 - def map(self, func):
393 mapped = map(func, self) 394 if self.location().complement: 395 mapped.reverse() 396 return reduce(operator.add, mapped)
397
398 - def seq(self):
399 return self.map(lambda x: x.seq())
400
401 - def translate(self):
402 attributes = ['alphabet', 'frame'] 403 index = [0, -1][self.location().complement] 404 for attribute in attributes: 405 self.__dict__[attribute] = self[index].__dict__[attribute] 406 translation = Feature.translate(self) 407 try: 408 assert translation.tostring().index("*") == len(translation) - 1 409 return translation[:translation.tostring().index("*")] 410 except ValueError: 411 return translation
412
413 -def object_split(object):
414 """ 415 >>> object_split("Sequence:F02E9.2a") 416 ('Sequence', 'F02E9.2a') 417 """ 418 return tuple(object.split(":"))
419
420 -def _test(*args, **keywds):
421 import doctest, sys 422 doctest.testmod(sys.modules[__name__], *args, **keywds)
423 424 if __name__ == "__main__": 425 if __debug__: 426 _test() 427