1
2
3
4
5
6
7
8 """Bio.GFF.easy: some functions to ease the use of Biopython (DEPRECATED)
9
10 This is part of the "old" Bio.GFF module by Michael Hoffman, which offered
11 access to a MySQL database holding GFF data loaded by BioPerl. This code has
12 now been deprecated, and will probably be removed in order to free the Bio.GFF
13 namespace for a new GFF parser in Biopython (including GFF3 support).
14
15 Some of the more useful ideas of Bio.GFF.easy may be reworked for Bio.GenBank,
16 using the standard SeqFeature objects used elsewhere in Biopython.
17 """
18
19 import copy
20 import re
21 import sys
22
23 import Bio
24 from Bio import GenBank
25 from Bio.Data import IUPACData
26 from Bio.Seq import Seq
27
28 from Bio import SeqIO
29 from Bio import SeqUtils
30
31 import GenericTools
32
34 """ JH: accessing feature.qualifiers as a list is stupid. Here's a dict that does it"""
35 - def __init__(self, feature_list, default=None):
49
50 -class Location(GenericTools.VerboseList):
51 """
52 this is really best interfaced through LocationFromString
53 fuzzy: < or >
54 join: {0 = no join, 1 = join, 2 = order}
55
56 >>> location = Location([Location([339]), Location([564])]) # zero-based
57 >>> location
58 Location(Location(339), Location(564))
59 >>> print location # one-based
60 340..565
61 >>> print location.five_prime()
62 340
63 >>> location_rev = Location([Location([339]), Location([564])], 1)
64 >>> print location_rev
65 complement(340..565)
66 >>> print location_rev.five_prime()
67 565
68 """
69 - def __init__(self, the_list, complement=0, seqname=None):
75
77 if self.join == 1:
78 label = 'join'
79 elif self.join == 2:
80 label = 'order'
81 return "%s(%s)" % (label, ",".join(map(str, self)))
82
84 if self.seqname:
85 format = "%s:%%s" % self.seqname
86 else:
87 format = "%s"
88
89 if self.complement:
90 format = format % "complement(%s)"
91
92 if self.join:
93 return format % self._joinstr()
94
95 elif isinstance(self[0], list):
96 return format % "%s..%s" % (str(self[0]), str(self[1]))
97 else:
98 if self.fuzzy:
99 format = format % self.fuzzy + "%s"
100 return format % str(self[0] + 1)
101
103 return "Location(%s)" % ", ".join(map(repr, self))
104
105 direction2index = {1: 0, -1: -1}
107 """
108 1: 5'
109 -1: 3'
110
111 >>> loc1 = LocationFromString("join(1..3,complement(5..6))")
112 >>> loc1.direction_and_index(1)
113 (1, 0)
114 >>> loc1.direction_and_index(-1)
115 (-1, -1)
116 >>> loc1.reverse()
117 >>> print loc1
118 complement(join(1..3,complement(5..6)))
119 >>> loc1.direction_and_index(1)
120 (-1, -1)
121 """
122 if self.complement:
123 direction = direction * -1
124 index = self.direction2index[direction]
125 return direction, index
126
128 """
129 >>> loc = LocationFromString("complement(join(1..5,complement(6..10)))")
130 >>> loc.findside(1)
131 Location(5)
132 >>> loc.findside(-1)
133 Location(0)
134 """
135 direction, index = self.direction_and_index(direction)
136 if self.join or isinstance(self[0], list):
137 return self[index].findside(direction)
138 else:
139 return self
140
142 """
143 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))")
144 >>> loc.findseqname_3prime()
145 'MOOCOW'
146 """
147 return self.findseqname(-1)
148
150 """
151 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))")
152 >>> loc.findseqname()
153 'SEQ'
154 >>> loc.findseqname(-1)
155 'MOOCOW'
156 """
157 direction, index = self.direction_and_index(direction)
158 if self.seqname:
159 return self.seqname
160 elif self.join:
161 return self[index].findseqname(direction)
162 else:
163 raise AttributeError('no sequence name')
164
169
171 """
172 WARNING: doesn't deal with joins!!!!
173 """
174 return self.end()-self.start()
175
177 """
178 WARNING: doesn't deal with joins!!!!
179
180 >>> location1 = LocationFromString("1..50")
181 >>> location2 = LocationFromString("25..200")
182 >>> print location1.intersection(location2)
183 25..50
184 >>> print location1.intersection(location2)
185 25..50
186 """
187 if self.start() >= other.start():
188 start = self.start()
189 else:
190 start = other.start()
191 if self.end() <= other.end():
192 end = self.end()
193 else:
194 end = other.end()
195 return Location([Location([start]), Location([end])])
196
203
210
217
219 """
220 >>> fwd_location = LocationFromString('X:5830132..5831528')
221 >>> print fwd_location.sublocation(LocationFromString('1..101'))
222 X:5830132..5830232
223 >>> print fwd_location.sublocation(LocationFromString('1267..1286'))
224 X:5831398..5831417
225 >>> rev_location = LocationFromString('I:complement(8415686..8416216)')
226 >>> print rev_location.sublocation(LocationFromString('1..101'))
227 I:complement(8416116..8416216)
228 >>> print rev_location.sublocation(LocationFromString('100..200'))
229 I:complement(8416017..8416117)
230 """
231
232 absolute_location = copy.deepcopy(self)
233 for i in xrange(2):
234 absolute_location[i] = self.five_prime().add(sub_location[i], self.complement)
235 if absolute_location.complement:
236 list.reverse(absolute_location)
237 return absolute_location
238
240 return self.add(addend)
241
242 - def add(self, addend, complement=0):
243 self_copy = copy.deepcopy(self)
244 if isinstance(addend, Location):
245 addend = addend[0]
246 if complement:
247 addend *= -1
248 self_copy[0] += addend
249 return self_copy
250
252 return self + -subtrahend
253
256
258 """
259 >>> loc1 = LocationFromString("join(I:complement(1..9000),I:complement(9001..10000))")
260 >>> loc1.reorient()
261 >>> print loc1
262 complement(join(I:1..9000,I:9001..10000))
263 >>> loc2 = LocationFromString("join(I:complement(1..9000),I:9001..10000)")
264 >>> loc2.reorient()
265 >>> print loc2
266 join(I:complement(1..9000),I:9001..10000)
267 """
268 if self.join:
269 if len([x for x in self if x.complement]) == len(self):
270 self.reverse()
271 for segment in self:
272 segment.reverse()
273
275 """
276 works for single level non-complex joins
277
278 >>> LOC = LocationFromString
279 >>> l1 = LOC("join(alpha:1..30,alpha:50..70)")
280 >>> print l1.bounding()
281 join(alpha:1..70)
282 >>> l2 = LOC("join(alpha:1..30,alpha:complement(50..70))")
283 >>> print l2.bounding()
284 join(alpha:1..30,alpha:complement(50..70))
285 >>> l3 = LOC("join(alpha:1..30,alpha:complement(50..70),beta:6..20,alpha:25..45)")
286 >>> print l3.bounding()
287 join(alpha:1..45,alpha:complement(50..70),beta:6..20)
288
289 """
290 if not self.join:
291 return
292
293 seqdict = {}
294 seqkeys = []
295 for subloc in self:
296 assert subloc.seqname
297 assert not subloc.join
298 try:
299 seqdict[_hashname(subloc)].append(subloc)
300 except KeyError:
301 key = _hashname(subloc)
302 seqdict[key] = [subloc]
303 seqkeys.append(key)
304
305 res = LocationJoin()
306 for key in seqkeys:
307 locations = seqdict[key]
308 coords = []
309 for subloc in locations:
310 coords.append(subloc.start())
311 coords.append(subloc.end())
312 res.append(LocationFromCoords(min(coords), max(coords), locations[0].complement, locations[0].seqname))
313 return res
314
317
319 """
320 >>> join = LocationJoin([LocationFromCoords(339, 564, 1), LocationFromString("complement(100..339)")])
321 >>> appendloc = LocationFromString("order(complement(66..99),complement(5..55))")
322 >>> join.append(appendloc)
323 >>> print join
324 join(complement(340..565),complement(100..339),order(complement(66..99),complement(5..55)))
325 >>> join2 = LocationJoin()
326 >>> join2.append(LocationFromString("complement(66..99)"))
327 >>> join2.append(LocationFromString("complement(5..55)"))
328 >>> print join2
329 join(complement(66..99),complement(5..55))
330 """
331 - def __init__(self, the_list = [], complement=0, seqname=None):
337
339 """
340 >>> print LocationFromCoords(339, 564)
341 340..565
342 >>> print LocationFromCoords(339, 564, seqname="I")
343 I:340..565
344 >>> print LocationFromCoords(999, 3234, "-", seqname="NC_343434")
345 NC_343434:complement(1000..3235)
346 """
347 - def __init__(self, start, end, strand=0, seqname=None):
353
354
355
356 re_complement = re.compile(r"^complement\((.*)\)$")
357 re_seqname = re.compile(r"^(?!join|order|complement)([^\:]+?):(.*)$")
358 re_join = re.compile(r"^(join|order)\((.*)\)$")
359 re_dotdot = re.compile(r"^([><]*\d+)\.\.([><]*\d+)$")
360 re_fuzzy = re.compile(r"^([><])(\d+)")
362 """
363 >>> # here are some tests from http://www.ncbi.nlm.nih.gov/collab/FT/index.html#location
364 >>> print LocationFromString("467")
365 467
366 >>> print LocationFromString("340..565")
367 340..565
368 >>> print LocationFromString("<345..500")
369 <345..500
370 >>> print LocationFromString("<1..888")
371 <1..888
372 >>> # (102.110) and 123^124 syntax unimplemented
373 >>> print LocationFromString("join(12..78,134..202)")
374 join(12..78,134..202)
375 >>> print LocationFromString("complement(join(2691..4571,4918..5163))")
376 complement(join(2691..4571,4918..5163))
377 >>> print LocationFromString("join(complement(4918..5163),complement(2691..4571))")
378 join(complement(4918..5163),complement(2691..4571))
379 >>> print LocationFromString("order(complement(4918..5163),complement(2691..4571))")
380 order(complement(4918..5163),complement(2691..4571))
381 >>> print LocationFromString("NC_001802x.fna:73..78")
382 NC_001802x.fna:73..78
383 >>> print LocationFromString("J00194:100..202")
384 J00194:100..202
385
386 >>> print LocationFromString("join(117505..118584,1..609)")
387 join(117505..118584,1..609)
388 >>> print LocationFromString("join(test3:complement(4..6),test3:complement(1..3))")
389 join(test3:complement(4..6),test3:complement(1..3))
390 >>> print LocationFromString("test3:join(test1:complement(1..3),4..6)")
391 test3:join(test1:complement(1..3),4..6)
392 """
424
426 if filename:
427 return open(filename)
428 else:
429 return sys.stdin
430
432 """
433 >>> record = fasta_single(string=\"""
434 ... >gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]
435 ... MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQT
436 ... GSEELRSLYNTVATLYCVHQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNYPIVQNIQG
437 ... QMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAA
438 ... EWDRVHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKRWIILGLNKIVRMYSPT
439 ... SILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTAC
440 ... QGVGGPGHKARVLAEAMSQVTNSATIMMQRGNFRNQRKIVKCFNCGKEGHTARNCRAPRKKGCWKCGKEG
441 ... HQMKDCTERQANFLGKIWPSYKGRPGNFLQSRPEPTAPPEESFRSGVETTTPPQKQEPIDKELYPLTSLR
442 ... SLFGNDPSSQ
443 ... \""")
444 >>> record.id
445 'gi|9629360|ref|NP_057850.1|'
446 >>> record.description
447 'gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]'
448 >>> record.seq[0:5]
449 Seq('MGARA', SingleLetterAlphabet())
450 """
451
452
453 if string:
454 import cStringIO
455 handle = cStringIO.StringIO(string)
456 else:
457 handle = open_file(filename)
458 try:
459 record = SeqIO.parse(handle, format="fasta").next()
460 except StopIteration:
461 record = None
462 return record
463
465
466
467
468
469 reader = SeqIO.parse(open_file(filename), format="fasta")
470 while True:
471 record = reader.next()
472 if record is None:
473 raise StopIteration
474 else:
475 yield record
476
478 """
479 >>> records = fasta_readrecords('GFF/multi.fna')
480 >>> records[0].id
481 'test1'
482 >>> records[2].seq
483 Seq('AAACACAC', SingleLetterAlphabet())
484 """
485 return list(SeqIO.parse(open_file(filename), format="fasta"))
486
491
493 """
494 >>> record = genbank_single("GFF/NC_001422.gbk")
495 >>> record.taxonomy
496 ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus']
497 >>> cds = record.features[-4]
498 >>> cds.key
499 'CDS'
500 >>> location = LocationFromString(cds.location)
501 >>> print location
502 2931..3917
503 >>> subseq = record_subseq(record, location)
504 >>> subseq[0:20]
505 Seq('ATGTTTGGTGCTATTGCTGG', Alphabet())
506 """
507 return GenBank.RecordParser().parse(open(filename))
508
510 """
511 >>> from Bio.SeqRecord import SeqRecord
512 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"),
513 ... "ref|NC_001422",
514 ... "Coliphage phiX174, complete genome",
515 ... "bases 1-11")
516 >>> record_subseq(record, LocationFromString("1..4")) # one-based
517 Seq('GAGT', Alphabet())
518 >>> record_subseq(record, LocationFromString("complement(1..4)")) # one-based
519 Seq('ACTC', Alphabet())
520 >>> record_subseq(record, LocationFromString("join(complement(1..4),1..4)")) # what an idea!
521 Seq('ACTCGAGT', Alphabet())
522 >>> loc = LocationFromString("complement(join(complement(5..7),1..4))")
523 >>> print loc
524 complement(join(complement(5..7),1..4))
525 >>> record_subseq(record, loc)
526 Seq('ACTCTTT', Alphabet())
527 >>> print loc
528 complement(join(complement(5..7),1..4))
529 >>> loc.reverse()
530 >>> record_subseq(record, loc)
531 Seq('AAAGAGT', Alphabet())
532 >>> record_subseq(record, loc, upper=1)
533 Seq('AAAGAGT', Alphabet())
534 """
535 if location.join:
536 subsequence_list = []
537 if location.complement:
538 location_copy = copy.copy(location)
539 list.reverse(location_copy)
540 else:
541 location_copy = location
542 for sublocation in location_copy:
543 if location.complement:
544 sublocation_copy = copy.copy(sublocation)
545 sublocation_copy.reverse()
546 else:
547 sublocation_copy = sublocation
548 subsequence_list.append(record_subseq(record, sublocation_copy, *args, **keywds).tostring())
549 return Seq(''.join(subsequence_list), record_sequence(record).alphabet)
550 else:
551 return record_coords(record, location.start(), location.end()+1, location.complement, *args, **keywds)
552
565
567 """
568 >>> from Bio.SeqRecord import SeqRecord
569 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"),
570 ... "ref|NC_001422",
571 ... "Coliphage phiX174, complete genome",
572 ... "bases 1-11")
573 >>> record_coords(record, 0, 4) # zero-based
574 Seq('GAGT', Alphabet())
575 >>> record_coords(record, 0, 4, "-") # zero-based
576 Seq('ACTC', Alphabet())
577 >>> record_coords(record, 0, 4, "-", upper=1) # zero-based
578 Seq('ACTC', Alphabet())
579 """
580
581 subseq = record_sequence(record)[start:end]
582 subseq_str = subseq.tostring()
583 subseq_str = subseq_str.upper()
584 subseq = Seq(subseq_str, subseq.alphabet)
585 if strand == '-' or strand == 1:
586 return subseq.reverse_complement()
587 else:
588 return subseq
589
591 """Run the Bio.GFF.easy module's doctests (PRIVATE).
592
593 This will try and locate the unit tests directory, and run the doctests
594 from there in order that the relative paths used in the examples work.
595 """
596 import doctest
597 import os
598 if os.path.isdir(os.path.join("..","..","Tests")):
599 print "Runing doctests..."
600 cur_dir = os.path.abspath(os.curdir)
601 os.chdir(os.path.join("..","..","Tests"))
602 doctest.testmod()
603 os.chdir(cur_dir)
604 del cur_dir
605 print "Done"
606 elif os.path.isdir(os.path.join("Tests")) :
607 print "Runing doctests..."
608 cur_dir = os.path.abspath(os.curdir)
609 os.chdir(os.path.join("Tests"))
610 doctest.testmod()
611 os.chdir(cur_dir)
612 del cur_dir
613 print "Done"
614
615 if __name__ == "__main__":
616 if __debug__:
617 _test()
618