1
2
3
4
5
6
7
8
9
10 """Miscellaneous functions for dealing with sequences."""
11
12 import re, time
13 from Bio import SeqIO
14 from Bio.Seq import Seq
15 from Bio import Alphabet
16 from Bio.Alphabet import IUPAC
17 from Bio.Data import IUPACData, CodonTable
18
19
20
21
22
23
24
26 """Reverse the sequence. Works on string sequences.
27
28 e.g.
29 >>> reverse("ACGGT")
30 'TGGCA'
31
32 """
33 r = list(seq)
34 r.reverse()
35 return ''.join(r)
36
38 """Calculates G+C content, returns the percentage (float between 0 and 100).
39
40 Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C)
41 when counting the G and C content. The percentage is calculated against
42 the full length, e.g.:
43
44 >>> from Bio.SeqUtils import GC
45 >>> GC("ACTGN")
46 40.0
47
48 Note that this will return zero for an empty sequence.
49 """
50 try:
51 gc = sum(map(seq.count,['G','C','g','c','S','s']))
52 return gc*100.0/len(seq)
53 except ZeroDivisionError:
54 return 0.0
55
56
58 """Calculates total G+C content plus first, second and third positions.
59
60 Returns a tuple of four floats (percentages between 0 and 100) for the
61 entire sequence, and the three codon positions. e.g.
62
63 >>> from Bio.SeqUtils import GC123
64 >>> GC123("ACTGTN")
65 (40.0, 50.0, 50.0, 0.0)
66
67 Copes with mixed case sequences, but does NOT deal with ambiguous
68 nucleotides.
69 """
70 d= {}
71 for nt in ['A','T','G','C']:
72 d[nt] = [0,0,0]
73
74 for i in range(0,len(seq),3):
75 codon = seq[i:i+3]
76 if len(codon) <3: codon += ' '
77 for pos in range(0,3):
78 for nt in ['A','T','G','C']:
79 if codon[pos] == nt or codon[pos] == nt.lower():
80 d[nt][pos] += 1
81 gc = {}
82 gcall = 0
83 nall = 0
84 for i in range(0,3):
85 try:
86 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
87 gc[i] = (d['G'][i] + d['C'][i])*100.0/n
88 except:
89 gc[i] = 0
90
91 gcall = gcall + d['G'][i] + d['C'][i]
92 nall = nall + n
93
94 gcall = 100.0*gcall/nall
95 return gcall, gc[0], gc[1], gc[2]
96
98 """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence.
99
100 Returns a list of ratios (floats), controlled by the length of the sequence
101 and the size of the window.
102
103 Does NOT look at any ambiguous nucleotides.
104 """
105
106 values = []
107 for i in range(0, len(seq), window):
108 s = seq[i: i + window]
109 g = s.count('G') + s.count('g')
110 c = s.count('C') + s.count('c')
111 skew = (g-c)/float(g+c)
112 values.append(skew)
113 return values
114
115 from math import pi, sin, cos, log
116 -def xGC_skew(seq, window = 1000, zoom = 100,
117 r = 300, px = 100, py = 100):
118 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!)."""
119 from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \
120 VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y
121 yscroll = Scrollbar(orient = VERTICAL)
122 xscroll = Scrollbar(orient = HORIZONTAL)
123 canvas = Canvas(yscrollcommand = yscroll.set,
124 xscrollcommand = xscroll.set, background = 'white')
125 win = canvas.winfo_toplevel()
126 win.geometry('700x700')
127
128 yscroll.config(command = canvas.yview)
129 xscroll.config(command = canvas.xview)
130 yscroll.pack(side = RIGHT, fill = Y)
131 xscroll.pack(side = BOTTOM, fill = X)
132 canvas.pack(fill=BOTH, side = LEFT, expand = 1)
133 canvas.update()
134
135 X0, Y0 = r + px, r + py
136 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r
137
138 ty = Y0
139 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
140 ty +=20
141 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq)))
142 ty +=20
143 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue')
144 ty +=20
145 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta')
146 ty +=20
147 canvas.create_oval(x1,y1, x2, y2)
148
149 acc = 0
150 start = 0
151 for gc in GC_skew(seq, window):
152 r1 = r
153 acc+=gc
154
155 alpha = pi - (2*pi*start)/len(seq)
156 r2 = r1 - gc*zoom
157 x1 = X0 + r1 * sin(alpha)
158 y1 = Y0 + r1 * cos(alpha)
159 x2 = X0 + r2 * sin(alpha)
160 y2 = Y0 + r2 * cos(alpha)
161 canvas.create_line(x1,y1,x2,y2, fill = 'blue')
162
163 r1 = r - 50
164 r2 = r1 - acc
165 x1 = X0 + r1 * sin(alpha)
166 y1 = Y0 + r1 * cos(alpha)
167 x2 = X0 + r2 * sin(alpha)
168 y2 = Y0 + r2 * cos(alpha)
169 canvas.create_line(x1,y1,x2,y2, fill = 'magenta')
170
171 canvas.update()
172 start += window
173
174 canvas.configure(scrollregion = canvas.bbox(ALL))
175
181
183 """Search for a DNA subseq in sequence.
184
185 use ambiguous values (like N = A or T or C or G, R = A or G etc.)
186 searches only on forward strand
187 """
188 pattern = ''
189 for nt in subseq:
190 value = IUPACData.ambiguous_dna_values[nt]
191 if len(value) == 1:
192 pattern += value
193 else:
194 pattern += '[%s]' % value
195
196 pos = -1
197 result = [pattern]
198 l = len(seq)
199 while True:
200 pos+=1
201 s = seq[pos:]
202 m = re.search(pattern, s)
203 if not m: break
204 pos += int(m.start(0))
205 result.append(pos)
206 return result
207
208
209
210
211
212
213
214
215
216
217
218 -class ProteinX(Alphabet.ProteinAlphabet):
220
221 proteinX = ProteinX()
222
226 - def get(self, codon, stop_symbol):
231
238
239
240
242 """Turn a one letter code protein sequence into one with three letter codes.
243
244 The single input argument 'seq' should be a protein sequence using single
245 letter codes, either as a python string or as a Seq or MutableSeq object.
246
247 This function returns the amino acid sequence as a string using the three
248 letter amino acid codes. Output follows the IUPAC standard (including
249 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
250 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown
251 character (including possible gap characters), is changed into 'Xaa'.
252
253 e.g.
254 >>> from Bio.SeqUtils import seq3
255 >>> seq3("MAIVMGRWKGAR*")
256 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
257
258 This function was inspired by BioPerl's seq3.
259 """
260 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
261 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
262 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
263 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
264 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
265 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter',
266 'U':'Sel', 'O':'Pyl', 'J':'Xle',
267 }
268
269
270 return ''.join([threecode.get(aa,'Xaa') for aa in seq])
271
272
273
274
275
276
277
278
279
281 """Just an alias for six_frame_translations (OBSOLETE).
282
283 Use six_frame_translation directly, as this function may be deprecated
284 in a future release."""
285 return six_frame_translations(seq, genetic_code)
286
288 """Formatted string showing the 6 frame translations and GC content.
289
290 nice looking 6 frame translation with GC content - code from xbbtools
291 similar to DNA Striders six-frame translation
292
293 e.g.
294 from Bio.SeqUtils import six_frame_translations
295 print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
296 """
297 from Bio.Seq import reverse_complement, translate
298 anti = reverse_complement(seq)
299 comp = anti[::-1]
300 length = len(seq)
301 frames = {}
302 for i in range(0,3):
303 frames[i+1] = translate(seq[i:], genetic_code)
304 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code))
305
306
307 if length > 20:
308 short = '%s ... %s' % (seq[:10], seq[-10:])
309 else:
310 short = seq
311
312 date = time.strftime('%y %b %d, %X', time.localtime(time.time()))
313 header = 'GC_Frame: %s, ' % date
314 for nt in ['a','t','g','c']:
315 header += '%s:%d ' % (nt, seq.count(nt.upper()))
316
317 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq))
318 res = header
319
320 for i in range(0,length,60):
321 subseq = seq[i:i+60]
322 csubseq = comp[i:i+60]
323 p = i/3
324 res = res + '%d/%d\n' % (i+1, i/3+1)
325 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n'
326 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n'
327 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n'
328
329 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq))
330 res = res + csubseq.lower() + '\n'
331
332 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n'
333 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n'
334 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n'
335 return res
336
337
338
339
340
341
342
343
345 """Checks and changes the name/ID's to be unique identifiers by adding numbers (OBSOLETE).
346
347 file - a FASTA format filename to read in.
348
349 No return value, the output is written to screen.
350 """
351 dict = {}
352 txt = open(file).read()
353 entries = []
354 for entry in txt.split('>')[1:]:
355 name, seq= entry.split('\n',1)
356 name = name.split()[0].split(',')[0]
357
358 if name in dict:
359 n = 1
360 while 1:
361 n = n + 1
362 _name = name + str(n)
363 if _name not in dict:
364 name = _name
365 break
366
367 dict[name] = seq
368
369 for name, seq in dict.items():
370 print '>%s\n%s' % (name, seq)
371
373 """Simple FASTA reader, returning a list of string tuples.
374
375 The single argument 'file' should be the filename of a FASTA format file.
376 This function will open and read in the entire file, constructing a list
377 of all the records, each held as a tuple of strings (the sequence name or
378 title, and its sequence).
379
380 This function was originally intended for use on large files, where its
381 low overhead makes it very fast. However, because it returns the data as
382 a single in memory list, this can require a lot of RAM on large files.
383
384 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
385 allows you to iterate over the records one by one (avoiding having all the
386 records in memory at once). Using Bio.SeqIO also makes it easy to switch
387 between different input file formats. However, please note that rather
388 than simple strings, Bio.SeqIO uses SeqRecord objects for each record.
389 """
390
391
392
393 handle = open(file)
394 txt = "\n" + handle.read()
395 handle.close()
396 entries = []
397 for entry in txt.split('\n>')[1:]:
398 name,seq= entry.split('\n',1)
399 seq = seq.replace('\n','').replace(' ','').upper()
400 entries.append((name, seq))
401 return entries
402
404 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
405
406 file - filename of a FASTA format file
407 function - the function you wish to invoke on each record
408 *args - any extra arguments you want passed to the function
409
410 This function will iterate over each record in a FASTA file as SeqRecord
411 objects, calling your function with the record (and supplied args) as
412 arguments.
413
414 This function returns a list. For those records where your function
415 returns a value, this is taken as a sequence and used to construct a
416 FASTA format string. If your function never has a return value, this
417 means apply_on_multi_fasta will return an empty list.
418 """
419 try:
420 f = globals()[function]
421 except:
422 raise NotImplementedError("%s not implemented" % function)
423
424 handle = open(file, 'r')
425 records = SeqIO.parse(handle, "fasta")
426 results = []
427 for record in records:
428 arguments = [record.sequence]
429 for arg in args: arguments.append(arg)
430 result = f(*arguments)
431 if result:
432 results.append('>%s\n%s' % (record.name, result))
433 handle.close()
434 return results
435
437 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
438
439 file - filename of a FASTA format file
440 function - the function you wish to invoke on each record
441 *args - any extra arguments you want passed to the function
442
443 This function will use quick_FASTA_reader to load every record in the
444 FASTA file into memory as a list of tuples. For each record, it will
445 call your supplied function with the record as a tuple of the name and
446 sequence as strings (plus any supplied args).
447
448 This function returns a list. For those records where your function
449 returns a value, this is taken as a sequence and used to construct a
450 FASTA format string. If your function never has a return value, this
451 means quicker_apply_on_multi_fasta will return an empty list.
452 """
453 try:
454 f = globals()[function]
455 except:
456 raise NotImplementedError("%s not implemented" % function)
457
458 entries = quick_FASTA_reader(file)
459 results = []
460 for name, seq in entries:
461 arguments = [seq]
462 for arg in args: arguments.append(arg)
463 result = f(*arguments)
464 if result:
465 results.append('>%s\n%s' % (name, result))
466 handle.close()
467 return results
468
469
470
471
472
473
474
475
476 if __name__ == '__main__':
477 import sys, getopt
478
479 options = {'apply_on_multi_fasta':0,
480 'quick':0,
481 'uniq_ids':0,
482 }
483
484 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=',
485 'help', 'quick', 'uniq_ids', 'search='])
486 for arg in optlist:
487 if arg[0] in ['-h', '--help']:
488 pass
489 elif arg[0] in ['--describe']:
490
491 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)]
492 mol_funcs.sort()
493 print 'available functions:'
494 for f in mol_funcs: print '\t--%s' % f
495 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas'
496
497 sys.exit(0)
498 elif arg[0] in ['--apply_on_multi_fasta']:
499 options['apply_on_multi_fasta'] = arg[1]
500 elif arg[0] in ['--search']:
501 options['search'] = arg[1]
502 else:
503 key = re.search('-*(.+)', arg[0]).group(1)
504 options[key] = 1
505
506
507 if options.get('apply_on_multi_fasta'):
508 file = args[0]
509 function = options['apply_on_multi_fasta']
510 arguments = []
511 if options.get('search'):
512 arguments = options['search']
513 if function == 'xGC_skew':
514 arguments = 1000
515 if options.get('quick'):
516 results = quicker_apply_on_multi_fasta(file, function, arguments)
517 else:
518 results = apply_on_multi_fasta(file, function, arguments)
519 for result in results: print result
520
521 elif options.get('uniq_ids'):
522 file = args[0]
523 fasta_uniqids(file)
524
525
526