Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # All rights reserved. 
  6  # This code is part of the Biopython distribution and governed by its 
  7  # license.  Please see the LICENSE file that should have been included 
  8  # as part of this package. 
  9   
 10  """Miscellaneous functions for dealing with sequences.""" 
 11   
 12  import re, time 
 13  from Bio import SeqIO 
 14  from Bio.Seq import Seq 
 15  from Bio import Alphabet 
 16  from Bio.Alphabet import IUPAC 
 17  from Bio.Data import IUPACData, CodonTable 
 18   
 19   
 20  ###################################### 
 21  # DNA 
 22  ###################### 
 23  # {{{  
 24   
25 -def reverse(seq):
26 """Reverse the sequence. Works on string sequences. 27 28 e.g. 29 >>> reverse("ACGGT") 30 'TGGCA' 31 32 """ 33 r = list(seq) 34 r.reverse() 35 return ''.join(r)
36
37 -def GC(seq):
38 """Calculates G+C content, returns the percentage (float between 0 and 100). 39 40 Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C) 41 when counting the G and C content. The percentage is calculated against 42 the full length, e.g.: 43 44 >>> from Bio.SeqUtils import GC 45 >>> GC("ACTGN") 46 40.0 47 48 Note that this will return zero for an empty sequence. 49 """ 50 try: 51 gc = sum(map(seq.count,['G','C','g','c','S','s'])) 52 return gc*100.0/len(seq) 53 except ZeroDivisionError: 54 return 0.0
55 56
57 -def GC123(seq):
58 """Calculates total G+C content plus first, second and third positions. 59 60 Returns a tuple of four floats (percentages between 0 and 100) for the 61 entire sequence, and the three codon positions. e.g. 62 63 >>> from Bio.SeqUtils import GC123 64 >>> GC123("ACTGTN") 65 (40.0, 50.0, 50.0, 0.0) 66 67 Copes with mixed case sequences, but does NOT deal with ambiguous 68 nucleotides. 69 """ 70 d= {} 71 for nt in ['A','T','G','C']: 72 d[nt] = [0,0,0] 73 74 for i in range(0,len(seq),3): 75 codon = seq[i:i+3] 76 if len(codon) <3: codon += ' ' 77 for pos in range(0,3): 78 for nt in ['A','T','G','C']: 79 if codon[pos] == nt or codon[pos] == nt.lower(): 80 d[nt][pos] += 1 81 gc = {} 82 gcall = 0 83 nall = 0 84 for i in range(0,3): 85 try: 86 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i] 87 gc[i] = (d['G'][i] + d['C'][i])*100.0/n 88 except: 89 gc[i] = 0 90 91 gcall = gcall + d['G'][i] + d['C'][i] 92 nall = nall + n 93 94 gcall = 100.0*gcall/nall 95 return gcall, gc[0], gc[1], gc[2]
96
97 -def GC_skew(seq, window = 100):
98 """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence. 99 100 Returns a list of ratios (floats), controlled by the length of the sequence 101 and the size of the window. 102 103 Does NOT look at any ambiguous nucleotides. 104 """ 105 # 8/19/03: Iddo: added lowercase 106 values = [] 107 for i in range(0, len(seq), window): 108 s = seq[i: i + window] 109 g = s.count('G') + s.count('g') 110 c = s.count('C') + s.count('c') 111 skew = (g-c)/float(g+c) 112 values.append(skew) 113 return values
114 115 from math import pi, sin, cos, log
116 -def xGC_skew(seq, window = 1000, zoom = 100, 117 r = 300, px = 100, py = 100):
118 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).""" 119 from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \ 120 VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y 121 yscroll = Scrollbar(orient = VERTICAL) 122 xscroll = Scrollbar(orient = HORIZONTAL) 123 canvas = Canvas(yscrollcommand = yscroll.set, 124 xscrollcommand = xscroll.set, background = 'white') 125 win = canvas.winfo_toplevel() 126 win.geometry('700x700') 127 128 yscroll.config(command = canvas.yview) 129 xscroll.config(command = canvas.xview) 130 yscroll.pack(side = RIGHT, fill = Y) 131 xscroll.pack(side = BOTTOM, fill = X) 132 canvas.pack(fill=BOTH, side = LEFT, expand = 1) 133 canvas.update() 134 135 X0, Y0 = r + px, r + py 136 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r 137 138 ty = Y0 139 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 140 ty +=20 141 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq))) 142 ty +=20 143 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue') 144 ty +=20 145 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta') 146 ty +=20 147 canvas.create_oval(x1,y1, x2, y2) 148 149 acc = 0 150 start = 0 151 for gc in GC_skew(seq, window): 152 r1 = r 153 acc+=gc 154 # GC skew 155 alpha = pi - (2*pi*start)/len(seq) 156 r2 = r1 - gc*zoom 157 x1 = X0 + r1 * sin(alpha) 158 y1 = Y0 + r1 * cos(alpha) 159 x2 = X0 + r2 * sin(alpha) 160 y2 = Y0 + r2 * cos(alpha) 161 canvas.create_line(x1,y1,x2,y2, fill = 'blue') 162 # accumulated GC skew 163 r1 = r - 50 164 r2 = r1 - acc 165 x1 = X0 + r1 * sin(alpha) 166 y1 = Y0 + r1 * cos(alpha) 167 x2 = X0 + r2 * sin(alpha) 168 y2 = Y0 + r2 * cos(alpha) 169 canvas.create_line(x1,y1,x2,y2, fill = 'magenta') 170 171 canvas.update() 172 start += window 173 174 canvas.configure(scrollregion = canvas.bbox(ALL))
175
176 -def molecular_weight(seq):
177 """Calculate the molecular weight of a DNA sequence.""" 178 if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna) 179 weight_table = IUPACData.unambiguous_dna_weights 180 return sum(weight_table[x] for x in seq)
181
182 -def nt_search(seq, subseq):
183 """Search for a DNA subseq in sequence. 184 185 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 186 searches only on forward strand 187 """ 188 pattern = '' 189 for nt in subseq: 190 value = IUPACData.ambiguous_dna_values[nt] 191 if len(value) == 1: 192 pattern += value 193 else: 194 pattern += '[%s]' % value 195 196 pos = -1 197 result = [pattern] 198 l = len(seq) 199 while True: 200 pos+=1 201 s = seq[pos:] 202 m = re.search(pattern, s) 203 if not m: break 204 pos += int(m.start(0)) 205 result.append(pos) 206 return result
207 208 # }}} 209 210 ###################################### 211 # Protein 212 ###################### 213 # {{{ 214 215 # temporary hack for exception free translation of "dirty" DNA 216 # should be moved to ??? 217
218 -class ProteinX(Alphabet.ProteinAlphabet):
219 letters = IUPACData.extended_protein_letters + "X"
220 221 proteinX = ProteinX() 222
223 -class MissingTable:
224 - def __init__(self, table):
225 self._table = table
226 - def get(self, codon, stop_symbol):
227 try: 228 return self._table.get(codon, stop_symbol) 229 except CodonTable.TranslationError: 230 return 'X'
231
232 -def makeTableX(table):
233 assert table.protein_alphabet == IUPAC.extended_protein 234 return CodonTable.CodonTable(table.nucleotide_alphabet, proteinX, 235 MissingTable(table.forward_table), 236 table.back_table, table.start_codons, 237 table.stop_codons)
238 239 # end of hacks 240
241 -def seq3(seq):
242 """Turn a one letter code protein sequence into one with three letter codes. 243 244 The single input argument 'seq' should be a protein sequence using single 245 letter codes, either as a python string or as a Seq or MutableSeq object. 246 247 This function returns the amino acid sequence as a string using the three 248 letter amino acid codes. Output follows the IUPAC standard (including 249 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U 250 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown 251 character (including possible gap characters), is changed into 'Xaa'. 252 253 e.g. 254 >>> from Bio.SeqUtils import seq3 255 >>> seq3("MAIVMGRWKGAR*") 256 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' 257 258 This function was inspired by BioPerl's seq3. 259 """ 260 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp', 261 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His', 262 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met', 263 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg', 264 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp', 265 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter', 266 'U':'Sel', 'O':'Pyl', 'J':'Xle', 267 } 268 #We use a default of 'Xaa' for undefined letters 269 #Note this will map '-' to 'Xaa' which may be undesirable! 270 return ''.join([threecode.get(aa,'Xaa') for aa in seq])
271 272 273 # }}} 274 275 ###################################### 276 # Mixed ??? 277 ###################### 278 # {{{ 279
280 -def GC_Frame(seq, genetic_code = 1):
281 """Just an alias for six_frame_translations (OBSOLETE). 282 283 Use six_frame_translation directly, as this function may be deprecated 284 in a future release.""" 285 return six_frame_translations(seq, genetic_code)
286
287 -def six_frame_translations(seq, genetic_code = 1):
288 """Formatted string showing the 6 frame translations and GC content. 289 290 nice looking 6 frame translation with GC content - code from xbbtools 291 similar to DNA Striders six-frame translation 292 293 e.g. 294 from Bio.SeqUtils import six_frame_translations 295 print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA") 296 """ 297 from Bio.Seq import reverse_complement, translate 298 anti = reverse_complement(seq) 299 comp = anti[::-1] 300 length = len(seq) 301 frames = {} 302 for i in range(0,3): 303 frames[i+1] = translate(seq[i:], genetic_code) 304 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code)) 305 306 # create header 307 if length > 20: 308 short = '%s ... %s' % (seq[:10], seq[-10:]) 309 else: 310 short = seq 311 #TODO? Remove the date as this would spoil any unit test... 312 date = time.strftime('%y %b %d, %X', time.localtime(time.time())) 313 header = 'GC_Frame: %s, ' % date 314 for nt in ['a','t','g','c']: 315 header += '%s:%d ' % (nt, seq.count(nt.upper())) 316 317 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq)) 318 res = header 319 320 for i in range(0,length,60): 321 subseq = seq[i:i+60] 322 csubseq = comp[i:i+60] 323 p = i/3 324 res = res + '%d/%d\n' % (i+1, i/3+1) 325 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n' 326 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n' 327 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n' 328 # seq 329 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq)) 330 res = res + csubseq.lower() + '\n' 331 # - frames 332 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n' 333 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n' 334 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n' 335 return res
336 337 # }}} 338 339 ###################################### 340 # FASTA file utilities 341 ###################### 342 # {{{ 343
344 -def fasta_uniqids(file):
345 """Checks and changes the name/ID's to be unique identifiers by adding numbers (OBSOLETE). 346 347 file - a FASTA format filename to read in. 348 349 No return value, the output is written to screen. 350 """ 351 dict = {} 352 txt = open(file).read() 353 entries = [] 354 for entry in txt.split('>')[1:]: 355 name, seq= entry.split('\n',1) 356 name = name.split()[0].split(',')[0] 357 358 if name in dict: 359 n = 1 360 while 1: 361 n = n + 1 362 _name = name + str(n) 363 if _name not in dict: 364 name = _name 365 break 366 367 dict[name] = seq 368 369 for name, seq in dict.items(): 370 print '>%s\n%s' % (name, seq)
371
372 -def quick_FASTA_reader(file):
373 """Simple FASTA reader, returning a list of string tuples. 374 375 The single argument 'file' should be the filename of a FASTA format file. 376 This function will open and read in the entire file, constructing a list 377 of all the records, each held as a tuple of strings (the sequence name or 378 title, and its sequence). 379 380 This function was originally intended for use on large files, where its 381 low overhead makes it very fast. However, because it returns the data as 382 a single in memory list, this can require a lot of RAM on large files. 383 384 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which 385 allows you to iterate over the records one by one (avoiding having all the 386 records in memory at once). Using Bio.SeqIO also makes it easy to switch 387 between different input file formats. However, please note that rather 388 than simple strings, Bio.SeqIO uses SeqRecord objects for each record. 389 """ 390 #Want to split on "\n>" not just ">" in case there are any extra ">" 391 #in the name/description. So, in order to make sure we also split on 392 #the first entry, prepend a "\n" to the start of the file. 393 handle = open(file) 394 txt = "\n" + handle.read() 395 handle.close() 396 entries = [] 397 for entry in txt.split('\n>')[1:]: 398 name,seq= entry.split('\n',1) 399 seq = seq.replace('\n','').replace(' ','').upper() 400 entries.append((name, seq)) 401 return entries
402
403 -def apply_on_multi_fasta(file, function, *args):
404 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE). 405 406 file - filename of a FASTA format file 407 function - the function you wish to invoke on each record 408 *args - any extra arguments you want passed to the function 409 410 This function will iterate over each record in a FASTA file as SeqRecord 411 objects, calling your function with the record (and supplied args) as 412 arguments. 413 414 This function returns a list. For those records where your function 415 returns a value, this is taken as a sequence and used to construct a 416 FASTA format string. If your function never has a return value, this 417 means apply_on_multi_fasta will return an empty list. 418 """ 419 try: 420 f = globals()[function] 421 except: 422 raise NotImplementedError("%s not implemented" % function) 423 424 handle = open(file, 'r') 425 records = SeqIO.parse(handle, "fasta") 426 results = [] 427 for record in records: 428 arguments = [record.sequence] 429 for arg in args: arguments.append(arg) 430 result = f(*arguments) 431 if result: 432 results.append('>%s\n%s' % (record.name, result)) 433 handle.close() 434 return results
435
436 -def quicker_apply_on_multi_fasta(file, function, *args):
437 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE). 438 439 file - filename of a FASTA format file 440 function - the function you wish to invoke on each record 441 *args - any extra arguments you want passed to the function 442 443 This function will use quick_FASTA_reader to load every record in the 444 FASTA file into memory as a list of tuples. For each record, it will 445 call your supplied function with the record as a tuple of the name and 446 sequence as strings (plus any supplied args). 447 448 This function returns a list. For those records where your function 449 returns a value, this is taken as a sequence and used to construct a 450 FASTA format string. If your function never has a return value, this 451 means quicker_apply_on_multi_fasta will return an empty list. 452 """ 453 try: 454 f = globals()[function] 455 except: 456 raise NotImplementedError("%s not implemented" % function) 457 458 entries = quick_FASTA_reader(file) 459 results = [] 460 for name, seq in entries: 461 arguments = [seq] 462 for arg in args: arguments.append(arg) 463 result = f(*arguments) 464 if result: 465 results.append('>%s\n%s' % (name, result)) 466 handle.close() 467 return results
468 469 # }}} 470 471 ###################################### 472 # Main 473 ##################### 474 # {{{ 475 476 if __name__ == '__main__': 477 import sys, getopt 478 # crude command line options to use most functions directly on a FASTA file 479 options = {'apply_on_multi_fasta':0, 480 'quick':0, 481 'uniq_ids':0, 482 } 483 484 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=', 485 'help', 'quick', 'uniq_ids', 'search=']) 486 for arg in optlist: 487 if arg[0] in ['-h', '--help']: 488 pass 489 elif arg[0] in ['--describe']: 490 # get all new functions from this file 491 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)] 492 mol_funcs.sort() 493 print 'available functions:' 494 for f in mol_funcs: print '\t--%s' % f 495 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas' 496 497 sys.exit(0) 498 elif arg[0] in ['--apply_on_multi_fasta']: 499 options['apply_on_multi_fasta'] = arg[1] 500 elif arg[0] in ['--search']: 501 options['search'] = arg[1] 502 else: 503 key = re.search('-*(.+)', arg[0]).group(1) 504 options[key] = 1 505 506 507 if options.get('apply_on_multi_fasta'): 508 file = args[0] 509 function = options['apply_on_multi_fasta'] 510 arguments = [] 511 if options.get('search'): 512 arguments = options['search'] 513 if function == 'xGC_skew': 514 arguments = 1000 515 if options.get('quick'): 516 results = quicker_apply_on_multi_fasta(file, function, arguments) 517 else: 518 results = apply_on_multi_fasta(file, function, arguments) 519 for result in results: print result 520 521 elif options.get('uniq_ids'): 522 file = args[0] 523 fasta_uniqids(file) 524 525 # }}} 526