1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 """Convert a serie of Rebase files into a Restriction_Dictionary.py module.
30
31 The Rebase files are in the emboss format:
32
33 emboss_e.### -> contains informations about the restriction sites.
34 emboss_r.### -> contains general informations about the enzymes.
35 emboss_s.### -> contains informations about the suppliers.
36
37 ### is a 3 digit number. The first digit is the year and the two last the month.
38 """
39
40 import sre
41 import os
42 import itertools
43 import time
44 import sys
45 import site
46 import shutil
47
48 from Bio.Seq import Seq
49 from Bio.Alphabet import IUPAC
50
51 import Bio.Restriction.Restriction
52 from Bio.Restriction.Restriction import AbstractCut, RestrictionType, NoCut, OneCut,\
53 TwoCuts, Meth_Dep, Meth_Undep, Palindromic, NonPalindromic, Unknown, Blunt,\
54 Ov5, Ov3, NotDefined, Defined, Ambiguous, Commercially_available, Not_available
55
56 import Bio.Restriction.RanaConfig as config
57 from Bio.Restriction._Update.Update import RebaseUpdate
58 from Bio.Restriction.Restriction import *
59 from Bio.Restriction.DNAUtils import antiparallel
60
61 DNA=Seq
62 dna_alphabet = {'A':'A', 'C':'C', 'G':'G', 'T':'T',
63 'R':'AG', 'Y':'CT', 'W':'AT', 'S':'CG', 'M':'AC', 'K':'GT',
64 'H':'ACT', 'B':'CGT', 'V':'ACG', 'D':'AGT',
65 'N':'ACGT',
66 'a': 'a', 'c': 'c', 'g': 'g', 't': 't',
67 'r':'ag', 'y':'ct', 'w':'at', 's':'cg', 'm':'ac', 'k':'gt',
68 'h':'act', 'b':'cgt', 'v':'acg', 'd':'agt',
69 'n':'acgt'}
70
71
72 complement_alphabet = {'A':'T', 'T':'A', 'C':'G', 'G':'C','R':'Y', 'Y':'R',
73 'W':'W', 'S':'S', 'M':'K', 'K':'M', 'H':'D', 'D':'H',
74 'B':'V', 'V':'B', 'N':'N','a':'t', 'c':'g', 'g':'c',
75 't':'a', 'r':'y', 'y':'r', 'w':'w', 's':'s','m':'k',
76 'k':'m', 'h':'d', 'd':'h', 'b':'v', 'v':'b', 'n':'n'}
77 enzymedict = {}
78 suppliersdict = {}
79 classdict = {}
80 typedict = {}
81
82
84 """Exception for dealing with overhang."""
85 pass
86
88 """BaseExpand(base) -> string.
89
90 given a degenerated base, returns its meaning in IUPAC alphabet.
91
92 i.e:
93 b= 'A' -> 'A'
94 b= 'N' -> 'ACGT'
95 etc..."""
96 base = base.upper()
97 return dna_alphabet[base]
98
100 """regex(site) -> string.
101
102 Construct a regular expression from a DNA sequence.
103 i.e.:
104 site = 'ABCGN' -> 'A[CGT]CG.'"""
105 reg_ex = site
106 for base in reg_ex:
107 if base in ('A', 'T', 'C', 'G', 'a', 'c', 'g', 't'):
108 pass
109 if base in ('N', 'n'):
110 reg_ex = '.'.join(reg_ex.split('N'))
111 reg_ex = '.'.join(reg_ex.split('n'))
112 if base in ('R', 'Y', 'W', 'M', 'S', 'K', 'H', 'D', 'B', 'V'):
113 expand = '['+ str(BaseExpand(base))+']'
114 reg_ex = expand.join(reg_ex.split(base))
115 return reg_ex
116
118 """Antiparallel(sequence) -> string.
119
120 returns a string which represents the reverse complementary strand of
121 a DNA sequence."""
122 return antiparallel(sequence.tostring())
123
125 """is_palindrom(sequence) -> bool.
126
127 True is the sequence is a palindrom.
128 sequence is a DNA object."""
129 return sequence == DNA(Antiparallel(sequence))
130
132 """LocalTime() -> string.
133
134 LocalTime calculate the extension for emboss file for the current year and
135 month."""
136 t = time.gmtime()
137 year = str(t.tm_year)[-1]
138 month = str(t.tm_mon)
139 if len(month) == 1 : month = '0'+month
140 return year+month
141
142
144 """construct the attributes of the enzyme corresponding to 'name'."""
146 cls.opt_temp = 37
147 cls.inact_temp = 65
148 cls.substrat = 'DNA'
149 target = enzymedict[name]
150 cls.site = target[0]
151 cls.size = target[1]
152 cls.suppl = tuple(target[9])
153 cls.freq = target[11]
154 cls.ovhg = target[13]
155 cls.ovhgseq = target[14]
156 cls.bases = ()
157
158
159
160
161
162
163
164 if target[10] : cls.bases += ('Palindromic',)
165 else : cls.bases += ('NonPalindromic',)
166
167
168
169
170
171
172
173 if not target[2]:
174
175
176
177 cls.bases += ('NoCut','Unknown', 'NotDefined')
178 cls.fst5 = None
179 cls.fst3 = None
180 cls.scd5 = None
181 cls.scd3 = None
182 cls.ovhg = None
183 cls.ovhgseq = None
184 else:
185
186
187
188 if target[2] == 2:
189 cls.bases += ('OneCut',)
190 cls.fst5 = target[4]
191 cls.fst3 = target[5]
192 cls.scd5 = None
193 cls.scd3 = None
194 else:
195 cls.bases += ('TwoCuts',)
196 cls.fst5 = target[4]
197 cls.fst3 = target[5]
198 cls.scd5 = target[6]
199 cls.scd3 = target[7]
200
201
202
203
204
205
206
207
208
209
210
211
212
213 if target[3]:
214
215
216
217
218 cls.bases += ('Blunt', 'Defined')
219 cls.ovhg = 0
220 elif isinstance(cls.ovhg, int):
221
222
223
224 if cls.ovhg > 0:
225
226
227
228
229 cls.bases += ('Ov3', 'Ambiguous')
230 elif cls.ovhg < 0:
231
232
233
234
235 cls.bases += ('Ov5', 'Ambiguous')
236 else:
237
238
239
240 if cls.fst5 - (cls.fst3 + cls.size) < 0:
241 cls.bases += ('Ov5', 'Defined')
242 cls.ovhg = - len(cls.ovhg)
243 else:
244 cls.bases += ('Ov3', 'Defined')
245 cls.ovhg = + len(cls.ovhg)
246
247
248
249
250
251
252
253
254 if target[8]:
255 cls.bases += ('Meth_Dep', )
256 cls.compsite = target[12]
257 else:
258 cls.bases += ('Meth_Undep',)
259 cls.compsite = target[12]
260
261
262
263
264 if cls.suppl:
265 cls.bases += ('Commercially_available', )
266 else:
267 cls.bases += ('Not_available', )
268 cls.bases += ('AbstractCut', 'RestrictionType')
269 cls.__name__ = name
270 cls.results = None
271 cls.dna = None
272 cls.__bases__ = cls.bases
273 cls.charac = (cls.fst5, cls.fst3, cls.scd5, cls.scd3, cls.site)
274 if not target[2] and cls.suppl:
275 supp = ', '.join([suppliersdict[s][0] for s in cls.suppl])
276 print 'WARNING : It seems that %s is both commercially available\
277 \n\tand its characteristics are unknown. \
278 \n\tThis seems counter-intuitive.\
279 \n\tThere is certainly an error either in ranacompiler or\
280 \n\tin this REBASE release.\
281 \n\tThe supplier is : %s.' % (name, supp)
282 return
283
284
286
287 """Build the different types possible for Restriction Enzymes"""
288
290 """TypeCompiler() -> new TypeCompiler instance."""
291 pass
292
294 """TC.buildtype() -> generator.
295
296 build the new types that will be needed for constructing the
297 restriction enzymes."""
298 baT = (AbstractCut, RestrictionType)
299 cuT = (NoCut, OneCut, TwoCuts)
300 meT = (Meth_Dep, Meth_Undep)
301 paT = (Palindromic, NonPalindromic)
302 ovT = (Unknown, Blunt, Ov5, Ov3)
303 deT = (NotDefined, Defined, Ambiguous)
304 coT = (Commercially_available, Not_available)
305 All = (baT, cuT, meT, paT, ovT, deT, coT)
306
307
308
309
310
311
312 types = [(p,c,o,d,m,co,baT[0],baT[1])
313 for p in paT for c in cuT for o in ovT
314 for d in deT for m in meT for co in coT]
315 n= 1
316 for ty in types:
317 dct = {}
318 for t in ty:
319 dct.update(t.__dict__)
320
321
322
323
324
325
326 dct['results'] = []
327 dct['substrat'] = 'DNA'
328 dct['dna'] = None
329 if t == NoCut:
330 dct.update({'fst5':None,'fst3':None,
331 'scd5':None,'scd3':None,
332 'ovhg':None,'ovhgseq':None})
333 elif t == OneCut:
334 dct.update({'scd5':None, 'scd3':None})
335 class klass(type):
336 def __new__(cls):
337 return type.__new__(cls, 'type%i'%n,ty,dct)
338 def __init__(cls):
339 super(klass, cls).__init__('type%i'%n,ty,dct)
340 yield klass()
341 n+=1
342
343 start = '\n\
344 #!/usr/bin/env python\n\
345 #\n\
346 # Restriction Analysis Libraries.\n\
347 # Copyright (C) 2004. Frederic Sohm.\n\
348 #\n\
349 # This code is part of the Biopython distribution and governed by its\n\
350 # license. Please see the LICENSE file that should have been included\n\
351 # as part of this package.\n\
352 #\n\
353 # This file is automatically generated - do not edit it by hand! Instead,\n\
354 # use the tool Scripts/Restriction/ranacompiler.py which in turn uses\n\
355 # Bio/Restriction/_Update/RestrictionCompiler.py\n\
356 #\n\
357 # The following dictionaries used to be defined in one go, but that does\n\
358 # not work on Jython due to JVM limitations. Therefore we break this up\n\
359 # into steps, using temporary functions to avoid the JVM limits.\n\
360 \n\n'
361
363
364 - def __init__(self, e_mail='', ftp_proxy=''):
365 """DictionaryBuilder([e_mail[, ftp_proxy]) -> DictionaryBuilder instance.
366
367 If the emboss files used for the construction need to be updated this
368 class will download them if the ftp connection is correctly set.
369 either in RanaConfig.py or given at run time.
370
371 e_mail is the e-mail address used as password for the anonymous
372 ftp connection.
373
374 proxy is the ftp_proxy to use if any."""
375 self.rebase_pass = e_mail or config.Rebase_password
376 self.proxy = ftp_proxy or config.ftp_proxy
377
379 """DB.build_dict() -> None.
380
381 Construct the dictionary and build the files containing the new
382 dictionaries."""
383
384
385
386 emboss_e, emboss_r, emboss_s = self.lastrebasefile()
387
388
389
390 self.information_mixer(emboss_r, emboss_e, emboss_s)
391 emboss_r.close()
392 emboss_e.close()
393 emboss_s.close()
394
395
396
397 tdct = {}
398 for klass in TypeCompiler().buildtype():
399 exec klass.__name__ +'= klass'
400 exec "tdct['"+klass.__name__+"'] = klass"
401
402
403
404
405
406
407
408 for name in enzymedict:
409
410
411
412 cls = newenzyme(name)
413
414
415
416 bases = cls.bases
417 clsbases = tuple([eval(x) for x in bases])
418 typestuff = ''
419 for n, t in tdct.iteritems():
420
421
422
423
424 if t.__bases__ == clsbases:
425 typestuff = t
426 typename = t.__name__
427 continue
428
429
430
431 dct = dict(cls.__dict__)
432 del dct['bases']
433 del dct['__bases__']
434 del dct['__name__']
435 classdict[name] = dct
436
437 commonattr = ['fst5', 'fst3', 'scd5', 'scd3', 'substrat',
438 'ovhg', 'ovhgseq','results', 'dna']
439 if typename in typedict:
440 typedict[typename][1].append(name)
441 else:
442 enzlst= []
443 tydct = dict(typestuff.__dict__)
444 tydct = dict([(k,v) for k,v in tydct.iteritems() if k in commonattr])
445 enzlst.append(name)
446 typedict[typename] = (bases, enzlst)
447 for letter in cls.__dict__['suppl']:
448 supplier = suppliersdict[letter]
449 suppliersdict[letter][1].append(name)
450 if not classdict or not suppliersdict or not typedict:
451 print 'One of the new dictionaries is empty.'
452 print 'Check the integrity of the emboss file before continuing.'
453 print 'Update aborted.'
454 sys.exit()
455
456
457
458 print '\nThe new database contains %i enzymes.\n' % len(classdict)
459
460
461
462
463
464 update = os.getcwd()
465 results = open(os.path.join(update, 'Restriction_Dictionary.py'), 'w')
466 print 'Writing the dictionary containing the new Restriction classes.\t',
467 results.write(start)
468 results.write('rest_dict = {}\n')
469 for name in sorted(classdict) :
470 results.write("def _temp():\n")
471 results.write(" return {\n")
472 for key, value in classdict[name].iteritems() :
473 results.write(" %s : %s,\n" % (repr(key), repr(value)))
474 results.write(" }\n")
475 results.write("rest_dict[%s] = _temp()\n" % repr(name))
476 results.write("\n")
477 print 'OK.\n'
478 print 'Writing the dictionary containing the suppliers datas.\t\t',
479 results.write('suppliers = {}\n')
480 for name in sorted(suppliersdict) :
481 results.write("def _temp():\n")
482 results.write(" return (\n")
483 for value in suppliersdict[name] :
484 results.write(" %s,\n" % repr(value))
485 results.write(" )\n")
486 results.write("suppliers[%s] = _temp()\n" % repr(name))
487 results.write("\n")
488 print 'OK.\n'
489 print 'Writing the dictionary containing the Restriction types.\t',
490 results.write('typedict = {}\n')
491 for name in sorted(typedict) :
492 results.write("def _temp():\n")
493 results.write(" return (\n")
494 for value in typedict[name] :
495 results.write(" %s,\n" % repr(value))
496 results.write(" )\n")
497 results.write("typedict[%s] = _temp()\n" % repr(name))
498 results.write("\n")
499
500
501
502 results.write("del _temp\n")
503 results.write("\n")
504 print 'OK.\n'
505 results.close()
506 return
507
509 """DB.install_dict() -> None.
510
511 Install the newly created dictionary in the site-packages folder.
512
513 May need super user privilege on some architectures."""
514 print '\n ' +'*'*78 + ' \n'
515 print '\n\t\tInstalling Restriction_Dictionary.py'
516 try:
517 import Bio.Restriction.Restriction_Dictionary as rd
518 except ImportError:
519 print '\
520 \n Unable to locate the previous Restriction_Dictionary.py module\
521 \n Aborting installation.'
522 sys.exit()
523
524
525
526 old = os.path.join(os.path.split(rd.__file__)[0],
527 'Restriction_Dictionary.py')
528
529 update_folder = os.getcwd()
530 shutil.copyfile(old, os.path.join(update_folder,
531 'Restriction_Dictionary.old'))
532
533
534
535 new = os.path.join(update_folder, 'Restriction_Dictionary.py')
536 try:
537 execfile(new)
538 print '\
539 \n\tThe new file seems ok. Proceeding with the installation.'
540 except SyntaxError:
541 print '\
542 \n The new dictionary file is corrupted. Aborting the installation.'
543 return
544 try:
545 shutil.copyfile(new, old)
546 print'\n\t Everything ok. If you need it a version of the old\
547 \n\t dictionary have been saved in the Updates folder under\
548 \n\t the name Restriction_Dictionary.old.'
549 print '\n ' +'*'*78 + ' \n'
550 except IOError:
551 print '\n ' +'*'*78 + ' \n'
552 print '\
553 \n\t WARNING : Impossible to install the new dictionary.\
554 \n\t Are you sure you have write permission to the folder :\n\
555 \n\t %s ?\n\n' % os.path.split(old)[0]
556 return self.no_install()
557 return
558
560 """BD.no_install() -> None.
561
562 build the new dictionary but do not install the dictionary."""
563 print '\n ' +'*'*78 + '\n'
564
565 try:
566 import Bio.Restriction.Restriction_Dictionary as rd
567 except ImportError:
568 print '\
569 \n Unable to locate the previous Restriction_Dictionary.py module\
570 \n Aborting installation.'
571 sys.exit()
572
573
574
575 old = os.path.join(os.path.split(rd.__file__)[0],
576 'Restriction_Dictionary.py')
577 update = os.getcwd()
578 shutil.copyfile(old, os.path.join(update, 'Restriction_Dictionary.old'))
579 places = update, os.path.split(Bio.Restriction.Restriction.__file__)[0]
580 print "\t\tCompilation of the new dictionary : OK.\
581 \n\t\tInstallation : No.\n\
582 \n You will find the newly created 'Restriction_Dictionary.py' file\
583 \n in the folder : \n\
584 \n\t%s\n\
585 \n Make a copy of 'Restriction_Dictionary.py' and place it with \
586 \n the other Restriction libraries.\n\
587 \n note : \
588 \n This folder should be :\n\
589 \n\t%s\n" % places
590 print '\n ' +'*'*78 + '\n'
591 return
592
593
595 """BD.lastrebasefile() -> None.
596
597 Check the emboss files are up to date and download them if they are not.
598 """
599 embossnames = ('emboss_e', 'emboss_r', 'emboss_s')
600
601
602
603 emboss_now = ['.'.join((x,LocalTime())) for x in embossnames]
604 update_needed = False
605
606 dircontent = os.listdir(os.getcwd())
607 base = os.getcwd()
608 for name in emboss_now:
609 if name in dircontent:
610 pass
611 else:
612 update_needed = True
613
614 if not update_needed:
615
616
617
618 print '\n Using the files : %s'% ', '.join(emboss_now)
619 return tuple([open(os.path.join(base, n)) for n in emboss_now])
620 else:
621
622
623
624 print '\n The rebase files are more than one month old.\
625 \n Would you like to update them before proceeding?(y/n)'
626 r = raw_input(' update [n] >>> ')
627 if r in ['y', 'yes', 'Y', 'Yes']:
628 updt = RebaseUpdate(self.rebase_pass, self.proxy)
629 updt.openRebase()
630 updt.getfiles()
631 updt.close()
632 print '\n Update complete. Creating the dictionaries.\n'
633 print '\n Using the files : %s'% ', '.join(emboss_now)
634 return tuple([open(os.path.join(base, n)) for n in emboss_now])
635 else:
636
637
638
639
640 class NotFoundError(Exception):
641 pass
642 for name in embossnames:
643 try:
644 for file in dircontent:
645 if file.startswith(name):
646 break
647 else:
648 pass
649 raise NotFoundError
650 except NotFoundError:
651 print "\nNo %s file found. Upgrade is impossible.\n"%name
652 sys.exit()
653 continue
654 pass
655
656
657
658 last = [0]
659 for file in dircontent:
660 fs = file.split('.')
661 try:
662 if fs[0] in embossnames and int(fs[1]) > int(last[-1]):
663 if last[0] : last.append(fs[1])
664 else : last[0] = fs[1]
665 else:
666 continue
667 except ValueError:
668 continue
669 last.sort()
670 last = last[::-1]
671 if int(last[-1]) < 100 : last[0], last[-1] = last[-1], last[0]
672 for number in last:
673 files = [(name, name+'.%s'%number) for name in embossnames]
674 strmess = '\nLast EMBOSS files found are :\n'
675 try:
676 for name,file in files:
677 if os.path.isfile(os.path.join(base, file)):
678 strmess += '\t%s.\n'%file
679 else:
680 raise ValueError
681 print strmess
682 emboss_e = open(os.path.join(base, 'emboss_e.%s'%number),'r')
683 emboss_r = open(os.path.join(base, 'emboss_r.%s'%number),'r')
684 emboss_s = open(os.path.join(base, 'emboss_s.%s'%number),'r')
685 return emboss_e, emboss_r, emboss_s
686 except ValueError:
687 continue
688
690 line = [line[0]]+[line[1].upper()]+[int(i) for i in line[2:9]]+line[9:]
691 name = line[0].replace("-","_")
692 site = line[1]
693 dna = DNA(site)
694 size = line[2]
695
696
697
698 fst5 = line[5]
699 fst3 = line[6]
700 scd5 = line[7]
701 scd3 = line[8]
702
703
704
705
706 ovhg1 = fst5 - fst3
707 ovhg2 = scd5 - scd3
708
709
710
711
712
713
714
715 if fst5 < 0 : fst5 += 1
716 if fst3 < 0 : fst3 += 1
717 if scd5 < 0 : scd5 += 1
718 if scd3 < 0 : scd3 += 1
719
720 if ovhg2 != 0 and ovhg1 != ovhg2:
721
722
723
724
725
726
727
728
729
730 print '\
731 \nWARNING : %s cut twice with different overhang length each time.\
732 \n\tUnable to deal with this behaviour. \
733 \n\tThis enzyme will not be included in the database. Sorry.' %name
734 print '\tChecking :',
735 raise OverhangError
736 if 0 <= fst5 <= size and 0 <= fst3 <= size:
737
738
739
740 if fst5 < fst3:
741
742
743
744 ovhg1 = ovhgseq = site[fst5:fst3]
745 elif fst5 > fst3:
746
747
748
749 ovhg1 = ovhgseq = site[fst3:fst5]
750 else:
751
752
753
754 ovhg1 = ovhgseq = ''
755 for base in 'NRYWMSKHDBV':
756 if base in ovhg1:
757
758
759
760 ovhgseq = ovhg1
761 if fst5 < fst3 : ovhg1 = - len(ovhg1)
762 else : ovhg1 = len(ovhg1)
763 break
764 else:
765 continue
766 elif 0 <= fst5 <= size:
767
768
769
770 if fst5 < fst3:
771
772
773
774 ovhgseq = site[fst5:] + (fst3 - size) * 'N'
775 elif fst5 > fst3:
776
777
778
779 ovhgseq = abs(fst3) * 'N' + site[:fst5]
780 else:
781
782
783
784 ovhg1 = ovhgseq = ''
785 elif 0 <= fst3 <= size:
786
787
788
789 if fst5 < fst3:
790
791
792
793 ovhgseq = abs(fst5) * 'N' + site[:fst3]
794 elif fst5 > fst3:
795
796
797
798 ovhgseq = site[fst3:] + (fst5 - size) * 'N'
799 else:
800
801
802
803 raise ValueError('Error in #1')
804 elif fst3 < 0 and size < fst5:
805
806
807
808 ovhgseq = abs(fst3)*'N' + site + (fst5-size)*'N'
809 elif fst5 < 0 and size <fst3:
810
811
812
813 ovhgseq = abs(fst5)*'N' + site + (fst3-size)*'N'
814 else:
815
816
817
818 ovhgseq = 'N' * abs(ovhg1)
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838 for x in (5, 7):
839 if line[x] < 0 : line[x] += 1
840 for x in (6, 8):
841 if line[x] > 0 : line[x] -= size
842 elif line[x] < 0 : line[x] = line[x] - size + 1
843
844
845
846
847
848
849
850 rg = ''
851 if is_palindrom(dna):
852 line.append(True)
853 rg = ''.join(['(?P<', name, '>', regex(site.upper()), ')'])
854 else:
855 line.append(False)
856 sense = ''.join(['(?P<', name, '>', regex(site.upper()), ')'])
857 antisense = ''.join(['(?P<', name, '_as>',
858 regex(Antiparallel(dna)), ')'])
859 rg = sense + '|' + antisense
860
861
862
863 f = [4/len(dna_alphabet[l]) for l in site.upper()]
864 freq = reduce(lambda x, y : x*y, f)
865 line.append(freq)
866
867
868
869
870 line.append(rg)
871 line.append(ovhg1)
872 line.append(ovhgseq)
873 return line
874
876
877
878
879 return [l for l in itertools.dropwhile(lambda l:l.startswith('#'),file)]
880
882
883
884
885 take = itertools.takewhile
886 block = [l for l in take(lambda l :not l.startswith('//'),file[index:])]
887 index += len(block)+1
888 return block, index
889
890 - def get(self, block):
891
892
893
894
895
896
897
898 bl3 = block[3].strip()
899 if not bl3 : bl3 = False
900 return (block[0].strip(), bl3, block[5].strip())
901
976