1
2
3
4
5
6
7 """Functions to calculate assorted sequence checksums."""
8
9
10
11
12 from binascii import crc32 as _crc32
13
15 """Returns the crc32 checksum for a sequence (string or Seq object)"""
16 try:
17
18 return _crc32(seq.tostring())
19 except AttributeError:
20
21 return _crc32(seq)
22
24 _table_h = []
25 for i in range(256):
26 l = i
27 part_h = 0
28 for j in range(8):
29 rflag = l & 1
30 l >>= 1
31 if part_h & 1: l |= (1L << 31)
32 part_h >>= 1L
33 if rflag: part_h ^= 0xd8000000L
34 _table_h.append(part_h)
35 return _table_h
36
37
38 _table_h = _init_table_h()
39
41 """Returns the crc64 checksum for a sequence (string or Seq object)"""
42 crcl = 0
43 crch = 0
44 for c in s:
45 shr = (crch & 0xFF) << 24
46 temp1h = crch >> 8
47 temp1l = (crcl >> 8) | shr
48 idx = (crcl ^ ord(c)) & 0xFF
49 crch = temp1h ^ _table_h[idx]
50 crcl = temp1l
51
52 return "CRC-%08X%08X" % (crch, crcl)
53
54
56 """Returns the GCG checksum (int) for a sequence (string or Seq object)
57
58 Given a nucleotide or amino-acid secuence (or any string),
59 returns the GCG checksum (int). Checksum used by GCG program.
60 seq type = str.
61 Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi
62 with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina.
63 All sequences are converted to uppercase """
64 index = checksum = 0
65 if type(seq)!=type("aa"):
66 seq=seq.tostring()
67 for char in seq:
68 index += 1
69 checksum += index * ord(char.upper())
70 if index == 57: index = 0
71 return checksum % 10000
72
74 """Returns the SEGUID (string) for a sequence (string or Seq object)
75
76 Given a nucleotide or amino-acid secuence (or any string),
77 returns the SEGUID string (A SEquence Globally Unique IDentifier).
78 seq type = str.
79 For more information about SEGUID, see:
80 http://bioinformatics.anl.gov/seguid/
81 DOI: 10.1002/pmic.200600032 """
82 try:
83
84 import hashlib
85 m = hashlib.sha1()
86 except:
87
88 import sha
89 m = sha.new()
90 import base64
91 if type(seq)!=type("aa"):
92 seq=seq.tostring().upper()
93 else:
94 seq=seq.upper()
95 m.update(seq)
96 try:
97
98 return base64.b64encode(m.digest()).rstrip("=")
99 except:
100
101 import os
102
103
104
105 return base64.encodestring(m.digest()).replace("\n","").rstrip("=")
106
107 if __name__ == "__main__":
108 print "Quick self test"
109
110 str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
111 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
112 + "YCSSYAGSSTLVFGGGTKLTVL"
113
114 str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
115 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
116 + "YCCSYAGSSTWVFGGGTKLTVL"
117
118 assert crc64(str_light_chain_one) == crc64(str_light_chain_two)
119 assert 'CRC-44CAAD88706CC153' == crc64(str_light_chain_one)
120
121 assert 'BpBeDdcNUYNsdk46JoJdw7Pd3BI' == seguid(str_light_chain_one)
122 assert 'X5XEaayob1nZLOc7eVT9qyczarY' == seguid(str_light_chain_two)
123
124 print "Done"
125