Package Bio :: Package SeqIO :: Module _convert
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._convert

  1  # Copyright 2009 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Optimised sequence conversion code (PRIVATE). 
  6   
  7  You are not expected to access this module, or any of its code, directly. This 
  8  is all handled internally by the Bio.SeqIO.convert(...) function which is the 
  9  public interface for this. 
 10   
 11  The idea here is rather while doing this will work: 
 12   
 13  from Bio import SeqIO 
 14  records = SeqIO.parse(in_handle, in_format) 
 15  count = SeqIO.write(records, out_handle, out_format) 
 16   
 17  it is shorter to write: 
 18   
 19  from Bio import SeqIO 
 20  count = SeqIO.convert(in_handle, in_format, out_handle, out_format) 
 21   
 22  Also, the convert function can take a number of special case optimisations. This 
 23  means that using Bio.SeqIO.convert() may be faster, as well as more convenient. 
 24  All these file format specific optimisations are handled by this (private) module. 
 25  """ 
 26   
 27  from Bio import SeqIO 
 28  #NOTE - Lots of lazy imports further on... 
 29   
30 -def _genbank_convert_fasta(in_handle, out_handle, alphabet=None):
31 """Fast GenBank to FASTA (PRIVATE).""" 32 #We don't need to parse the features... 33 from Bio.GenBank.Scanner import GenBankScanner 34 records = GenBankScanner().parse_records(in_handle, do_features=False) 35 #For FASTA output we can ignore the alphabet too 36 return SeqIO.write(records, out_handle, "fasta")
37
38 -def _embl_convert_fasta(in_handle, out_handle, alphabet=None):
39 """Fast EMBL to FASTA (PRIVATE).""" 40 #We don't need to parse the features... 41 from Bio.GenBank.Scanner import EmblScanner 42 records = EmblScanner().parse_records(in_handle, do_features=False) 43 #For FASTA output we can ignore the alphabet too 44 return SeqIO.write(records, out_handle, "fasta")
45
46 -def _fastq_generic(in_handle, out_handle, mapping):
47 """FASTQ helper function where can't have data loss by truncation (PRIVATE).""" 48 from Bio.SeqIO.QualityIO import FastqGeneralIterator 49 #For real speed, don't even make SeqRecord and Seq objects! 50 count = 0 51 null = chr(0) 52 for title, seq, old_qual in FastqGeneralIterator(in_handle): 53 count += 1 54 #map the qual... 55 qual = old_qual.translate(mapping) 56 if null in qual: 57 raise ValueError("Invalid character in quality string") 58 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 59 return count
60
61 -def _fastq_generic2(in_handle, out_handle, mapping, truncate_char, truncate_msg):
62 """FASTQ helper function where there could be data loss by truncation (PRIVATE).""" 63 from Bio.SeqIO.QualityIO import FastqGeneralIterator 64 #For real speed, don't even make SeqRecord and Seq objects! 65 count = 0 66 null = chr(0) 67 for title, seq, old_qual in FastqGeneralIterator(in_handle): 68 count += 1 69 #map the qual... 70 qual = old_qual.translate(mapping) 71 if null in qual: 72 raise ValueError("Invalid character in quality string") 73 if truncate_char in qual: 74 qual = qual.replace(truncate_char, chr(126)) 75 import warnings 76 warnings.warn(truncate_msg) 77 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 78 return count
79
80 -def _fastq_sanger_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
81 """Fast Sanger FASTQ to Sanger FASTQ conversion (PRIVATE). 82 83 Useful for removing line wrapping and the redundant second identifier 84 on the plus lines. Will check also check the quality string is valid. 85 86 Avoids creating SeqRecord and Seq objects in order to speed up this 87 conversion. 88 """ 89 #Map unexpected chars to null 90 mapping = "".join([chr(0) for ascii in range(0,33)] \ 91 +[chr(ascii) for ascii in range(33,127)] \ 92 +[chr(0) for ascii in range(127,256)]) 93 assert len(mapping)==256 94 return _fastq_generic(in_handle, out_handle, mapping)
95
96 -def _fastq_solexa_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
97 """Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE). 98 99 Useful for removing line wrapping and the redundant second identifier 100 on the plus lines. Will check also check the quality string is valid. 101 Avoids creating SeqRecord and Seq objects in order to speed up this 102 conversion. 103 """ 104 #Map unexpected chars to null 105 mapping = "".join([chr(0) for ascii in range(0,59)] \ 106 +[chr(ascii) for ascii in range(59,127)] \ 107 +[chr(0) for ascii in range(127,256)]) 108 assert len(mapping)==256 109 return _fastq_generic(in_handle, out_handle, mapping)
110
111 -def _fastq_illumina_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
112 """Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 113 114 Useful for removing line wrapping and the redundant second identifier 115 on the plus lines. Will check also check the quality string is valid. 116 Avoids creating SeqRecord and Seq objects in order to speed up this 117 conversion. 118 """ 119 #Map unexpected chars to null 120 mapping = "".join([chr(0) for ascii in range(0,64)] \ 121 +[chr(ascii) for ascii in range(64,127)] \ 122 +[chr(0) for ascii in range(127,256)]) 123 assert len(mapping)==256 124 return _fastq_generic(in_handle, out_handle, mapping)
125
126 -def _fastq_illumina_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
127 """Fast Illumina 1.3+ FASTQ to Sanger FASTQ conversion (PRIVATE). 128 129 Avoids creating SeqRecord and Seq objects in order to speed up this 130 conversion. 131 """ 132 #Map unexpected chars to null 133 mapping = "".join([chr(0) for ascii in range(0,64)] \ 134 +[chr(33+q) for q in range(0,62+1)] \ 135 +[chr(0) for ascii in range(127,256)]) 136 assert len(mapping)==256 137 return _fastq_generic(in_handle, out_handle, mapping)
138
139 -def _fastq_sanger_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
140 """Fast Sanger FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 141 142 Avoids creating SeqRecord and Seq objects in order to speed up this 143 conversion. Will issue a warning if the scores had to be truncated at 62 144 (maximum possible in the Illumina 1.3+ FASTQ format) 145 """ 146 #Map unexpected chars to null 147 trunc_char = chr(1) 148 mapping = "".join([chr(0) for ascii in range(0,33)] \ 149 +[chr(64+q) for q in range(0,62+1)] \ 150 +[trunc_char for ascii in range(96,127)] \ 151 +[chr(0) for ascii in range(127,256)]) 152 assert len(mapping)==256 153 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char, 154 "Data loss - max PHRED quality 62 in Illumina 1.3+ FASTQ")
155
156 -def _fastq_solexa_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
157 """Fast Solexa FASTQ to Sanger FASTQ conversion (PRIVATE). 158 159 Avoids creating SeqRecord and Seq objects in order to speed up this 160 conversion. 161 """ 162 #Map unexpected chars to null 163 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 164 mapping = "".join([chr(0) for ascii in range(0,59)] \ 165 +[chr(33+int(round(phred_quality_from_solexa(q)))) \ 166 for q in range(-5,62+1)]\ 167 +[chr(0) for ascii in range(127,256)]) 168 assert len(mapping)==256 169 return _fastq_generic(in_handle, out_handle, mapping)
170
171 -def _fastq_sanger_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
172 """Fast Sanger FASTQ to Solexa FASTQ conversion (PRIVATE). 173 174 Avoids creating SeqRecord and Seq objects in order to speed up this 175 conversion. Will issue a warning if the scores had to be truncated at 62 176 (maximum possible in the Solexa FASTQ format) 177 """ 178 #Map unexpected chars to null 179 from Bio.SeqIO.QualityIO import solexa_quality_from_phred 180 trunc_char = chr(1) 181 mapping = "".join([chr(0) for ascii in range(0,33)] \ 182 +[chr(64+int(round(solexa_quality_from_phred(q)))) \ 183 for q in range(0,62+1)] \ 184 +[trunc_char for ascii in range(96,127)] \ 185 +[chr(0) for ascii in range(127,256)]) 186 assert len(mapping)==256 187 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char, 188 "Data loss - max Solexa quality 62 in Solexa FASTQ")
189 190
191 -def _fastq_solexa_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
192 """Fast Solexa FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 193 194 Avoids creating SeqRecord and Seq objects in order to speed up this 195 conversion. 196 """ 197 #Map unexpected chars to null 198 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 199 mapping = "".join([chr(0) for ascii in range(0,59)] \ 200 +[chr(64+int(round(phred_quality_from_solexa(q)))) \ 201 for q in range(-5,62+1)]\ 202 +[chr(0) for ascii in range(127,256)]) 203 assert len(mapping)==256 204 return _fastq_generic(in_handle, out_handle, mapping)
205
206 -def _fastq_illumina_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
207 """Fast Illumina 1.3+ FASTQ to Solexa FASTQ conversion (PRIVATE). 208 209 Avoids creating SeqRecord and Seq objects in order to speed up this 210 conversion. 211 """ 212 #Map unexpected chars to null 213 from Bio.SeqIO.QualityIO import solexa_quality_from_phred 214 trunc_char = chr(1) 215 mapping = "".join([chr(0) for ascii in range(0,64)] \ 216 +[chr(64+int(round(solexa_quality_from_phred(q)))) \ 217 for q in range(0,62+1)] \ 218 +[chr(0) for ascii in range(127,256)]) 219 assert len(mapping)==256 220 return _fastq_generic(in_handle, out_handle, mapping)
221 222
223 -def _fastq_convert_fasta(in_handle, out_handle, alphabet=None):
224 """Fast FASTQ to FASTA conversion (PRIVATE). 225 226 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and 227 Seq objects in order to speed up this conversion. 228 229 NOTE - This does NOT check the characters used in the FASTQ quality string are valid! 230 """ 231 from Bio.SeqIO.QualityIO import FastqGeneralIterator 232 #For real speed, don't even make SeqRecord and Seq objects! 233 count = 0 234 for title, seq, qual in FastqGeneralIterator(in_handle): 235 count += 1 236 out_handle.write(">%s\n" % title) 237 #Do line wrapping 238 for i in range(0, len(seq), 60): 239 out_handle.write(seq[i:i+60] + "\n") 240 return count
241
242 -def _fastq_convert_tab(in_handle, out_handle, alphabet=None):
243 """Fast FASTQ to simple tabbed conversion (PRIVATE). 244 245 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and 246 Seq objects in order to speed up this conversion. 247 248 NOTE - This does NOT check the characters used in the FASTQ quality string are valid! 249 """ 250 from Bio.SeqIO.QualityIO import FastqGeneralIterator 251 #For real speed, don't even make SeqRecord and Seq objects! 252 count = 0 253 for title, seq, qual in FastqGeneralIterator(in_handle): 254 count += 1 255 out_handle.write("%s\t%s\n" % (title.split(None,1)[0], seq)) 256 return count
257 258 #TODO? - Handling aliases explicitly would let us shorten this list: 259 _converter = { 260 ("genbank", "fasta") : _genbank_convert_fasta, 261 ("gb", "fasta") : _genbank_convert_fasta, 262 ("embl", "fasta") : _embl_convert_fasta, 263 ("fastq", "fasta") : _fastq_convert_fasta, 264 ("fastq-sanger", "fasta") : _fastq_convert_fasta, 265 ("fastq-solexa", "fasta") : _fastq_convert_fasta, 266 ("fastq-illumina", "fasta") : _fastq_convert_fasta, 267 ("fastq", "tab") : _fastq_convert_tab, 268 ("fastq-sanger", "tab") : _fastq_convert_tab, 269 ("fastq-solexa", "tab") : _fastq_convert_tab, 270 ("fastq-illumina", "tab") : _fastq_convert_tab, 271 ("fastq", "fastq") : _fastq_sanger_convert_fastq_sanger, 272 ("fastq-sanger", "fastq") : _fastq_sanger_convert_fastq_sanger, 273 ("fastq-solexa", "fastq") : _fastq_solexa_convert_fastq_sanger, 274 ("fastq-illumina", "fastq") : _fastq_illumina_convert_fastq_sanger, 275 ("fastq", "fastq-sanger") : _fastq_sanger_convert_fastq_sanger, 276 ("fastq-sanger", "fastq-sanger") : _fastq_sanger_convert_fastq_sanger, 277 ("fastq-solexa", "fastq-sanger") : _fastq_solexa_convert_fastq_sanger, 278 ("fastq-illumina", "fastq-sanger") : _fastq_illumina_convert_fastq_sanger, 279 ("fastq", "fastq-solexa") : _fastq_sanger_convert_fastq_solexa, 280 ("fastq-sanger", "fastq-solexa") : _fastq_sanger_convert_fastq_solexa, 281 ("fastq-solexa", "fastq-solexa") : _fastq_solexa_convert_fastq_solexa, 282 ("fastq-illumina", "fastq-solexa") : _fastq_illumina_convert_fastq_solexa, 283 ("fastq", "fastq-illumina") : _fastq_sanger_convert_fastq_illumina, 284 ("fastq-sanger", "fastq-illumina") : _fastq_sanger_convert_fastq_illumina, 285 ("fastq-solexa", "fastq-illumina") : _fastq_solexa_convert_fastq_illumina, 286 ("fastq-illumina", "fastq-illumina") : _fastq_illumina_convert_fastq_illumina, 287 } 288
289 -def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None):
290 """SeqIO conversion function (PRIVATE).""" 291 try: 292 f = _converter[(in_format, out_format)] 293 except KeyError: 294 f = None 295 if f: 296 return f(in_handle, out_handle, alphabet) 297 else: 298 records = SeqIO.parse(in_handle, in_format, alphabet) 299 return SeqIO.write(records, out_handle, out_format)
300