1
2
3
4
5 """Optimised sequence conversion code (PRIVATE).
6
7 You are not expected to access this module, or any of its code, directly. This
8 is all handled internally by the Bio.SeqIO.convert(...) function which is the
9 public interface for this.
10
11 The idea here is rather while doing this will work:
12
13 from Bio import SeqIO
14 records = SeqIO.parse(in_handle, in_format)
15 count = SeqIO.write(records, out_handle, out_format)
16
17 it is shorter to write:
18
19 from Bio import SeqIO
20 count = SeqIO.convert(in_handle, in_format, out_handle, out_format)
21
22 Also, the convert function can take a number of special case optimisations. This
23 means that using Bio.SeqIO.convert() may be faster, as well as more convenient.
24 All these file format specific optimisations are handled by this (private) module.
25 """
26
27 from Bio import SeqIO
28
29
37
45
60
61 -def _fastq_generic2(in_handle, out_handle, mapping, truncate_char, truncate_msg):
62 """FASTQ helper function where there could be data loss by truncation (PRIVATE)."""
63 from Bio.SeqIO.QualityIO import FastqGeneralIterator
64
65 count = 0
66 null = chr(0)
67 for title, seq, old_qual in FastqGeneralIterator(in_handle):
68 count += 1
69
70 qual = old_qual.translate(mapping)
71 if null in qual:
72 raise ValueError("Invalid character in quality string")
73 if truncate_char in qual:
74 qual = qual.replace(truncate_char, chr(126))
75 import warnings
76 warnings.warn(truncate_msg)
77 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
78 return count
79
81 """Fast Sanger FASTQ to Sanger FASTQ conversion (PRIVATE).
82
83 Useful for removing line wrapping and the redundant second identifier
84 on the plus lines. Will check also check the quality string is valid.
85
86 Avoids creating SeqRecord and Seq objects in order to speed up this
87 conversion.
88 """
89
90 mapping = "".join([chr(0) for ascii in range(0,33)] \
91 +[chr(ascii) for ascii in range(33,127)] \
92 +[chr(0) for ascii in range(127,256)])
93 assert len(mapping)==256
94 return _fastq_generic(in_handle, out_handle, mapping)
95
97 """Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE).
98
99 Useful for removing line wrapping and the redundant second identifier
100 on the plus lines. Will check also check the quality string is valid.
101 Avoids creating SeqRecord and Seq objects in order to speed up this
102 conversion.
103 """
104
105 mapping = "".join([chr(0) for ascii in range(0,59)] \
106 +[chr(ascii) for ascii in range(59,127)] \
107 +[chr(0) for ascii in range(127,256)])
108 assert len(mapping)==256
109 return _fastq_generic(in_handle, out_handle, mapping)
110
112 """Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
113
114 Useful for removing line wrapping and the redundant second identifier
115 on the plus lines. Will check also check the quality string is valid.
116 Avoids creating SeqRecord and Seq objects in order to speed up this
117 conversion.
118 """
119
120 mapping = "".join([chr(0) for ascii in range(0,64)] \
121 +[chr(ascii) for ascii in range(64,127)] \
122 +[chr(0) for ascii in range(127,256)])
123 assert len(mapping)==256
124 return _fastq_generic(in_handle, out_handle, mapping)
125
127 """Fast Illumina 1.3+ FASTQ to Sanger FASTQ conversion (PRIVATE).
128
129 Avoids creating SeqRecord and Seq objects in order to speed up this
130 conversion.
131 """
132
133 mapping = "".join([chr(0) for ascii in range(0,64)] \
134 +[chr(33+q) for q in range(0,62+1)] \
135 +[chr(0) for ascii in range(127,256)])
136 assert len(mapping)==256
137 return _fastq_generic(in_handle, out_handle, mapping)
138
140 """Fast Sanger FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
141
142 Avoids creating SeqRecord and Seq objects in order to speed up this
143 conversion. Will issue a warning if the scores had to be truncated at 62
144 (maximum possible in the Illumina 1.3+ FASTQ format)
145 """
146
147 trunc_char = chr(1)
148 mapping = "".join([chr(0) for ascii in range(0,33)] \
149 +[chr(64+q) for q in range(0,62+1)] \
150 +[trunc_char for ascii in range(96,127)] \
151 +[chr(0) for ascii in range(127,256)])
152 assert len(mapping)==256
153 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char,
154 "Data loss - max PHRED quality 62 in Illumina 1.3+ FASTQ")
155
170
172 """Fast Sanger FASTQ to Solexa FASTQ conversion (PRIVATE).
173
174 Avoids creating SeqRecord and Seq objects in order to speed up this
175 conversion. Will issue a warning if the scores had to be truncated at 62
176 (maximum possible in the Solexa FASTQ format)
177 """
178
179 from Bio.SeqIO.QualityIO import solexa_quality_from_phred
180 trunc_char = chr(1)
181 mapping = "".join([chr(0) for ascii in range(0,33)] \
182 +[chr(64+int(round(solexa_quality_from_phred(q)))) \
183 for q in range(0,62+1)] \
184 +[trunc_char for ascii in range(96,127)] \
185 +[chr(0) for ascii in range(127,256)])
186 assert len(mapping)==256
187 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char,
188 "Data loss - max Solexa quality 62 in Solexa FASTQ")
189
190
205
221
222
224 """Fast FASTQ to FASTA conversion (PRIVATE).
225
226 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
227 Seq objects in order to speed up this conversion.
228
229 NOTE - This does NOT check the characters used in the FASTQ quality string are valid!
230 """
231 from Bio.SeqIO.QualityIO import FastqGeneralIterator
232
233 count = 0
234 for title, seq, qual in FastqGeneralIterator(in_handle):
235 count += 1
236 out_handle.write(">%s\n" % title)
237
238 for i in range(0, len(seq), 60):
239 out_handle.write(seq[i:i+60] + "\n")
240 return count
241
243 """Fast FASTQ to simple tabbed conversion (PRIVATE).
244
245 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
246 Seq objects in order to speed up this conversion.
247
248 NOTE - This does NOT check the characters used in the FASTQ quality string are valid!
249 """
250 from Bio.SeqIO.QualityIO import FastqGeneralIterator
251
252 count = 0
253 for title, seq, qual in FastqGeneralIterator(in_handle):
254 count += 1
255 out_handle.write("%s\t%s\n" % (title.split(None,1)[0], seq))
256 return count
257
258
259 _converter = {
260 ("genbank", "fasta") : _genbank_convert_fasta,
261 ("gb", "fasta") : _genbank_convert_fasta,
262 ("embl", "fasta") : _embl_convert_fasta,
263 ("fastq", "fasta") : _fastq_convert_fasta,
264 ("fastq-sanger", "fasta") : _fastq_convert_fasta,
265 ("fastq-solexa", "fasta") : _fastq_convert_fasta,
266 ("fastq-illumina", "fasta") : _fastq_convert_fasta,
267 ("fastq", "tab") : _fastq_convert_tab,
268 ("fastq-sanger", "tab") : _fastq_convert_tab,
269 ("fastq-solexa", "tab") : _fastq_convert_tab,
270 ("fastq-illumina", "tab") : _fastq_convert_tab,
271 ("fastq", "fastq") : _fastq_sanger_convert_fastq_sanger,
272 ("fastq-sanger", "fastq") : _fastq_sanger_convert_fastq_sanger,
273 ("fastq-solexa", "fastq") : _fastq_solexa_convert_fastq_sanger,
274 ("fastq-illumina", "fastq") : _fastq_illumina_convert_fastq_sanger,
275 ("fastq", "fastq-sanger") : _fastq_sanger_convert_fastq_sanger,
276 ("fastq-sanger", "fastq-sanger") : _fastq_sanger_convert_fastq_sanger,
277 ("fastq-solexa", "fastq-sanger") : _fastq_solexa_convert_fastq_sanger,
278 ("fastq-illumina", "fastq-sanger") : _fastq_illumina_convert_fastq_sanger,
279 ("fastq", "fastq-solexa") : _fastq_sanger_convert_fastq_solexa,
280 ("fastq-sanger", "fastq-solexa") : _fastq_sanger_convert_fastq_solexa,
281 ("fastq-solexa", "fastq-solexa") : _fastq_solexa_convert_fastq_solexa,
282 ("fastq-illumina", "fastq-solexa") : _fastq_illumina_convert_fastq_solexa,
283 ("fastq", "fastq-illumina") : _fastq_sanger_convert_fastq_illumina,
284 ("fastq-sanger", "fastq-illumina") : _fastq_sanger_convert_fastq_illumina,
285 ("fastq-solexa", "fastq-illumina") : _fastq_solexa_convert_fastq_illumina,
286 ("fastq-illumina", "fastq-illumina") : _fastq_illumina_convert_fastq_illumina,
287 }
288
289 -def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None):
290 """SeqIO conversion function (PRIVATE)."""
291 try:
292 f = _converter[(in_format, out_format)]
293 except KeyError:
294 f = None
295 if f:
296 return f(in_handle, out_handle, alphabet)
297 else:
298 records = SeqIO.parse(in_handle, in_format, alphabet)
299 return SeqIO.write(records, out_handle, out_format)
300