1
2
3
4
5
6
7
8 """Bio.SeqIO support for the "phd" file format.
9
10 PHD files are output by PHRED and used by PHRAP and CONSED.
11
12 You are expected to use this module via the Bio.SeqIO functions, under the
13 format name "phd". See also the underlying Bio.Sequencing.Phd module.
14
15 For example, using Bio.SeqIO we can read in one of the example PHRED files
16 from the Biopython unit tests:
17
18 >>> from Bio import SeqIO
19 >>> for record in SeqIO.parse(open("Phd/phd1"), "phd"):
20 ... print record.id
21 ... print record.seq[:10], "..."
22 ... print record.letter_annotations["phred_quality"][:10], "..."
23 34_222_(80-A03-19).b.ab1
24 ctccgtcgga ...
25 [9, 9, 10, 19, 22, 37, 28, 28, 24, 22] ...
26 425_103_(81-A03-19).g.ab1
27 cgggatccca ...
28 [14, 17, 22, 10, 10, 10, 15, 8, 8, 9] ...
29 425_7_(71-A03-19).b.ab1
30 acataaatca ...
31 [10, 10, 10, 10, 8, 8, 6, 6, 6, 6] ...
32
33 Since PHRED files contain quality scores, you can save them as FASTQ or as
34 QUAL files, for example using Bio.SeqIO.write(...), or simply with the format
35 method of the SeqRecord object:
36
37 >>> print record[:50].format("fastq")
38 @425_7_(71-A03-19).b.ab1
39 acataaatcaaattactnaccaacacacaaaccngtctcgcgtagtggag
40 +
41 ++++))'''')(''')$!$''')''''(+.''$!$))))+)))'''''''
42 <BLANKLINE>
43
44 Or,
45
46 >>> print record[:50].format("qual")
47 >425_7_(71-A03-19).b.ab1
48 10 10 10 10 8 8 6 6 6 6 8 7 6 6 6 8 3 0 3 6 6 6 8 6 6 6 6 7
49 10 13 6 6 3 0 3 8 8 8 8 10 8 8 8 6 6 6 6 6 6 6
50 <BLANKLINE>
51
52 Note these examples only show the first 50 bases to keep the output short.
53 """
54
55 from Bio.SeqRecord import SeqRecord
56 from Bio.Sequencing import Phd
57 from Bio.SeqIO.Interfaces import SequentialSequenceWriter
58 from Bio.SeqIO import QualityIO
59
60
62 """Returns SeqRecord objects from a PHD file.
63
64 This uses the Bio.Sequencing.Phd module to do the hard work.
65 """
66 phd_records = Phd.parse(handle)
67 for phd_record in phd_records:
68
69
70
71
72
73 name = phd_record.file_name.split(None,1)[0]
74 seq_record = SeqRecord(phd_record.seq,
75 id = name, name = name,
76 description= phd_record.file_name)
77
78 seq_record.annotations = phd_record.comments
79
80 seq_record.letter_annotations["phred_quality"] = \
81 [int(site[1]) for site in phd_record.sites]
82 try:
83 seq_record.letter_annotations["peak_location"] = \
84 [int(site[2]) for site in phd_record.sites]
85 except IndexError:
86
87
88 pass
89 yield seq_record
90
91
93 """Class to write Phd format files"""
94
97
99 """Write a single Phd record to the file."""
100 assert record.seq, "No sequence present in SeqRecord"
101
102
103 phred_qualities = QualityIO._get_phred_quality(record)
104 peak_locations = record.letter_annotations.get("peak_location", None)
105 assert len(record.seq) == len(phred_qualities), "Number of " + \
106 "phd quality scores does not match length of sequence"
107 if peak_locations:
108 assert len(record.seq) == len(peak_locations), "Number " + \
109 "of peak location scores does not match length of sequence"
110 if None in phred_qualities:
111 raise ValueError("A quality value of None was found")
112 if record.description.startswith("%s " % record.id):
113 title = record.description
114 else:
115 title = "%s %s" % (record.id, record.description)
116 self.handle.write("BEGIN_SEQUENCE %s\nBEGIN_COMMENT\n" \
117 % self.clean(title))
118 for annot in [k.lower() for k in Phd.CKEYWORDS]:
119 value = None
120 if annot == "trim":
121 if record.annotations.get("trim", None):
122 value = "%s %s %.4f" % record.annotations["trim"]
123 elif annot == "trace_peak_area_ratio":
124 if record.annotations.get("trace_peak_area_ratio", None):
125 value = "%.4f" % record.annotations["trace_peak_area_ratio"]
126 else:
127 value = record.annotations.get(annot, None)
128 if value or value == 0:
129 self.handle.write("%s: %s\n" % (annot.upper(), value))
130
131 self.handle.write("END_COMMENT\nBEGIN_DNA\n")
132 for i, site in enumerate(record.seq):
133 if peak_locations:
134 self.handle.write("%s %i %i\n" % (
135 site,
136 round(phred_qualities[i]),
137 peak_locations[i])
138 )
139 else:
140 self.handle.write("%s %i\n" % (
141 site,
142 round(phred_qualities[i]))
143 )
144
145 self.handle.write("END_DNA\nEND_SEQUENCE\n")
146
148 """Run the Bio.SeqIO.PhdIO module's doctests.
149
150 This will try and locate the unit tests directory, and run the doctests
151 from there in order that the relative paths used in the examples work.
152 """
153 import doctest
154 import os
155 if os.path.isdir(os.path.join("..","..","Tests")):
156 print "Runing doctests..."
157 cur_dir = os.path.abspath(os.curdir)
158 os.chdir(os.path.join("..","..","Tests"))
159 assert os.path.isfile("Phd/phd1")
160 doctest.testmod()
161 os.chdir(cur_dir)
162 del cur_dir
163 print "Done"
164
165 if __name__ == "__main__":
166 _test()
167