Package Bio :: Package AlignIO :: Module PhylipIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.PhylipIO

  1  # Copyright 2006-2009 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  AlignIO support for the "phylip" format used in Joe Felsenstein's PHYLIP tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  Note 
 12  ==== 
 13  In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003) 
 14  a dot/period (".") in a sequence is interpreted as meaning the same 
 15  character as in the first sequence.  The PHYLIP 3.6 documentation says: 
 16   
 17     "a period was also previously allowed but it is no longer allowed, 
 18     because it sometimes is used in different senses in other programs" 
 19   
 20  At the time of writing, we do nothing special with a dot/period. 
 21  """ 
 22      
 23  from Bio.Alphabet import single_letter_alphabet 
 24  from Bio.Align.Generic import Alignment 
 25  from Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 26   
27 -class PhylipWriter(SequentialAlignmentWriter):
28 """Phylip alignment writer."""
29 - def write_alignment(self, alignment):
30 """Use this to write (another) single alignment to an open file. 31 32 This code will write interlaced alignments (when the sequences are 33 longer than 50 characters). 34 35 Note that record identifiers are strictly truncated at 10 characters. 36 37 For more information on the file format, please see: 38 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 39 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 40 """ 41 truncate=10 42 records = alignment.get_all_seqs() 43 handle = self.handle 44 45 if len(records)==0: 46 raise ValueError("Must have at least one sequence") 47 length_of_seqs = alignment.get_alignment_length() 48 for record in records: 49 if length_of_seqs != len(record.seq): 50 raise ValueError("Sequences must all be the same length") 51 if length_of_seqs <= 0: 52 raise ValueError("Non-empty sequences are required") 53 54 if len(records) > len(set([r.id[:truncate] for r in records])): 55 raise ValueError("Repeated identifier, possibly due to truncation") 56 57 58 # From experimentation, the use of tabs is not understood by the 59 # EMBOSS suite. The nature of the expected white space is not 60 # defined in the PHYLIP documentation, simply "These are in free 61 # format, separated by blanks". We'll use spaces to keep EMBOSS 62 # happy. 63 handle.write(" %i %s\n" % (len(records), length_of_seqs)) 64 block=0 65 while True: 66 for record in records: 67 if block==0: 68 #Write name (truncated/padded to 10 characters) 69 """ 70 Quoting the PHYLIP version 3.6 documentation: 71 72 The name should be ten characters in length, filled out to 73 the full ten characters by blanks if shorter. Any printable 74 ASCII/ISO character is allowed in the name, except for 75 parentheses ("(" and ")"), square brackets ("[" and "]"), 76 colon (":"), semicolon (";") and comma (","). If you forget 77 to extend the names to ten characters in length by blanks, 78 the program [i.e. PHYLIP] will get out of synchronization 79 with the contents of the data file, and an error message will 80 result. 81 82 Note that Tab characters count as only one character in the 83 species names. Their inclusion can cause trouble. 84 """ 85 name = record.id.strip() 86 #Either remove the banned characters, or map them to something 87 #else like an underscore "_" or pipe "|" character... 88 for char in "[](),": 89 name = name.replace(char,"") 90 for char in ":;": 91 name = name.replace(char,"|") 92 93 #Now truncate and right pad to expected length. 94 handle.write(name[:truncate].ljust(truncate)) 95 else: 96 #write 10 space indent 97 handle.write(" "*truncate) 98 #Write five chunks of ten letters per line... 99 for chunk in range(0,5): 100 i = block*50 + chunk*10 101 seq_segment = record.seq.tostring()[i:i+10] 102 #TODO - Force any gaps to be '-' character? Look at the alphabet... 103 #TODO - How to cope with '?' or '.' in the sequence? 104 handle.write(" %s" % seq_segment) 105 if i+10 > length_of_seqs : break 106 handle.write("\n") 107 block=block+1 108 if block*50 > length_of_seqs : break 109 handle.write("\n")
110
111 -class PhylipIterator(AlignmentIterator):
112 """Reads a Phylip alignment file returning an Alignment object iterator. 113 114 Record identifiers are limited to at most 10 characters. 115 116 It only copes with interlaced phylip files! Sequential files won't work 117 where the sequences are split over multiple lines. 118 119 For more information on the file format, please see: 120 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 121 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 122 """ 123
124 - def _is_header(self, line):
125 line = line.strip() 126 parts = filter(None, line.split()) 127 if len(parts)!=2: 128 return False # First line should have two integers 129 try: 130 number_of_seqs = int(parts[0]) 131 length_of_seqs = int(parts[1]) 132 return True 133 except ValueError: 134 return False # First line should have two integers
135
136 - def next(self):
137 handle = self.handle 138 139 try: 140 #Header we saved from when we were parsing 141 #the previous alignment. 142 line = self._header 143 del self._header 144 except AttributeError: 145 line = handle.readline() 146 147 if not line: return 148 line = line.strip() 149 parts = filter(None, line.split()) 150 if len(parts)!=2: 151 raise ValueError("First line should have two integers") 152 try: 153 number_of_seqs = int(parts[0]) 154 length_of_seqs = int(parts[1]) 155 except ValueError: 156 raise ValueError("First line should have two integers") 157 158 assert self._is_header(line) 159 160 if self.records_per_alignment is not None \ 161 and self.records_per_alignment != number_of_seqs: 162 raise ValueError("Found %i records in this alignment, told to expect %i" \ 163 % (number_of_seqs, self.records_per_alignment)) 164 165 ids = [] 166 seqs = [] 167 168 #Expects STRICT truncation/padding to 10 characters 169 #Does not require any white space between name and seq. 170 for i in range(0,number_of_seqs): 171 line = handle.readline().rstrip() 172 ids.append(line[:10].strip()) #first ten characters 173 seqs.append([line[10:].strip().replace(" ","")]) 174 175 #Look for further blocks 176 line="" 177 while True: 178 #Skip any blank lines between blocks... 179 while ""==line.strip(): 180 line = handle.readline() 181 if not line : break #end of file 182 if not line : break #end of file 183 184 if self._is_header(line): 185 #Looks like the start of a concatenated alignment 186 self._header = line 187 break 188 189 #print "New block..." 190 for i in range(0,number_of_seqs): 191 seqs[i].append(line.strip().replace(" ","")) 192 line = handle.readline() 193 if (not line) and i+1 < number_of_seqs: 194 raise ValueError("End of file mid-block") 195 if not line : break #end of file 196 197 alignment = Alignment(self.alphabet) 198 for i in range(0,number_of_seqs): 199 seq = "".join(seqs[i]) 200 if len(seq)!=length_of_seqs: 201 raise ValueError("Sequence %i length %i, expected length %i" \ 202 % (i+1, len(seq), length_of_seqs)) 203 alignment.add_sequence(ids[i], seq) 204 205 record = alignment.get_all_seqs()[-1] 206 assert ids[i] == record.id or ids[i] == record.description 207 record.id = ids[i] 208 record.name = ids[i] 209 record.description = ids[i] 210 return alignment
211 212 if __name__=="__main__": 213 print "Running short mini-test" 214 215 phylip_text=""" 8 286 216 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG 217 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG 218 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG 219 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG 220 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG 221 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA 222 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA 223 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG 224 225 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL 226 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE 227 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG 228 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG 229 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS 230 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA 231 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG 232 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS 233 234 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE 235 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD 236 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA 237 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE 238 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD 239 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK 240 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA 241 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE 242 243 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA 244 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA 245 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM 246 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA 247 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA 248 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA 249 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA 250 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA 251 252 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK 253 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK 254 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE 255 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA 256 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED 257 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE 258 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA 259 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE 260 261 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK---- 262 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH 263 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK---- 264 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ---- 265 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK---- 266 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K----- 267 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP--- 268 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG--- 269 """ 270 271 from cStringIO import StringIO 272 handle = StringIO(phylip_text) 273 count=0 274 for alignment in PhylipIterator(handle): 275 for record in alignment.get_all_seqs(): 276 count=count+1 277 print record.id 278 #print record.seq.tostring() 279 assert count == 8 280 281 expected="""mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg 282 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly 283 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys 284 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre 285 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ","").replace("\n","").upper() 286 assert record.seq.tostring().replace("-","") == expected 287 288 #From here: 289 #http://atgc.lirmm.fr/phyml/usersguide.html 290 phylip_text2="""5 60 291 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG 292 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG 293 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG 294 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG 295 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG 296 297 GAAATGGTCAATATTACAAGGT 298 GAAATGGTCAACATTAAAAGAT 299 GAAATCGTCAATATTAAAAGGT 300 GAAATGGTCAATCTTAAAAGGT 301 GAAATGGTCAATATTAAAAGGT""" 302 303 phylip_text3="""5 60 304 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT 305 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT 306 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT 307 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT 308 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT""" 309 310 handle = StringIO(phylip_text2) 311 list2 = list(PhylipIterator(handle)) 312 handle.close() 313 assert len(list2)==1 314 assert len(list2[0].get_all_seqs())==5 315 316 handle = StringIO(phylip_text3) 317 list3 = list(PhylipIterator(handle)) 318 handle.close() 319 assert len(list3)==1 320 assert len(list3[0].get_all_seqs())==5 321 322 for i in range(0,5): 323 list2[0].get_all_seqs()[i].id == list3[0].get_all_seqs()[i].id 324 list2[0].get_all_seqs()[i].seq.tostring() == list3[0].get_all_seqs()[i].seq.tostring() 325 326 #From here: 327 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 328 #Note the lack of any white space between names 2 and 3 and their seqs. 329 phylip_text4=""" 5 42 330 Turkey AAGCTNGGGC ATTTCAGGGT 331 Salmo gairAAGCCTTGGC AGTGCAGGGT 332 H. SapiensACCGGTTGGC CGTTCAGGGT 333 Chimp AAACCCTTGC CGTTACGCTT 334 Gorilla AAACCCTTGC CGGTACGCTT 335 336 GAGCCCGGGC AATACAGGGT AT 337 GAGCCGTGGC CGGGCACGGT AT 338 ACAGGTTGGC CGTTCAGGGT AA 339 AAACCGAGGC CGGGACACTC AT 340 AAACCATTGC CGGTACGCTT AA""" 341 342 #From here: 343 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 344 phylip_text5=""" 5 42 345 Turkey AAGCTNGGGC ATTTCAGGGT 346 GAGCCCGGGC AATACAGGGT AT 347 Salmo gairAAGCCTTGGC AGTGCAGGGT 348 GAGCCGTGGC CGGGCACGGT AT 349 H. SapiensACCGGTTGGC CGTTCAGGGT 350 ACAGGTTGGC CGTTCAGGGT AA 351 Chimp AAACCCTTGC CGTTACGCTT 352 AAACCGAGGC CGGGACACTC AT 353 Gorilla AAACCCTTGC CGGTACGCTT 354 AAACCATTGC CGGTACGCTT AA""" 355 356 phylip_text5a=""" 5 42 357 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT 358 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT 359 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA 360 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT 361 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA""" 362 363 handle = StringIO(phylip_text4) 364 list4 = list(PhylipIterator(handle)) 365 handle.close() 366 assert len(list4)==1 367 assert len(list4[0].get_all_seqs())==5 368 369 handle = StringIO(phylip_text5) 370 try: 371 list5 = list(PhylipIterator(handle)) 372 assert len(list5)==1 373 assert len(list5[0].get_all_seqs())==5 374 print "That should have failed..." 375 except ValueError: 376 print "Evil multiline non-interlaced example failed as expected" 377 handle.close() 378 379 handle = StringIO(phylip_text5a) 380 list5 = list(PhylipIterator(handle)) 381 handle.close() 382 assert len(list5)==1 383 assert len(list4[0].get_all_seqs())==5 384 385 print "Concatenation" 386 handle = StringIO(phylip_text4 + "\n" + phylip_text4) 387 assert len(list(PhylipIterator(handle))) == 2 388 389 handle = StringIO(phylip_text3 + "\n" + phylip_text4 + "\n\n\n" + phylip_text) 390 assert len(list(PhylipIterator(handle))) == 3 391 392 print "OK" 393 394 print "Checking write/read" 395 handle = StringIO() 396 PhylipWriter(handle).write_file(list5) 397 handle.seek(0) 398 list6 = list(PhylipIterator(handle)) 399 assert len(list5) == len(list6) 400 for a1,a2 in zip(list5, list6): 401 assert len(a1.get_all_seqs()) == len(a2.get_all_seqs()) 402 for r1, r2 in zip(a1.get_all_seqs(), a2.get_all_seqs()): 403 assert r1.id == r2.id 404 assert r1.seq.tostring() == r2.seq.tostring() 405 print "Done" 406