Package Bio :: Package SeqIO :: Module _index
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._index

  1  # Copyright 2009 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Dictionary like indexing of sequence files (PRIVATE). 
  6   
  7  You are not expected to access this module, or any of its code, directly. This 
  8  is all handled internally by the Bio.SeqIO.index(...) function which is the 
  9  public interface for this functionality. 
 10   
 11  The basic idea is that we scan over a sequence file, looking for new record 
 12  markers. We then try and extract the string that Bio.SeqIO.parse/read would 
 13  use as the record id, ideally without actually parsing the full record. We 
 14  then use a subclassed Python dictionary to record the file offset for the 
 15  record start against the record id. 
 16   
 17  Note that this means full parsing is on demand, so any invalid or problem 
 18  record may not trigger an exception until it is accessed. This is by design. 
 19   
 20  This means our dictionary like objects have in memory ALL the keys (all the 
 21  record identifiers), which shouldn't be a problem even with second generation 
 22  sequencing. If this is an issue later on, storing the keys and offsets in a 
 23  temp lookup file might be one idea (e.g. using SQLite or an OBDA style index). 
 24  """ 
 25   
 26  import re 
 27  from Bio import SeqIO 
 28   
29 -class _IndexedSeqFileDict(dict):
30 """Read only dictionary interface to a sequential sequence file. 31 32 Keeps the keys in memory, reads the file to access entries as 33 SeqRecord objects using Bio.SeqIO for parsing them. This approach 34 is memory limited, but will work even with millions of sequences. 35 36 Note - as with the Bio.SeqIO.to_dict() function, duplicate keys 37 (record identifiers by default) are not allowed. If this happens, 38 a ValueError exception is raised. 39 40 By default the SeqRecord's id string is used as the dictionary 41 key. This can be changed by suppling an optional key_function, 42 a callback function which will be given the record id and must 43 return the desired key. For example, this allows you to parse 44 NCBI style FASTA identifiers, and extract the GI number to use 45 as the dictionary key. 46 47 Note that this dictionary is essentially read only. You cannot 48 add or change values, pop values, nor clear the dictionary. 49 """
50 - def __init__(self, filename, alphabet, key_function, mode="rU"):
51 #Use key_function=None for default value 52 dict.__init__(self) #init as empty dict! 53 self._handle = open(filename, mode) 54 self._alphabet = alphabet 55 self._format = "" 56 self._key_function = key_function
57 #Now scan it in a subclassed method, and set the format! 58
59 - def __repr__(self):
60 return "SeqIO.index('%s', '%s', alphabet=%s, key_function=%s)" \ 61 % (self._handle.name, self._format, 62 repr(self._alphabet), self._key_function)
63
64 - def __str__(self):
65 if self: 66 return "{%s : SeqRecord(...), ...}" % repr(self.keys()[0]) 67 else: 68 return "{}"
69
70 - def _record_key(self, identifier, seek_position):
71 """Used by subclasses to record file offsets for identifiers (PRIVATE). 72 73 This will apply the key_function (if given) to map the record id 74 string to the desired key. 75 76 This will raise a ValueError if a key (record id string) occurs 77 more than once. 78 """ 79 if self._key_function: 80 key = self._key_function(identifier) 81 else: 82 key = identifier 83 if key in self: 84 raise ValueError("Duplicate key '%s'" % key) 85 else: 86 dict.__setitem__(self, key, seek_position)
87
88 - def values(self):
89 """Would be a list of the SeqRecord objects, but not implemented. 90 91 In general you can be indexing very very large files, with millions 92 of sequences. Loading all these into memory at once as SeqRecord 93 objects would (probably) use up all the RAM. Therefore we simply 94 don't support this dictionary method. 95 """ 96 raise NotImplementedError("Due to memory concerns, when indexing a " 97 "sequence file you cannot access all the " 98 "records at once.")
99
100 - def items(self):
101 """Would be a list of the (key, SeqRecord) tuples, but not implemented. 102 103 In general you can be indexing very very large files, with millions 104 of sequences. Loading all these into memory at once as SeqRecord 105 objects would (probably) use up all the RAM. Therefore we simply 106 don't support this dictionary method. 107 """ 108 raise NotImplementedError("Due to memory concerns, when indexing a " 109 "sequence file you cannot access all the " 110 "records at once.")
111
112 - def iteritems(self):
113 """Iterate over the (key, SeqRecord) items.""" 114 for key in self.__iter__(): 115 yield key, self.__getitem__(key)
116
117 - def __getitem__(self, key):
118 """x.__getitem__(y) <==> x[y]""" 119 #For non-trivial file formats this must be over-ridden in the subclass 120 handle = self._handle 121 handle.seek(dict.__getitem__(self, key)) 122 record = SeqIO.parse(handle, self._format, self._alphabet).next() 123 if self._key_function: 124 assert self._key_function(record.id) == key, \ 125 "Requested key %s, found record.id %s which has key %s" \ 126 % (repr(key), repr(record.id), 127 repr(self._key_function(record.id))) 128 else: 129 assert record.id == key, \ 130 "Requested key %s, found record.id %s" \ 131 % (repr(key), repr(record.id)) 132 return record
133
134 - def get(self, k, d=None):
135 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" 136 try: 137 return self.__getitem__(k) 138 except KeyError: 139 return d
140
141 - def __setitem__(self, key, value):
142 """Would allow setting or replacing records, but not implemented.""" 143 raise NotImplementedError("An indexed a sequence file is read only.")
144
145 - def update(self, **kwargs):
146 """Would allow adding more values, but not implemented.""" 147 raise NotImplementedError("An indexed a sequence file is read only.")
148 149
150 - def pop(self, key, default=None):
151 """Would remove specified record, but not implemented.""" 152 raise NotImplementedError("An indexed a sequence file is read only.")
153
154 - def popitem(self):
155 """Would remove and return a SeqRecord, but not implemented.""" 156 raise NotImplementedError("An indexed a sequence file is read only.")
157 158
159 - def clear(self):
160 """Would clear dictionary, but not implemented.""" 161 raise NotImplementedError("An indexed a sequence file is read only.")
162
163 - def fromkeys(self, keys, value=None):
164 """A dictionary method which we don't implement.""" 165 raise NotImplementedError("An indexed a sequence file doesn't " 166 "support this.")
167
168 - def copy(self):
169 """A dictionary method which we don't implement.""" 170 raise NotImplementedError("An indexed a sequence file doesn't " 171 "support this.")
172 173 174 175 #################### 176 # Special indexers # 177 #################### 178 179 # Anything where the records cannot be read simply by parsing from 180 # the record start. For example, anything requiring information from 181 # a file header - e.g. SFF files where we would need to know the 182 # number of flows. 183 184 ################### 185 # Simple indexers # 186 ################### 187
188 -class _SequentialSeqFileDict(_IndexedSeqFileDict):
189 """Subclass for easy cases (PRIVATE)."""
190 - def __init__(self, filename, alphabet, key_function, format, marker):
191 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function) 192 self._format = format 193 handle = self._handle 194 marker_re = re.compile("^%s" % marker) 195 marker_offset = len(marker) 196 while True: 197 offset = handle.tell() 198 line = handle.readline() 199 if not line : break #End of file 200 if marker_re.match(line): 201 #Here we can assume the record.id is the first word after the 202 #marker. This is generally fine... but not for GenBank, EMBL, Swiss 203 self._record_key(line[marker_offset:].strip().split(None,1)[0], offset)
204
205 -class FastaDict(_SequentialSeqFileDict):
206 """Indexed dictionary like access to a FASTA file."""
207 - def __init__(self, filename, alphabet, key_function):
208 _SequentialSeqFileDict.__init__(self, filename, alphabet, key_function, 209 "fasta", ">")
210
211 -class QualDict(_SequentialSeqFileDict):
212 """Indexed dictionary like access to a QUAL file."""
213 - def __init__(self, filename, alphabet, key_function):
214 _SequentialSeqFileDict.__init__(self, filename, alphabet, key_function, 215 "qual", ">")
216
217 -class PirDict(_SequentialSeqFileDict):
218 """Indexed dictionary like access to a PIR/NBRF file."""
219 - def __init__(self, filename, alphabet, key_function):
220 _SequentialSeqFileDict.__init__(self, filename, alphabet, key_function, 221 "pir", ">..;")
222
223 -class PhdDict(_SequentialSeqFileDict):
224 """Indexed dictionary like access to a PHD (PHRED) file."""
225 - def __init__(self, filename, alphabet, key_function):
226 _SequentialSeqFileDict.__init__(self, filename, alphabet, key_function, 227 "phd", "BEGIN_SEQUENCE")
228
229 -class AceDict(_SequentialSeqFileDict):
230 """Indexed dictionary like access to an ACE file."""
231 - def __init__(self, filename, alphabet, key_function):
232 _SequentialSeqFileDict.__init__(self, filename, alphabet, key_function, 233 "ace", "CO ")
234 235 236 ####################################### 237 # Fiddly indexers: GenBank, EMBL, ... # 238 ####################################### 239
240 -class GenBankDict(_IndexedSeqFileDict):
241 """Indexed dictionary like access to a GenBank file."""
242 - def __init__(self, filename, alphabet, key_function):
243 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function) 244 self._format = "genbank" 245 handle = self._handle 246 marker_re = re.compile("^LOCUS ") 247 while True: 248 offset = handle.tell() 249 line = handle.readline() 250 if not line : break #End of file 251 if marker_re.match(line): 252 #We cannot assume the record.id is the first word after LOCUS, 253 #normally the first entry on the VERSION or ACCESSION line is used. 254 key = None 255 done = False 256 while not done: 257 line = handle.readline() 258 if line.startswith("ACCESSION "): 259 key = line.rstrip().split()[1] 260 elif line.startswith("VERSION "): 261 version_id = line.rstrip().split()[1] 262 if version_id.count(".")==1 and version_id.split(".")[1].isdigit(): 263 #This should mimics the GenBank parser... 264 key = version_id 265 done = True 266 break 267 elif line.startswith("FEATURES ") \ 268 or line.startswith("ORIGIN ") \ 269 or line.startswith("//") \ 270 or marker_re.match(line) \ 271 or not line: 272 done = True 273 break 274 if not key: 275 raise ValueError("Did not find ACCESSION/VERSION lines") 276 self._record_key(key, offset)
277
278 -class EmblDict(_IndexedSeqFileDict):
279 """Indexed dictionary like access to an EMBL file."""
280 - def __init__(self, filename, alphabet, key_function):
281 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function) 282 self._format = "embl" 283 handle = self._handle 284 marker_re = re.compile("^ID ") 285 while True: 286 offset = handle.tell() 287 line = handle.readline() 288 if not line : break #End of file 289 if marker_re.match(line): 290 #We cannot assume the record.id is the first word after ID, 291 #normally the SV line is used. 292 parts = line[3:].rstrip().split(";") 293 if parts[1].strip().startswith("SV "): 294 #The SV bit gives the version 295 key = "%s.%s" % (parts[0].strip(),parts[1].strip().split()[1]) 296 else: 297 key = parts[0].strip() 298 while True: 299 line = handle.readline() 300 if line.startswith("SV "): 301 key = line.rstrip().split()[1] 302 break 303 elif line.startswith("FH ") \ 304 or line.startswith("FT ") \ 305 or line.startswith("SQ ") \ 306 or line.startswith("//") \ 307 or marker_re.match(line) \ 308 or not line: 309 break 310 self._record_key(key, offset)
311
312 -class SwissDict(_IndexedSeqFileDict):
313 """Indexed dictionary like access to a SwissProt file."""
314 - def __init__(self, filename, alphabet, key_function):
315 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function) 316 self._format = "swiss" 317 handle = self._handle 318 marker_re = re.compile("^ID ") 319 while True: 320 offset = handle.tell() 321 line = handle.readline() 322 if not line : break #End of file 323 if marker_re.match(line): 324 #We cannot assume the record.id is the first word after ID, 325 #normally the following AC line is used. 326 line = handle.readline() 327 assert line.startswith("AC ") 328 key = line[3:].strip().split(";")[0].strip() 329 self._record_key(key, offset)
330
331 -class IntelliGeneticsDict(_IndexedSeqFileDict):
332 """Indexed dictionary like access to a IntelliGenetics file."""
333 - def __init__(self, filename, alphabet, key_function):
334 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function) 335 self._format = "ig" 336 handle = self._handle 337 marker_re = re.compile("^;") 338 while True: 339 offset = handle.tell() 340 line = handle.readline() 341 if not line : break #End of file 342 if marker_re.match(line): 343 #Now look for the first line which doesn't start ";" 344 while True: 345 line = handle.readline() 346 if not line: 347 raise ValueError("Premature end of file?") 348 if line[0] != ";" and line.strip(): 349 key = line.split()[0] 350 self._record_key(key, offset) 351 break
352
353 -class TabDict(_IndexedSeqFileDict):
354 """Indexed dictionary like access to a simple tabbed file."""
355 - def __init__(self, filename, alphabet, key_function):
356 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function) 357 self._format = "tab" 358 handle = self._handle 359 while True: 360 offset = handle.tell() 361 line = handle.readline() 362 if not line : break #End of file 363 try: 364 key, rest = line.split("\t") 365 except ValueError, err: 366 if not line.strip(): 367 #Ignore blank lines 368 continue 369 else: 370 raise err 371 else: 372 self._record_key(key, offset)
373 374 ########################## 375 # Now the FASTQ indexers # 376 ########################## 377
378 -class _FastqSeqFileDict(_IndexedSeqFileDict):
379 """Subclass for easy cases (PRIVATE). 380 381 With FASTQ the records all start with a "@" line, but so too can some 382 quality lines. Note this will cope with line-wrapped FASTQ files. 383 """
384 - def __init__(self, filename, alphabet, key_function, fastq_format):
385 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function) 386 self._format = fastq_format 387 handle = self._handle 388 pos = handle.tell() 389 line = handle.readline() 390 if not line: 391 #Empty file! 392 return 393 if line[0] != "@": 394 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line)) 395 while line: 396 #assert line[0]=="@" 397 #This record seems OK (so far) 398 self._record_key(line[1:].rstrip().split(None,1)[0],pos) 399 #Find the seq line(s) 400 seq_len = 0 401 while line: 402 line = handle.readline() 403 if line.startswith("+") : break 404 seq_len += len(line.strip()) 405 if not line: 406 raise ValueError("Premature end of file in seq section") 407 #assert line[0]=="+" 408 #Find the qual line(s) 409 qual_len = 0 410 while line: 411 if seq_len == qual_len: 412 #Should be end of record... 413 pos = handle.tell() 414 line = handle.readline() 415 if line and line[0]!="@": 416 ValueError("Problem with line %s" % repr(line)) 417 break 418 else: 419 line = handle.readline() 420 qual_len += len(line.strip()) 421 if seq_len != qual_len: 422 raise ValueError("Problem with quality section")
423 #print "EOF" 424
425 -class FastqSangerDict(_FastqSeqFileDict):
426 """Indexed dictionary like access to a standard Sanger FASTQ file."""
427 - def __init__(self, filename, alphabet, key_function):
428 _FastqSeqFileDict.__init__(self, filename, alphabet, key_function, 429 "fastq-sanger")
430
431 -class FastqSolexaDict(_FastqSeqFileDict):
432 """Indexed dictionary like access to a Solexa (or early Illumina) FASTQ file."""
433 - def __init__(self, filename, alphabet, key_function):
434 _FastqSeqFileDict.__init__(self, filename, alphabet, key_function, 435 "fastq-solexa")
436
437 -class FastqIlluminaDict(_FastqSeqFileDict):
438 """Indexed dictionary like access to a Illumina 1.3+ FASTQ file."""
439 - def __init__(self, filename, alphabet, key_function):
440 _FastqSeqFileDict.__init__(self, filename, alphabet, key_function, 441 "fastq-illumina")
442 443 ############################################################################### 444 445 _FormatToIndexedDict = {"ace" : AceDict, 446 "embl" : EmblDict, 447 "fasta" : FastaDict, 448 "fastq" : FastqSangerDict, 449 "fastq-sanger" : FastqSangerDict, #alias of the above 450 "fastq-solexa" : FastqSolexaDict, 451 "fastq-illumina" : FastqIlluminaDict, 452 "genbank" : GenBankDict, 453 "gb" : GenBankDict, #alias of the above 454 "ig" : IntelliGeneticsDict, 455 "phd" : PhdDict, 456 "pir" : PirDict, 457 "swiss" : SwissDict, 458 "tab" : TabDict, 459 "qual" : QualDict 460 } 461