001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.archivers.dump;
020
021import org.apache.commons.compress.archivers.ArchiveException;
022import org.apache.commons.compress.archivers.ArchiveInputStream;
023import org.apache.commons.compress.archivers.zip.ZipEncoding;
024import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
025
026import java.io.EOFException;
027import java.io.IOException;
028import java.io.InputStream;
029
030import java.util.Arrays;
031import java.util.Comparator;
032import java.util.HashMap;
033import java.util.Map;
034import java.util.PriorityQueue;
035import java.util.Queue;
036import java.util.Stack;
037
038/**
039 * The DumpArchiveInputStream reads a UNIX dump archive as an InputStream.
040 * Methods are provided to position at each successive entry in
041 * the archive, and the read each entry as a normal input stream
042 * using read().
043 *
044 * There doesn't seem to exist a hint on the encoding of string values
045 * in any piece documentation.  Given the main purpose of dump/restore
046 * is backing up a system it seems very likely the format uses the
047 * current default encoding of the system.
048 *
049 * @NotThreadSafe
050 */
051public class DumpArchiveInputStream extends ArchiveInputStream {
052    private DumpArchiveSummary summary;
053    private DumpArchiveEntry active;
054    private boolean isClosed;
055    private boolean hasHitEOF;
056    private long entrySize;
057    private long entryOffset;
058    private int readIdx;
059    private final byte[] readBuf = new byte[DumpArchiveConstants.TP_SIZE];
060    private byte[] blockBuffer;
061    private int recordOffset;
062    private long filepos;
063    protected TapeInputStream raw;
064
065    // map of ino -> dirent entry. We can use this to reconstruct full paths.
066    private final Map<Integer, Dirent> names = new HashMap<Integer, Dirent>();
067
068    // map of ino -> (directory) entry when we're missing one or more elements in the path.
069    private final Map<Integer, DumpArchiveEntry> pending = new HashMap<Integer, DumpArchiveEntry>();
070
071    // queue of (directory) entries where we now have the full path.
072    private Queue<DumpArchiveEntry> queue;
073
074    /**
075     * The encoding to use for filenames and labels.
076     */
077    private final ZipEncoding zipEncoding;
078
079    // the provided encoding (for unit tests)
080    final String encoding;
081
082    /**
083     * Constructor using the platform's default encoding for file
084     * names.
085     *
086     * @param is stream to read from
087     * @throws ArchiveException on error
088     */
089    public DumpArchiveInputStream(InputStream is) throws ArchiveException {
090        this(is, null);
091    }
092
093    /**
094     * Constructor.
095     *
096     * @param is stream to read from
097     * @param encoding the encoding to use for file names, use null
098     * for the platform's default encoding
099     * @since 1.6
100     * @throws ArchiveException on error
101     */
102    public DumpArchiveInputStream(InputStream is, String encoding)
103        throws ArchiveException {
104        this.raw = new TapeInputStream(is);
105        this.hasHitEOF = false;
106        this.encoding = encoding;
107        this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
108
109        try {
110            // read header, verify it's a dump archive.
111            byte[] headerBytes = raw.readRecord();
112
113            if (!DumpArchiveUtil.verify(headerBytes)) {
114                throw new UnrecognizedFormatException();
115            }
116
117            // get summary information
118            summary = new DumpArchiveSummary(headerBytes, this.zipEncoding);
119
120            // reset buffer with actual block size.
121            raw.resetBlockSize(summary.getNTRec(), summary.isCompressed());
122
123            // allocate our read buffer.
124            blockBuffer = new byte[4 * DumpArchiveConstants.TP_SIZE];
125
126            // skip past CLRI and BITS segments since we don't handle them yet.
127            readCLRI();
128            readBITS();
129        } catch (IOException ex) {
130            throw new ArchiveException(ex.getMessage(), ex);
131        }
132
133        // put in a dummy record for the root node.
134        Dirent root = new Dirent(2, 2, 4, ".");
135        names.put(2, root);
136
137        // use priority based on queue to ensure parent directories are
138        // released first.
139        queue = new PriorityQueue<DumpArchiveEntry>(10,
140                new Comparator<DumpArchiveEntry>() {
141                    public int compare(DumpArchiveEntry p, DumpArchiveEntry q) {
142                        if (p.getOriginalName() == null || q.getOriginalName() == null) {
143                            return Integer.MAX_VALUE;
144                        }
145
146                        return p.getOriginalName().compareTo(q.getOriginalName());
147                    }
148                });
149    }
150
151    @Deprecated
152    @Override
153    public int getCount() {
154        return (int) getBytesRead();
155    }
156
157    @Override
158    public long getBytesRead() {
159        return raw.getBytesRead();
160    }
161
162    /**
163     * Return the archive summary information.
164     * @return the summary
165     */
166    public DumpArchiveSummary getSummary() {
167        return summary;
168    }
169
170    /**
171     * Read CLRI (deleted inode) segment.
172     */
173    private void readCLRI() throws IOException {
174        byte[] buffer = raw.readRecord();
175
176        if (!DumpArchiveUtil.verify(buffer)) {
177            throw new InvalidFormatException();
178        }
179
180        active = DumpArchiveEntry.parse(buffer);
181
182        if (DumpArchiveConstants.SEGMENT_TYPE.CLRI != active.getHeaderType()) {
183            throw new InvalidFormatException();
184        }
185
186        // we don't do anything with this yet.
187        if (raw.skip(DumpArchiveConstants.TP_SIZE * active.getHeaderCount())
188            == -1) {
189            throw new EOFException();
190        }
191        readIdx = active.getHeaderCount();
192    }
193
194    /**
195     * Read BITS segment.
196     */
197    private void readBITS() throws IOException {
198        byte[] buffer = raw.readRecord();
199
200        if (!DumpArchiveUtil.verify(buffer)) {
201            throw new InvalidFormatException();
202        }
203
204        active = DumpArchiveEntry.parse(buffer);
205
206        if (DumpArchiveConstants.SEGMENT_TYPE.BITS != active.getHeaderType()) {
207            throw new InvalidFormatException();
208        }
209
210        // we don't do anything with this yet.
211        if (raw.skip(DumpArchiveConstants.TP_SIZE * active.getHeaderCount())
212            == -1) {
213            throw new EOFException();
214        }
215        readIdx = active.getHeaderCount();
216    }
217
218    /**
219     * Read the next entry.
220     * @return the next entry
221     * @throws IOException on error
222     */
223    public DumpArchiveEntry getNextDumpEntry() throws IOException {
224        return getNextEntry();
225    }
226
227    @Override
228    public DumpArchiveEntry getNextEntry() throws IOException {
229        DumpArchiveEntry entry = null;
230        String path = null;
231
232        // is there anything in the queue?
233        if (!queue.isEmpty()) {
234            return queue.remove();
235        }
236
237        while (entry == null) {
238            if (hasHitEOF) {
239                return null;
240            }
241
242            // skip any remaining records in this segment for prior file.
243            // we might still have holes... easiest to do it
244            // block by block. We may want to revisit this if
245            // the unnecessary decompression time adds up.
246            while (readIdx < active.getHeaderCount()) {
247                if (!active.isSparseRecord(readIdx++)
248                    && raw.skip(DumpArchiveConstants.TP_SIZE) == -1) {
249                    throw new EOFException();
250                }
251            }
252
253            readIdx = 0;
254            filepos = raw.getBytesRead();
255
256            byte[] headerBytes = raw.readRecord();
257
258            if (!DumpArchiveUtil.verify(headerBytes)) {
259                throw new InvalidFormatException();
260            }
261
262            active = DumpArchiveEntry.parse(headerBytes);
263
264            // skip any remaining segments for prior file.
265            while (DumpArchiveConstants.SEGMENT_TYPE.ADDR == active.getHeaderType()) {
266                if (raw.skip(DumpArchiveConstants.TP_SIZE
267                             * (active.getHeaderCount()
268                                - active.getHeaderHoles())) == -1) {
269                    throw new EOFException();
270                }
271
272                filepos = raw.getBytesRead();
273                headerBytes = raw.readRecord();
274
275                if (!DumpArchiveUtil.verify(headerBytes)) {
276                    throw new InvalidFormatException();
277                }
278
279                active = DumpArchiveEntry.parse(headerBytes);
280            }
281
282            // check if this is an end-of-volume marker.
283            if (DumpArchiveConstants.SEGMENT_TYPE.END == active.getHeaderType()) {
284                hasHitEOF = true;
285
286                return null;
287            }
288
289            entry = active;
290
291            if (entry.isDirectory()) {
292                readDirectoryEntry(active);
293
294                // now we create an empty InputStream.
295                entryOffset = 0;
296                entrySize = 0;
297                readIdx = active.getHeaderCount();
298            } else {
299                entryOffset = 0;
300                entrySize = active.getEntrySize();
301                readIdx = 0;
302            }
303
304            recordOffset = readBuf.length;
305
306            path = getPath(entry);
307
308            if (path == null) {
309                entry = null;
310            }
311        }
312
313        entry.setName(path);
314        entry.setSimpleName(names.get(entry.getIno()).getName());
315        entry.setOffset(filepos);
316
317        return entry;
318    }
319
320    /**
321     * Read directory entry.
322     */
323    private void readDirectoryEntry(DumpArchiveEntry entry)
324        throws IOException {
325        long size = entry.getEntrySize();
326        boolean first = true;
327
328        while (first ||
329                DumpArchiveConstants.SEGMENT_TYPE.ADDR == entry.getHeaderType()) {
330            // read the header that we just peeked at.
331            if (!first) {
332                raw.readRecord();
333            }
334
335            if (!names.containsKey(entry.getIno()) &&
336                    DumpArchiveConstants.SEGMENT_TYPE.INODE == entry.getHeaderType()) {
337                pending.put(entry.getIno(), entry);
338            }
339
340            int datalen = DumpArchiveConstants.TP_SIZE * entry.getHeaderCount();
341
342            if (blockBuffer.length < datalen) {
343                blockBuffer = new byte[datalen];
344            }
345
346            if (raw.read(blockBuffer, 0, datalen) != datalen) {
347                throw new EOFException();
348            }
349
350            int reclen = 0;
351
352            for (int i = 0; i < datalen - 8 && i < size - 8;
353                    i += reclen) {
354                int ino = DumpArchiveUtil.convert32(blockBuffer, i);
355                reclen = DumpArchiveUtil.convert16(blockBuffer, i + 4);
356
357                byte type = blockBuffer[i + 6];
358
359                String name = DumpArchiveUtil.decode(zipEncoding, blockBuffer, i + 8, blockBuffer[i + 7]);
360
361                if (".".equals(name) || "..".equals(name)) {
362                    // do nothing...
363                    continue;
364                }
365
366                Dirent d = new Dirent(ino, entry.getIno(), type, name);
367
368                /*
369                if ((type == 4) && names.containsKey(ino)) {
370                    System.out.println("we already have ino: " +
371                                       names.get(ino));
372                }
373                */
374
375                names.put(ino, d);
376
377                // check whether this allows us to fill anything in the pending list.
378                for (Map.Entry<Integer, DumpArchiveEntry> e : pending.entrySet()) {
379                    String path = getPath(e.getValue());
380
381                    if (path != null) {
382                        e.getValue().setName(path);
383                        e.getValue()
384                         .setSimpleName(names.get(e.getKey()).getName());
385                        queue.add(e.getValue());
386                    }
387                }
388
389                // remove anything that we found. (We can't do it earlier
390                // because of concurrent modification exceptions.)
391                for (DumpArchiveEntry e : queue) {
392                    pending.remove(e.getIno());
393                }
394            }
395
396            byte[] peekBytes = raw.peek();
397
398            if (!DumpArchiveUtil.verify(peekBytes)) {
399                throw new InvalidFormatException();
400            }
401
402            entry = DumpArchiveEntry.parse(peekBytes);
403            first = false;
404            size -= DumpArchiveConstants.TP_SIZE;
405        }
406    }
407
408    /**
409     * Get full path for specified archive entry, or null if there's a gap.
410     *
411     * @param entry
412     * @return  full path for specified archive entry, or null if there's a gap.
413     */
414    private String getPath(DumpArchiveEntry entry) {
415        // build the stack of elements. It's possible that we're 
416        // still missing an intermediate value and if so we
417        Stack<String> elements = new Stack<String>();
418        Dirent dirent = null;
419
420        for (int i = entry.getIno();; i = dirent.getParentIno()) {
421            if (!names.containsKey(i)) {
422                elements.clear();
423                break;
424            }
425
426            dirent = names.get(i);
427            elements.push(dirent.getName());
428
429            if (dirent.getIno() == dirent.getParentIno()) {
430                break;
431            }
432        }
433
434        // if an element is missing defer the work and read next entry.
435        if (elements.isEmpty()) {
436            pending.put(entry.getIno(), entry);
437
438            return null;
439        }
440
441        // generate full path from stack of elements.
442        StringBuilder sb = new StringBuilder(elements.pop());
443
444        while (!elements.isEmpty()) {
445            sb.append('/');
446            sb.append(elements.pop());
447        }
448
449        return sb.toString();
450    }
451
452    /**
453     * Reads bytes from the current dump archive entry.
454     *
455     * This method is aware of the boundaries of the current
456     * entry in the archive and will deal with them as if they
457     * were this stream's start and EOF.
458     *
459     * @param buf The buffer into which to place bytes read.
460     * @param off The offset at which to place bytes read.
461     * @param len The number of bytes to read.
462     * @return The number of bytes read, or -1 at EOF.
463     * @throws IOException on error
464     */
465    @Override
466    public int read(byte[] buf, int off, int len) throws IOException {
467        int totalRead = 0;
468
469        if (hasHitEOF || isClosed || entryOffset >= entrySize) {
470            return -1;
471        }
472
473        if (active == null) {
474            throw new IllegalStateException("No current dump entry");
475        }
476
477        if (len + entryOffset > entrySize) {
478            len = (int) (entrySize - entryOffset);
479        }
480
481        while (len > 0) {
482            int sz = len > readBuf.length - recordOffset
483                ? readBuf.length - recordOffset : len;
484
485            // copy any data we have
486            if (recordOffset + sz <= readBuf.length) {
487                System.arraycopy(readBuf, recordOffset, buf, off, sz);
488                totalRead += sz;
489                recordOffset += sz;
490                len -= sz;
491                off += sz;
492            }
493
494            // load next block if necessary.
495            if (len > 0) {
496                if (readIdx >= 512) {
497                    byte[] headerBytes = raw.readRecord();
498
499                    if (!DumpArchiveUtil.verify(headerBytes)) {
500                        throw new InvalidFormatException();
501                    }
502
503                    active = DumpArchiveEntry.parse(headerBytes);
504                    readIdx = 0;
505                }
506
507                if (!active.isSparseRecord(readIdx++)) {
508                    int r = raw.read(readBuf, 0, readBuf.length);
509                    if (r != readBuf.length) {
510                        throw new EOFException();
511                    }
512                } else {
513                    Arrays.fill(readBuf, (byte) 0);
514                }
515
516                recordOffset = 0;
517            }
518        }
519
520        entryOffset += totalRead;
521
522        return totalRead;
523    }
524
525    /**
526     * Closes the stream for this entry.
527     */
528    @Override
529    public void close() throws IOException {
530        if (!isClosed) {
531            isClosed = true;
532            raw.close();
533        }
534    }
535
536    /**
537     * Look at the first few bytes of the file to decide if it's a dump
538     * archive. With 32 bytes we can look at the magic value, with a full
539     * 1k we can verify the checksum.
540     * @param buffer data to match
541     * @param length length of data
542     * @return whether the buffer seems to contain dump data
543     */
544    public static boolean matches(byte[] buffer, int length) {
545        // do we have enough of the header?
546        if (length < 32) {
547            return false;
548        }
549
550        // this is the best test
551        if (length >= DumpArchiveConstants.TP_SIZE) {
552            return DumpArchiveUtil.verify(buffer);
553        }
554
555        // this will work in a pinch.
556        return DumpArchiveConstants.NFS_MAGIC == DumpArchiveUtil.convert32(buffer,
557            24);
558    }
559
560}