001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.util.Arrays;
024import java.util.Collections;
025import java.util.Comparator;
026import java.util.List;
027
028import org.apache.commons.io.ByteOrderMark;
029
030/**
031 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
032 *
033 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
034 * first byte in the stream.
035 *
036 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs:
037 * <ul>
038 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
039 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
040 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
041 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
042 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
043 * </ul>
044 *
045 *
046 * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3>
047 *
048 * <pre>
049 * BOMInputStream bomIn = new BOMInputStream(in);
050 * if (bomIn.hasBOM()) {
051 *     // has a UTF-8 BOM
052 * }
053 * </pre>
054 *
055 * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3>
056 *
057 * <pre>
058 * boolean include = true;
059 * BOMInputStream bomIn = new BOMInputStream(in, include);
060 * if (bomIn.hasBOM()) {
061 *     // has a UTF-8 BOM
062 * }
063 * </pre>
064 *
065 * <h3>Example 3 - Detect Multiple BOMs</h3>
066 *
067 * <pre>
068 * BOMInputStream bomIn = new BOMInputStream(in,
069 *   ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
070 *   ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE
071 *   );
072 * if (bomIn.hasBOM() == false) {
073 *     // No BOM found
074 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
075 *     // has a UTF-16LE BOM
076 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
077 *     // has a UTF-16BE BOM
078 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
079 *     // has a UTF-32LE BOM
080 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
081 *     // has a UTF-32BE BOM
082 * }
083 * </pre>
084 *
085 * @see org.apache.commons.io.ByteOrderMark
086 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
087 * @version $Id$
088 * @since 2.0
089 */
090public class BOMInputStream extends ProxyInputStream {
091    private final boolean include;
092    /**
093     * BOMs are sorted from longest to shortest.
094     */
095    private final List<ByteOrderMark> boms;
096    private ByteOrderMark byteOrderMark;
097    private int[] firstBytes;
098    private int fbLength;
099    private int fbIndex;
100    private int markFbIndex;
101    private boolean markedAtStart;
102
103    /**
104     * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
105     *
106     * @param delegate
107     *            the InputStream to delegate to
108     */
109    public BOMInputStream(final InputStream delegate) {
110        this(delegate, false, ByteOrderMark.UTF_8);
111    }
112
113    /**
114     * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it.
115     *
116     * @param delegate
117     *            the InputStream to delegate to
118     * @param include
119     *            true to include the UTF-8 BOM or false to exclude it
120     */
121    public BOMInputStream(final InputStream delegate, final boolean include) {
122        this(delegate, include, ByteOrderMark.UTF_8);
123    }
124
125    /**
126     * Constructs a new BOM InputStream that excludes the specified BOMs.
127     *
128     * @param delegate
129     *            the InputStream to delegate to
130     * @param boms
131     *            The BOMs to detect and exclude
132     */
133    public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
134        this(delegate, false, boms);
135    }
136
137    /**
138     * Compares ByteOrderMark objects in descending length order.
139     */
140    private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = new Comparator<ByteOrderMark>() {
141
142        @Override
143        public int compare(final ByteOrderMark bom1, final ByteOrderMark bom2) {
144            final int len1 = bom1.length();
145            final int len2 = bom2.length();
146            if (len1 > len2) {
147                return EOF;
148            }
149            if (len2 > len1) {
150                return 1;
151            }
152            return 0;
153        }
154    };
155
156    /**
157     * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
158     *
159     * @param delegate
160     *            the InputStream to delegate to
161     * @param include
162     *            true to include the specified BOMs or false to exclude them
163     * @param boms
164     *            The BOMs to detect and optionally exclude
165     */
166    public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
167        super(delegate);
168        if (boms == null || boms.length == 0) {
169            throw new IllegalArgumentException("No BOMs specified");
170        }
171        this.include = include;
172        final List<ByteOrderMark> list = Arrays.asList(boms);
173        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
174        Collections.sort(list, ByteOrderMarkLengthComparator);
175        this.boms = list;
176
177    }
178
179    /**
180     * Indicates whether the stream contains one of the specified BOMs.
181     *
182     * @return true if the stream has one of the specified BOMs, otherwise false if it does not
183     * @throws IOException
184     *             if an error reading the first bytes of the stream occurs
185     */
186    public boolean hasBOM() throws IOException {
187        return getBOM() != null;
188    }
189
190    /**
191     * Indicates whether the stream contains the specified BOM.
192     *
193     * @param bom
194     *            The BOM to check for
195     * @return true if the stream has the specified BOM, otherwise false if it does not
196     * @throws IllegalArgumentException
197     *             if the BOM is not one the stream is configured to detect
198     * @throws IOException
199     *             if an error reading the first bytes of the stream occurs
200     */
201    public boolean hasBOM(final ByteOrderMark bom) throws IOException {
202        if (!boms.contains(bom)) {
203            throw new IllegalArgumentException("Stream not configure to detect " + bom);
204        }
205        getBOM();
206        return byteOrderMark != null && byteOrderMark.equals(bom);
207    }
208
209    /**
210     * Return the BOM (Byte Order Mark).
211     *
212     * @return The BOM or null if none
213     * @throws IOException
214     *             if an error reading the first bytes of the stream occurs
215     */
216    public ByteOrderMark getBOM() throws IOException {
217        if (firstBytes == null) {
218            fbLength = 0;
219            // BOMs are sorted from longest to shortest
220            final int maxBomSize = boms.get(0).length();
221            firstBytes = new int[maxBomSize];
222            // Read first maxBomSize bytes
223            for (int i = 0; i < firstBytes.length; i++) {
224                firstBytes[i] = in.read();
225                fbLength++;
226                if (firstBytes[i] < 0) {
227                    break;
228                }
229            }
230            // match BOM in firstBytes
231            byteOrderMark = find();
232            if (byteOrderMark != null) {
233                if (!include) {
234                    if (byteOrderMark.length() < firstBytes.length) {
235                        fbIndex = byteOrderMark.length();
236                    } else {
237                        fbLength = 0;
238                    }
239                }
240            }
241        }
242        return byteOrderMark;
243    }
244
245    /**
246     * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
247     *
248     * @return The BOM charset Name or null if no BOM found
249     * @throws IOException
250     *             if an error reading the first bytes of the stream occurs
251     *
252     */
253    public String getBOMCharsetName() throws IOException {
254        getBOM();
255        return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
256    }
257
258    /**
259     * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
260     * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been
261     * processed already.
262     *
263     * @return the byte read (excluding BOM) or -1 if the end of stream
264     * @throws IOException
265     *             if an I/O error occurs
266     */
267    private int readFirstBytes() throws IOException {
268        getBOM();
269        return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF;
270    }
271
272    /**
273     * Find a BOM with the specified bytes.
274     *
275     * @return The matched BOM or null if none matched
276     */
277    private ByteOrderMark find() {
278        for (final ByteOrderMark bom : boms) {
279            if (matches(bom)) {
280                return bom;
281            }
282        }
283        return null;
284    }
285
286    /**
287     * Check if the bytes match a BOM.
288     *
289     * @param bom
290     *            The BOM
291     * @return true if the bytes match the bom, otherwise false
292     */
293    private boolean matches(final ByteOrderMark bom) {
294        // if (bom.length() != fbLength) {
295        // return false;
296        // }
297        // firstBytes may be bigger than the BOM bytes
298        for (int i = 0; i < bom.length(); i++) {
299            if (bom.get(i) != firstBytes[i]) {
300                return false;
301            }
302        }
303        return true;
304    }
305
306    // ----------------------------------------------------------------------------
307    // Implementation of InputStream
308    // ----------------------------------------------------------------------------
309
310    /**
311     * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM.
312     *
313     * @return the byte read (excluding BOM) or -1 if the end of stream
314     * @throws IOException
315     *             if an I/O error occurs
316     */
317    @Override
318    public int read() throws IOException {
319        final int b = readFirstBytes();
320        return b >= 0 ? b : in.read();
321    }
322
323    /**
324     * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM.
325     *
326     * @param buf
327     *            the buffer to read the bytes into
328     * @param off
329     *            The start offset
330     * @param len
331     *            The number of bytes to read (excluding BOM)
332     * @return the number of bytes read or -1 if the end of stream
333     * @throws IOException
334     *             if an I/O error occurs
335     */
336    @Override
337    public int read(final byte[] buf, int off, int len) throws IOException {
338        int firstCount = 0;
339        int b = 0;
340        while (len > 0 && b >= 0) {
341            b = readFirstBytes();
342            if (b >= 0) {
343                buf[off++] = (byte) (b & 0xFF);
344                len--;
345                firstCount++;
346            }
347        }
348        final int secondCount = in.read(buf, off, len);
349        return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
350    }
351
352    /**
353     * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM.
354     *
355     * @param buf
356     *            the buffer to read the bytes into
357     * @return the number of bytes read (excluding BOM) or -1 if the end of stream
358     * @throws IOException
359     *             if an I/O error occurs
360     */
361    @Override
362    public int read(final byte[] buf) throws IOException {
363        return read(buf, 0, buf.length);
364    }
365
366    /**
367     * Invokes the delegate's <code>mark(int)</code> method.
368     *
369     * @param readlimit
370     *            read ahead limit
371     */
372    @Override
373    public synchronized void mark(final int readlimit) {
374        markFbIndex = fbIndex;
375        markedAtStart = firstBytes == null;
376        in.mark(readlimit);
377    }
378
379    /**
380     * Invokes the delegate's <code>reset()</code> method.
381     *
382     * @throws IOException
383     *             if an I/O error occurs
384     */
385    @Override
386    public synchronized void reset() throws IOException {
387        fbIndex = markFbIndex;
388        if (markedAtStart) {
389            firstBytes = null;
390        }
391
392        in.reset();
393    }
394
395    /**
396     * Invokes the delegate's <code>skip(long)</code> method, detecting and optionally skipping BOM.
397     *
398     * @param n
399     *            the number of bytes to skip
400     * @return the number of bytes to skipped or -1 if the end of stream
401     * @throws IOException
402     *             if an I/O error occurs
403     */
404    @Override
405    public long skip(final long n) throws IOException {
406        int skipped = 0;
407        while ((n > skipped) && (readFirstBytes() >= 0)) {
408            skipped++;
409        }
410        return in.skip(n - skipped) + skipped;
411    }
412}