1 files changed, 411 insertions, 0 deletions
diff --git a/velocity-engine-core/src/main/java/org/apache/velocity/io/UnicodeInputStream.java b/velocity-engine-core/src/main/java/org/apache/velocity/io/UnicodeInputStream.java
new file mode 100644
index 00000000..43abfef5
--- /dev/null
+++ b/velocity-engine-core/src/main/java/org/apache/velocity/io/UnicodeInputStream.java
@@ -0,0 +1,411 @@
+package org.apache.velocity.io;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+
+import java.util.Locale;
+
+/**
+ * This is an input stream that is unicode BOM aware. This allows you to e.g. read
+ * Windows Notepad Unicode files as Velocity templates.
+ *
+ * It allows you to check the actual encoding of a file by calling {@link #getEncodingFromStream()} on
+ * the input stream reader.
+ *
+ * This class is not thread safe! When more than one thread wants to use an instance of UnicodeInputStream,
+ * the caller must provide synchronization.
+ *
+ * @author <a href="mailto:mailmur@yahoo.com">Aki Nieminen</a>
+ * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
+ * @version $Id$
+ * @since 1.5
+ */
+public class UnicodeInputStream
+    extends InputStream
+{
+
+    /** BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html */
+    public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8", new byte [] { (byte)0xef, (byte)0xbb, (byte)0xbf });
+
+    /** BOM Marker for UTF 16, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html */
+    public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM("UTF-16LE", new byte [] { (byte)0xff, (byte)0xfe });
+
+    /** BOM Marker for UTF 16, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html */
+    public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM("UTF-16BE", new byte [] { (byte)0xfe, (byte)0xff });
+
+    /**
+     * BOM Marker for UTF 32, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html
+     *
+     */
+    public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM("UTF-32LE", new byte [] { (byte)0xff, (byte)0xfe, (byte)0x00, (byte)0x00 });
+
+    /**
+     * BOM Marker for UTF 32, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html
+     *
+     */
+    public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM("UTF-32BE", new byte [] { (byte)0x00, (byte)0x00, (byte)0xfe, (byte)0xff });
+
+    /** The maximum amount of bytes to read for a BOM */
+    private static final int MAX_BOM_SIZE = 4;
+
+    /** Buffer for BOM reading */
+    private byte [] buf = new byte[MAX_BOM_SIZE];
+
+    /** Buffer pointer. */
+    private int pos = 0;
+
+    /** The stream encoding as read from the BOM or null. */
+    private final String encoding;
+
+    /** True if the BOM itself should be skipped and not read. */
+    private final boolean skipBOM;
+
+    private final PushbackInputStream inputStream;
+
+    /**
+     * Creates a new UnicodeInputStream object. Skips a BOM which defines the file encoding.
+     *
+     * @param  inputStream The input stream to use for reading.
+     * @throws IllegalStateException
+     * @throws IOException
+     */
+    public UnicodeInputStream(final InputStream inputStream)
+            throws IllegalStateException, IOException
+    {
+        this(inputStream, true);
+    }
+
+    /**
+     * Creates a new UnicodeInputStream object.
+     *
+     * @param  inputStream The input stream to use for reading.
+     * @param skipBOM If this is set to true, a BOM read from the stream is discarded. This parameter should normally be true.
+     * @throws IllegalStateException
+     * @throws IOException
+     */
+    public UnicodeInputStream(final InputStream inputStream, boolean skipBOM)
+            throws IllegalStateException, IOException
+    {
+        super();
+
+        this.skipBOM = skipBOM;
+        this.inputStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE);
+
+        try
+        {
+            this.encoding = readEncoding();
+        }
+        catch (IOException ioe)
+        {
+            throw new IllegalStateException("Could not read BOM from Stream", ioe);
+        }
+    }
+
+    /**
+     * Returns true if the input stream discards the BOM.
+     *
+     * @return  True if the input stream discards the BOM.
+     */
+    public boolean isSkipBOM()
+    {
+        return skipBOM;
+    }
+
+    /**
+     * Read encoding based on BOM.
+     *
+     * @return  The encoding based on the BOM.
+     *
+     * @throws  IllegalStateException  When a problem reading the BOM occured.
+     */
+    public String getEncodingFromStream()
+    {
+        return encoding;
+    }
+
+    /**
+     * This method gets the encoding from the stream contents if a BOM exists. If no BOM exists, the encoding
+     * is undefined.
+     *
+     * @return The encoding of this streams contents as decided by the BOM or null if no BOM was found.
+     * @throws IOException
+     */
+    protected String readEncoding()
+        throws IOException
+    {
+        pos = 0;
+
+        UnicodeBOM encoding = null;
+
+        // read first byte.
+        if (readByte())
+        {
+            // Build a list of matches
+            //
+            // 00 00 FE FF --> UTF 32 BE
+            // EF BB BF    --> UTF 8
+            // FE FF       --> UTF 16 BE
+            // FF FE       --> UTF 16 LE
+            // FF FE 00 00 --> UTF 32 LE
+
+            switch (buf[0])
+            {
+            case (byte)0x00: // UTF32 BE
+                encoding = match(UTF32BE_BOM, null);
+                break;
+            case (byte)0xef: // UTF8
+                encoding = match(UTF8_BOM, null);
+                break;
+            case (byte)0xfe: // UTF16 BE
+                encoding = match(UTF16BE_BOM, null);
+                break;
+            case (byte)0xff: // UTF16/32 LE
+                encoding = match(UTF16LE_BOM, null);
+
+                if (encoding != null)
+                {
+                    encoding = match(UTF32LE_BOM, encoding);
+                }
+                break;
+
+            default:
+                encoding = null;
+                break;
+            }
+        }
+
+        pushback(encoding);
+
+        return (encoding != null) ? encoding.getEncoding() : null;
+    }
+
+    private UnicodeBOM match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding)
+        throws IOException
+    {
+        byte [] bom = matchEncoding.getBytes();
+
+        for (int i = 0; i < bom.length; i++)
+        {
+            if (pos <= i) // Byte has not yet been read
+            {
+                if (!readByte())
+                {
+                    return noMatchEncoding;
+                }
+            }
+
+            if (bom[i] != buf[i])
+            {
+                return noMatchEncoding;
+            }
+        }
+
+        return matchEncoding;
+    }
+
+    private boolean readByte()
+            throws IOException
+    {
+        int res = inputStream.read();
+        if (res == -1)
+        {
+            return false;
+        }
+
+        if (pos >= buf.length)
+        {
+            throw new IOException("BOM read error");
+        }
+
+        buf[pos++] = (byte) res;
+        return true;
+    }
+
+    private void pushback(final UnicodeBOM matchBOM)
+        throws IOException
+    {
+        int count = pos; // By default, all bytes are pushed back.
+        int start = 0;
+
+        if (matchBOM != null && skipBOM)
+        {
+            // We have a match (some bytes are part of the BOM)
+            // and we want to skip the BOM. Push back only the bytes
+            // after the BOM.
+            start = matchBOM.getBytes().length;
+            count = (pos - start);
+
+            if (count < 0)
+            {
+                throw new IllegalStateException("Match has more bytes than available!");
+            }
+        }
+
+        inputStream.unread(buf, start, count);
+    }
+
+    /**
+     * @throws IOException
+     * @see java.io.InputStream#close()
+     */
+    @Override
+    public void close()
+        throws IOException
+    {
+        inputStream.close();
+    }
+
+    /**
+     * @throws IOException
+     * @see java.io.InputStream#available()
+     */
+    @Override
+    public int available()
+        throws IOException
+    {
+        return inputStream.available();
+    }
+
+    /**
+     * @param readlimit
+     * @see java.io.InputStream#mark(int)
+     */
+    @Override
+    public void mark(final int readlimit)
+    {
+        inputStream.mark(readlimit);
+    }
+
+    /**
+     * @return mark supported
+     * @see java.io.InputStream#markSupported()
+     */
+    @Override
+    public boolean markSupported()
+    {
+        return inputStream.markSupported();
+    }
+
+    /**
+     * @return read char
+     * @see java.io.InputStream#read()
+     */
+    @Override
+    public int read()
+        throws IOException
+    {
+        return inputStream.read();
+    }
+
+    /**
+     * @param b buffer
+     * @return read chars count
+     * @see java.io.InputStream#read(byte[])
+     */
+    @Override
+    public int read(final byte [] b)
+        throws IOException
+    {
+        return inputStream.read(b);
+    }
+
+    /**
+     * @param b buffer
+     * @param off offset
+     * @param len length
+     * @return reac char
+     * @see java.io.InputStream#read(byte[], int, int)
+     */
+    @Override
+    public int read(final byte [] b, final int off, final int len)
+        throws IOException
+    {
+        return inputStream.read(b, off, len);
+    }
+
+    /**
+     * @see java.io.InputStream#reset()
+     */
+    @Override
+    public void reset()
+        throws IOException
+    {
+        inputStream.reset();
+    }
+
+    /**
+     * @param n
+     * @return skipped count
+     * @see java.io.InputStream#skip(long)
+     */
+    @Override
+    public long skip(final long n)
+        throws IOException
+    {
+        return inputStream.skip(n);
+    }
+
+
+    /**
+     * Helper function to compare encodings
+     * @param left
+     * @param right
+     * @return true for same encoding
+     */
+    public static boolean sameEncoding(String left, String right)
+    {
+        left = left.toUpperCase(Locale.ROOT).replace("-", "").replace("_","");
+        right = right.toUpperCase(Locale.ROOT).replace("-", "").replace("_","");
+        return left.equals(right);
+    }
+
+    /**
+     * Helper class to bundle encoding and BOM marker.
+     *
+     * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
+     * @version $Id$
+     */
+    static final class UnicodeBOM
+    {
+        private final String encoding;
+
+        private final byte [] bytes;
+
+        private UnicodeBOM(final String encoding, final byte [] bytes)
+        {
+            this.encoding = encoding;
+            this.bytes = bytes;
+        }
+
+        String getEncoding()
+        {
+            return encoding;
+        }
+
+        byte [] getBytes()
+        {
+            return bytes;
+        }
+    }
+}