v1/src/common/org/apache/xmlbeans/impl/common/SniffedXmlInputStream.java - xmlbeans - Git at Google

 /*   Copyright 2004 The Apache Software Foundation
  *
  *   Licensed under the Apache License, Version 2.0 (the "License");
  *   you may not use this file except in compliance with the License.
  *   You may obtain a copy of the License at
  *
  *       http://www.apache.org/licenses/LICENSE-2.0
  *
  *   Unless required by applicable law or agreed to in writing, software
  *   distributed under the License is distributed on an "AS IS" BASIS,
  *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  *   See the License for the specific language governing permissions and
  *  limitations under the License.
  */

 package org.apache.xmlbeans.impl.common;

 import java.io.InputStream;
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.ByteArrayInputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.nio.charset.Charset;

 public class SniffedXmlInputStream extends BufferedInputStream
 {
     // We don't sniff more than 192 bytes.
     public static int MAX_SNIFFED_BYTES = 192;

     public SniffedXmlInputStream(InputStream stream) throws IOException
     {
         super(stream);

         // read byte order marks and detect EBCDIC etc
         _encoding = sniffFourBytes();

         if (_encoding != null && _encoding.equals("IBM037"))
         {
             // First four bytes suggest EBCDIC with <?xm at start
             String encoding = sniffForXmlDecl(_encoding);
             if (encoding != null)
                 _encoding = encoding;
         }

         if (_encoding == null)
         {
             // Haven't yet determined encoding: sniff for <?xml encoding="..."?>
             // assuming we can read it as UTF-8.
             _encoding = sniffForXmlDecl("UTF-8");
         }

         if (_encoding == null)
         {
             // The XML spec says these two things:

             // (1) "In the absence of external character encoding information
             // (such as MIME headers), parsed entities which are stored in an
             // encoding other than UTF-8 or UTF-16 must begin with a text
             // declaration (see 4.3.1 The Text Declaration) containing an
             // encoding declaration:"

             // (2) "In the absence of information provided by an external
             // transport protocol (e.g. HTTP or MIME), it is an error
             // for an entity including an encoding declaration to be
             // presented to the XML processor in an encoding other than
             // that named in the declaration, or for an entity which begins
             // with neither a Byte Order Mark nor an encoding declaration
             // to use an encoding other than UTF-8."

             // Since we're using a sniffed stream, we do not have external
             // character encoding information.

             // Since we're here, we also don't have a recognized byte order
             // mark or an explicit encoding declaration that can be read in
             // either ASCII or EBDIC style.

             // Therefore, we must use UTF-8.

             _encoding = "UTF-8";
         }
     }

     private int readAsMuchAsPossible(byte[] buf, int startAt, int len) throws IOException
     {
         int total = 0;
         while (total < len)
         {
             int count = read(buf, startAt + total, len - total);
             if (count < 0)
                 break;
             total += count;
         }
         return total;
     }

     private String sniffFourBytes() throws IOException
     {
         mark(4);
         int skip = 0;
         try
         {
             byte[] buf = new byte[4];
             if (readAsMuchAsPossible(buf, 0, 4) < 4)
                 return null;
             long result = 0xFF000000 & (buf[0] << 24) | 0x00FF0000 & (buf[1] << 16) | 0x0000FF00 & (buf[2] << 8) | 0x000000FF & buf[3];

             if (result == 0x0000FEFF)
                 return "UCS-4";
             else if (result == 0xFFFE0000)
                 return "UCS-4";
             else if (result == 0x0000003C)
                 return "UCS-4BE";
             else if (result == 0x3C000000)
                 return "UCS-4LE";
             else if (result == 0x003C003F)
                 return "UTF-16BE";
             else if (result == 0x3C003F00)
                 return "UTF-16LE";
             else if (result == 0x3C3F786D)
                 return null; // looks like US-ASCII with <?xml: sniff
             else if (result == 0x4C6FA794)
                 return "IBM037"; // Sniff for ebdic codepage
             else if ((result & 0xFFFF0000) == 0xFEFF0000)
                 return "UTF-16";
             else if ((result & 0xFFFF0000) == 0xFFFE0000)
                 return "UTF-16";
             else if ((result & 0xFFFFFF00) == 0xEFBBBF00)
                 return "UTF-8";
             else return null;
         }
         finally
         {
             reset();
         }
     }

     // BUGBUG in JDK: Charset.forName is not threadsafe, so we'll prime it
     // with the common charsets.

     private static Charset dummy1 = Charset.forName("UTF-8");
     private static Charset dummy2 = Charset.forName("UTF-16");
     private static Charset dummy3 = Charset.forName("UTF-16BE");
     private static Charset dummy4 = Charset.forName("UTF-16LE");
     private static Charset dummy5 = Charset.forName("ISO-8859-1");
     private static Charset dummy6 = Charset.forName("US-ASCII");
     private static Charset dummy7 = Charset.forName("Cp1252");


     private String sniffForXmlDecl(String encoding) throws IOException
     {
         mark(MAX_SNIFFED_BYTES);
         try
         {
             byte[] bytebuf = new byte[MAX_SNIFFED_BYTES];
             int bytelimit = readAsMuchAsPossible(bytebuf, 0, MAX_SNIFFED_BYTES);

             // BUGBUG in JDK: Charset.forName is not threadsafe.
             Charset charset = Charset.forName(encoding);
             Reader reader = new InputStreamReader(new ByteArrayInputStream(bytebuf, 0, bytelimit), charset);
             char[] buf = new char[bytelimit];
             int limit = 0;
             while (limit < bytelimit)
             {
                 int count = reader.read(buf, limit, bytelimit - limit);
                 if (count < 0)
                     break;
                 limit += count;
             }

             return extractXmlDeclEncoding(buf, 0, limit);
         }
         finally
         {
             reset();
         }
     }

     private String _encoding;

     public String getXmlEncoding()
     {
         return _encoding;
     }

     /* package */ static String extractXmlDeclEncoding(char[] buf, int offset, int size)
     {
         int limit = offset + size;
         int xmlpi = firstIndexOf("<?xml", buf, offset, limit);
         if (xmlpi >= 0)
         {
             int i = xmlpi + 5;
             ScannedAttribute attr = new ScannedAttribute();
             while (i < limit)
             {
                 i = scanAttribute(buf, i, limit, attr);
                 if (i < 0)
                     return null;
                 if (attr.name.equals("encoding"))
                     return attr.value;
             }
         }
         return null;
     }

     private static int firstIndexOf(String s, char[] buf, int startAt, int limit)
     {
         assert(s.length() > 0);
         char[] lookFor = s.toCharArray();

         char firstchar = lookFor[0];
         searching: for (limit -= lookFor.length; startAt < limit; startAt++)
         {
             if (buf[startAt] == firstchar)
             {
                 for (int i = 1; i < lookFor.length; i++)
                 {
                     if (buf[startAt + i] != lookFor[i])
                     {
                         continue searching;
                     }
                 }
                 return startAt;
             }
         }

         return -1;
     }

     private static int nextNonmatchingByte(char[] lookFor, char[] buf, int startAt, int limit)
     {
         searching: for (; startAt < limit; startAt++)
         {
             int thischar = buf[startAt];
             for (int i = 0; i < lookFor.length; i++)
                 if (thischar == lookFor[i])
                     continue searching;
             return startAt;
         }
         return -1;
     }

     private static int nextMatchingByte(char[] lookFor, char[] buf, int startAt, int limit)
     {
         searching: for (; startAt < limit; startAt++)
         {
             int thischar = buf[startAt];
             for (int i = 0; i < lookFor.length; i++)
                 if (thischar == lookFor[i])
                     return startAt;
         }
         return -1;
     }

     private static int nextMatchingByte(char lookFor, char[] buf, int startAt, int limit)
     {
         searching: for (; startAt < limit; startAt++)
         {
             if (buf[startAt] == lookFor)
                 return startAt;
         }
         return -1;
     }
     private static char[] WHITESPACE = new char[] { ' ', '\r', '\t', '\n' };
     private static char[] NOTNAME = new char[] { '=', ' ', '\r', '\t', '\n', '?', '>', '<', '\'', '\"' };

     private static class ScannedAttribute
     {
         public String name;
         public String value;
     }

     private static int scanAttribute(char[] buf, int startAt, int limit, ScannedAttribute attr)
     {
         int nameStart = nextNonmatchingByte(WHITESPACE, buf, startAt, limit);
         if (nameStart < 0)
             return -1;
         int nameEnd = nextMatchingByte(NOTNAME, buf, nameStart, limit);
         if (nameEnd < 0)
             return -1;
         int equals = nextNonmatchingByte(WHITESPACE, buf, nameEnd, limit);
         if (equals < 0)
             return -1;
         if (buf[equals] != '=')
             return -1;
         int valQuote = nextNonmatchingByte(WHITESPACE, buf, equals + 1, limit);
         if (buf[valQuote] != '\'' && buf[valQuote] != '\"')
             return -1;
         int valEndquote = nextMatchingByte(buf[valQuote], buf, valQuote + 1, limit);
         if (valEndquote < 0)
             return -1;
         attr.name = new String(buf, nameStart, nameEnd - nameStart);
         attr.value = new String(buf, valQuote + 1, valEndquote - valQuote - 1);
         return valEndquote + 1;
     }
 }
	/* Copyright 2004 The Apache Software Foundation
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.xmlbeans.impl.common;

	import java.io.InputStream;
	import java.io.BufferedInputStream;
	import java.io.IOException;
	import java.io.ByteArrayInputStream;
	import java.io.InputStreamReader;
	import java.io.Reader;
	import java.nio.charset.Charset;

	public class SniffedXmlInputStream extends BufferedInputStream
	{
	// We don't sniff more than 192 bytes.
	public static int MAX_SNIFFED_BYTES = 192;

	public SniffedXmlInputStream(InputStream stream) throws IOException
	{
	super(stream);

	// read byte order marks and detect EBCDIC etc
	_encoding = sniffFourBytes();

	if (_encoding != null && _encoding.equals("IBM037"))
	{
	// First four bytes suggest EBCDIC with <?xm at start
	String encoding = sniffForXmlDecl(_encoding);
	if (encoding != null)
	_encoding = encoding;
	}

	if (_encoding == null)
	{
	// Haven't yet determined encoding: sniff for <?xml encoding="..."?>
	// assuming we can read it as UTF-8.
	_encoding = sniffForXmlDecl("UTF-8");
	}

	if (_encoding == null)
	{
	// The XML spec says these two things:

	// (1) "In the absence of external character encoding information
	// (such as MIME headers), parsed entities which are stored in an
	// encoding other than UTF-8 or UTF-16 must begin with a text
	// declaration (see 4.3.1 The Text Declaration) containing an
	// encoding declaration:"

	// (2) "In the absence of information provided by an external
	// transport protocol (e.g. HTTP or MIME), it is an error
	// for an entity including an encoding declaration to be
	// presented to the XML processor in an encoding other than
	// that named in the declaration, or for an entity which begins
	// with neither a Byte Order Mark nor an encoding declaration
	// to use an encoding other than UTF-8."

	// Since we're using a sniffed stream, we do not have external
	// character encoding information.

	// Since we're here, we also don't have a recognized byte order
	// mark or an explicit encoding declaration that can be read in
	// either ASCII or EBDIC style.

	// Therefore, we must use UTF-8.

	_encoding = "UTF-8";
	}
	}

	private int readAsMuchAsPossible(byte[] buf, int startAt, int len) throws IOException
	{
	int total = 0;
	while (total < len)
	{
	int count = read(buf, startAt + total, len - total);
	if (count < 0)
	break;
	total += count;
	}
	return total;
	}

	private String sniffFourBytes() throws IOException
	{
	mark(4);
	int skip = 0;
	try
	{
	byte[] buf = new byte[4];
	if (readAsMuchAsPossible(buf, 0, 4) < 4)
	return null;
	long result = 0xFF000000 & (buf[0] << 24) \| 0x00FF0000 & (buf[1] << 16) \| 0x0000FF00 & (buf[2] << 8) \| 0x000000FF & buf[3];

	if (result == 0x0000FEFF)
	return "UCS-4";
	else if (result == 0xFFFE0000)
	return "UCS-4";
	else if (result == 0x0000003C)
	return "UCS-4BE";
	else if (result == 0x3C000000)
	return "UCS-4LE";
	else if (result == 0x003C003F)
	return "UTF-16BE";
	else if (result == 0x3C003F00)
	return "UTF-16LE";
	else if (result == 0x3C3F786D)
	return null; // looks like US-ASCII with <?xml: sniff
	else if (result == 0x4C6FA794)
	return "IBM037"; // Sniff for ebdic codepage
	else if ((result & 0xFFFF0000) == 0xFEFF0000)
	return "UTF-16";
	else if ((result & 0xFFFF0000) == 0xFFFE0000)
	return "UTF-16";
	else if ((result & 0xFFFFFF00) == 0xEFBBBF00)
	return "UTF-8";
	else return null;
	}
	finally
	{
	reset();
	}
	}

	// BUGBUG in JDK: Charset.forName is not threadsafe, so we'll prime it
	// with the common charsets.

	private static Charset dummy1 = Charset.forName("UTF-8");
	private static Charset dummy2 = Charset.forName("UTF-16");
	private static Charset dummy3 = Charset.forName("UTF-16BE");
	private static Charset dummy4 = Charset.forName("UTF-16LE");
	private static Charset dummy5 = Charset.forName("ISO-8859-1");
	private static Charset dummy6 = Charset.forName("US-ASCII");
	private static Charset dummy7 = Charset.forName("Cp1252");


	private String sniffForXmlDecl(String encoding) throws IOException
	{
	mark(MAX_SNIFFED_BYTES);
	try
	{
	byte[] bytebuf = new byte[MAX_SNIFFED_BYTES];
	int bytelimit = readAsMuchAsPossible(bytebuf, 0, MAX_SNIFFED_BYTES);

	// BUGBUG in JDK: Charset.forName is not threadsafe.
	Charset charset = Charset.forName(encoding);
	Reader reader = new InputStreamReader(new ByteArrayInputStream(bytebuf, 0, bytelimit), charset);
	char[] buf = new char[bytelimit];
	int limit = 0;
	while (limit < bytelimit)
	{
	int count = reader.read(buf, limit, bytelimit - limit);
	if (count < 0)
	break;
	limit += count;
	}

	return extractXmlDeclEncoding(buf, 0, limit);
	}
	finally
	{
	reset();
	}
	}

	private String _encoding;

	public String getXmlEncoding()
	{
	return _encoding;
	}

	/* package */ static String extractXmlDeclEncoding(char[] buf, int offset, int size)
	{
	int limit = offset + size;
	int xmlpi = firstIndexOf("<?xml", buf, offset, limit);
	if (xmlpi >= 0)
	{
	int i = xmlpi + 5;
	ScannedAttribute attr = new ScannedAttribute();
	while (i < limit)
	{
	i = scanAttribute(buf, i, limit, attr);
	if (i < 0)
	return null;
	if (attr.name.equals("encoding"))
	return attr.value;
	}
	}
	return null;
	}

	private static int firstIndexOf(String s, char[] buf, int startAt, int limit)
	{
	assert(s.length() > 0);
	char[] lookFor = s.toCharArray();

	char firstchar = lookFor[0];
	searching: for (limit -= lookFor.length; startAt < limit; startAt++)
	{
	if (buf[startAt] == firstchar)
	{
	for (int i = 1; i < lookFor.length; i++)
	{
	if (buf[startAt + i] != lookFor[i])
	{
	continue searching;
	}
	}
	return startAt;
	}
	}

	return -1;
	}

	private static int nextNonmatchingByte(char[] lookFor, char[] buf, int startAt, int limit)
	{
	searching: for (; startAt < limit; startAt++)
	{
	int thischar = buf[startAt];
	for (int i = 0; i < lookFor.length; i++)
	if (thischar == lookFor[i])
	continue searching;
	return startAt;
	}
	return -1;
	}

	private static int nextMatchingByte(char[] lookFor, char[] buf, int startAt, int limit)
	{
	searching: for (; startAt < limit; startAt++)
	{
	int thischar = buf[startAt];
	for (int i = 0; i < lookFor.length; i++)
	if (thischar == lookFor[i])
	return startAt;
	}
	return -1;
	}

	private static int nextMatchingByte(char lookFor, char[] buf, int startAt, int limit)
	{
	searching: for (; startAt < limit; startAt++)
	{
	if (buf[startAt] == lookFor)
	return startAt;
	}
	return -1;
	}
	private static char[] WHITESPACE = new char[] { ' ', '\r', '\t', '\n' };
	private static char[] NOTNAME = new char[] { '=', ' ', '\r', '\t', '\n', '?', '>', '<', '\'', '\"' };

	private static class ScannedAttribute
	{
	public String name;
	public String value;
	}

	private static int scanAttribute(char[] buf, int startAt, int limit, ScannedAttribute attr)
	{
	int nameStart = nextNonmatchingByte(WHITESPACE, buf, startAt, limit);
	if (nameStart < 0)
	return -1;
	int nameEnd = nextMatchingByte(NOTNAME, buf, nameStart, limit);
	if (nameEnd < 0)
	return -1;
	int equals = nextNonmatchingByte(WHITESPACE, buf, nameEnd, limit);
	if (equals < 0)
	return -1;
	if (buf[equals] != '=')
	return -1;
	int valQuote = nextNonmatchingByte(WHITESPACE, buf, equals + 1, limit);
	if (buf[valQuote] != '\'' && buf[valQuote] != '\"')
	return -1;
	int valEndquote = nextMatchingByte(buf[valQuote], buf, valQuote + 1, limit);
	if (valEndquote < 0)
	return -1;
	attr.name = new String(buf, nameStart, nameEnd - nameStart);
	attr.value = new String(buf, valQuote + 1, valEndquote - valQuote - 1);
	return valEndquote + 1;
	}
	}