src/main/java/org/apache/xmlbeans/impl/common/SniffedXmlInputStream.java - xmlbeans - Git at Google

 /*   Copyright 2004 The Apache Software Foundation
  *
  *   Licensed under the Apache License, Version 2.0 (the "License");
  *   you may not use this file except in compliance with the License.
  *   You may obtain a copy of the License at
  *
  *       http://www.apache.org/licenses/LICENSE-2.0
  *
  *   Unless required by applicable law or agreed to in writing, software
  *   distributed under the License is distributed on an "AS IS" BASIS,
  *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  *   See the License for the specific language governing permissions and
  *  limitations under the License.
  */

 package org.apache.xmlbeans.impl.common;

 import java.io.*;
 import java.nio.charset.Charset;

 public class SniffedXmlInputStream extends BufferedInputStream {
     // We don't sniff more than 192 bytes.
     public static final int MAX_SNIFFED_BYTES = 192;

     public SniffedXmlInputStream(InputStream stream) throws IOException {
         super(stream);

         // read byte order marks and detect EBCDIC etc
         _encoding = sniffFourBytes();

         if (_encoding != null && _encoding.equals("IBM037")) {
             // First four bytes suggest EBCDIC with <?xm at start
             String encoding = sniffForXmlDecl(_encoding);
             if (encoding != null) {
                 _encoding = encoding;
             }
         }

         if (_encoding == null) {
             // Haven't yet determined encoding: sniff for <?xml encoding="..."?>
             // assuming we can read it as UTF-8.
             _encoding = sniffForXmlDecl("UTF-8");
         }

         if (_encoding == null) {
             // The XML spec says these two things:

             // (1) "In the absence of external character encoding information
             // (such as MIME headers), parsed entities which are stored in an
             // encoding other than UTF-8 or UTF-16 must begin with a text
             // declaration (see 4.3.1 The Text Declaration) containing an
             // encoding declaration:"

             // (2) "In the absence of information provided by an external
             // transport protocol (e.g. HTTP or MIME), it is an error
             // for an entity including an encoding declaration to be
             // presented to the XML processor in an encoding other than
             // that named in the declaration, or for an entity which begins
             // with neither a Byte Order Mark nor an encoding declaration
             // to use an encoding other than UTF-8."

             // Since we're using a sniffed stream, we do not have external
             // character encoding information.

             // Since we're here, we also don't have a recognized byte order
             // mark or an explicit encoding declaration that can be read in
             // either ASCII or EBDIC style.

             // Therefore, we must use UTF-8.

             _encoding = "UTF-8";
         }
     }

     private int readAsMuchAsPossible(byte[] buf, int startAt, int len) throws IOException {
         int total = 0;
         while (total < len) {
             int count = read(buf, startAt + total, len - total);
             if (count < 0) {
                 break;
             }
             total += count;
         }
         return total;
     }

     private String sniffFourBytes() throws IOException {
         mark(4);
         int skip = 0;
         try {
             byte[] buf = new byte[4];
             if (readAsMuchAsPossible(buf, 0, 4) < 4) {
                 return null;
             }
             long result = 0xFF000000 & (buf[0] << 24) | 0x00FF0000 & (buf[1] << 16) | 0x0000FF00 & (buf[2] << 8) | 0x000000FF & buf[3];

             if (result == 0x0000FEFF) {
                 return "UCS-4";
             } else if (result == 0xFFFE0000) {
                 return "UCS-4";
             } else if (result == 0x0000003C) {
                 return "UCS-4BE";
             } else if (result == 0x3C000000) {
                 return "UCS-4LE";
             } else if (result == 0x003C003F) {
                 return "UTF-16BE";
             } else if (result == 0x3C003F00) {
                 return "UTF-16LE";
             } else if (result == 0x3C3F786D) {
                 return null; // looks like US-ASCII with <?xml: sniff
             } else if (result == 0x4C6FA794) {
                 return "IBM037"; // Sniff for ebdic codepage
             } else if ((result & 0xFFFF0000) == 0xFEFF0000) {
                 return "UTF-16";
             } else if ((result & 0xFFFF0000) == 0xFFFE0000) {
                 return "UTF-16";
             } else if ((result & 0xFFFFFF00) == 0xEFBBBF00) {
                 return "UTF-8";
             } else {
                 return null;
             }
         } finally {
             reset();
         }
     }

     // BUGBUG in JDK: Charset.forName is not threadsafe, so we'll prime it
     // with the common charsets.

     private static Charset dummy1 = Charset.forName("UTF-8");
     private static Charset dummy2 = Charset.forName("UTF-16");
     private static Charset dummy3 = Charset.forName("UTF-16BE");
     private static Charset dummy4 = Charset.forName("UTF-16LE");
     private static Charset dummy5 = Charset.forName("ISO-8859-1");
     private static Charset dummy6 = Charset.forName("US-ASCII");
     private static Charset dummy7 = Charset.forName("Cp1252");


     private String sniffForXmlDecl(String encoding) throws IOException {
         mark(MAX_SNIFFED_BYTES);
         try {
             byte[] bytebuf = new byte[MAX_SNIFFED_BYTES];
             int bytelimit = readAsMuchAsPossible(bytebuf, 0, MAX_SNIFFED_BYTES);

             // BUGBUG in JDK: Charset.forName is not threadsafe.
             Charset charset = Charset.forName(encoding);
             Reader reader = new InputStreamReader(new ByteArrayInputStream(bytebuf, 0, bytelimit), charset);
             char[] buf = new char[bytelimit];
             int limit = 0;
             while (limit < bytelimit) {
                 int count = reader.read(buf, limit, bytelimit - limit);
                 if (count < 0) {
                     break;
                 }
                 limit += count;
             }

             return extractXmlDeclEncoding(buf, 0, limit);
         } finally {
             reset();
         }
     }

     private String _encoding;

     public String getXmlEncoding() {
         return _encoding;
     }

     /* package */
     static String extractXmlDeclEncoding(char[] buf, int offset, int size) {
         int limit = offset + size;
         int xmlpi = firstIndexOf("<?xml", buf, offset, limit);
         if (xmlpi >= 0) {
             int i = xmlpi + 5;
             ScannedAttribute attr = new ScannedAttribute();
             while (i < limit) {
                 i = scanAttribute(buf, i, limit, attr);
                 if (i < 0) {
                     return null;
                 }
                 if (attr.name.equals("encoding")) {
                     return attr.value;
                 }
             }
         }
         return null;
     }

     private static int firstIndexOf(String s, char[] buf, int startAt, int limit) {
         assert (s.length() > 0);
         char[] lookFor = s.toCharArray();

         char firstchar = lookFor[0];
         searching:
         for (limit -= lookFor.length; startAt < limit; startAt++) {
             if (buf[startAt] == firstchar) {
                 for (int i = 1; i < lookFor.length; i++) {
                     if (buf[startAt + i] != lookFor[i]) {
                         continue searching;
                     }
                 }
                 return startAt;
             }
         }

         return -1;
     }

     private static int nextNonmatchingByte(char[] lookFor, char[] buf, int startAt, int limit) {
         searching:
         for (; startAt < limit; startAt++) {
             int thischar = buf[startAt];
             for (int i = 0; i < lookFor.length; i++) {
                 if (thischar == lookFor[i]) {
                     continue searching;
                 }
             }
             return startAt;
         }
         return -1;
     }

     private static int nextMatchingByte(char[] lookFor, char[] buf, int startAt, int limit) {
         searching:
         for (; startAt < limit; startAt++) {
             int thischar = buf[startAt];
             for (int i = 0; i < lookFor.length; i++) {
                 if (thischar == lookFor[i]) {
                     return startAt;
                 }
             }
         }
         return -1;
     }

     private static int nextMatchingByte(char lookFor, char[] buf, int startAt, int limit) {
         searching:
         for (; startAt < limit; startAt++) {
             if (buf[startAt] == lookFor) {
                 return startAt;
             }
         }
         return -1;
     }

     private static char[] WHITESPACE = new char[]{' ', '\r', '\t', '\n'};
     private static char[] NOTNAME = new char[]{'=', ' ', '\r', '\t', '\n', '?', '>', '<', '\'', '\"'};

     private static class ScannedAttribute {
         public String name;
         public String value;
     }

     private static int scanAttribute(char[] buf, int startAt, int limit, ScannedAttribute attr) {
         int nameStart = nextNonmatchingByte(WHITESPACE, buf, startAt, limit);
         if (nameStart < 0) {
             return -1;
         }
         int nameEnd = nextMatchingByte(NOTNAME, buf, nameStart, limit);
         if (nameEnd < 0) {
             return -1;
         }
         int equals = nextNonmatchingByte(WHITESPACE, buf, nameEnd, limit);
         if (equals < 0) {
             return -1;
         }
         if (buf[equals] != '=') {
             return -1;
         }
         int valQuote = nextNonmatchingByte(WHITESPACE, buf, equals + 1, limit);
         if (buf[valQuote] != '\'' && buf[valQuote] != '\"') {
             return -1;
         }
         int valEndquote = nextMatchingByte(buf[valQuote], buf, valQuote + 1, limit);
         if (valEndquote < 0) {
             return -1;
         }
         attr.name = new String(buf, nameStart, nameEnd - nameStart);
         attr.value = new String(buf, valQuote + 1, valEndquote - valQuote - 1);
         return valEndquote + 1;
     }
 }
	/* Copyright 2004 The Apache Software Foundation
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.xmlbeans.impl.common;

	import java.io.*;
	import java.nio.charset.Charset;

	public class SniffedXmlInputStream extends BufferedInputStream {
	// We don't sniff more than 192 bytes.
	public static final int MAX_SNIFFED_BYTES = 192;

	public SniffedXmlInputStream(InputStream stream) throws IOException {
	super(stream);

	// read byte order marks and detect EBCDIC etc
	_encoding = sniffFourBytes();

	if (_encoding != null && _encoding.equals("IBM037")) {
	// First four bytes suggest EBCDIC with <?xm at start
	String encoding = sniffForXmlDecl(_encoding);
	if (encoding != null) {
	_encoding = encoding;
	}
	}

	if (_encoding == null) {
	// Haven't yet determined encoding: sniff for <?xml encoding="..."?>
	// assuming we can read it as UTF-8.
	_encoding = sniffForXmlDecl("UTF-8");
	}

	if (_encoding == null) {
	// The XML spec says these two things:

	// (1) "In the absence of external character encoding information
	// (such as MIME headers), parsed entities which are stored in an
	// encoding other than UTF-8 or UTF-16 must begin with a text
	// declaration (see 4.3.1 The Text Declaration) containing an
	// encoding declaration:"

	// (2) "In the absence of information provided by an external
	// transport protocol (e.g. HTTP or MIME), it is an error
	// for an entity including an encoding declaration to be
	// presented to the XML processor in an encoding other than
	// that named in the declaration, or for an entity which begins
	// with neither a Byte Order Mark nor an encoding declaration
	// to use an encoding other than UTF-8."

	// Since we're using a sniffed stream, we do not have external
	// character encoding information.

	// Since we're here, we also don't have a recognized byte order
	// mark or an explicit encoding declaration that can be read in
	// either ASCII or EBDIC style.

	// Therefore, we must use UTF-8.

	_encoding = "UTF-8";
	}
	}

	private int readAsMuchAsPossible(byte[] buf, int startAt, int len) throws IOException {
	int total = 0;
	while (total < len) {
	int count = read(buf, startAt + total, len - total);
	if (count < 0) {
	break;
	}
	total += count;
	}
	return total;
	}

	private String sniffFourBytes() throws IOException {
	mark(4);
	int skip = 0;
	try {
	byte[] buf = new byte[4];
	if (readAsMuchAsPossible(buf, 0, 4) < 4) {
	return null;
	}
	long result = 0xFF000000 & (buf[0] << 24) \| 0x00FF0000 & (buf[1] << 16) \| 0x0000FF00 & (buf[2] << 8) \| 0x000000FF & buf[3];

	if (result == 0x0000FEFF) {
	return "UCS-4";
	} else if (result == 0xFFFE0000) {
	return "UCS-4";
	} else if (result == 0x0000003C) {
	return "UCS-4BE";
	} else if (result == 0x3C000000) {
	return "UCS-4LE";
	} else if (result == 0x003C003F) {
	return "UTF-16BE";
	} else if (result == 0x3C003F00) {
	return "UTF-16LE";
	} else if (result == 0x3C3F786D) {
	return null; // looks like US-ASCII with <?xml: sniff
	} else if (result == 0x4C6FA794) {
	return "IBM037"; // Sniff for ebdic codepage
	} else if ((result & 0xFFFF0000) == 0xFEFF0000) {
	return "UTF-16";
	} else if ((result & 0xFFFF0000) == 0xFFFE0000) {
	return "UTF-16";
	} else if ((result & 0xFFFFFF00) == 0xEFBBBF00) {
	return "UTF-8";
	} else {
	return null;
	}
	} finally {
	reset();
	}
	}

	// BUGBUG in JDK: Charset.forName is not threadsafe, so we'll prime it
	// with the common charsets.

	private static Charset dummy1 = Charset.forName("UTF-8");
	private static Charset dummy2 = Charset.forName("UTF-16");
	private static Charset dummy3 = Charset.forName("UTF-16BE");
	private static Charset dummy4 = Charset.forName("UTF-16LE");
	private static Charset dummy5 = Charset.forName("ISO-8859-1");
	private static Charset dummy6 = Charset.forName("US-ASCII");
	private static Charset dummy7 = Charset.forName("Cp1252");


	private String sniffForXmlDecl(String encoding) throws IOException {
	mark(MAX_SNIFFED_BYTES);
	try {
	byte[] bytebuf = new byte[MAX_SNIFFED_BYTES];
	int bytelimit = readAsMuchAsPossible(bytebuf, 0, MAX_SNIFFED_BYTES);

	// BUGBUG in JDK: Charset.forName is not threadsafe.
	Charset charset = Charset.forName(encoding);
	Reader reader = new InputStreamReader(new ByteArrayInputStream(bytebuf, 0, bytelimit), charset);
	char[] buf = new char[bytelimit];
	int limit = 0;
	while (limit < bytelimit) {
	int count = reader.read(buf, limit, bytelimit - limit);
	if (count < 0) {
	break;
	}
	limit += count;
	}

	return extractXmlDeclEncoding(buf, 0, limit);
	} finally {
	reset();
	}
	}

	private String _encoding;

	public String getXmlEncoding() {
	return _encoding;
	}

	/* package */
	static String extractXmlDeclEncoding(char[] buf, int offset, int size) {
	int limit = offset + size;
	int xmlpi = firstIndexOf("<?xml", buf, offset, limit);
	if (xmlpi >= 0) {
	int i = xmlpi + 5;
	ScannedAttribute attr = new ScannedAttribute();
	while (i < limit) {
	i = scanAttribute(buf, i, limit, attr);
	if (i < 0) {
	return null;
	}
	if (attr.name.equals("encoding")) {
	return attr.value;
	}
	}
	}
	return null;
	}

	private static int firstIndexOf(String s, char[] buf, int startAt, int limit) {
	assert (s.length() > 0);
	char[] lookFor = s.toCharArray();

	char firstchar = lookFor[0];
	searching:
	for (limit -= lookFor.length; startAt < limit; startAt++) {
	if (buf[startAt] == firstchar) {
	for (int i = 1; i < lookFor.length; i++) {
	if (buf[startAt + i] != lookFor[i]) {
	continue searching;
	}
	}
	return startAt;
	}
	}

	return -1;
	}

	private static int nextNonmatchingByte(char[] lookFor, char[] buf, int startAt, int limit) {
	searching:
	for (; startAt < limit; startAt++) {
	int thischar = buf[startAt];
	for (int i = 0; i < lookFor.length; i++) {
	if (thischar == lookFor[i]) {
	continue searching;
	}
	}
	return startAt;
	}
	return -1;
	}

	private static int nextMatchingByte(char[] lookFor, char[] buf, int startAt, int limit) {
	searching:
	for (; startAt < limit; startAt++) {
	int thischar = buf[startAt];
	for (int i = 0; i < lookFor.length; i++) {
	if (thischar == lookFor[i]) {
	return startAt;
	}
	}
	}
	return -1;
	}

	private static int nextMatchingByte(char lookFor, char[] buf, int startAt, int limit) {
	searching:
	for (; startAt < limit; startAt++) {
	if (buf[startAt] == lookFor) {
	return startAt;
	}
	}
	return -1;
	}

	private static char[] WHITESPACE = new char[]{' ', '\r', '\t', '\n'};
	private static char[] NOTNAME = new char[]{'=', ' ', '\r', '\t', '\n', '?', '>', '<', '\'', '\"'};

	private static class ScannedAttribute {
	public String name;
	public String value;
	}

	private static int scanAttribute(char[] buf, int startAt, int limit, ScannedAttribute attr) {
	int nameStart = nextNonmatchingByte(WHITESPACE, buf, startAt, limit);
	if (nameStart < 0) {
	return -1;
	}
	int nameEnd = nextMatchingByte(NOTNAME, buf, nameStart, limit);
	if (nameEnd < 0) {
	return -1;
	}
	int equals = nextNonmatchingByte(WHITESPACE, buf, nameEnd, limit);
	if (equals < 0) {
	return -1;
	}
	if (buf[equals] != '=') {
	return -1;
	}
	int valQuote = nextNonmatchingByte(WHITESPACE, buf, equals + 1, limit);
	if (buf[valQuote] != '\'' && buf[valQuote] != '\"') {
	return -1;
	}
	int valEndquote = nextMatchingByte(buf[valQuote], buf, valQuote + 1, limit);
	if (valEndquote < 0) {
	return -1;
	}
	attr.name = new String(buf, nameStart, nameEnd - nameStart);
	attr.value = new String(buf, valQuote + 1, valEndquote - valQuote - 1);
	return valEndquote + 1;
	}
	}