blob: 6f2a4eb439e4c032a77678db74a10813c5e2b482 [file] [log] [blame]
/* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.xmlbeans.impl.common;
import java.io.*;
import java.nio.charset.Charset;
public class SniffedXmlInputStream extends BufferedInputStream {
// We don't sniff more than 192 bytes.
public static final int MAX_SNIFFED_BYTES = 192;
public SniffedXmlInputStream(InputStream stream) throws IOException {
super(stream);
// read byte order marks and detect EBCDIC etc
_encoding = sniffFourBytes();
if (_encoding != null && _encoding.equals("IBM037")) {
// First four bytes suggest EBCDIC with <?xm at start
String encoding = sniffForXmlDecl(_encoding);
if (encoding != null) {
_encoding = encoding;
}
}
if (_encoding == null) {
// Haven't yet determined encoding: sniff for <?xml encoding="..."?>
// assuming we can read it as UTF-8.
_encoding = sniffForXmlDecl("UTF-8");
}
if (_encoding == null) {
// The XML spec says these two things:
// (1) "In the absence of external character encoding information
// (such as MIME headers), parsed entities which are stored in an
// encoding other than UTF-8 or UTF-16 must begin with a text
// declaration (see 4.3.1 The Text Declaration) containing an
// encoding declaration:"
// (2) "In the absence of information provided by an external
// transport protocol (e.g. HTTP or MIME), it is an error
// for an entity including an encoding declaration to be
// presented to the XML processor in an encoding other than
// that named in the declaration, or for an entity which begins
// with neither a Byte Order Mark nor an encoding declaration
// to use an encoding other than UTF-8."
// Since we're using a sniffed stream, we do not have external
// character encoding information.
// Since we're here, we also don't have a recognized byte order
// mark or an explicit encoding declaration that can be read in
// either ASCII or EBDIC style.
// Therefore, we must use UTF-8.
_encoding = "UTF-8";
}
}
private int readAsMuchAsPossible(byte[] buf, int startAt, int len) throws IOException {
int total = 0;
while (total < len) {
int count = read(buf, startAt + total, len - total);
if (count < 0) {
break;
}
total += count;
}
return total;
}
private String sniffFourBytes() throws IOException {
mark(4);
int skip = 0;
try {
byte[] buf = new byte[4];
if (readAsMuchAsPossible(buf, 0, 4) < 4) {
return null;
}
long result = 0xFF000000 & (buf[0] << 24) | 0x00FF0000 & (buf[1] << 16) | 0x0000FF00 & (buf[2] << 8) | 0x000000FF & buf[3];
if (result == 0x0000FEFF) {
return "UCS-4";
} else if (result == 0xFFFE0000) {
return "UCS-4";
} else if (result == 0x0000003C) {
return "UCS-4BE";
} else if (result == 0x3C000000) {
return "UCS-4LE";
} else if (result == 0x003C003F) {
return "UTF-16BE";
} else if (result == 0x3C003F00) {
return "UTF-16LE";
} else if (result == 0x3C3F786D) {
return null; // looks like US-ASCII with <?xml: sniff
} else if (result == 0x4C6FA794) {
return "IBM037"; // Sniff for ebdic codepage
} else if ((result & 0xFFFF0000) == 0xFEFF0000) {
return "UTF-16";
} else if ((result & 0xFFFF0000) == 0xFFFE0000) {
return "UTF-16";
} else if ((result & 0xFFFFFF00) == 0xEFBBBF00) {
return "UTF-8";
} else {
return null;
}
} finally {
reset();
}
}
// BUGBUG in JDK: Charset.forName is not threadsafe, so we'll prime it
// with the common charsets.
private static Charset dummy1 = Charset.forName("UTF-8");
private static Charset dummy2 = Charset.forName("UTF-16");
private static Charset dummy3 = Charset.forName("UTF-16BE");
private static Charset dummy4 = Charset.forName("UTF-16LE");
private static Charset dummy5 = Charset.forName("ISO-8859-1");
private static Charset dummy6 = Charset.forName("US-ASCII");
private static Charset dummy7 = Charset.forName("Cp1252");
private String sniffForXmlDecl(String encoding) throws IOException {
mark(MAX_SNIFFED_BYTES);
try {
byte[] bytebuf = new byte[MAX_SNIFFED_BYTES];
int bytelimit = readAsMuchAsPossible(bytebuf, 0, MAX_SNIFFED_BYTES);
// BUGBUG in JDK: Charset.forName is not threadsafe.
Charset charset = Charset.forName(encoding);
Reader reader = new InputStreamReader(new ByteArrayInputStream(bytebuf, 0, bytelimit), charset);
char[] buf = new char[bytelimit];
int limit = 0;
while (limit < bytelimit) {
int count = reader.read(buf, limit, bytelimit - limit);
if (count < 0) {
break;
}
limit += count;
}
return extractXmlDeclEncoding(buf, 0, limit);
} finally {
reset();
}
}
private String _encoding;
public String getXmlEncoding() {
return _encoding;
}
/* package */
static String extractXmlDeclEncoding(char[] buf, int offset, int size) {
int limit = offset + size;
int xmlpi = firstIndexOf("<?xml", buf, offset, limit);
if (xmlpi >= 0) {
int i = xmlpi + 5;
ScannedAttribute attr = new ScannedAttribute();
while (i < limit) {
i = scanAttribute(buf, i, limit, attr);
if (i < 0) {
return null;
}
if (attr.name.equals("encoding")) {
return attr.value;
}
}
}
return null;
}
private static int firstIndexOf(String s, char[] buf, int startAt, int limit) {
assert (s.length() > 0);
char[] lookFor = s.toCharArray();
char firstchar = lookFor[0];
searching:
for (limit -= lookFor.length; startAt < limit; startAt++) {
if (buf[startAt] == firstchar) {
for (int i = 1; i < lookFor.length; i++) {
if (buf[startAt + i] != lookFor[i]) {
continue searching;
}
}
return startAt;
}
}
return -1;
}
private static int nextNonmatchingByte(char[] lookFor, char[] buf, int startAt, int limit) {
searching:
for (; startAt < limit; startAt++) {
int thischar = buf[startAt];
for (int i = 0; i < lookFor.length; i++) {
if (thischar == lookFor[i]) {
continue searching;
}
}
return startAt;
}
return -1;
}
private static int nextMatchingByte(char[] lookFor, char[] buf, int startAt, int limit) {
searching:
for (; startAt < limit; startAt++) {
int thischar = buf[startAt];
for (int i = 0; i < lookFor.length; i++) {
if (thischar == lookFor[i]) {
return startAt;
}
}
}
return -1;
}
private static int nextMatchingByte(char lookFor, char[] buf, int startAt, int limit) {
searching:
for (; startAt < limit; startAt++) {
if (buf[startAt] == lookFor) {
return startAt;
}
}
return -1;
}
private static char[] WHITESPACE = new char[]{' ', '\r', '\t', '\n'};
private static char[] NOTNAME = new char[]{'=', ' ', '\r', '\t', '\n', '?', '>', '<', '\'', '\"'};
private static class ScannedAttribute {
public String name;
public String value;
}
private static int scanAttribute(char[] buf, int startAt, int limit, ScannedAttribute attr) {
int nameStart = nextNonmatchingByte(WHITESPACE, buf, startAt, limit);
if (nameStart < 0) {
return -1;
}
int nameEnd = nextMatchingByte(NOTNAME, buf, nameStart, limit);
if (nameEnd < 0) {
return -1;
}
int equals = nextNonmatchingByte(WHITESPACE, buf, nameEnd, limit);
if (equals < 0) {
return -1;
}
if (buf[equals] != '=') {
return -1;
}
int valQuote = nextNonmatchingByte(WHITESPACE, buf, equals + 1, limit);
if (buf[valQuote] != '\'' && buf[valQuote] != '\"') {
return -1;
}
int valEndquote = nextMatchingByte(buf[valQuote], buf, valQuote + 1, limit);
if (valEndquote < 0) {
return -1;
}
attr.name = new String(buf, nameStart, nameEnd - nameStart);
attr.value = new String(buf, valQuote + 1, valEndquote - valQuote - 1);
return valEndquote + 1;
}
}