blob: 9271ded1512ba20609a0e4871dbdd327188e3910 [file] [log] [blame]
/*
Copyright 1999-2003 The Apache Software Foundation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package org.apache.batik.xml;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.Reader;
import org.apache.batik.util.EncodingUtilities;
/**
* A collection of utility functions for XML.
*
* @author <a href="mailto:stephane@hillion.org">Stephane Hillion</a>
* @version $Id$
*/
public class XMLUtilities extends XMLCharacters {
/**
* This class does not need to be instantiated.
*/
protected XMLUtilities() {
}
/**
* Tests whether the given character is a valid space.
*/
public static boolean isXMLSpace(char c) {
return (c <= 0x0020) &&
(((((1L << 0x0009) |
(1L << 0x000A) |
(1L << 0x000D) |
(1L << 0x0020)) >> c) & 1L) != 0);
}
/**
* Tests whether the given character is usable as the
* first character of an XML name.
*/
public static boolean isXMLNameFirstCharacter(char c) {
return (NAME_FIRST_CHARACTER[c / 32] & (1 << (c % 32))) != 0;
}
/**
* Tests whether the given character is a valid XML name character.
*/
public static boolean isXMLNameCharacter(char c) {
return (NAME_CHARACTER[c / 32] & (1 << (c % 32))) != 0;
}
/**
* Tests whether the given 32 bits character is valid in XML documents.
*/
public static boolean isXMLCharacter(int c) {
return (c >= 0x10000 && c <= 0x10ffff) ||
(XML_CHARACTER[c / 32] & (1 << (c % 32))) != 0;
}
/**
* Tests whether the given character is a valid XML public ID character.
*/
public static boolean isXMLPublicIdCharacter(char c) {
return (c < 128) &&
(PUBLIC_ID_CHARACTER[c / 32] & (1 << (c % 32))) != 0;
}
/**
* Tests whether the given character is a valid XML version character.
*/
public static boolean isXMLVersionCharacter(char c) {
return (c < 128) &&
(VERSION_CHARACTER[c / 32] & (1 << (c % 32))) != 0;
}
/**
* Tests whether the given character is a valid aphabetic character.
*/
public static boolean isXMLAlphabeticCharacter(char c) {
return (c < 128) &&
(ALPHABETIC_CHARACTER[c / 32] & (1 << (c % 32))) != 0;
}
/**
* Creates a Reader initialized to scan the characters in the given
* XML document's InputStream.
* @param is The input stream positionned at the beginning of an
* XML document.
* @return a Reader positionned at the beginning of the XML document
* It is created from an encoding figured out from the first
* few bytes of the document. As a consequence the given
* input stream is not positionned anymore at the beginning
* of the document when this method returns.
*/
public static Reader createXMLDocumentReader(InputStream is)
throws IOException {
PushbackInputStream pbis = new PushbackInputStream(is, 128);
byte[] buf = new byte[4];
int len = pbis.read(buf);
if (len > 0) {
pbis.unread(buf, 0, len);
}
if (len == 4) {
switch (buf[0] & 0x00FF) {
case 0:
if (buf[1] == 0x003c && buf[2] == 0x0000 && buf[3] == 0x003f) {
return new InputStreamReader(pbis, "UnicodeBig");
}
break;
case '<':
switch (buf[1] & 0x00FF) {
case 0:
if (buf[2] == 0x003f && buf[3] == 0x0000) {
return new InputStreamReader(pbis, "UnicodeLittle");
}
break;
case '?':
if (buf[2] == 'x' && buf[3] == 'm') {
Reader r = createXMLDeclarationReader(pbis, "UTF8");
String enc = getXMLDeclarationEncoding(r, "UTF8");
return new InputStreamReader(pbis, enc);
}
}
break;
case 0x004C:
if (buf[1] == 0x006f &&
(buf[2] & 0x00FF) == 0x00a7 &&
(buf[3] & 0x00FF) == 0x0094) {
Reader r = createXMLDeclarationReader(pbis, "CP037");
String enc = getXMLDeclarationEncoding(r, "CP037");
return new InputStreamReader(pbis, enc);
}
break;
case 0x00FE:
if ((buf[1] & 0x00FF) == 0x00FF) {
return new InputStreamReader(pbis, "Unicode");
}
break;
case 0x00FF:
if ((buf[1] & 0x00FF) == 0x00FE) {
return new InputStreamReader(pbis, "Unicode");
}
}
}
return new InputStreamReader(pbis, "UTF8");
}
/**
* Creates a reader from the given input stream and encoding.
* This method assumes the input stream working buffer is at least
* 128 byte long. The input stream is restored before this method
* returns. The 4 first bytes are skipped before creating the reader.
*/
protected static Reader createXMLDeclarationReader(PushbackInputStream pbis,
String enc)
throws IOException {
byte[] buf = new byte[128];
int len = pbis.read(buf);
if (len > 0) {
pbis.unread(buf, 0, len);
}
return new InputStreamReader(new ByteArrayInputStream(buf, 4, len), enc);
}
/**
* Reads an XML declaration to get the encoding declaration value.
* @param r a reader positionned just after '<?xm'.
* @param e the encoding to return by default or on error.
*/
protected static String getXMLDeclarationEncoding(Reader r, String e)
throws IOException {
int c;
if ((c = r.read()) != 'l') {
return e;
}
if (!isXMLSpace((char)(c = r.read()))) {
return e;
}
while (isXMLSpace((char)(c = r.read())));
if (c != 'v') {
return e;
}
if ((c = r.read()) != 'e') {
return e;
}
if ((c = r.read()) != 'r') {
return e;
}
if ((c = r.read()) != 's') {
return e;
}
if ((c = r.read()) != 'i') {
return e;
}
if ((c = r.read()) != 'o') {
return e;
}
if ((c = r.read()) != 'n') {
return e;
}
c = r.read();
while (isXMLSpace((char)c)) {
c = r.read();
}
if (c != '=') {
return e;
}
while (isXMLSpace((char)(c = r.read())));
if (c != '"' && c != '\'') {
return e;
}
char sc = (char)c;
for (;;) {
c = r.read();
if (c == sc) {
break;
}
if (!isXMLVersionCharacter((char)c)) {
return e;
}
}
if (!isXMLSpace((char)(c = r.read()))) {
return e;
}
while (isXMLSpace((char)(c = r.read())));
if (c != 'e') {
return e;
}
if ((c = r.read()) != 'n') {
return e;
}
if ((c = r.read()) != 'c') {
return e;
}
if ((c = r.read()) != 'o') {
return e;
}
if ((c = r.read()) != 'd') {
return e;
}
if ((c = r.read()) != 'i') {
return e;
}
if ((c = r.read()) != 'n') {
return e;
}
if ((c = r.read()) != 'g') {
return e;
}
c = r.read();
while (isXMLSpace((char)c)) {
c = r.read();
}
if (c != '=') {
return e;
}
while (isXMLSpace((char)(c = r.read())));
if (c != '"' && c != '\'') {
return e;
}
sc = (char)c;
StringBuffer enc = new StringBuffer();
for (;;) {
c = r.read();
if (c == -1) {
return e;
}
if (c == sc) {
return encodingToJavaEncoding(enc.toString(), e);
}
enc.append((char)c);
}
}
/**
* Converts the given standard encoding representation to the
* corresponding Java encoding string.
* @param e the encoding string to convert.
* @param de the encoding string if no corresponding encoding was found.
*/
public static String encodingToJavaEncoding(String e, String de) {
String result = EncodingUtilities.javaEncoding(e);
return (result == null) ? de : result;
}
}