Committing cleaup5.txt patch in xalanj-2258
diff --git a/src/org/apache/xml/serializer/EncodingInfo.java b/src/org/apache/xml/serializer/EncodingInfo.java
index 72db588..ab21ca4 100644
--- a/src/org/apache/xml/serializer/EncodingInfo.java
+++ b/src/org/apache/xml/serializer/EncodingInfo.java
@@ -51,13 +51,29 @@
* <p>
* This Class is not a public API, and should only be used internally within
* the serializer.
- *
+ * <p>
+ * This class is not a public API.
* @xsl.usage internal
*/
public final class EncodingInfo extends Object
{
/**
+ * Not all characters in an encoding are in on contiguous group,
+ * however there is a lowest contiguous group starting at '\u0001'
+ * and working up to m_highCharInContiguousGroup.
+ * <p>
+ * This is the char for which chars at or below this value are
+ * definately in the encoding, although for chars
+ * above this point they might be in the encoding.
+ * This exists for performance, especially for ASCII characters
+ * because for ASCII all chars in the range '\u0001' to '\u007F'
+ * are in the encoding.
+ *
+ */
+ private final char m_highCharInContiguousGroup;
+
+ /**
* The ISO encoding name.
*/
final String name;
@@ -79,6 +95,8 @@
* This is not a public API. It returns true if the
* char in question is in the encoding.
* @param ch the char in question.
+ * <p>
+ * This method is not a public API.
* @xsl.usage internal
*/
public boolean isInEncoding(char ch) {
@@ -98,6 +116,8 @@
* character formed by the high/low pair is in the encoding.
* @param high a char that the a high char of a high/low surrogate pair.
* @param low a char that is the low char of a high/low surrogate pair.
+ * <p>
+ * This method is not a public API.
* @xsl.usage internal
*/
public boolean isInEncoding(char high, char low) {
@@ -120,12 +140,16 @@
*
* @param name reference to the ISO name.
* @param javaName reference to the Java encoding name.
+ * @param highChar The char for which characters at or below this value are
+ * definately in the
+ * encoding, although for characters above this point they might be in the encoding.
*/
- public EncodingInfo(String name, String javaName)
+ public EncodingInfo(String name, String javaName, char highChar)
{
this.name = name;
this.javaName = javaName;
+ this.m_highCharInContiguousGroup = highChar;
}
@@ -503,5 +527,34 @@
}
return isInEncoding;
}
+
+ /**
+ * This method exists for performance reasons.
+ * <p>
+ * Except for '\u0000', if a char is less than or equal to the value
+ * returned by this method then it in the encoding.
+ * <p>
+ * The characters in an encoding are not contiguous, however
+ * there is a lowest group of chars starting at '\u0001' upto and
+ * including the char returned by this method that are all in the encoding.
+ * So the char returned by this method essentially defines the lowest
+ * contiguous group.
+ * <p>
+ * chars above the value returned might be in the encoding, but
+ * chars at or below the value returned are definately in the encoding.
+ * <p>
+ * In any case however, the isInEncoding(char) method can be used
+ * regardless of the value of the char returned by this method.
+ * <p>
+ * If the value returned is '\u0000' it means that every character must be tested
+ * with an isInEncoding method {@link #isInEncoding(char)} or {@link #isInEncoding(char, char)}
+ * for surrogate pairs.
+ * <p>
+ * This method is not a public API.
+ * @xsl.usage internal
+ */
+ public final char getHighChar() {
+ return m_highCharInContiguousGroup;
+ }
}
diff --git a/src/org/apache/xml/serializer/Encodings.java b/src/org/apache/xml/serializer/Encodings.java
index 90d5f6f..47b7ef9 100644
--- a/src/org/apache/xml/serializer/Encodings.java
+++ b/src/org/apache/xml/serializer/Encodings.java
@@ -23,14 +23,11 @@
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
-import java.lang.reflect.Method;
-import java.net.URL;
-import java.security.AccessController;
-import java.security.PrivilegedAction;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Properties;
import java.util.StringTokenizer;
+import java.util.Vector;
/**
@@ -56,7 +53,7 @@
* <p>
* This is not a public API.
* @param output The output stream
- * @param encoding The encoding
+ * @param encoding The encoding MIME name, not a Java name for the encoding.
* @return A suitable writer
* @throws UnsupportedEncodingException There is no convertor
* to support this encoding
@@ -72,9 +69,8 @@
{
try
{
- return new OutputStreamWriter(
- output,
- _encodings[i].javaName);
+ OutputStreamWriter osw = new OutputStreamWriter(output,_encodings[i].javaName);
+ return osw;
}
catch (java.lang.IllegalArgumentException iae) // java 1.1.8
{
@@ -100,7 +96,9 @@
/**
* Returns the EncodingInfo object for the specified
- * encoding.
+ * encoding, never null, although the encoding name
+ * inside the returned EncodingInfo object will be if
+ * we can't find a "real" EncodingInfo for the encoding.
* <p>
* This is not a public API.
*
@@ -119,7 +117,7 @@
ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding);
if (ei == null) {
// We shouldn't have to do this, but just in case.
- ei = new EncodingInfo(null,null);
+ ei = new EncodingInfo(null,null, '\u0000');
}
return ei;
@@ -262,7 +260,8 @@
* @param encoding non-null reference to encoding string, java style.
*
* @return ISO-style encoding string.
- *
+ * <p>
+ * This method is not a public API.
* @xsl.usage internal
*/
public static String convertMime2JavaEncoding(String encoding)
@@ -311,57 +310,54 @@
}
int totalEntries = props.size();
- int totalMimeNames = 0;
+
+ Vector encodingInfo_list = new Vector();
Enumeration keys = props.keys();
for (int i = 0; i < totalEntries; ++i)
{
String javaName = (String) keys.nextElement();
String val = props.getProperty(javaName);
- totalMimeNames++;
- int pos = val.indexOf(' ');
- for (int j = 0; j < pos; ++j)
- if (val.charAt(j) == ',')
- totalMimeNames++;
- }
- EncodingInfo[] ret = new EncodingInfo[totalMimeNames];
- int j = 0;
- keys = props.keys();
- for (int i = 0; i < totalEntries; ++i)
- {
- String javaName = (String) keys.nextElement();
- String val = props.getProperty(javaName);
- int pos = val.indexOf(' ');
+ int len = lengthOfMimeNames(val);
+
String mimeName;
- if (pos < 0)
+ char highChar;
+ if (len == 0)
{
- // Maybe report/log this problem?
- // "Last printable character not defined for encoding " +
- // mimeName + " (" + val + ")" ...
- mimeName = val;
+ // There is no property value, only the javaName, so try and recover
+ mimeName = javaName;
+ highChar = '\u0000'; // don't know the high code point, will need to test every character
}
else
{
+ try {
+ // Get the substring after the Mime names
+ final String highVal = val.substring(len).trim();
+ highChar = (char) Integer.decode(highVal).intValue();
+ }
+ catch( NumberFormatException e) {
+ highChar = 0;
+ }
+ String mimeNames = val.substring(0, len);
StringTokenizer st =
- new StringTokenizer(val.substring(0, pos), ",");
+ new StringTokenizer(mimeNames, ",");
for (boolean first = true;
st.hasMoreTokens();
first = false)
{
mimeName = st.nextToken();
- ret[j] =
- new EncodingInfo(mimeName, javaName);
- _encodingTableKeyMime.put(
- mimeName.toUpperCase(),
- ret[j]);
+ EncodingInfo ei = new EncodingInfo(mimeName, javaName, highChar);
+ encodingInfo_list.add(ei);
+ _encodingTableKeyMime.put(mimeName.toUpperCase(), ei);
if (first)
- _encodingTableKeyJava.put(
- javaName.toUpperCase(),
- ret[j]);
- j++;
+ _encodingTableKeyJava.put(javaName.toUpperCase(), ei);
}
}
}
- return ret;
+ // Convert the Vector of EncodingInfo objects into an array of them,
+ // as that is the kind of thing this method returns.
+ EncodingInfo[] ret_ei = new EncodingInfo[encodingInfo_list.size()];
+ encodingInfo_list.toArray(ret_ei);
+ return ret_ei;
}
catch (java.net.MalformedURLException mue)
{
@@ -372,6 +368,24 @@
throw new org.apache.xml.serializer.utils.WrappedRuntimeException(ioe);
}
}
+
+ /**
+ * Get the length of the Mime names within the property value
+ * @param val The value of the property, which should contain a comma
+ * separated list of Mime names, followed optionally by a space and the
+ * high char value
+ * @return
+ */
+ private static int lengthOfMimeNames(String val) {
+ // look for the space preceding the optional high char
+ int len = val.indexOf(' ');
+ // If len is zero it means the optional part is not there, so
+ // the value must be all Mime names, so set the length appropriately
+ if (len < 0)
+ len = val.length();
+
+ return len;
+ }
/**
* Return true if the character is the high member of a surrogate pair.
@@ -421,6 +435,37 @@
int codePoint = ch;
return codePoint;
}
+
+ /**
+ * Characters with values at or below the high code point are
+ * in the encoding. Code point values above this one may or may
+ * not be in the encoding, but lower ones certainly are.
+ * <p>
+ * This is for performance.
+ *
+ * @param encoding The encoding
+ * @return The code point for which characters at or below this code point
+ * are in the encoding. Characters with higher code point may or may not be
+ * in the encoding. A value of zero is returned if the high code point is unknown.
+ * <p>
+ * This method is not a public API.
+ * @xsl.usage internal
+ */
+ static public char getHighChar(String encoding)
+ {
+ final char highCodePoint;
+ EncodingInfo ei;
+
+ String normalizedEncoding = toUpperCaseFast(encoding);
+ ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding);
+ if (ei == null)
+ ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding);
+ if (ei != null)
+ highCodePoint = ei.getHighChar();
+ else
+ highCodePoint = 0;
+ return highCodePoint;
+ }
private static final Hashtable _encodingTableKeyJava = new Hashtable();
private static final Hashtable _encodingTableKeyMime = new Hashtable();
diff --git a/src/org/apache/xml/serializer/Encodings.properties b/src/org/apache/xml/serializer/Encodings.properties
index e380f77..011ecea 100644
--- a/src/org/apache/xml/serializer/Encodings.properties
+++ b/src/org/apache/xml/serializer/Encodings.properties
@@ -16,103 +16,243 @@
#
# $Id$
#
+# Each entry in this properties file is:
+# 1) The Java name for the encoding
+# 2) A comma separated list of the MIME names for the encoding,
+# with the first one being the preferred MIME name.
+# 3) An optional high char. Characters at or below this value are
+# definately in the encoding, but characters above it may or may not be.
+# This value is given only for performance reasons.
+# A value of zero is the same as no value at all.
+#
+# For example this line in this file:
+# ASCII ASCII,US-ASCII 0x007F
+# Means the Java name for the encoding is "ASCII". The MIME names for this
+# encoding which may appear in a stylesheet are "ASCII" or "US-ASCII"
+# and the optional high code point value is given, and it is 0X007F
+# which means that the contiguous block of chars from
+# 0x0001 to 0x007F ( 127 in base 10) are all in the encoding.
+# Higher values above this char might be in the encoding, although in the
+# case of this particular encoding there are no higher chars.
+#
+#
# <JAVA name encoding>, <PREFERRED name MIME>
-# Peter Smolik
-Cp1250 WINDOWS-1250 0x00FF
-# Patch attributed to havardw@underdusken.no (Håvard Wigtil)
-Cp1251 WINDOWS-1251 0x00FF
-Cp1252 WINDOWS-1252 0x00FF
-ISO8859_1 ISO-8859-1 0x00FF
-# Patch attributed to havardw@underdusken.no (Håvard Wigtil)
-ISO8859-1 ISO-8859-1 0x00FF
-ISO8859_2 ISO-8859-2 0x00FF
-# I'm going to apply "ISO8859-X" variant to all these, to be safe.
-ISO8859-2 ISO-8859-2 0x00FF
-ISO8859_3 ISO-8859-3 0x00FF
-ISO8859-3 ISO-8859-3 0x00FF
-ISO8859_4 ISO-8859-4 0x00FF
-ISO8859-4 ISO-8859-4 0x00FF
-ISO8859_5 ISO-8859-5 0x00FF
-ISO8859-5 ISO-8859-5 0x00FF
-ISO8859_6 ISO-8859-6 0x00FF
-ISO8859-6 ISO-8859-6 0x00FF
-ISO8859_7 ISO-8859-7 0x00FF
-ISO8859-7 ISO-8859-7 0x00FF
-ISO8859_8 ISO-8859-8 0x00FF
-ISO8859-8 ISO-8859-8 0x00FF
-ISO8859_9 ISO-8859-9 0x00FF
-ISO8859-9 ISO-8859-9 0x00FF
-ISO8859_10 ISO-8859-10 0x00FF
-ISO8859-10 ISO-8859-10 0x00FF
-ISO8859_11 ISO-8859-11 0x00FF
-ISO8859-11 ISO-8859-11 0x00FF
-ISO8859_12 ISO-8859-12 0x00FF
-ISO8859-12 ISO-8859-12 0x00FF
-ISO8859_13 ISO-8859-13 0x00FF
-ISO8859-13 ISO-8859-13 0x00FF
-ISO8859_14 ISO-8859-14 0x00FF
-ISO8859-14 ISO-8859-14 0x00FF
-ISO8859_15 ISO-8859-15 0x00FF
-ISO8859-15 ISO-8859-15 0x00FF
-# # ?
-8859_1 ISO-8859-1 0x00FF
-8859_2 ISO-8859-2 0x00FF
-8859_3 ISO-8859-3 0x00FF
-8859_4 ISO-8859-4 0x00FF
-8859_5 ISO-8859-5 0x00FF
-8859_6 ISO-8859-6 0x00FF
-8859_7 ISO-8859-7 0x00FF
-8859_8 ISO-8859-8 0x00FF
-8859_9 ISO-8859-9 0x00FF
-8859-1 ISO-8859-1 0x00FF
-8859-2 ISO-8859-2 0x00FF
-8859-3 ISO-8859-3 0x00FF
-8859-4 ISO-8859-4 0x00FF
-8859-5 ISO-8859-5 0x00FF
-8859-6 ISO-8859-6 0x00FF
-8859-7 ISO-8859-7 0x00FF
-8859-8 ISO-8859-8 0x00FF
-8859-9 ISO-8859-9 0x00FF
-JIS ISO-2022-JP 0xFFFF
-ISO2022KR ISO-2022-KR 0xFFFF
-SJIS SHIFT_JIS 0xFFFF
-EUC_JP EUC-JP 0xFFFF
-EUC_KR EUC-KR 0xFFFF
-EUC_CN EUC-CN 0xFFFF
-EUC_TW EUC-TW 0xFFFF
-EUC_CN GB2312 0xFFFF
-EUC-JP EUC-JP 0xFFFF
-EUC-KR EUC-KR 0xFFFF
-EUC-CN EUC-CN 0xFFFF
-EUC-TW EUC-TW 0xFFFF
-EUC-CN GB2312 0xFFFF
-GB2312 GB2312 0xFFFF
-Big5 BIG5 0xFFFF
-EUCJIS EUC-JP 0xFFFF
-KSC5601 EUC-KR 0xFFFF
-KOI8_R KOI8-R 0xFFFF
-Cp037 EBCDIC-CP-US,EBCDIC-CP-CA,EBCDIC-CP-NL 0x00FF
-Cp277 EBCDIC-CP-DK,EBCDIC-CP-NO 0x00FF
-Cp278 EBCDIC-CP-FI,EBCDIC-CP-SE 0x00FF
-Cp280 EBCDIC-CP-IT 0x00FF
-Cp284 EBCDIC-CP-ES 0x00FF
-Cp285 EBCDIC-CP-GB 0x00FF
-Cp297 EBCDIC-CP-FR 0x00FF
-Cp420 EBCDIC-CP-AR1 0x00FF
-Cp424 EBCDIC-CP-HE 0x00FF
-Cp500 EBCDIC-CP-CH 0x00FF
-Cp850 850,csPC850Multilingual 0xFFFF
-Cp860 860,csIBM860 0xFFFF
-Cp870 EBCDIC-CP-ROECE,EBCDIC-CP-YU 0x00FF
-Cp871 EBCDIC-CP-IS 0x00FF
-Cp918 EBCDIC-CP-AR2 0x00FF
-Cp1047 IBM1047,IBM-1047 0x00FF
-MacTEC MacRoman 0x00FF
-ASCII ASCII,US-ASCII 0x007F
-Unicode UNICODE,UTF-16 0xFFFF
-UTF8 UTF-8 0xFFFF
-# patch attributed to Jinsung Lee
-KS_C_5601-1987 KS_C_5601-1987,iso-ir-149,KS_C_5601-1989,KSC_5601,csKSC56011987 0xFFFF
+#
+#
+ASCII ASCII,US-ASCII 0x007F
+#
+# Big5, Traditional Chinese
+Big5 BIG5,csBig5 0x007F
+#Big5 with Hong Kong extensions, Traditional Chinese (incorporating 2001 revision)
+Big5_HKSCS BIG5-HKSCS 0x007F
+# USA, Canada (Bilingual, French), Netherlands, Portugal, Brazil, Australia
+Cp037 EBCDIC-CP-US,EBCDIC-CP-CA,EBCDIC-CP-WT,EBCDIC-CP-NL,IBM037 0x0019
+# IBM Austria, Germany
+Cp273 IBM273,csIBM273 0x0019
+Cp274 csIBM274,EBCDIC-BE
+Cp275 csIBM275,EBCDIC-BR
+# IBM Denmark, Norway
+Cp277 EBCDIC-CP-DK,EBCDIC-CP-NO,IBM277,csIBM277 0x0019
+# IBM Finland, Sweden
+Cp278 EBCDIC-CP-FI,EBCDIC-CP-SE,IBM278,csIBM278 0x0019
+# IBM Italy
+Cp280 EBCDIC-CP-IT,IBM280,csIBM280 0x0019
+Cp281 EBCDIC-JP-E,csIBM281
+# IBM Catalan/Spain, Spanish Latin America
+Cp284 EBCDIC-CP-ES,IBM284,csIBM284 0x0019
+# IBM United Kingdom, Ireland
+Cp285 EBCDIC-CP-GB,IBM284,csIBM285 0x0019
+Cp290 EBCDIC-JP-kana,IBM290,csIBM290 0x0019
+# IBM France
+Cp297 EBCDIC-CP-FR,IBM297,csIBM297 0x0019
+# IBM Arabic
+Cp420 EBCDIC-CP-AR1,IBM420,csIBM420 0x0019
+Cp423 EBCDIC-CP-GR,IBM423,csIBM423
+# IBM Hebrew
+Cp424 EBCDIC-CP-HE,IBM424,csIBM424 0x0019
+Cp437 437,IBM437,csPC8CodePage437 0x007F
+# EBCDIC 500V1
+Cp500 EBCDIC-CP-CH,EBCDIC-CP-BE,IBM500,csIBM500 0x0019
+# PC Baltic
+Cp775 IBM775,csPC775Baltic 0x007F
+# IBM Thailand extended SBCS
+Cp838 IBM-Thai,838,csIBMThai 0x0019
+# MS-DOS Latin-1
+Cp850 850,csPC850Multilingual,IBM850 0x007F
+Cp851 851,IBM851,csIBM851
+# MS-DOS Latin-2
+Cp852 IBM852,852,csPCp852 0x007F
+# IBM Cyrillic
+Cp855 IBM855,855,csIBM855 0x007F
+# IBM Turkish
+Cp857 IBM857,857,csIBM857 0x007F
+# Variant of Cp850 with Euro character
+Cp858 IBM00858 0x007F
+# MS-DOS Portuguese
+Cp860 860,csIBM860,IBM860 0x007F
+# MS-DOS Icelandic
+Cp861 IBM861,861,csIBM861,cp-is 0x007F
+#
+Cp862 IBM862,862,csPCi62LatinHebrew 0x007F
+# MS-DOS Canadian French
+Cp863 IBM863,863,csIBM863 0x007F
+# PC Arabic
+Cp864 IBM864,864,csIBM864 0x007F
+# MS-DOS Nordic
+Cp865 IBM865,865,csIBM865 0x007F
+# MS-DOS Russian
+Cp866 IBM866,866,csIBM866 0x007F
+# MS-DOS Pakistan
+Cp868 IBM868,cp-ar,csIBM868 0x007F
+# IBM Modern Greek
+Cp869 IBM869,869,cp-gr,csIBM869 0x007F
+# IBM Multilingual Latin-2
+Cp870 EBCDIC-CP-ROECE,EBCDIC-CP-YU,IBM870,csIBM870 0x0019
+# IBM Iceland
+Cp871 EBCDIC-CP-IS,IBM871,csIBM871 0x0019
+Cp880 EBCDIC-Cyrillic,IBM880,csIBM880
+Cp891 IBM891,csIBM891
+Cp903 IBM903,csIBM903
+Cp904 IBM904,csIBM904
+Cp905 IBM905,csIBM905,EBCDIC-CP-TR
+# IBM Pakistan (Urdu)
+Cp918 EBCDIC-CP-AR2,IBM918,csIBM918 0x0019
+# GBK, Simplified Chinese
+Cp936 GBK,MS936,WINDOWS-936
+# IBM Latin-5, Turkey
+Cp1026 IBM1026,csIBM1026 0x0019
+# Latin-1 character set for EBCDIC hosts
+Cp1047 IBM1047,IBM-1047 0x0019
+# Variant of Cp037 with Euro character
+Cp1140 IBM01140 0x0019
+# Variant of Cp273 with Euro character
+Cp1141 IBM01141 0x0019
+# Variant of Cp277 with Euro character
+Cp1142 IBM01142 0x0019
+# Variant of Cp278 with Euro character
+Cp1143 IBM01143 0x0019
+# Variant of Cp280 with Euro character
+Cp1144 IBM01144 0x0019
+# Variant of Cp284 with Euro character
+Cp1145 IBM01145 0x0019
+# Variant of Cp285 with Euro character
+Cp1146 IBM01146 0x0019
+# Variant of Cp297 with Euro character
+Cp1147 IBM01147 0x0019
+# Variant of Cp500 with Euro character
+Cp1148 IBM01148 0x0019
+# Variant of Cp871 with Euro character
+Cp1149 IBM01149 0x0019
+Cp1250 WINDOWS-1250 0x007F
+Cp1251 WINDOWS-1251 0x007F
+Cp1252 WINDOWS-1252 0x007F
+Cp1253 WINDOWS-1253 0x007F
+Cp1254 WINDOWS-1254 0x007F
+# Windows Hebrew
+Cp1255 WINDOWS-1255 0x007F
+# Windows Arabic
+Cp1256 WINDOWS-1256 0x007F
+Cp1257 WINDOWS-1257 0x007F
+# Windows Vietnamese
+Cp1258 WINDOWS-1258 0x007F
+EUC-CN EUC-CN 0x007F
+EUC_CN EUC-CN 0x007F
+#
+#JISX 0201, 0208 and 0212, EUC encoding Japanese
+EUC-JP EUC-JP 0x007F
+EUC_JP EUC-JP 0x007F
+# KS C 5601, EUC encoding, Korean
+EUC-KR EUC-KR 0x007F
+EUC_KR EUC-KR 0x007F
+# CNS11643 (Plane 1-7,15), EUC encoding, Traditional Chinese
+EUC-TW EUC-TW 0x007F
+EUC_TW EUC-TW,x-EUC-TW 0x007F
+EUCJIS EUC-JP 0x007F
+#
+# GB2312, EUC encoding, Simplified Chinese
+GB2312 GB2312 0x007F
+
+# GB2312 and CNS11643 in ISO 2022 CN form, Simplified and Traditional Chinese (conversion to Unicode only)
+ISO2022CN ISO-2022-CN
+# JIS X 0201, 0208, in ISO 2022 form, Japanese
+ISO2022JP ISO-2022-JP
+# ISO 2022 KR, Korean
+ISO2022KR ISO-2022-KR 0x007F
+#
+#
+ISO8859-1 ISO-8859-1 0x00FF
+ISO8859_1 ISO-8859-1 0x00FF
+8859-1 ISO-8859-1 0x00FF
+8859_1 ISO-8859-1 0x00FF
+#
+ISO8859-2 ISO-8859-2 0x00A0
+ISO8859_2 ISO-8859-2 0x00A0
+8859-2 ISO-8859-2 0x00A0
+8859_2 ISO-8859-2 0x00A0
+#
+# Latin Alphabet No. 3
+ISO8859-3 ISO-8859-3 0x00A0
+ISO8859_3 ISO-8859-3 0x00A0
+8859-3 ISO-8859-3 0x00A0
+8859_3 ISO-8859-3 0x00A0
+#
+ISO8859-4 ISO-8859-4 0x00A0
+ISO8859_4 ISO-8859-4 0x00A0
+8859-4 ISO-8859-4 0x00A0
+8859_4 ISO-8859-4 0x00A0
+#
+ISO8859-5 ISO-8859-5 0x00A0
+ISO8859_5 ISO-8859-5 0x00A0
+8859-5 ISO-8859-5 0x00A0
+8859_5 ISO-8859-5 0x00A0
+#
+# Latin/Arabic Alphabet
+ISO8859-6 ISO-8859-6 0x00A0
+ISO8859_6 ISO-8859-6 0x00A0
+8859-6 ISO-8859-6 0x00A0
+8859_6 ISO-8859-6 0x00A0
+#
+ISO8859-7 ISO-8859-7 0x00A0
+ISO8859_7 ISO-8859-7 0x00A0
+8859-7 ISO-8859-7 0x00A0
+8859_7 ISO-8859-7 0x00A0
+#
+ISO8859-8 ISO-8859-8 0x00A0
+ISO8859_8 ISO-8859-8 0x00A0
+8859-8 ISO-8859-8 0x00A0
+8859_8 ISO-8859-8 0x00A0
+#
+ISO8859-9 ISO-8859-9 0x00CF
+ISO8859_9 ISO-8859-9 0x00CF
+8859-9 ISO-8859-9 0x00CF
+8859_9 ISO-8859-9 0x00CF
+#
+ISO8859-10 ISO-8859-10 0x007E
+ISO8859_10 ISO-8859-10 0x007E
+ISO8859-11 ISO-8859-11 0x007E
+ISO8859_11 ISO-8859-11 0x007E
+ISO8859-12 ISO-8859-12 0x007F
+ISO8859_12 ISO-8859-12 0x007F
+ISO8859-13 ISO-8859-13 0x00A0
+ISO8859_13 ISO-8859-13 0x00A0
+ISO8859-14 ISO-8859-14 0x007E
+ISO8859_14 ISO-8859-14 0x007E
+ISO8859-15 ISO-8859-15 0x00A3
+ISO8859_15 ISO-8859-15 0x00A3
+JIS ISO-2022-JP 0x007F
+KOI8_R KOI8-R 0x007F
+KSC5601 EUC-KR 0x007F
+KS_C_5601-1987 KS_C_5601-1987,iso-ir-149,KS_C_5601-1989,KSC_5601,csKSC56011987 0x007F
+MacTEC MacRoman
+# Windows Japanese
+MS932 windows-31j
+# Shift-JIS, Japanese
+SJIS SHIFT_JIS 0x007F
+# TIS620, Thai
+TIS620 TIS-620
+UTF8 UTF-8 0xD7FF
+Unicode UNICODE,UTF-16 0xFFFF
+
# note that more character set names and their aliases
# can be found at http://www.iana.org/assignments/character-sets
diff --git a/src/org/apache/xml/serializer/SerializerBase.java b/src/org/apache/xml/serializer/SerializerBase.java
index d64af88..dce518c 100644
--- a/src/org/apache/xml/serializer/SerializerBase.java
+++ b/src/org/apache/xml/serializer/SerializerBase.java
@@ -19,7 +19,6 @@
package org.apache.xml.serializer;
import java.io.IOException;
-import java.util.Vector;
import javax.xml.transform.SourceLocator;
import javax.xml.transform.Transformer;
@@ -156,7 +155,7 @@
* The character encoding. Must match the encoding used for the
* printWriter.
*/
- private String m_encoding = null;
+ String m_encoding = null;
/**
* Tells if we should write the XML declaration.
@@ -564,9 +563,9 @@
* Sets the character encoding coming from the xsl:output encoding stylesheet attribute.
* @param m_encoding the character encoding
*/
- public void setEncoding(String m_encoding)
+ public void setEncoding(String encoding)
{
- this.m_encoding = m_encoding;
+ this.m_encoding = encoding;
}
/**
@@ -1469,40 +1468,20 @@
if (null != m_StringOfCDATASections)
{
- String localName = m_elemContext.m_elementLocalName;
- if (localName == null)
+ if (m_elemContext.m_elementLocalName == null)
{
- localName = getLocalName(m_elemContext.m_elementName);
+ String localName = getLocalName(m_elemContext.m_elementName);
m_elemContext.m_elementLocalName = localName;
}
- String uri = m_elemContext.m_elementURI;
- if ( uri == null)
- {
- String prefix = getPrefixPart(m_elemContext.m_elementName);
- if (prefix != null) {
- uri = m_prefixMap.lookupNamespace(prefix);
- if (uri != null)
- m_elemContext.m_elementURI = uri;
- else
- uri = "";
- }
- else {
- // no prefix so lookup the URI of the default namespace
- uri = m_prefixMap.lookupNamespace("");
- if (uri == null) // If no URI then the empty string also means no URI
- uri = "";
- }
- }
- else {
- if (m_elemContext.m_elementURI.length() == 0)
- m_elemContext.m_elementURI = null;
- }
+ if (m_elemContext.m_elementURI == null)
+ m_elemContext.m_elementURI = getElementURI();
+
java.util.Hashtable h = (java.util.Hashtable) m_CdataElems.get(m_elemContext.m_elementLocalName);
if (h != null)
{
- Object obj = h.get(uri);
+ Object obj = h.get(m_elemContext.m_elementURI);
if (obj != null)
b = true;
}
@@ -1510,5 +1489,39 @@
}
return b;
}
-}
+
+ /**
+ * Before this call m_elementContext.m_elementURI is null,
+ * which means it is not yet known. After this call it
+ * is non-null, but possibly "" meaning that it is in the
+ * default namespace.
+ *
+ * @return The URI of the element, never null, but possibly "".
+ */
+ private String getElementURI() {
+ String uri = null;
+ // At this point in processing we have received all the
+ // namespace mappings
+ // As we still don't know the elements namespace,
+ // we now figure it out.
+ String prefix = getPrefixPart(m_elemContext.m_elementName);
+
+ if (prefix == null) {
+ // no prefix so lookup the URI of the default namespace
+ uri = m_prefixMap.lookupNamespace("");
+ } else {
+ uri = m_prefixMap.lookupNamespace(prefix);
+ }
+ if (uri == null) {
+ // We didn't find the namespace for the
+ // prefix ... ouch, that shouldn't happen.
+ // This is a hack, we really don't know
+ // the namespace
+ uri = EMPTYSTRING;
+ }
+
+ return uri;
+ }
+}
+
diff --git a/src/org/apache/xml/serializer/ToStream.java b/src/org/apache/xml/serializer/ToStream.java
index ccf193c..133eaa6 100644
--- a/src/org/apache/xml/serializer/ToStream.java
+++ b/src/org/apache/xml/serializer/ToStream.java
@@ -22,6 +22,7 @@
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
+import java.util.EmptyStackException;
import java.util.Enumeration;
import java.util.Properties;
import java.util.StringTokenizer;
@@ -40,8 +41,6 @@
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-//import com.sun.media.sound.IESecurity;
-
/**
* This abstract class is a base class for other stream
* serializers (xml, html, text ...) that write output to a stream.
@@ -69,7 +68,7 @@
* single chars or surrogate pairs of high/low chars form
* characters in the output encoding.
*/
- EncodingInfo m_encodingInfo = new EncodingInfo(null,null);
+ EncodingInfo m_encodingInfo = new EncodingInfo(null,null, '\u0000');
/**
* Stack to keep track of whether or not we need to
@@ -422,9 +421,9 @@
// characters are written to the output writer.
if (m_tracer != null
&& !(writer instanceof SerializerTraceWriter) )
- m_writer = new SerializerTraceWriter(writer, m_tracer);
+ setWriterInternal(new SerializerTraceWriter(writer, m_tracer), false);
else
- m_writer = writer;
+ setWriterInternal(writer, false);
if (m_format == null)
m_format = new java.util.Properties();
@@ -499,16 +498,16 @@
}
// initCharsMap();
- String encoding = getEncoding();
- if (null == encoding)
- {
- encoding =
- Encodings.getMimeEncoding(
- format.getProperty(OutputKeys.ENCODING));
- setEncoding(encoding);
+ String previous_encoding = getEncoding();
+ String possible_encoding =
+ Encodings.getMimeEncoding(format.getProperty(OutputKeys.ENCODING));
+ if (previous_encoding == null || defaultProperties == false) {
+ // Only set the encoding if there was no previous encoding, or if we are
+ // setting a value that is not a default value, because we don't
+ // want to stomp on a previously set non-default one with the default one.
+ setEncoding(possible_encoding);
}
- m_isUTF8 = encoding.equals(Encodings.DEFAULT_MIME_ENCODING);
// Access this only from the Hashtable level... we don't want to
// get default properties.
@@ -537,7 +536,7 @@
w2 = ((WriterChain)w2).getWriter();
}
if (noTracerYet)
- m_writer = new SerializerTraceWriter(m_writer, m_tracer);
+ setWriterInternal(new SerializerTraceWriter(m_writer, m_tracer), false);
}
}
@@ -571,21 +570,24 @@
throws UnsupportedEncodingException
{
- String encoding = getEncoding();
- if (encoding == null)
- {
- // if not already set then get it from the properties
- encoding =
- Encodings.getMimeEncoding(
- format.getProperty(OutputKeys.ENCODING));
- setEncoding(encoding);
+ // Get the encoding in the format Properties, or UTF-8 if none in the format
+ String previous_encoding = getEncoding();
+ String possible_encoding = Encodings.getMimeEncoding(format.getProperty(OutputKeys.ENCODING));
+ if (previous_encoding == null || defaultProperties == false ) {
+ // Lets not stomp on an encoding that was already set, with one that is only coming from
+ // a default set of properties. So only do this setting of the encoding if either there
+ // was no previously set encoding, or if this is not a default value for the encoding
+ setEncoding(possible_encoding);
}
-
- if (encoding.equalsIgnoreCase("UTF-8"))
+
+ // When all is said and done encoding may be possible_encoding, or
+ // if there was a problem with that one the encoding will be unchanged, so
+ // just get what it is.
+ String encoding = getEncoding();
+
+
+ if (Encodings.DEFAULT_MIME_ENCODING.equalsIgnoreCase(encoding))
{
- m_isUTF8 = true;
-
-
init(
new WriterToUTF8Buffered(output),
format,
@@ -595,22 +597,30 @@
}
else if (
- encoding.equals("WINDOWS-1250")
- || encoding.equals("US-ASCII")
- || encoding.equals("ASCII"))
+ "WINDOWS-1250".equals(encoding)
+ || "US-ASCII".equals(encoding)
+ || "ASCII".equals(encoding))
{
init(new WriterToASCI(output), format, defaultProperties, true);
}
else
{
- Writer osw;
+ Writer osw = null;
- try
- {
- osw = Encodings.getWriter(output, encoding);
+ if (encoding == null)
+ encoding = possible_encoding;
+ else {
+ try
+ {
+ osw = Encodings.getWriter(output, encoding);
+ }
+ catch (UnsupportedEncodingException uee)
+ {
+ osw = null;
+ }
}
- catch (UnsupportedEncodingException uee)
- {
+
+ if (osw == null) {
System.out.println(
"Warning: encoding \""
+ encoding
@@ -651,9 +661,16 @@
// characters are written to the output writer.
if (m_tracer != null
&& !(writer instanceof SerializerTraceWriter) )
- m_writer = new SerializerTraceWriter(writer, m_tracer);
+ setWriterInternal(new SerializerTraceWriter(writer, m_tracer), true);
else
- m_writer = writer;
+ setWriterInternal(writer, true);
+ }
+
+ private boolean m_writer_set_by_user;
+ private void setWriterInternal(Writer writer, boolean setByUser) {
+ if (setByUser)
+ m_writer_set_by_user = true;
+ m_writer = writer;
}
/**
@@ -2977,7 +2994,7 @@
super.setTransformer(transformer);
if (m_tracer != null
&& !(m_writer instanceof SerializerTraceWriter) )
- m_writer = new SerializerTraceWriter(m_writer, m_tracer);
+ setWriterInternal(new SerializerTraceWriter(m_writer, m_tracer), false);
}
@@ -3031,7 +3048,8 @@
this.m_lineSepUse = true;
// DON'T SET THE WRITER TO NULL, IT MAY BE REUSED !!
// this.m_writer = null;
- this.m_expandDTDEntities = true;
+ this.m_expandDTDEntities = true;
+ this.m_writer_set_by_user = false;
}
@@ -3041,34 +3059,58 @@
*/
public void setEncoding(String encoding)
{
- String old = getEncoding();
- super.setEncoding(encoding);
+ final String old = getEncoding();
if (old == null || !old.equals(encoding)) {
- // If we have changed the setting of the
- m_encodingInfo = Encodings.getEncodingInfo(encoding);
+ // We are trying to change the setting of the encoding to a different value
+ // from what it was
- if (encoding != null && m_encodingInfo.name == null) {
+ EncodingInfo encodingInfo = Encodings.getEncodingInfo(encoding);
+ if (encoding != null && encodingInfo.name == null) {
// We tried to get an EncodingInfo for Object for the given
// encoding, but it came back with an internall null name
// so the encoding is not supported by the JDK, issue a message.
- String msg = Utils.messages.createMessage(
+ final String msg = Utils.messages.createMessage(
MsgKey.ER_ENCODING_NOT_SUPPORTED,new Object[]{ encoding });
+
+ final String msg2 =
+ "Warning: encoding \"" + encoding + "\" not supported, using "
+ + Encodings.DEFAULT_MIME_ENCODING;
try
{
// Prepare to issue the warning message
- Transformer tran = super.getTransformer();
+ final Transformer tran = super.getTransformer();
if (tran != null) {
- ErrorListener errHandler = tran.getErrorListener();
+ final ErrorListener errHandler = tran.getErrorListener();
// Issue the warning message
- if (null != errHandler && m_sourceLocator != null)
+ if (null != errHandler && m_sourceLocator != null) {
errHandler.warning(new TransformerException(msg, m_sourceLocator));
- else
+ errHandler.warning(new TransformerException(msg2, m_sourceLocator));
+ }
+ else {
System.out.println(msg);
+ System.out.println(msg2);
+ }
}
- else
+ else {
System.out.println(msg);
+ System.out.println(msg2);
+ }
}
catch (Exception e){}
+
+ // We said we are using UTF-8, so use it
+ encoding = Encodings.DEFAULT_MIME_ENCODING;
+ encodingInfo = Encodings.getEncodingInfo(encoding);
+ //if (m_format != null)
+ // m_format.setProperty(OutputKeys.ENCODING,Encodings.DEFAULT_MIME_ENCODING);
+ } else {
+
+ // Either the encoding was good, or it was forced into UTF-8.
+ // In any case we remember it for later.
+ m_encodingInfo = encodingInfo;
+ this.m_encoding = encoding;
+ if (encoding != null)
+ m_isUTF8 = encoding.equals(Encodings.DEFAULT_MIME_ENCODING);
}
}
return;