Committing cleaup5.txt patch in xalanj-2258

commit: 382dacd8684b81dfd45915be7544254651b17788 [log] [tgz]
author: Brian James Minchau <minchau@apache.org> Sat Jan 28 17:10:42 2006 +0000
committer: Brian James Minchau <minchau@apache.org> Sat Jan 28 17:10:42 2006 +0000
tree: 0d283d2b876007372842f49af5de66a67c26bee1
parent: 9fdfb02ecddb27649edc3ceba1f7da2d6ebbf50b [diff]
diff --git a/src/org/apache/xml/serializer/EncodingInfo.java b/src/org/apache/xml/serializer/EncodingInfo.java
index 72db588..ab21ca4 100644
--- a/src/org/apache/xml/serializer/EncodingInfo.java
+++ b/src/org/apache/xml/serializer/EncodingInfo.java

@@ -51,13 +51,29 @@
  * <p>
  * This Class is not a public API, and should only be used internally within
  * the serializer.
- * 
+ * <p>
+ * This class is not a public API.
  * @xsl.usage internal
  */
 public final class EncodingInfo extends Object
 {
 
     /**
+     * Not all characters in an encoding are in on contiguous group,
+     * however there is a lowest contiguous group starting at '\u0001'
+     * and working up to m_highCharInContiguousGroup.
+     * <p>
+     * This is the char for which chars at or below this value are 
+     * definately in the encoding, although for chars
+     * above this point they might be in the encoding.
+     * This exists for performance, especially for ASCII characters
+     * because for ASCII all chars in the range '\u0001' to '\u007F' 
+     * are in the encoding.
+     * 
+     */
+    private final char m_highCharInContiguousGroup;
+
+    /**
      * The ISO encoding name.
      */
     final String name;
@@ -79,6 +95,8 @@
      * This is not a public API. It returns true if the
      * char in question is in the encoding.
      * @param ch the char in question.
+     * <p>
+     * This method is not a public API.
      * @xsl.usage internal
      */
     public boolean isInEncoding(char ch) {
@@ -98,6 +116,8 @@
      * character formed by the high/low pair is in the encoding.
      * @param high a char that the a high char of a high/low surrogate pair.
      * @param low a char that is the low char of a high/low surrogate pair.
+     * <p>
+     * This method is not a public API.
      * @xsl.usage internal
      */
     public boolean isInEncoding(char high, char low) {
@@ -120,12 +140,16 @@
      *
      * @param name reference to the ISO name.
      * @param javaName reference to the Java encoding name.
+     * @param highChar The char for which characters at or below this value are 
+     * definately in the
+     * encoding, although for characters above this point they might be in the encoding.
      */
-    public EncodingInfo(String name, String javaName)
+    public EncodingInfo(String name, String javaName, char highChar)
     {
 
         this.name = name;
         this.javaName = javaName;
+        this.m_highCharInContiguousGroup = highChar;
     }
     
     
@@ -503,5 +527,34 @@
         }
         return isInEncoding;
     }
+    
+    /**
+     * This method exists for performance reasons.
+     * <p>
+     * Except for '\u0000', if a char is less than or equal to the value
+     * returned by this method then it in the encoding.
+     * <p>
+     * The characters in an encoding are not contiguous, however
+     * there is a lowest group of chars starting at '\u0001' upto and
+     * including the char returned by this method that are all in the encoding.
+     * So the char returned by this method essentially defines the lowest
+     * contiguous group.
+     * <p>
+     * chars above the value returned might be in the encoding, but 
+     * chars at or below the value returned are definately in the encoding.
+     * <p>
+     * In any case however, the isInEncoding(char) method can be used
+     * regardless of the value of the char returned by this method.
+     * <p>
+     * If the value returned is '\u0000' it means that every character must be tested
+     * with an isInEncoding method {@link #isInEncoding(char)} or {@link #isInEncoding(char, char)} 
+     * for surrogate pairs.
+     * <p>
+     * This method is not a public API.
+     * @xsl.usage internal
+     */
+    public final char getHighChar() {
+        return m_highCharInContiguousGroup;
+    }
 
 }

diff --git a/src/org/apache/xml/serializer/Encodings.java b/src/org/apache/xml/serializer/Encodings.java
index 90d5f6f..47b7ef9 100644
--- a/src/org/apache/xml/serializer/Encodings.java
+++ b/src/org/apache/xml/serializer/Encodings.java

@@ -23,14 +23,11 @@
 import java.io.OutputStreamWriter;
 import java.io.UnsupportedEncodingException;
 import java.io.Writer;
-import java.lang.reflect.Method;
-import java.net.URL;
-import java.security.AccessController;
-import java.security.PrivilegedAction;
 import java.util.Enumeration;
 import java.util.Hashtable;
 import java.util.Properties;
 import java.util.StringTokenizer;
+import java.util.Vector;
 
 
 /**
@@ -56,7 +53,7 @@
      * <p>
      * This is not a public API.
      * @param output The output stream
-     * @param encoding The encoding
+     * @param encoding The encoding MIME name, not a Java name for the encoding.
      * @return A suitable writer
      * @throws UnsupportedEncodingException There is no convertor
      *  to support this encoding
@@ -72,9 +69,8 @@
             {
                 try
                 {
-                    return new OutputStreamWriter(
-                        output,
-                        _encodings[i].javaName);
+                	OutputStreamWriter osw = new OutputStreamWriter(output,_encodings[i].javaName);
+                    return osw; 
                 }
                 catch (java.lang.IllegalArgumentException iae) // java 1.1.8
                 {
@@ -100,7 +96,9 @@
 
     /**
      * Returns the EncodingInfo object for the specified
-     * encoding.
+     * encoding, never null, although the encoding name 
+     * inside the returned EncodingInfo object will be if
+     * we can't find a "real" EncodingInfo for the encoding.
      * <p>
      * This is not a public API.
      *
@@ -119,7 +117,7 @@
             ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding);
         if (ei == null) {
             // We shouldn't have to do this, but just in case.
-            ei = new EncodingInfo(null,null);
+            ei = new EncodingInfo(null,null, '\u0000');
         }
 
         return ei;
@@ -262,7 +260,8 @@
      * @param encoding non-null reference to encoding string, java style.
      *
      * @return ISO-style encoding string.
-     *
+     * <p>
+     * This method is not a public API.
      * @xsl.usage internal
      */
     public static String convertMime2JavaEncoding(String encoding)
@@ -311,57 +310,54 @@
             }
 
             int totalEntries = props.size();
-            int totalMimeNames = 0;
+
+            Vector encodingInfo_list = new Vector();
             Enumeration keys = props.keys();
             for (int i = 0; i < totalEntries; ++i)
             {
                 String javaName = (String) keys.nextElement();
                 String val = props.getProperty(javaName);
-                totalMimeNames++;
-                int pos = val.indexOf(' ');
-                for (int j = 0; j < pos; ++j)
-                    if (val.charAt(j) == ',')
-                        totalMimeNames++;
-            }
-            EncodingInfo[] ret = new EncodingInfo[totalMimeNames];
-            int j = 0;
-            keys = props.keys();
-            for (int i = 0; i < totalEntries; ++i)
-            {
-                String javaName = (String) keys.nextElement();
-                String val = props.getProperty(javaName);
-                int pos = val.indexOf(' ');
+                int len = lengthOfMimeNames(val);
+
                 String mimeName;
-                if (pos < 0)
+                char highChar;
+                if (len == 0)
                 {
-                    // Maybe report/log this problem?
-                    //  "Last printable character not defined for encoding " +
-                    //  mimeName + " (" + val + ")" ...
-                    mimeName = val;
+                    // There is no property value, only the javaName, so try and recover
+                    mimeName = javaName;
+                    highChar = '\u0000'; // don't know the high code point, will need to test every character
                 }
                 else
                 {
+                    try {
+                        // Get the substring after the Mime names
+                        final String highVal = val.substring(len).trim();
+                        highChar = (char) Integer.decode(highVal).intValue();
+                    }
+                    catch( NumberFormatException e) {
+                        highChar = 0;
+                    }
+                    String mimeNames = val.substring(0, len);
                     StringTokenizer st =
-                        new StringTokenizer(val.substring(0, pos), ",");
+                        new StringTokenizer(mimeNames, ",");
                     for (boolean first = true;
                         st.hasMoreTokens();
                         first = false)
                     {
                         mimeName = st.nextToken();
-                        ret[j] =
-                            new EncodingInfo(mimeName, javaName);
-                        _encodingTableKeyMime.put(
-                            mimeName.toUpperCase(),
-                            ret[j]);
+                        EncodingInfo ei = new EncodingInfo(mimeName, javaName, highChar);
+                        encodingInfo_list.add(ei);
+                        _encodingTableKeyMime.put(mimeName.toUpperCase(), ei);
                         if (first)
-                            _encodingTableKeyJava.put(
-                                javaName.toUpperCase(),
-                                ret[j]);
-                        j++;
+                            _encodingTableKeyJava.put(javaName.toUpperCase(), ei);
                     }
                 }
             }
-            return ret;
+            // Convert the Vector of EncodingInfo objects into an array of them,
+            // as that is the kind of thing this method returns.
+            EncodingInfo[] ret_ei = new EncodingInfo[encodingInfo_list.size()];
+            encodingInfo_list.toArray(ret_ei);
+            return ret_ei;
         }
         catch (java.net.MalformedURLException mue)
         {
@@ -372,6 +368,24 @@
             throw new org.apache.xml.serializer.utils.WrappedRuntimeException(ioe);
         }
     }
+    
+    /**
+     * Get the length of the Mime names within the property value
+     * @param val The value of the property, which should contain a comma
+     * separated list of Mime names, followed optionally by a space and the
+     * high char value
+     * @return
+     */
+    private static int lengthOfMimeNames(String val) {
+        // look for the space preceding the optional high char
+        int len = val.indexOf(' ');
+        // If len is zero it means the optional part is not there, so
+        // the value must be all Mime names, so set the length appropriately
+        if (len < 0)  
+            len = val.length();
+        
+        return len;
+    }
 
     /**
      * Return true if the character is the high member of a surrogate pair.
@@ -421,6 +435,37 @@
         int codePoint = ch;
         return codePoint;
     }
+    
+    /**
+     * Characters with values at or below the high code point are
+     * in the encoding. Code point values above this one may or may
+     * not be in the encoding, but lower ones certainly are.
+     * <p>
+     * This is for performance.
+     *
+     * @param encoding The encoding
+     * @return The code point for which characters at or below this code point
+     * are in the encoding. Characters with higher code point may or may not be
+     * in the encoding. A value of zero is returned if the high code point is unknown.
+     * <p>
+     * This method is not a public API.
+     * @xsl.usage internal
+     */
+    static public char getHighChar(String encoding)
+    {
+        final char highCodePoint;
+        EncodingInfo ei;
+
+        String normalizedEncoding = toUpperCaseFast(encoding);
+        ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding);
+        if (ei == null)
+            ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding);
+        if (ei != null)
+            highCodePoint =  ei.getHighChar();
+        else
+            highCodePoint = 0;
+        return highCodePoint;
+    }
 
     private static final Hashtable _encodingTableKeyJava = new Hashtable();
     private static final Hashtable _encodingTableKeyMime = new Hashtable();

diff --git a/src/org/apache/xml/serializer/Encodings.properties b/src/org/apache/xml/serializer/Encodings.properties
index e380f77..011ecea 100644
--- a/src/org/apache/xml/serializer/Encodings.properties
+++ b/src/org/apache/xml/serializer/Encodings.properties

@@ -16,103 +16,243 @@
 #
 # $Id$
 #
+# Each entry in this properties file is:
+# 1) The Java name for the encoding
+# 2) A comma separated list of the MIME names for the encoding,
+#    with the first one being the preferred MIME name.
+# 3) An optional high char. Characters at or below this value are
+#    definately in the encoding, but characters above it may or may not be.
+#    This value is given only for performance reasons.
+#    A value of zero is the same as no value at all.
+#
+# For example this line in this file:
+#              ASCII ASCII,US-ASCII 0x007F
+# Means the Java name for the encoding is "ASCII". The MIME names for this
+# encoding which may appear in a stylesheet are "ASCII" or "US-ASCII"
+# and the optional high code point value is given, and it is 0X007F
+# which means that the contiguous block of chars from
+# 0x0001 to 0x007F ( 127 in base 10) are all in the encoding.
+# Higher values above this char might be in the encoding, although in the 
+# case of this particular encoding there are no higher chars.
+#
+#
 # <JAVA name encoding>, <PREFERRED name MIME>
-# Peter Smolik
-Cp1250 WINDOWS-1250 0x00FF
-# Patch attributed to havardw@underdusken.no (Håvard Wigtil)
-Cp1251 WINDOWS-1251 0x00FF
-Cp1252 WINDOWS-1252 0x00FF
-ISO8859_1 ISO-8859-1 0x00FF
-# Patch attributed to havardw@underdusken.no (Håvard Wigtil)
-ISO8859-1 ISO-8859-1 0x00FF
-ISO8859_2 ISO-8859-2 0x00FF
-# I'm going to apply "ISO8859-X" variant to all these, to be safe.
-ISO8859-2 ISO-8859-2 0x00FF
-ISO8859_3 ISO-8859-3 0x00FF
-ISO8859-3 ISO-8859-3 0x00FF
-ISO8859_4 ISO-8859-4 0x00FF
-ISO8859-4 ISO-8859-4 0x00FF
-ISO8859_5 ISO-8859-5 0x00FF
-ISO8859-5 ISO-8859-5 0x00FF
-ISO8859_6 ISO-8859-6 0x00FF
-ISO8859-6 ISO-8859-6 0x00FF
-ISO8859_7 ISO-8859-7 0x00FF
-ISO8859-7 ISO-8859-7 0x00FF
-ISO8859_8 ISO-8859-8 0x00FF
-ISO8859-8 ISO-8859-8 0x00FF
-ISO8859_9 ISO-8859-9 0x00FF
-ISO8859-9 ISO-8859-9 0x00FF
-ISO8859_10 ISO-8859-10 0x00FF
-ISO8859-10 ISO-8859-10 0x00FF
-ISO8859_11 ISO-8859-11 0x00FF
-ISO8859-11 ISO-8859-11 0x00FF
-ISO8859_12 ISO-8859-12 0x00FF
-ISO8859-12 ISO-8859-12 0x00FF
-ISO8859_13 ISO-8859-13 0x00FF
-ISO8859-13 ISO-8859-13 0x00FF
-ISO8859_14 ISO-8859-14 0x00FF
-ISO8859-14 ISO-8859-14 0x00FF
-ISO8859_15 ISO-8859-15 0x00FF
-ISO8859-15 ISO-8859-15 0x00FF
-# # ?
-8859_1 ISO-8859-1 0x00FF
-8859_2 ISO-8859-2 0x00FF
-8859_3 ISO-8859-3 0x00FF
-8859_4 ISO-8859-4 0x00FF
-8859_5 ISO-8859-5 0x00FF
-8859_6 ISO-8859-6 0x00FF
-8859_7 ISO-8859-7 0x00FF
-8859_8 ISO-8859-8 0x00FF
-8859_9 ISO-8859-9 0x00FF
-8859-1 ISO-8859-1 0x00FF
-8859-2 ISO-8859-2 0x00FF
-8859-3 ISO-8859-3 0x00FF
-8859-4 ISO-8859-4 0x00FF
-8859-5 ISO-8859-5 0x00FF
-8859-6 ISO-8859-6 0x00FF
-8859-7 ISO-8859-7 0x00FF
-8859-8 ISO-8859-8 0x00FF
-8859-9 ISO-8859-9 0x00FF
-JIS ISO-2022-JP 0xFFFF
-ISO2022KR ISO-2022-KR 0xFFFF
-SJIS SHIFT_JIS 0xFFFF
-EUC_JP EUC-JP 0xFFFF
-EUC_KR EUC-KR 0xFFFF
-EUC_CN EUC-CN 0xFFFF
-EUC_TW EUC-TW 0xFFFF
-EUC_CN GB2312 0xFFFF
-EUC-JP EUC-JP 0xFFFF
-EUC-KR EUC-KR 0xFFFF
-EUC-CN EUC-CN 0xFFFF
-EUC-TW EUC-TW 0xFFFF
-EUC-CN GB2312 0xFFFF
-GB2312 GB2312 0xFFFF
-Big5 BIG5 0xFFFF
-EUCJIS EUC-JP 0xFFFF
-KSC5601 EUC-KR 0xFFFF
-KOI8_R KOI8-R 0xFFFF
-Cp037 EBCDIC-CP-US,EBCDIC-CP-CA,EBCDIC-CP-NL 0x00FF
-Cp277 EBCDIC-CP-DK,EBCDIC-CP-NO 0x00FF
-Cp278 EBCDIC-CP-FI,EBCDIC-CP-SE 0x00FF
-Cp280 EBCDIC-CP-IT 0x00FF
-Cp284 EBCDIC-CP-ES 0x00FF
-Cp285 EBCDIC-CP-GB 0x00FF
-Cp297 EBCDIC-CP-FR 0x00FF
-Cp420 EBCDIC-CP-AR1 0x00FF
-Cp424 EBCDIC-CP-HE 0x00FF
-Cp500 EBCDIC-CP-CH 0x00FF
-Cp850 850,csPC850Multilingual 0xFFFF
-Cp860 860,csIBM860 0xFFFF
-Cp870 EBCDIC-CP-ROECE,EBCDIC-CP-YU 0x00FF
-Cp871 EBCDIC-CP-IS 0x00FF
-Cp918 EBCDIC-CP-AR2 0x00FF
-Cp1047 IBM1047,IBM-1047 0x00FF
-MacTEC MacRoman 0x00FF
-ASCII ASCII,US-ASCII 0x007F
-Unicode UNICODE,UTF-16 0xFFFF
-UTF8 UTF-8 0xFFFF
-# patch attributed to Jinsung Lee
-KS_C_5601-1987 KS_C_5601-1987,iso-ir-149,KS_C_5601-1989,KSC_5601,csKSC56011987 0xFFFF
+#
+#
+ASCII      ASCII,US-ASCII                         0x007F
+#
+# Big5, Traditional Chinese 
+Big5       BIG5,csBig5                            0x007F
+#Big5 with Hong Kong extensions, Traditional Chinese (incorporating 2001 revision) 
+Big5_HKSCS BIG5-HKSCS                             0x007F
+# USA, Canada (Bilingual, French), Netherlands, Portugal, Brazil, Australia
+Cp037      EBCDIC-CP-US,EBCDIC-CP-CA,EBCDIC-CP-WT,EBCDIC-CP-NL,IBM037 0x0019
+# IBM Austria, Germany 
+Cp273      IBM273,csIBM273                        0x0019
+Cp274      csIBM274,EBCDIC-BE
+Cp275      csIBM275,EBCDIC-BR
+# IBM Denmark, Norway 
+Cp277      EBCDIC-CP-DK,EBCDIC-CP-NO,IBM277,csIBM277    0x0019
+# IBM Finland, Sweden 
+Cp278      EBCDIC-CP-FI,EBCDIC-CP-SE,IBM278,csIBM278    0x0019
+# IBM Italy
+Cp280      EBCDIC-CP-IT,IBM280,csIBM280           0x0019
+Cp281      EBCDIC-JP-E,csIBM281
+# IBM Catalan/Spain, Spanish Latin America 
+Cp284      EBCDIC-CP-ES,IBM284,csIBM284           0x0019
+# IBM United Kingdom, Ireland
+Cp285      EBCDIC-CP-GB,IBM284,csIBM285           0x0019
+Cp290      EBCDIC-JP-kana,IBM290,csIBM290         0x0019
+# IBM France
+Cp297      EBCDIC-CP-FR,IBM297,csIBM297           0x0019
+# IBM Arabic
+Cp420      EBCDIC-CP-AR1,IBM420,csIBM420          0x0019
+Cp423      EBCDIC-CP-GR,IBM423,csIBM423
+# IBM Hebrew
+Cp424      EBCDIC-CP-HE,IBM424,csIBM424           0x0019
+Cp437      437,IBM437,csPC8CodePage437            0x007F
+# EBCDIC 500V1
+Cp500      EBCDIC-CP-CH,EBCDIC-CP-BE,IBM500,csIBM500    0x0019
+# PC Baltic
+Cp775      IBM775,csPC775Baltic                   0x007F
+# IBM Thailand extended SBCS 
+Cp838      IBM-Thai,838,csIBMThai                 0x0019
+# MS-DOS Latin-1
+Cp850      850,csPC850Multilingual,IBM850         0x007F
+Cp851      851,IBM851,csIBM851
+# MS-DOS Latin-2
+Cp852      IBM852,852,csPCp852                    0x007F
+# IBM Cyrillic
+Cp855      IBM855,855,csIBM855                    0x007F
+# IBM Turkish
+Cp857      IBM857,857,csIBM857                    0x007F
+# Variant of Cp850 with Euro character 
+Cp858      IBM00858                               0x007F
+# MS-DOS Portuguese
+Cp860      860,csIBM860,IBM860                    0x007F
+# MS-DOS Icelandic
+Cp861      IBM861,861,csIBM861,cp-is              0x007F
+#
+Cp862      IBM862,862,csPCi62LatinHebrew          0x007F
+# MS-DOS Canadian French
+Cp863      IBM863,863,csIBM863                    0x007F
+# PC Arabic 
+Cp864      IBM864,864,csIBM864                    0x007F
+# MS-DOS Nordic 
+Cp865      IBM865,865,csIBM865                    0x007F
+# MS-DOS Russian 
+Cp866      IBM866,866,csIBM866                    0x007F
+# MS-DOS Pakistan 
+Cp868      IBM868,cp-ar,csIBM868                  0x007F
+# IBM Modern Greek 
+Cp869      IBM869,869,cp-gr,csIBM869              0x007F
+# IBM Multilingual Latin-2 
+Cp870      EBCDIC-CP-ROECE,EBCDIC-CP-YU,IBM870,csIBM870 0x0019
+# IBM Iceland 
+Cp871      EBCDIC-CP-IS,IBM871,csIBM871           0x0019
+Cp880      EBCDIC-Cyrillic,IBM880,csIBM880
+Cp891      IBM891,csIBM891
+Cp903      IBM903,csIBM903
+Cp904      IBM904,csIBM904
+Cp905      IBM905,csIBM905,EBCDIC-CP-TR
+# IBM Pakistan (Urdu)
+Cp918      EBCDIC-CP-AR2,IBM918,csIBM918          0x0019
+# GBK, Simplified Chinese 
+Cp936      GBK,MS936,WINDOWS-936
+# IBM Latin-5, Turkey 
+Cp1026     IBM1026,csIBM1026                      0x0019
+# Latin-1 character set for EBCDIC hosts 
+Cp1047     IBM1047,IBM-1047                       0x0019
+# Variant of Cp037 with Euro character 
+Cp1140     IBM01140                               0x0019
+# Variant of Cp273 with Euro character 
+Cp1141     IBM01141                               0x0019
+# Variant of Cp277 with Euro character 
+Cp1142     IBM01142                               0x0019
+# Variant of Cp278 with Euro character 
+Cp1143     IBM01143                               0x0019
+# Variant of Cp280 with Euro character 
+Cp1144     IBM01144                               0x0019
+# Variant of Cp284 with Euro character 
+Cp1145     IBM01145                               0x0019
+# Variant of Cp285 with Euro character 
+Cp1146     IBM01146                               0x0019
+# Variant of Cp297 with Euro character 
+Cp1147     IBM01147                               0x0019
+# Variant of Cp500 with Euro character 
+Cp1148     IBM01148                               0x0019
+# Variant of Cp871 with Euro character 
+Cp1149     IBM01149                               0x0019
+Cp1250     WINDOWS-1250                           0x007F
+Cp1251     WINDOWS-1251                           0x007F
+Cp1252     WINDOWS-1252                           0x007F
+Cp1253     WINDOWS-1253                           0x007F
+Cp1254     WINDOWS-1254                           0x007F
+# Windows Hebrew 
+Cp1255     WINDOWS-1255                           0x007F
+# Windows Arabic
+Cp1256     WINDOWS-1256                           0x007F
+Cp1257     WINDOWS-1257                           0x007F
+# Windows Vietnamese
+Cp1258     WINDOWS-1258                           0x007F
+EUC-CN     EUC-CN                                 0x007F
+EUC_CN     EUC-CN                                 0x007F
+#
+#JISX 0201, 0208 and 0212, EUC encoding Japanese
+EUC-JP     EUC-JP                                 0x007F
+EUC_JP     EUC-JP                                 0x007F
+# KS C 5601, EUC encoding, Korean 
+EUC-KR     EUC-KR                                 0x007F
+EUC_KR     EUC-KR                                 0x007F
+# CNS11643 (Plane 1-7,15), EUC encoding, Traditional Chinese
+EUC-TW     EUC-TW                                 0x007F
+EUC_TW     EUC-TW,x-EUC-TW                        0x007F
+EUCJIS     EUC-JP                                 0x007F
+#
+# GB2312, EUC encoding, Simplified Chinese 
+GB2312     GB2312                                 0x007F
+
+# GB2312 and CNS11643 in ISO 2022 CN form, Simplified and Traditional Chinese (conversion to Unicode only) 
+ISO2022CN  ISO-2022-CN
+# JIS X 0201, 0208, in ISO 2022 form, Japanese 
+ISO2022JP  ISO-2022-JP
+# ISO 2022 KR, Korean 
+ISO2022KR  ISO-2022-KR                            0x007F
+#
+#
+ISO8859-1  ISO-8859-1                             0x00FF
+ISO8859_1  ISO-8859-1                             0x00FF
+8859-1     ISO-8859-1                             0x00FF
+8859_1     ISO-8859-1                             0x00FF
+#
+ISO8859-2  ISO-8859-2                             0x00A0
+ISO8859_2  ISO-8859-2                             0x00A0
+8859-2     ISO-8859-2                             0x00A0
+8859_2     ISO-8859-2                             0x00A0
+#
+# Latin Alphabet No. 3 
+ISO8859-3  ISO-8859-3                             0x00A0
+ISO8859_3  ISO-8859-3                             0x00A0
+8859-3     ISO-8859-3                             0x00A0
+8859_3     ISO-8859-3                             0x00A0
+#
+ISO8859-4  ISO-8859-4                             0x00A0
+ISO8859_4  ISO-8859-4                             0x00A0
+8859-4     ISO-8859-4                             0x00A0
+8859_4     ISO-8859-4                             0x00A0
+#
+ISO8859-5  ISO-8859-5                             0x00A0
+ISO8859_5  ISO-8859-5                             0x00A0
+8859-5     ISO-8859-5                             0x00A0
+8859_5     ISO-8859-5                             0x00A0
+#
+# Latin/Arabic Alphabet 
+ISO8859-6  ISO-8859-6                             0x00A0
+ISO8859_6  ISO-8859-6                             0x00A0
+8859-6     ISO-8859-6                             0x00A0
+8859_6     ISO-8859-6                             0x00A0
+#
+ISO8859-7  ISO-8859-7                             0x00A0
+ISO8859_7  ISO-8859-7                             0x00A0
+8859-7     ISO-8859-7                             0x00A0
+8859_7     ISO-8859-7                             0x00A0
+#
+ISO8859-8  ISO-8859-8                             0x00A0
+ISO8859_8  ISO-8859-8                             0x00A0
+8859-8     ISO-8859-8                             0x00A0
+8859_8     ISO-8859-8                             0x00A0
+#
+ISO8859-9  ISO-8859-9                             0x00CF
+ISO8859_9  ISO-8859-9                             0x00CF
+8859-9     ISO-8859-9                             0x00CF
+8859_9     ISO-8859-9                             0x00CF
+#
+ISO8859-10 ISO-8859-10                            0x007E
+ISO8859_10 ISO-8859-10                            0x007E
+ISO8859-11 ISO-8859-11                            0x007E
+ISO8859_11 ISO-8859-11                            0x007E
+ISO8859-12 ISO-8859-12                            0x007F
+ISO8859_12 ISO-8859-12                            0x007F
+ISO8859-13 ISO-8859-13                            0x00A0
+ISO8859_13 ISO-8859-13                            0x00A0
+ISO8859-14 ISO-8859-14                            0x007E
+ISO8859_14 ISO-8859-14                            0x007E
+ISO8859-15 ISO-8859-15                            0x00A3
+ISO8859_15 ISO-8859-15                            0x00A3
+JIS        ISO-2022-JP                            0x007F
+KOI8_R     KOI8-R                                 0x007F
+KSC5601    EUC-KR                                 0x007F
+KS_C_5601-1987 KS_C_5601-1987,iso-ir-149,KS_C_5601-1989,KSC_5601,csKSC56011987  0x007F
+MacTEC     MacRoman
+# Windows Japanese
+MS932      windows-31j
+# Shift-JIS, Japanese 
+SJIS       SHIFT_JIS                              0x007F
+# TIS620, Thai
+TIS620     TIS-620
+UTF8       UTF-8                                  0xD7FF
+Unicode    UNICODE,UTF-16                         0xFFFF
+
 # note that more character set names and their aliases
 # can be found at http://www.iana.org/assignments/character-sets
 

diff --git a/src/org/apache/xml/serializer/SerializerBase.java b/src/org/apache/xml/serializer/SerializerBase.java
index d64af88..dce518c 100644
--- a/src/org/apache/xml/serializer/SerializerBase.java
+++ b/src/org/apache/xml/serializer/SerializerBase.java

@@ -19,7 +19,6 @@
 package org.apache.xml.serializer;
 
 import java.io.IOException;
-import java.util.Vector;
 
 import javax.xml.transform.SourceLocator;
 import javax.xml.transform.Transformer;
@@ -156,7 +155,7 @@
      * The character encoding.  Must match the encoding used for the
      * printWriter.
      */
-    private String m_encoding = null;
+    String m_encoding = null;
 
     /**
      * Tells if we should write the XML declaration.
@@ -564,9 +563,9 @@
      * Sets the character encoding coming from the xsl:output encoding stylesheet attribute.
      * @param m_encoding the character encoding
      */
-    public void setEncoding(String m_encoding)
+    public void setEncoding(String encoding)
     {
-        this.m_encoding = m_encoding;
+        this.m_encoding = encoding;
     }
 
     /**
@@ -1469,40 +1468,20 @@
 
         if (null != m_StringOfCDATASections)
         {
-            String localName = m_elemContext.m_elementLocalName;
-            if (localName == null) 
+            if (m_elemContext.m_elementLocalName == null) 
             {
-                localName =  getLocalName(m_elemContext.m_elementName); 
+                String localName =  getLocalName(m_elemContext.m_elementName); 
                 m_elemContext.m_elementLocalName = localName;                   
             }
             
-            String uri = m_elemContext.m_elementURI; 
-            if ( uri == null)
-            {
-                String prefix = getPrefixPart(m_elemContext.m_elementName);
-                if (prefix != null) {
-                    uri = m_prefixMap.lookupNamespace(prefix);
-                    if (uri != null) 
-                        m_elemContext.m_elementURI = uri;
-                    else
-                        uri = "";                        
-                }
-                else {
-                    // no prefix so lookup the URI of the default namespace
-                    uri = m_prefixMap.lookupNamespace("");
-                    if (uri == null)  // If no URI then the empty string also means no URI
-                        uri = "";
-                }
-            }
-            else {
-                if (m_elemContext.m_elementURI.length() == 0)
-                m_elemContext.m_elementURI = null;
-            }             
+            if (m_elemContext.m_elementURI == null)
+                m_elemContext.m_elementURI = getElementURI();
+                
 
             java.util.Hashtable h = (java.util.Hashtable) m_CdataElems.get(m_elemContext.m_elementLocalName);
             if (h != null) 
             {
-                Object obj = h.get(uri);
+                Object obj = h.get(m_elemContext.m_elementURI);
                 if (obj != null)
                     b = true; 
             }
@@ -1510,5 +1489,39 @@
         }
         return b;
     }
-}
+    
+    /**
+     * Before this call m_elementContext.m_elementURI is null,
+     * which means it is not yet known. After this call it
+     * is non-null, but possibly "" meaning that it is in the
+     * default namespace.
+     * 
+     * @return The URI of the element, never null, but possibly "".
+     */
+    private String getElementURI() {
+        String uri = null;
+        // At this point in processing we have received all the
+        // namespace mappings
+        // As we still don't know the elements namespace,
+        // we now figure it out.
 
+        String prefix = getPrefixPart(m_elemContext.m_elementName);
+
+        if (prefix == null) {
+            // no prefix so lookup the URI of the default namespace
+            uri = m_prefixMap.lookupNamespace("");
+        } else {
+            uri = m_prefixMap.lookupNamespace(prefix);
+        }
+        if (uri == null) {
+            // We didn't find the namespace for the
+            // prefix ... ouch, that shouldn't happen.
+            // This is a hack, we really don't know
+            // the namespace
+            uri = EMPTYSTRING;
+        }
+
+        return uri;
+    }
+}
+    

diff --git a/src/org/apache/xml/serializer/ToStream.java b/src/org/apache/xml/serializer/ToStream.java
index ccf193c..133eaa6 100644
--- a/src/org/apache/xml/serializer/ToStream.java
+++ b/src/org/apache/xml/serializer/ToStream.java

@@ -22,6 +22,7 @@
 import java.io.OutputStream;
 import java.io.UnsupportedEncodingException;
 import java.io.Writer;
+import java.util.EmptyStackException;
 import java.util.Enumeration;
 import java.util.Properties;
 import java.util.StringTokenizer;
@@ -40,8 +41,6 @@
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-//import com.sun.media.sound.IESecurity;
-
 /**
  * This abstract class is a base class for other stream 
  * serializers (xml, html, text ...) that write output to a stream.
@@ -69,7 +68,7 @@
      * single chars or surrogate pairs of high/low chars form
      * characters in the output encoding. 
      */
-    EncodingInfo m_encodingInfo = new EncodingInfo(null,null);
+    EncodingInfo m_encodingInfo = new EncodingInfo(null,null, '\u0000');
     
     /**
      * Stack to keep track of whether or not we need to
@@ -422,9 +421,9 @@
         // characters are written to the output writer.
         if (m_tracer != null
          && !(writer instanceof SerializerTraceWriter)  )
-            m_writer = new SerializerTraceWriter(writer, m_tracer);
+            setWriterInternal(new SerializerTraceWriter(writer, m_tracer), false);
         else
-            m_writer = writer;        
+            setWriterInternal(writer, false);        
         
         if (m_format == null)
             m_format = new java.util.Properties();
@@ -499,16 +498,16 @@
         }
 
         // initCharsMap();
-        String encoding = getEncoding();
-        if (null == encoding)
-        {
-            encoding =
-                Encodings.getMimeEncoding(
-                    format.getProperty(OutputKeys.ENCODING));
-            setEncoding(encoding);
+        String previous_encoding = getEncoding();
+        String possible_encoding =  
+            Encodings.getMimeEncoding(format.getProperty(OutputKeys.ENCODING));
+        if (previous_encoding == null || defaultProperties == false) {
+        	// Only set the encoding if there was no previous encoding, or if we are
+        	// setting a value that is not a default value, because we don't
+        	// want to stomp on a previously set non-default one with the default one.
+        	setEncoding(possible_encoding);
         }
 
-        m_isUTF8 = encoding.equals(Encodings.DEFAULT_MIME_ENCODING);
 
         // Access this only from the Hashtable level... we don't want to 
         // get default properties.
@@ -537,7 +536,7 @@
                 w2 = ((WriterChain)w2).getWriter();
             }
             if (noTracerYet)
-                m_writer = new SerializerTraceWriter(m_writer, m_tracer);
+                setWriterInternal(new SerializerTraceWriter(m_writer, m_tracer), false);
         }
     }
 
@@ -571,21 +570,24 @@
         throws UnsupportedEncodingException
     {
 
-        String encoding = getEncoding();
-        if (encoding == null)
-        {
-            // if not already set then get it from the properties
-            encoding =
-                Encodings.getMimeEncoding(
-                    format.getProperty(OutputKeys.ENCODING));
-            setEncoding(encoding);
+        // Get the encoding in the format Properties, or UTF-8 if none in the format
+    	String previous_encoding = getEncoding();
+        String possible_encoding = Encodings.getMimeEncoding(format.getProperty(OutputKeys.ENCODING));
+        if (previous_encoding == null || defaultProperties == false ) {
+        	// Lets not stomp on an encoding that was already set, with one that is only coming from
+        	// a default set of properties.  So only do this setting of the encoding if either there
+        	// was no previously set encoding, or if this is not a default value for the encoding
+        	setEncoding(possible_encoding);
         }
-
-        if (encoding.equalsIgnoreCase("UTF-8"))
+        
+        // When all is said and done encoding may be possible_encoding, or
+        // if there was a problem with that one the encoding will be unchanged, so
+        // just get what it is.
+        String encoding = getEncoding();
+        	
+        
+        if (Encodings.DEFAULT_MIME_ENCODING.equalsIgnoreCase(encoding))
         {
-            m_isUTF8 = true;
-         
-
                 init(
                     new WriterToUTF8Buffered(output),
                     format,
@@ -595,22 +597,30 @@
 
         }
         else if (
-            encoding.equals("WINDOWS-1250")
-                || encoding.equals("US-ASCII")
-                || encoding.equals("ASCII"))
+        		"WINDOWS-1250".equals(encoding)
+                || "US-ASCII".equals(encoding)
+                || "ASCII".equals(encoding))
         {
             init(new WriterToASCI(output), format, defaultProperties, true);
         }
         else
         {
-            Writer osw;
+            Writer osw = null;
 
-            try
-            {
-                osw = Encodings.getWriter(output, encoding);
+            if (encoding == null)
+            	encoding = possible_encoding;
+            else {
+            	try
+            	{
+            		osw = Encodings.getWriter(output, encoding);
+            	}
+            	catch (UnsupportedEncodingException uee)
+            	{
+            		osw = null;
+            	}
             }
-            catch (UnsupportedEncodingException uee)
-            {
+            
+            if (osw == null) {
                 System.out.println(
                     "Warning: encoding \""
                         + encoding
@@ -651,9 +661,16 @@
         // characters are written to the output writer.
         if (m_tracer != null
          && !(writer instanceof SerializerTraceWriter)  )
-            m_writer = new SerializerTraceWriter(writer, m_tracer);
+            setWriterInternal(new SerializerTraceWriter(writer, m_tracer), true);
         else
-            m_writer = writer;
+            setWriterInternal(writer, true);
+    }
+    
+    private boolean m_writer_set_by_user;
+    private void setWriterInternal(Writer writer, boolean setByUser) {
+        if (setByUser)
+            m_writer_set_by_user = true;
+        m_writer = writer;
     }
     
     /**
@@ -2977,7 +2994,7 @@
         super.setTransformer(transformer);
         if (m_tracer != null
          && !(m_writer instanceof SerializerTraceWriter)  )
-            m_writer = new SerializerTraceWriter(m_writer, m_tracer);        
+            setWriterInternal(new SerializerTraceWriter(m_writer, m_tracer), false);        
         
         
     }
@@ -3031,7 +3048,8 @@
          this.m_lineSepUse = true;
          // DON'T SET THE WRITER TO NULL, IT MAY BE REUSED !!
          // this.m_writer = null;  
-         this.m_expandDTDEntities = true;      
+         this.m_expandDTDEntities = true;     
+         this.m_writer_set_by_user = false;
  
     }        
     
@@ -3041,34 +3059,58 @@
       */
      public void setEncoding(String encoding)
      {
-         String old = getEncoding();
-         super.setEncoding(encoding); 
+         final String old = getEncoding();
          if (old == null || !old.equals(encoding)) {        
-            // If we have changed the setting of the 
-            m_encodingInfo = Encodings.getEncodingInfo(encoding);
+            // We are trying to change the setting of the encoding to a different value
+            // from what it was
             
-            if (encoding != null && m_encodingInfo.name == null) {
+            EncodingInfo encodingInfo = Encodings.getEncodingInfo(encoding);
+            if (encoding != null && encodingInfo.name == null) {
             	// We tried to get an EncodingInfo for Object for the given
             	// encoding, but it came back with an internall null name
             	// so the encoding is not supported by the JDK, issue a message.
-            	String msg = Utils.messages.createMessage(
+            	final String msg = Utils.messages.createMessage(
             			MsgKey.ER_ENCODING_NOT_SUPPORTED,new Object[]{ encoding });
+            	
+            	final String msg2 = 
+            		"Warning: encoding \"" + encoding + "\" not supported, using "
+                        + Encodings.DEFAULT_MIME_ENCODING;
             	try 
             	{
             		// Prepare to issue the warning message
-            		Transformer tran = super.getTransformer();
+            		final Transformer tran = super.getTransformer();
             		if (tran != null) {
-            			ErrorListener errHandler = tran.getErrorListener();
+            			final ErrorListener errHandler = tran.getErrorListener();
             			// Issue the warning message
-            			if (null != errHandler && m_sourceLocator != null)
+            			if (null != errHandler && m_sourceLocator != null) {
             				errHandler.warning(new TransformerException(msg, m_sourceLocator));
-            			else
+            				errHandler.warning(new TransformerException(msg2, m_sourceLocator));
+            			}
+            			else {
             				System.out.println(msg);
+            				System.out.println(msg2);
+            			}
             	    }
-            		else
+            		else {
             			System.out.println(msg);
+            			System.out.println(msg2);
+            		}
             	}
             	catch (Exception e){}
+            	
+            	// We said we are using UTF-8, so use it
+            	encoding = Encodings.DEFAULT_MIME_ENCODING;
+            	encodingInfo = Encodings.getEncodingInfo(encoding);
+            	//if (m_format != null) 
+            	//	m_format.setProperty(OutputKeys.ENCODING,Encodings.DEFAULT_MIME_ENCODING);
+            } else {
+
+            // Either the encoding was good, or it was forced into UTF-8. 
+            // In any case we remember it for later.
+            m_encodingInfo = encodingInfo;
+            this.m_encoding = encoding;                
+            if (encoding != null)
+            	m_isUTF8 = encoding.equals(Encodings.DEFAULT_MIME_ENCODING);
             }
          }
          return;
commit	382dacd8684b81dfd45915be7544254651b17788	[log] [tgz]
author	Brian James Minchau <minchau@apache.org>	Sat Jan 28 17:10:42 2006 +0000
committer	Brian James Minchau <minchau@apache.org>	Sat Jan 28 17:10:42 2006 +0000
tree	0d283d2b876007372842f49af5de66a67c26bee1
parent	9fdfb02ecddb27649edc3ceba1f7da2d6ebbf50b [diff]