Committing patch in XALANJ-2271 which fixes a bug in outputing XML 1.1 attributes. It is also a general clean-up of code related to whether particular characters have entities, or should be written as character entities, etc. The code is tricky because it all depends on: > method type (xml, html, text) > character in a text node? > character in an XML attribute value? > character in an HTML URL attribute value? The old code had a concept that the character was "special" but put plenty band-aids on that CharInfo API call. New code has far fewer band-aids. CharInfo basically knows if the character is mapped to a String (e.g. '<' mapping to "<") and leave it more explicitly to the methods (e.g. ToXMLStream) output method (e.g. characters() or writeAttr() ...) whether it is a character in a text node, or an attribute value, or

commit: 666e9b9df6f9f4bef281d4547d088bfa67009bad [log] [tgz]
author: Brian James Minchau <minchau@apache.org> Tue Mar 07 16:44:53 2006 +0000
committer: Brian James Minchau <minchau@apache.org> Tue Mar 07 16:44:53 2006 +0000
tree: f4c27ffa7967d6243665d643316d962001dd47d3
parent: 4a54e5ffff9c553a9a9afda566b1d18a601f5390 [diff]
diff --git a/src/org/apache/xml/serializer/CharInfo.java b/src/org/apache/xml/serializer/CharInfo.java
index 9389ac9..65bb5f4 100644
--- a/src/org/apache/xml/serializer/CharInfo.java
+++ b/src/org/apache/xml/serializer/CharInfo.java

@@ -1,5 +1,5 @@
 /*
- * Copyright 1999-2004 The Apache Software Foundation.
+ * Copyright 1999-2006 The Apache Software Foundation.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 import java.io.UnsupportedEncodingException;
 import java.net.URL;
 import java.util.Enumeration;
+import java.util.HashMap;
 import java.util.Hashtable;
 import java.util.PropertyResourceBundle;
 import java.util.ResourceBundle;
@@ -50,7 +51,7 @@
 final class CharInfo
 {
     /** Given a character, lookup a String to output (e.g. a decorated entity reference). */
-    private Hashtable m_charToString = new Hashtable();
+    private HashMap m_charToString;
 
     /**
      * The name of the HTML entities file.
@@ -67,42 +68,50 @@
                 SerializerBase.PKG_NAME+".XMLEntities";
 
     /** The horizontal tab character, which the parser should always normalize. */
-    public static final char S_HORIZONAL_TAB = 0x09;
+    static final char S_HORIZONAL_TAB = 0x09;
 
     /** The linefeed character, which the parser should always normalize. */
-    public static final char S_LINEFEED = 0x0A;
+    static final char S_LINEFEED = 0x0A;
 
     /** The carriage return character, which the parser should always normalize. */
-    public static final char S_CARRIAGERETURN = 0x0D;
+    static final char S_CARRIAGERETURN = 0x0D;
+    static final char S_SPACE = 0x20;
+    static final char S_QUOTE = 0x22;
+    static final char S_LT = 0x3C;
+    static final char S_GT = 0x3E;
+    static final char S_NEL = 0x85;    
+    static final char S_LINE_SEPARATOR = 0x2028;
     
     /** This flag is an optimization for HTML entities. It false if entities 
      * other than quot (34), amp (38), lt (60) and gt (62) are defined
      * in the range 0 to 127.
      * @xsl.usage internal
      */    
-    final boolean onlyQuotAmpLtGt;
+    boolean onlyQuotAmpLtGt;
     
     /** Copy the first 0,1 ... ASCII_MAX values into an array */
-    private static final int ASCII_MAX = 128;
+    static final int ASCII_MAX = 128;
     
     /** Array of values is faster access than a set of bits 
-     * to quickly check ASCII characters in attribute values. 
+     * to quickly check ASCII characters in attribute values,
+     * the value is true if the character in an attribute value
+     * should be mapped to a String. 
      */
-    private boolean[] isSpecialAttrASCII = new boolean[ASCII_MAX];
+    private final boolean[] shouldMapAttrChar_ASCII;
     
     /** Array of values is faster access than a set of bits 
-     * to quickly check ASCII characters in text nodes. 
+     * to quickly check ASCII characters in text nodes, 
+     * the value is true if the character in a text node
+     * should be mapped to a String. 
      */
-    private boolean[] isSpecialTextASCII = new boolean[ASCII_MAX];
-
-    private boolean[] isCleanTextASCII = new boolean[ASCII_MAX];
+    private final boolean[] shouldMapTextChar_ASCII;
 
     /** An array of bits to record if the character is in the set.
      * Although information in this array is complete, the
      * isSpecialAttrASCII array is used first because access to its values
      * is common and faster.
      */   
-    private int array_of_bits[] = createEmptySetOfIntegers(65535);
+    private final int array_of_bits[];
      
     
     // 5 for 32 bit words,  6 for 64 bit words ...
@@ -133,33 +142,38 @@
 
 
     /**
-     * Constructor that reads in a resource file that describes the mapping of
-     * characters to entity references.
-     * This constructor is private, just to force the use
-     * of the getCharInfo(entitiesResource) factory
+     * A base constructor just to explicitly create the fields,
+     * with the exception of m_charToString which is handled
+     * by the constructor that delegates base construction to this one.
+     * <p>
+     * m_charToString is not created here only for performance reasons,
+     * to avoid creating a Hashtable that will be replaced when
+     * making a mutable copy, {@link #mutableCopyOf(CharInfo)}. 
      *
-     * Resource files must be encoded in UTF-8 and can either be properties
-     * files with a .properties extension assumed.  Alternatively, they can
-     * have the following form, with no particular extension assumed:
-     *
-     * <pre>
-     * # First char # is a comment
-     * Entity numericValue
-     * quot 34
-     * amp 38
-     * </pre>
-     *    
-     * @param entitiesResource Name of properties or resource file that should
-     * be loaded, which describes that mapping of characters to entity
-     * references.
      */
-    private CharInfo(String entitiesResource, String method)
+    private CharInfo() 
     {
-        this(entitiesResource, method, false);
-    }
+    	this.array_of_bits = createEmptySetOfIntegers(65535);
+    	this.firstWordNotUsed = 0;
+    	this.shouldMapAttrChar_ASCII = new boolean[ASCII_MAX];
+    	this.shouldMapTextChar_ASCII = new boolean[ASCII_MAX];
+    	this.m_charKey = new CharKey();
+    	
+    	// Not set here, but in a constructor that uses this one
+    	// this.m_charToString =  new Hashtable();  
+    	
+    	this.onlyQuotAmpLtGt = true;
+    	
 
+    	return;
+    }
+    
     private CharInfo(String entitiesResource, String method, boolean internal)
     {
+    	// call the default constructor to create the fields
+    	this();
+    	m_charToString = new HashMap();
+
         ResourceBundle entities = null;
         boolean noExtraEntities = true;
 
@@ -185,12 +199,10 @@
                 String name = (String) keys.nextElement();
                 String value = entities.getString(name);
                 int code = Integer.parseInt(value);
-                defineEntity(name, (char) code);
-                if (extraEntity(code))
+                boolean extra = defineEntity(name, (char) code);
+                if (extra)
                     noExtraEntities = false;
             }
-            set(S_LINEFEED);
-            set(S_CARRIAGERETURN);
         } else {
             InputStream is = null;
 
@@ -274,8 +286,8 @@
 
                             int code = Integer.parseInt(value);
 
-                            defineEntity(name, (char) code);
-                            if (extraEntity(code))
+                            boolean extra = defineEntity(name, (char) code);
+                            if (extra)
                                 noExtraEntities = false;
                         }
                     }
@@ -284,8 +296,6 @@
                 }
 
                 is.close();
-                set(S_LINEFEED);
-                set(S_CARRIAGERETURN);
             } catch (Exception e) {
                 throw new RuntimeException(
                     Utils.messages.createMessage(
@@ -302,31 +312,8 @@
                 }
             }
         }
-          
-        /* initialize the array isCleanTextASCII[] with a cache of values
-         * for use by ToStream.character(char[], int , int)
-         * and the array isSpecialTextASCII[] with the opposite values
-         * (all in the name of performance!)
-         */
-        for (int ch = 0; ch <ASCII_MAX; ch++)
-        if((((0x20 <= ch || (0x0A == ch || 0x0D == ch || 0x09 == ch)))
-             && (!get(ch))) || ('"' == ch))
-        {
-            isCleanTextASCII[ch] = true;
-            isSpecialTextASCII[ch] = false;
-        }
-        else {
-            isCleanTextASCII[ch] = false;
-            isSpecialTextASCII[ch] = true;     
-        }       
-        
-
 
         onlyQuotAmpLtGt = noExtraEntities;
-
-        // initialize the array with a cache of the BitSet values
-        for (int i=0; i<ASCII_MAX; i++)
-            isSpecialAttrASCII[i] = get(i);   
             
         /* Now that we've used get(ch) just above to initialize the
          * two arrays we will change by adding a tab to the set of 
@@ -338,8 +325,19 @@
          * This is the reason for this delay.
          */
         if (Method.XML.equals(method)) 
-        {
-            isSpecialAttrASCII[S_HORIZONAL_TAB] = true;
+        {       
+            // We choose not to escape the quotation mark as &quot; in text nodes
+            shouldMapTextChar_ASCII[S_QUOTE] = false;
+        }
+        
+        if (Method.HTML.equals(method)) {
+        	// The XSLT 1.0 recommendation says 
+        	// "The html output method should not escape < characters occurring in attribute values."
+        	// So we don't escape '<' in an attribute for HTML
+        	shouldMapAttrChar_ASCII['<'] = false;    
+        	
+        	// We choose not to escape the quotation mark as &quot; in text nodes.
+            shouldMapTextChar_ASCII[S_QUOTE] = false;
         }
     }
 
@@ -348,23 +346,37 @@
      * supplied. Nothing happens if the character reference is already defined.
      * <p>Unlike internal entities, character references are a string to single
      * character mapping. They are used to map non-ASCII characters both on
-     * parsing and printing, primarily for HTML documents. '&lt;amp;' is an
+     * parsing and printing, primarily for HTML documents. '&amp;lt;' is an
      * example of a character reference.</p>
      *
      * @param name The entity's name
      * @param value The entity's value
+     * @return true if the mapping is not one of:
+     * <ul>
+     * <li> '<' to "&lt;"
+     * <li> '>' to "&gt;"
+     * <li> '&' to "&amp;"
+     * <li> '"' to "&quot;"
+     * </ul>
      */
-    private void defineEntity(String name, char value)
+    private boolean defineEntity(String name, char value)
     {
         StringBuffer sb = new StringBuffer("&");
         sb.append(name);
         sb.append(';');
         String entityString = sb.toString();
         
-        defineChar2StringMapping(entityString, value);
+        boolean extra = defineChar2StringMapping(entityString, value);
+        return extra;
     }
 
-    private CharKey m_charKey = new CharKey();
+    /**
+     * A utility object, just used to map characters to output Strings,
+     * needed because a HashMap needs to map an object as a key, not a 
+     * Java primitive type, like a char, so this object gets around that
+     * and it is reusable.
+     */
+    private final CharKey m_charKey;
 
     /**
      * Map a character to a String. For example given
@@ -388,7 +400,7 @@
      * @return The String that the character is mapped to, or null if not found.
      * @xsl.usage internal
      */
-    synchronized String getOutputStringForChar(char value)
+    String getOutputStringForChar(char value)
     {
         // CharKey m_charKey = new CharKey(); //Alternative to synchronized
         m_charKey.setChar(value);
@@ -397,21 +409,20 @@
     
     /**
      * Tell if the character argument that is from
-     * an attribute value should have special treatment.
+     * an attribute value has a mapping to a String.
      * 
      * @param value the value of a character that is in an attribute value
      * @return true if the character should have any special treatment, 
-     * such as when writing out attribute values, 
-     * or entity references.
+     * such as when writing out entity references.
      * @xsl.usage internal
      */
-    final boolean isSpecialAttrChar(int value)
+    final boolean shouldMapAttrChar(int value)
     {
         // for performance try the values in the boolean array first,
         // this is faster access than the BitSet for common ASCII values
 
         if (value < ASCII_MAX)
-            return isSpecialAttrASCII[value];
+            return shouldMapAttrChar_ASCII[value];
 
         // rather than java.util.BitSet, our private
         // implementation is faster (and less general).
@@ -420,46 +431,27 @@
 
     /**
      * Tell if the character argument that is from a 
-     * text node should have special treatment.
+     * text node has a mapping to a String, for example
+     * to map '<' to "&lt;".
      * 
      * @param value the value of a character that is in a text node
-     * @return true if the character should have any special treatment, 
-     * such as when writing out attribute values, 
-     * or entity references.
+     * @return true if the character has a mapping to a String, 
+     * such as when writing out entity references.
      * @xsl.usage internal
      */
-    final boolean isSpecialTextChar(int value)
+    final boolean shouldMapTextChar(int value)
     {
         // for performance try the values in the boolean array first,
         // this is faster access than the BitSet for common ASCII values
 
         if (value < ASCII_MAX)
-            return isSpecialTextASCII[value];
+            return shouldMapTextChar_ASCII[value];
 
         // rather than java.util.BitSet, our private
         // implementation is faster (and less general).
         return get(value);
     }
     
-    /**
-     * This method is used to determine if an ASCII character in
-     * a text node (not an attribute value) is "clean".
-     * @param value the character to check (0 to 127).
-     * @return true if the character can go to the writer as-is
-     * @xsl.usage internal
-     */
-    final boolean isTextASCIIClean(int value)
-    {
-        return isCleanTextASCII[value];
-    }
-    
-//  In the future one might want to use the array directly and avoid
-//  the method call, but I think the JIT alreay inlines this well enough
-//  so don't do it (for now) - bjm    
-//    public final boolean[] getASCIIClean()
-//    {
-//        return isCleanTextASCII;
-//    }
 
      
     private static CharInfo getCharInfoBasedOnPrivilege(
@@ -496,15 +488,17 @@
     {
         CharInfo charInfo = (CharInfo) m_getCharInfoCache.get(entitiesFileName);
         if (charInfo != null) {
-            return charInfo;
+        	return mutableCopyOf(charInfo);
         }
 
         // try to load it internally - cache
         try {
             charInfo = getCharInfoBasedOnPrivilege(entitiesFileName, 
                                         method, true);
+            // Put the common copy of charInfo in the cache, but return
+            // a copy of it.
             m_getCharInfoCache.put(entitiesFileName, charInfo);
-            return charInfo;
+            return mutableCopyOf(charInfo);
         } catch (Exception e) {}
 
         // try to load it externally - do not cache
@@ -531,7 +525,41 @@
                                 method, false);
     }
 
-    /** Table of user-specified char infos. */
+    /**
+     * Create a mutable copy of the cached one.
+     * @param charInfo The cached one.
+     * @return
+     */
+    private static CharInfo mutableCopyOf(CharInfo charInfo) {
+    	CharInfo copy = new CharInfo();
+    	
+    	int max = charInfo.array_of_bits.length;
+    	System.arraycopy(charInfo.array_of_bits,0,copy.array_of_bits,0,max);
+    	
+    	copy.firstWordNotUsed = charInfo.firstWordNotUsed;
+    	
+    	max = charInfo.shouldMapAttrChar_ASCII.length;
+    	System.arraycopy(charInfo.shouldMapAttrChar_ASCII,0,copy.shouldMapAttrChar_ASCII,0,max);
+    	
+    	max = charInfo.shouldMapTextChar_ASCII.length;
+    	System.arraycopy(charInfo.shouldMapTextChar_ASCII,0,copy.shouldMapTextChar_ASCII,0,max);
+    	
+    	// utility field copy.m_charKey is already created in the default constructor 
+    	
+    	copy.m_charToString = (HashMap) charInfo.m_charToString.clone();
+    	
+    	copy.onlyQuotAmpLtGt = charInfo.onlyQuotAmpLtGt;
+    	    	
+		return copy;
+	}
+
+	/** 
+	 * Table of user-specified char infos.
+	 * The table maps entify file names (the name of the
+	 * property file without the .properties extension)
+	 * to CharInfo objects populated with entities defined in 
+	 * corresponding property file.  
+	 */
     private static Hashtable m_getCharInfoCache = new Hashtable();
 
     /**
@@ -573,7 +601,8 @@
      * the creation of the set.
      */
     private final void set(int i) {   
-        setASCIIdirty(i);
+        setASCIItextDirty(i);
+        setASCIIattrDirty(i); 
              
         int j = (i >> SHIFT_PER_WORD); // this word is used
         int k = j + 1;       
@@ -608,24 +637,43 @@
         return in_the_set;
     }
     
-    // record if there are any entities other than
-    // quot, amp, lt, gt  (probably user defined)
     /**
-     * @return true if the entity 
-     * @param code The value of the character that has an entity defined
-     * for it.
+     * This method returns true if there are some non-standard mappings to
+     * entities other than quot, amp, lt, gt, and its only purpose is for
+     * performance.
+     * @param charToMap The value of the character that is mapped to a String
+     * @param outputString The String to which the character is mapped, usually
+     * an entity reference such as "&lt;".
+     * @return true if the mapping is not one of:
+     * <ul>
+     * <li> '<' to "&lt;"
+     * <li> '>' to "&gt;"
+     * <li> '&' to "&amp;"
+     * <li> '"' to "&quot;"
+     * </ul>
      */
-    private boolean extraEntity(int entityValue)
+    private boolean extraEntity(String outputString, int charToMap)
     {
         boolean extra = false;
-        if (entityValue < 128)
+        if (charToMap < ASCII_MAX)
         {
-            switch (entityValue)
+            switch (charToMap)
             {
-                case 34 : // quot
-                case 38 : // amp
-                case 60 : // lt
-                case 62 : // gt
+                case '"' : // quot
+                	if (!outputString.equals("&quot;"))
+                		extra = true;  
+                	break;
+                case '&' : // amp
+                	if (!outputString.equals("&amp;"))
+                		extra = true;
+                	break;
+                case '<' : // lt
+                	if (!outputString.equals("&lt;"))
+                		extra = true;
+                	break;
+                case '>' : // gt
+                	if (!outputString.equals("&gt;"))
+                		extra = true;
                     break;
                 default : // other entity in range 0 to 127  
                     extra = true;
@@ -635,48 +683,61 @@
     }    
     
     /**
-     * If the character is a printable ASCII character then
-     * mark it as not clean and needing replacement with
-     * a String on output.
+     * If the character is in the ASCII range then
+     * mark it as needing replacement with
+     * a String on output if it occurs in a text node.
      * @param ch
      */
-    private void setASCIIdirty(int j) 
+    private void setASCIItextDirty(int j) 
     {
         if (0 <= j && j < ASCII_MAX) 
         {
-            isCleanTextASCII[j] = false;
-            isSpecialTextASCII[j] = true;
+            shouldMapTextChar_ASCII[j] = true;
+        } 
+    }
+    
+    /**
+     * If the character is in the ASCII range then
+     * mark it as needing replacement with
+     * a String on output if it occurs in a attribute value.
+     * @param ch
+     */
+    private void setASCIIattrDirty(int j) 
+    {
+        if (0 <= j && j < ASCII_MAX) 
+        {
+            shouldMapAttrChar_ASCII[j] = true;
         } 
     }
 
-    /**
-     * If the character is a printable ASCII character then
-     * mark it as and not needing replacement with
-     * a String on output.
-     * @param ch
-     */    
-    private void setASCIIclean(int j)
-    {
-        if (0 <= j && j < ASCII_MAX) 
-        {        
-            isCleanTextASCII[j] = true;
-            isSpecialTextASCII[j] = false;
-        }
-    }
     
-    void defineChar2StringMapping(String outputString, char inputChar) 
+    /**
+     * Call this method to register a char to String mapping, for example
+     * to map '<' to "&lt;".
+     * @param outputString The String to map to.
+     * @param inputChar The char to map from.
+     * @return true if the mapping is not one of:
+     * <ul>
+     * <li> '<' to "&lt;"
+     * <li> '>' to "&gt;"
+     * <li> '&' to "&amp;"
+     * <li> '"' to "&quot;"
+     * </ul>
+     */
+    boolean defineChar2StringMapping(String outputString, char inputChar) 
     {
         CharKey character = new CharKey(inputChar);
         m_charToString.put(character, outputString);
-        set(inputChar);        
+        set(inputChar);  // mark the character has having a mapping to a String
+        
+        boolean extraMapping = extraEntity(outputString, inputChar);
+        return extraMapping;
+        	
     }
 
     /**
      * Simple class for fast lookup of char values, when used with
      * hashtables.  You can set the char, then use it as a key.
-     * 
-     * This class is a copy of the one in org.apache.xml.utils. 
-     * It exists to cut the serializers dependancy on that package.
      *  
      * @xsl.usage internal
      */

diff --git a/src/org/apache/xml/serializer/ToHTMLStream.java b/src/org/apache/xml/serializer/ToHTMLStream.java
index 5773a45..fc648a5 100644
--- a/src/org/apache/xml/serializer/ToHTMLStream.java
+++ b/src/org/apache/xml/serializer/ToHTMLStream.java

@@ -54,7 +54,7 @@
      * Map that tells which XML characters should have special treatment, and it
      *  provides character to entity name lookup.
      */
-    private static final CharInfo m_htmlcharInfo =
+    private final CharInfo m_htmlcharInfo =
 //        new CharInfo(CharInfo.HTML_ENTITIES_RESOURCE);
         CharInfo.getCharInfo(CharInfo.HTML_ENTITIES_RESOURCE, Method.HTML);
 
@@ -1377,7 +1377,7 @@
             // System.out.println("ch: "+(int)ch);
             // System.out.println("m_maxCharacter: "+(int)m_maxCharacter);
             // System.out.println("m_attrCharsMap[ch]: "+(int)m_attrCharsMap[ch]);
-            if (escapingNotNeeded(ch) && (!m_charInfo.isSpecialAttrChar(ch)))
+            if (escapingNotNeeded(ch) && (!m_charInfo.shouldMapAttrChar(ch)))
             {
                 cleanLength++;
             }

diff --git a/src/org/apache/xml/serializer/ToStream.java b/src/org/apache/xml/serializer/ToStream.java
index 133eaa6..f0026bb 100644
--- a/src/org/apache/xml/serializer/ToStream.java
+++ b/src/org/apache/xml/serializer/ToStream.java

@@ -1,5 +1,5 @@
 /*
- * Copyright 2001-2005 The Apache Software Foundation.
+ * Copyright 2001-2006 The Apache Software Foundation.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -905,7 +905,8 @@
         {
             // This is the old/fast code here, but is this 
             // correct for all encodings?
-            if (ch >= 0x20 || (0x0A == ch || 0x0D == ch || 0x09 == ch))
+            if (ch >= CharInfo.S_SPACE || (CharInfo.S_LINEFEED == ch || 
+                    CharInfo.S_CARRIAGERETURN == ch || CharInfo.S_HORIZONAL_TAB == ch))
                 ret= true;
             else
                 ret = false;
@@ -1014,7 +1015,7 @@
      *
      * @throws java.io.IOException
      */
-    protected int accumDefaultEntity(
+    int accumDefaultEntity(
         java.io.Writer writer,
         char ch,
         int i,
@@ -1033,7 +1034,7 @@
         {
             // if this is text node character and a special one of those,
             // or if this is a character from attribute value and a special one of those
-            if ((fromTextNode && m_charInfo.isSpecialTextChar(ch)) || (!fromTextNode && m_charInfo.isSpecialAttrChar(ch)))
+            if ((fromTextNode && m_charInfo.shouldMapTextChar(ch)) || (!fromTextNode && m_charInfo.shouldMapAttrChar(ch)))
             {
                 String outputStringForChar = m_charInfo.getOutputStringForChar(ch);
 
@@ -1387,8 +1388,7 @@
 
         if (m_cdataTagOpen)
             closeCDATA();
-        // the check with _escaping is a bit of a hack for XLSTC
-
+        
         if (m_disableOutputEscapingStates.peekOrFalse() || (!m_escaping))
         {
             charactersRaw(chars, start, length);
@@ -1410,82 +1410,175 @@
         try
         {
             int i;
-            char ch1;
             int startClean;
             
             // skip any leading whitspace 
             // don't go off the end and use a hand inlined version
             // of isWhitespace(ch)
             final int end = start + length;
-            int lastDirty = start - 1; // last character that needed processing
-            for (i = start;
-                ((i < end)                
-                    && ((ch1 = chars[i]) == 0x20
-                        || (ch1 == 0xA && m_lineSepUse)
-                        || ch1 == 0xD
-                        || ch1 == 0x09));
-                i++)
-            {
-                /*
-                 * We are processing leading whitespace, but are doing the same
-                 * processing for dirty characters here as for non-whitespace.
-                 * 
-                 */
-                if (!m_charInfo.isTextASCIIClean(ch1))
-                {
-                    lastDirty = processDirty(chars,end, i,ch1, lastDirty, true);
-                    i = lastDirty;
+            int lastDirtyCharProcessed = start - 1; // last non-clean character that was processed
+													// that was processed
+            final Writer writer = m_writer;
+            boolean isAllWhitespace = true;
+
+            // process any leading whitspace
+            i = start;
+            while (i < end && isAllWhitespace) {
+                char ch1 = chars[i];
+
+                if (m_charInfo.shouldMapTextChar(ch1)) {
+                    // The character is supposed to be replaced by a String
+                    // so write out the clean whitespace characters accumulated
+                    // so far
+                    // then the String.
+                    writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                    String outputStringForChar = m_charInfo
+                            .getOutputStringForChar(ch1);
+                    writer.write(outputStringForChar);
+                    // We can't say that everything we are writing out is
+                    // all whitespace, we just wrote out a String.
+                    isAllWhitespace = false;
+                    lastDirtyCharProcessed = i; // mark the last non-clean
+                    // character processed
+                    i++;
+                } else {
+                    // The character is clean, but is it a whitespace ?
+                    switch (ch1) {
+                    // TODO: Any other whitespace to consider?
+                    case CharInfo.S_SPACE:
+                        // Just accumulate the clean whitespace
+                        i++;
+                        break;
+                    case CharInfo.S_LINEFEED:
+                        lastDirtyCharProcessed = processLineFeed(chars, i,
+                                lastDirtyCharProcessed, writer);
+                        i++;
+                        break;
+                    case CharInfo.S_CARRIAGERETURN:
+                        writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                        writer.write("&#13;");
+                        lastDirtyCharProcessed = i;
+                        i++;
+                        break;
+                    case CharInfo.S_HORIZONAL_TAB:
+                        // Just accumulate the clean whitespace
+                        i++;
+                        break;
+                    default:
+                        // The character was clean, but not a whitespace
+                        // so break the loop to continue with this character
+                        // (we don't increment index i !!)
+                        isAllWhitespace = false;
+                        break;
+                    }
                 }
             }
+
             /* If there is some non-whitespace, mark that we may need
              * to preserve this. This is only important if we have indentation on.
              */            
-            if (i < end) 
+            if (i < end || !isAllWhitespace) 
                 m_ispreserve = true;
-                
-
-//            int lengthClean;    // number of clean characters in a row
-//            final boolean[] isAsciiClean = m_charInfo.getASCIIClean();
             
-            final boolean isXML10 = XMLVERSION10.equals(getVersion());
-            // we've skipped the leading whitespace, now deal with the rest
+            
             for (; i < end; i++)
-            {                      
-                {
-                    // A tight loop to skip over common clean chars
-                    // This tight loop makes it easier for the JIT
-                    // to optimize.
-                    char ch2;
-                    while (i<end 
-                            && ((ch2 = chars[i])<127)
-                            && m_charInfo.isTextASCIIClean(ch2))
-                            i++;
-                    if (i == end)
-                        break;
-                }  
-                   
-                final char ch = chars[i];
-                /*  The check for isCharacterInC0orC1Ranger and 
-                 *  isNELorLSEPCharacter has been added
-                 *  to support Control Characters in XML 1.1
-                 */     
-                if (!isCharacterInC0orC1Range(ch) && 
-                    (isXML10 || !isNELorLSEPCharacter(ch)) &&
-                    (escapingNotNeeded(ch) && (!m_charInfo.isSpecialTextChar(ch)))
-                        || ('"' == ch))
-                {
-                    ; // a character needing no special processing
+            {
+                char ch = chars[i];
+                
+                if (m_charInfo.shouldMapTextChar(ch)) {
+                    // The character is supposed to be replaced by a String
+                    // e.g.   '&'  -->  "&amp;"
+                    // e.g.   '<'  -->  "&lt;"
+                    writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                    String outputStringForChar = m_charInfo.getOutputStringForChar(ch);
+                    writer.write(outputStringForChar);
+                    lastDirtyCharProcessed = i;
                 }
-                else
-                {
-                    lastDirty = processDirty(chars,end, i, ch, lastDirty, true);
-                    i = lastDirty;
+                else {
+                    if (ch <= 0x1F) {
+                        // Range 0x00 through 0x1F inclusive
+                        //
+                        // This covers the non-whitespace control characters
+                        // in the range 0x1 to 0x1F inclusive.
+                        // It also covers the whitespace control characters in the same way:
+                        // 0x9   TAB
+                        // 0xA   NEW LINE
+                        // 0xD   CARRIAGE RETURN
+                        //
+                        // We also cover 0x0 ... It isn't valid
+                        // but we will output "&#0;" 
+                        
+                        // The default will handle this just fine, but this
+                        // is a little performance boost to handle the more
+                        // common TAB, NEW-LINE, CARRIAGE-RETURN
+                        switch (ch) {
+
+                        case CharInfo.S_HORIZONAL_TAB:
+                            // Leave whitespace TAB as a real character
+                            break;
+                        case CharInfo.S_LINEFEED:
+                            lastDirtyCharProcessed = processLineFeed(chars, i, lastDirtyCharProcessed, writer);
+                            break;
+                        case CharInfo.S_CARRIAGERETURN:
+                        	writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                        	writer.write("&#13;");
+                        	lastDirtyCharProcessed = i;
+                            // Leave whitespace carriage return as a real character
+                            break;
+                        default:
+                            writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                            writer.write("&#");
+                            writer.write(Integer.toString(ch));
+                            writer.write(';');
+                            lastDirtyCharProcessed = i;
+                            break;
+
+                        }
+                    }
+                    else if (ch < 0x7F) {  
+                        // Range 0x20 through 0x7E inclusive
+                        // Normal ASCII chars, do nothing, just add it to
+                        // the clean characters
+                            
+                    }
+                    else if (ch <= 0x9F){
+                        // Range 0x7F through 0x9F inclusive
+                        // More control characters, including NEL (0x85)
+                        writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                        writer.write("&#");
+                        writer.write(Integer.toString(ch));
+                        writer.write(';');
+                        lastDirtyCharProcessed = i;
+                    }
+                    else if (ch == CharInfo.S_LINE_SEPARATOR) {
+                        // LINE SEPARATOR
+                        writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                        writer.write("&#8232;");
+                        lastDirtyCharProcessed = i;
+                    }
+                    else if (m_encodingInfo.isInEncoding(ch)) {
+                        // If the character is in the encoding, and
+                        // not in the normal ASCII range, we also
+                        // just leave it get added on to the clean characters
+                        
+                    }
+                    else {
+                        // This is a fallback plan, we should never get here
+                        // but if the character wasn't previously handled
+                        // (i.e. isn't in the encoding, etc.) then what
+                        // should we do?  We choose to write out an entity
+                        writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                        writer.write("&#");
+                        writer.write(Integer.toString(ch));
+                        writer.write(';');
+                        lastDirtyCharProcessed = i;
+                    }
                 }
             }
             
             // we've reached the end. Any clean characters at the
             // end of the array than need to be written out?
-            startClean = lastDirty + 1;
+            startClean = lastDirtyCharProcessed + 1;
             if (i > startClean)
             {
                 int lengthClean = i - startClean;
@@ -1503,6 +1596,31 @@
         // time to fire off characters generation event
         if (m_tracer != null)
             super.fireCharEvent(chars, start, length);
+    }
+
+	private int processLineFeed(final char[] chars, int i, int lastProcessed, final Writer writer) throws IOException {
+		if (!m_lineSepUse 
+		|| (m_lineSepLen ==1 && m_lineSep[0] == CharInfo.S_LINEFEED)){
+		    // We are leaving the new-line alone, and it is just
+		    // being added to the 'clean' characters,
+			// so the last dirty character processed remains unchanged
+		}
+		else {
+		    writeOutCleanChars(chars, i, lastProcessed);
+		    writer.write(m_lineSep, 0, m_lineSepLen);
+		    lastProcessed = i;
+		}
+		return lastProcessed;
+	}
+
+    private void writeOutCleanChars(final char[] chars, int i, int lastProcessed) throws IOException {
+        int startClean;
+        startClean = lastProcessed + 1;
+        if (startClean < i)
+        {
+            int lengthClean = i - startClean;
+            m_writer.write(chars, startClean, lengthClean);
+        }
     }     
     /**
      * This method checks if a given character is between C0 or C1 range
@@ -1623,7 +1741,7 @@
      *
      * @throws org.xml.sax.SAXException
      */
-    protected int accumDefaultEscape(
+    private int accumDefaultEscape(
         Writer writer,
         char ch,
         int i,
@@ -1687,16 +1805,15 @@
                  *  to write it out as Numeric Character Reference(NCR) regardless of XML Version
                  *  being used for output document.
                  */ 
-                if (isCharacterInC0orC1Range(ch) || 
-                        (XMLVERSION11.equals(getVersion()) && isNELorLSEPCharacter(ch)))
+                if (isCharacterInC0orC1Range(ch) || isNELorLSEPCharacter(ch))
                 {
                     writer.write("&#");
                     writer.write(Integer.toString(ch));
                     writer.write(';');
                 }
                 else if ((!escapingNotNeeded(ch) || 
-                    (  (fromTextNode && m_charInfo.isSpecialTextChar(ch))
-                     || (!fromTextNode && m_charInfo.isSpecialAttrChar(ch)))) 
+                    (  (fromTextNode && m_charInfo.shouldMapTextChar(ch))
+                     || (!fromTextNode && m_charInfo.shouldMapAttrChar(ch)))) 
                 && m_elemContext.m_currentElemDepth > 0)
                 {
                     writer.write("&#");
@@ -1952,16 +2069,82 @@
         for (int i = 0; i < len; i++)
         {
             char ch = stringChars[i];
-            if (escapingNotNeeded(ch) && (!m_charInfo.isSpecialAttrChar(ch)))
-            {
-                writer.write(ch);
-            }
-            else
-            {
+            
+            if (m_charInfo.shouldMapAttrChar(ch)) {
+                // The character is supposed to be replaced by a String
+                // e.g.   '&'  -->  "&amp;"
+                // e.g.   '<'  -->  "&lt;"
                 accumDefaultEscape(writer, ch, i, stringChars, len, false, true);
             }
-        }
+            else {
+                if (0x0 <= ch && ch <= 0x1F) {
+                    // Range 0x00 through 0x1F inclusive
+                    // This covers the non-whitespace control characters
+                    // in the range 0x1 to 0x1F inclusive.
+                    // It also covers the whitespace control characters in the same way:
+                    // 0x9   TAB
+                    // 0xA   NEW LINE
+                    // 0xD   CARRIAGE RETURN
+                    //
+                    // We also cover 0x0 ... It isn't valid
+                    // but we will output "&#0;" 
+                    
+                    // The default will handle this just fine, but this
+                    // is a little performance boost to handle the more
+                    // common TAB, NEW-LINE, CARRIAGE-RETURN
+                    switch (ch) {
 
+                    case CharInfo.S_HORIZONAL_TAB:
+                        writer.write("&#9;");
+                        break;
+                    case CharInfo.S_LINEFEED:
+                        writer.write("&#10;");
+                        break;
+                    case CharInfo.S_CARRIAGERETURN:
+                        writer.write("&#13;");
+                        break;
+                    default:
+                        writer.write("&#");
+                        writer.write(Integer.toString(ch));
+                        writer.write(';');
+                        break;
+
+                    }
+                }
+                else if (ch < 0x7F) {   
+                    // Range 0x20 through 0x7E inclusive
+                    // Normal ASCII chars
+                        writer.write(ch);
+                }
+                else if (ch <= 0x9F){
+                    // Range 0x7F through 0x9F inclusive
+                    // More control characters
+                    writer.write("&#");
+                    writer.write(Integer.toString(ch));
+                    writer.write(';');
+                }
+                else if (ch == CharInfo.S_LINE_SEPARATOR) {
+                    // LINE SEPARATOR
+                    writer.write("&#8232;");
+                }
+                else if (m_encodingInfo.isInEncoding(ch)) {
+                    // If the character is in the encoding, and
+                    // not in the normal ASCII range, we also
+                    // just write it out
+                    writer.write(ch);
+                }
+                else {
+                    // This is a fallback plan, we should never get here
+                    // but if the character wasn't previously handled
+                    // (i.e. isn't in the encoding, etc.) then what
+                    // should we do?  We choose to write out a character ref
+                    writer.write("&#");
+                    writer.write(Integer.toString(ch));
+                    writer.write(';');
+                }
+                    
+            }
+        }
     }
 
     /**
@@ -2739,6 +2922,14 @@
                 closeCDATA();
                 m_cdataTagOpen = false;
             }
+            if (m_writer != null) {
+                try {
+                    m_writer.flush();
+                }
+                catch(IOException e) {
+                    // what? me worry?
+                }
+            }
     }
 
     public void setContentHandler(ContentHandler ch)

diff --git a/src/org/apache/xml/serializer/ToXMLStream.java b/src/org/apache/xml/serializer/ToXMLStream.java
index 03729b2..102b97c 100644
--- a/src/org/apache/xml/serializer/ToXMLStream.java
+++ b/src/org/apache/xml/serializer/ToXMLStream.java

@@ -52,7 +52,7 @@
      * Map that tells which XML characters should have special treatment, and it
      *  provides character to entity name lookup.
      */
-    private static CharInfo m_xmlcharInfo =
+    private CharInfo m_xmlcharInfo =
 //      new CharInfo(CharInfo.XML_ENTITIES_RESOURCE);
         CharInfo.getCharInfo(CharInfo.XML_ENTITIES_RESOURCE, Method.XML);
commit	666e9b9df6f9f4bef281d4547d088bfa67009bad	[log] [tgz]
author	Brian James Minchau <minchau@apache.org>	Tue Mar 07 16:44:53 2006 +0000
committer	Brian James Minchau <minchau@apache.org>	Tue Mar 07 16:44:53 2006 +0000
tree	f4c27ffa7967d6243665d643316d962001dd47d3
parent	4a54e5ffff9c553a9a9afda566b1d18a601f5390 [diff]