Committing patch in XALANJ-2271 which fixes a bug in
outputing XML 1.1 attributes.
It is also a general clean-up of code related to whether
particular characters have entities, or should be written
as character entities, etc. The code is tricky because
it all depends on:
> method type (xml, html, text)
> character in a text node?
> character in an XML attribute value?
> character in an HTML URL attribute value?
The old code had a concept that the character was "special"
but put plenty band-aids on that CharInfo API call.
New code has far fewer band-aids. CharInfo basically knows if
the character is mapped to a String (e.g. '<' mapping to "<")
and leave it more explicitly to the methods (e.g. ToXMLStream)
output method (e.g. characters() or writeAttr() ...)
whether it is a character in a text node, or an attribute
value, or
diff --git a/src/org/apache/xml/serializer/CharInfo.java b/src/org/apache/xml/serializer/CharInfo.java
index 9389ac9..65bb5f4 100644
--- a/src/org/apache/xml/serializer/CharInfo.java
+++ b/src/org/apache/xml/serializer/CharInfo.java
@@ -1,5 +1,5 @@
/*
- * Copyright 1999-2004 The Apache Software Foundation.
+ * Copyright 1999-2006 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.Enumeration;
+import java.util.HashMap;
import java.util.Hashtable;
import java.util.PropertyResourceBundle;
import java.util.ResourceBundle;
@@ -50,7 +51,7 @@
final class CharInfo
{
/** Given a character, lookup a String to output (e.g. a decorated entity reference). */
- private Hashtable m_charToString = new Hashtable();
+ private HashMap m_charToString;
/**
* The name of the HTML entities file.
@@ -67,42 +68,50 @@
SerializerBase.PKG_NAME+".XMLEntities";
/** The horizontal tab character, which the parser should always normalize. */
- public static final char S_HORIZONAL_TAB = 0x09;
+ static final char S_HORIZONAL_TAB = 0x09;
/** The linefeed character, which the parser should always normalize. */
- public static final char S_LINEFEED = 0x0A;
+ static final char S_LINEFEED = 0x0A;
/** The carriage return character, which the parser should always normalize. */
- public static final char S_CARRIAGERETURN = 0x0D;
+ static final char S_CARRIAGERETURN = 0x0D;
+ static final char S_SPACE = 0x20;
+ static final char S_QUOTE = 0x22;
+ static final char S_LT = 0x3C;
+ static final char S_GT = 0x3E;
+ static final char S_NEL = 0x85;
+ static final char S_LINE_SEPARATOR = 0x2028;
/** This flag is an optimization for HTML entities. It false if entities
* other than quot (34), amp (38), lt (60) and gt (62) are defined
* in the range 0 to 127.
* @xsl.usage internal
*/
- final boolean onlyQuotAmpLtGt;
+ boolean onlyQuotAmpLtGt;
/** Copy the first 0,1 ... ASCII_MAX values into an array */
- private static final int ASCII_MAX = 128;
+ static final int ASCII_MAX = 128;
/** Array of values is faster access than a set of bits
- * to quickly check ASCII characters in attribute values.
+ * to quickly check ASCII characters in attribute values,
+ * the value is true if the character in an attribute value
+ * should be mapped to a String.
*/
- private boolean[] isSpecialAttrASCII = new boolean[ASCII_MAX];
+ private final boolean[] shouldMapAttrChar_ASCII;
/** Array of values is faster access than a set of bits
- * to quickly check ASCII characters in text nodes.
+ * to quickly check ASCII characters in text nodes,
+ * the value is true if the character in a text node
+ * should be mapped to a String.
*/
- private boolean[] isSpecialTextASCII = new boolean[ASCII_MAX];
-
- private boolean[] isCleanTextASCII = new boolean[ASCII_MAX];
+ private final boolean[] shouldMapTextChar_ASCII;
/** An array of bits to record if the character is in the set.
* Although information in this array is complete, the
* isSpecialAttrASCII array is used first because access to its values
* is common and faster.
*/
- private int array_of_bits[] = createEmptySetOfIntegers(65535);
+ private final int array_of_bits[];
// 5 for 32 bit words, 6 for 64 bit words ...
@@ -133,33 +142,38 @@
/**
- * Constructor that reads in a resource file that describes the mapping of
- * characters to entity references.
- * This constructor is private, just to force the use
- * of the getCharInfo(entitiesResource) factory
+ * A base constructor just to explicitly create the fields,
+ * with the exception of m_charToString which is handled
+ * by the constructor that delegates base construction to this one.
+ * <p>
+ * m_charToString is not created here only for performance reasons,
+ * to avoid creating a Hashtable that will be replaced when
+ * making a mutable copy, {@link #mutableCopyOf(CharInfo)}.
*
- * Resource files must be encoded in UTF-8 and can either be properties
- * files with a .properties extension assumed. Alternatively, they can
- * have the following form, with no particular extension assumed:
- *
- * <pre>
- * # First char # is a comment
- * Entity numericValue
- * quot 34
- * amp 38
- * </pre>
- *
- * @param entitiesResource Name of properties or resource file that should
- * be loaded, which describes that mapping of characters to entity
- * references.
*/
- private CharInfo(String entitiesResource, String method)
+ private CharInfo()
{
- this(entitiesResource, method, false);
- }
+ this.array_of_bits = createEmptySetOfIntegers(65535);
+ this.firstWordNotUsed = 0;
+ this.shouldMapAttrChar_ASCII = new boolean[ASCII_MAX];
+ this.shouldMapTextChar_ASCII = new boolean[ASCII_MAX];
+ this.m_charKey = new CharKey();
+
+ // Not set here, but in a constructor that uses this one
+ // this.m_charToString = new Hashtable();
+
+ this.onlyQuotAmpLtGt = true;
+
+ return;
+ }
+
private CharInfo(String entitiesResource, String method, boolean internal)
{
+ // call the default constructor to create the fields
+ this();
+ m_charToString = new HashMap();
+
ResourceBundle entities = null;
boolean noExtraEntities = true;
@@ -185,12 +199,10 @@
String name = (String) keys.nextElement();
String value = entities.getString(name);
int code = Integer.parseInt(value);
- defineEntity(name, (char) code);
- if (extraEntity(code))
+ boolean extra = defineEntity(name, (char) code);
+ if (extra)
noExtraEntities = false;
}
- set(S_LINEFEED);
- set(S_CARRIAGERETURN);
} else {
InputStream is = null;
@@ -274,8 +286,8 @@
int code = Integer.parseInt(value);
- defineEntity(name, (char) code);
- if (extraEntity(code))
+ boolean extra = defineEntity(name, (char) code);
+ if (extra)
noExtraEntities = false;
}
}
@@ -284,8 +296,6 @@
}
is.close();
- set(S_LINEFEED);
- set(S_CARRIAGERETURN);
} catch (Exception e) {
throw new RuntimeException(
Utils.messages.createMessage(
@@ -302,31 +312,8 @@
}
}
}
-
- /* initialize the array isCleanTextASCII[] with a cache of values
- * for use by ToStream.character(char[], int , int)
- * and the array isSpecialTextASCII[] with the opposite values
- * (all in the name of performance!)
- */
- for (int ch = 0; ch <ASCII_MAX; ch++)
- if((((0x20 <= ch || (0x0A == ch || 0x0D == ch || 0x09 == ch)))
- && (!get(ch))) || ('"' == ch))
- {
- isCleanTextASCII[ch] = true;
- isSpecialTextASCII[ch] = false;
- }
- else {
- isCleanTextASCII[ch] = false;
- isSpecialTextASCII[ch] = true;
- }
-
-
onlyQuotAmpLtGt = noExtraEntities;
-
- // initialize the array with a cache of the BitSet values
- for (int i=0; i<ASCII_MAX; i++)
- isSpecialAttrASCII[i] = get(i);
/* Now that we've used get(ch) just above to initialize the
* two arrays we will change by adding a tab to the set of
@@ -338,8 +325,19 @@
* This is the reason for this delay.
*/
if (Method.XML.equals(method))
- {
- isSpecialAttrASCII[S_HORIZONAL_TAB] = true;
+ {
+ // We choose not to escape the quotation mark as " in text nodes
+ shouldMapTextChar_ASCII[S_QUOTE] = false;
+ }
+
+ if (Method.HTML.equals(method)) {
+ // The XSLT 1.0 recommendation says
+ // "The html output method should not escape < characters occurring in attribute values."
+ // So we don't escape '<' in an attribute for HTML
+ shouldMapAttrChar_ASCII['<'] = false;
+
+ // We choose not to escape the quotation mark as " in text nodes.
+ shouldMapTextChar_ASCII[S_QUOTE] = false;
}
}
@@ -348,23 +346,37 @@
* supplied. Nothing happens if the character reference is already defined.
* <p>Unlike internal entities, character references are a string to single
* character mapping. They are used to map non-ASCII characters both on
- * parsing and printing, primarily for HTML documents. '<amp;' is an
+ * parsing and printing, primarily for HTML documents. '&lt;' is an
* example of a character reference.</p>
*
* @param name The entity's name
* @param value The entity's value
+ * @return true if the mapping is not one of:
+ * <ul>
+ * <li> '<' to "<"
+ * <li> '>' to ">"
+ * <li> '&' to "&"
+ * <li> '"' to """
+ * </ul>
*/
- private void defineEntity(String name, char value)
+ private boolean defineEntity(String name, char value)
{
StringBuffer sb = new StringBuffer("&");
sb.append(name);
sb.append(';');
String entityString = sb.toString();
- defineChar2StringMapping(entityString, value);
+ boolean extra = defineChar2StringMapping(entityString, value);
+ return extra;
}
- private CharKey m_charKey = new CharKey();
+ /**
+ * A utility object, just used to map characters to output Strings,
+ * needed because a HashMap needs to map an object as a key, not a
+ * Java primitive type, like a char, so this object gets around that
+ * and it is reusable.
+ */
+ private final CharKey m_charKey;
/**
* Map a character to a String. For example given
@@ -388,7 +400,7 @@
* @return The String that the character is mapped to, or null if not found.
* @xsl.usage internal
*/
- synchronized String getOutputStringForChar(char value)
+ String getOutputStringForChar(char value)
{
// CharKey m_charKey = new CharKey(); //Alternative to synchronized
m_charKey.setChar(value);
@@ -397,21 +409,20 @@
/**
* Tell if the character argument that is from
- * an attribute value should have special treatment.
+ * an attribute value has a mapping to a String.
*
* @param value the value of a character that is in an attribute value
* @return true if the character should have any special treatment,
- * such as when writing out attribute values,
- * or entity references.
+ * such as when writing out entity references.
* @xsl.usage internal
*/
- final boolean isSpecialAttrChar(int value)
+ final boolean shouldMapAttrChar(int value)
{
// for performance try the values in the boolean array first,
// this is faster access than the BitSet for common ASCII values
if (value < ASCII_MAX)
- return isSpecialAttrASCII[value];
+ return shouldMapAttrChar_ASCII[value];
// rather than java.util.BitSet, our private
// implementation is faster (and less general).
@@ -420,46 +431,27 @@
/**
* Tell if the character argument that is from a
- * text node should have special treatment.
+ * text node has a mapping to a String, for example
+ * to map '<' to "<".
*
* @param value the value of a character that is in a text node
- * @return true if the character should have any special treatment,
- * such as when writing out attribute values,
- * or entity references.
+ * @return true if the character has a mapping to a String,
+ * such as when writing out entity references.
* @xsl.usage internal
*/
- final boolean isSpecialTextChar(int value)
+ final boolean shouldMapTextChar(int value)
{
// for performance try the values in the boolean array first,
// this is faster access than the BitSet for common ASCII values
if (value < ASCII_MAX)
- return isSpecialTextASCII[value];
+ return shouldMapTextChar_ASCII[value];
// rather than java.util.BitSet, our private
// implementation is faster (and less general).
return get(value);
}
- /**
- * This method is used to determine if an ASCII character in
- * a text node (not an attribute value) is "clean".
- * @param value the character to check (0 to 127).
- * @return true if the character can go to the writer as-is
- * @xsl.usage internal
- */
- final boolean isTextASCIIClean(int value)
- {
- return isCleanTextASCII[value];
- }
-
-// In the future one might want to use the array directly and avoid
-// the method call, but I think the JIT alreay inlines this well enough
-// so don't do it (for now) - bjm
-// public final boolean[] getASCIIClean()
-// {
-// return isCleanTextASCII;
-// }
private static CharInfo getCharInfoBasedOnPrivilege(
@@ -496,15 +488,17 @@
{
CharInfo charInfo = (CharInfo) m_getCharInfoCache.get(entitiesFileName);
if (charInfo != null) {
- return charInfo;
+ return mutableCopyOf(charInfo);
}
// try to load it internally - cache
try {
charInfo = getCharInfoBasedOnPrivilege(entitiesFileName,
method, true);
+ // Put the common copy of charInfo in the cache, but return
+ // a copy of it.
m_getCharInfoCache.put(entitiesFileName, charInfo);
- return charInfo;
+ return mutableCopyOf(charInfo);
} catch (Exception e) {}
// try to load it externally - do not cache
@@ -531,7 +525,41 @@
method, false);
}
- /** Table of user-specified char infos. */
+ /**
+ * Create a mutable copy of the cached one.
+ * @param charInfo The cached one.
+ * @return
+ */
+ private static CharInfo mutableCopyOf(CharInfo charInfo) {
+ CharInfo copy = new CharInfo();
+
+ int max = charInfo.array_of_bits.length;
+ System.arraycopy(charInfo.array_of_bits,0,copy.array_of_bits,0,max);
+
+ copy.firstWordNotUsed = charInfo.firstWordNotUsed;
+
+ max = charInfo.shouldMapAttrChar_ASCII.length;
+ System.arraycopy(charInfo.shouldMapAttrChar_ASCII,0,copy.shouldMapAttrChar_ASCII,0,max);
+
+ max = charInfo.shouldMapTextChar_ASCII.length;
+ System.arraycopy(charInfo.shouldMapTextChar_ASCII,0,copy.shouldMapTextChar_ASCII,0,max);
+
+ // utility field copy.m_charKey is already created in the default constructor
+
+ copy.m_charToString = (HashMap) charInfo.m_charToString.clone();
+
+ copy.onlyQuotAmpLtGt = charInfo.onlyQuotAmpLtGt;
+
+ return copy;
+ }
+
+ /**
+ * Table of user-specified char infos.
+ * The table maps entify file names (the name of the
+ * property file without the .properties extension)
+ * to CharInfo objects populated with entities defined in
+ * corresponding property file.
+ */
private static Hashtable m_getCharInfoCache = new Hashtable();
/**
@@ -573,7 +601,8 @@
* the creation of the set.
*/
private final void set(int i) {
- setASCIIdirty(i);
+ setASCIItextDirty(i);
+ setASCIIattrDirty(i);
int j = (i >> SHIFT_PER_WORD); // this word is used
int k = j + 1;
@@ -608,24 +637,43 @@
return in_the_set;
}
- // record if there are any entities other than
- // quot, amp, lt, gt (probably user defined)
/**
- * @return true if the entity
- * @param code The value of the character that has an entity defined
- * for it.
+ * This method returns true if there are some non-standard mappings to
+ * entities other than quot, amp, lt, gt, and its only purpose is for
+ * performance.
+ * @param charToMap The value of the character that is mapped to a String
+ * @param outputString The String to which the character is mapped, usually
+ * an entity reference such as "<".
+ * @return true if the mapping is not one of:
+ * <ul>
+ * <li> '<' to "<"
+ * <li> '>' to ">"
+ * <li> '&' to "&"
+ * <li> '"' to """
+ * </ul>
*/
- private boolean extraEntity(int entityValue)
+ private boolean extraEntity(String outputString, int charToMap)
{
boolean extra = false;
- if (entityValue < 128)
+ if (charToMap < ASCII_MAX)
{
- switch (entityValue)
+ switch (charToMap)
{
- case 34 : // quot
- case 38 : // amp
- case 60 : // lt
- case 62 : // gt
+ case '"' : // quot
+ if (!outputString.equals("""))
+ extra = true;
+ break;
+ case '&' : // amp
+ if (!outputString.equals("&"))
+ extra = true;
+ break;
+ case '<' : // lt
+ if (!outputString.equals("<"))
+ extra = true;
+ break;
+ case '>' : // gt
+ if (!outputString.equals(">"))
+ extra = true;
break;
default : // other entity in range 0 to 127
extra = true;
@@ -635,48 +683,61 @@
}
/**
- * If the character is a printable ASCII character then
- * mark it as not clean and needing replacement with
- * a String on output.
+ * If the character is in the ASCII range then
+ * mark it as needing replacement with
+ * a String on output if it occurs in a text node.
* @param ch
*/
- private void setASCIIdirty(int j)
+ private void setASCIItextDirty(int j)
{
if (0 <= j && j < ASCII_MAX)
{
- isCleanTextASCII[j] = false;
- isSpecialTextASCII[j] = true;
+ shouldMapTextChar_ASCII[j] = true;
+ }
+ }
+
+ /**
+ * If the character is in the ASCII range then
+ * mark it as needing replacement with
+ * a String on output if it occurs in a attribute value.
+ * @param ch
+ */
+ private void setASCIIattrDirty(int j)
+ {
+ if (0 <= j && j < ASCII_MAX)
+ {
+ shouldMapAttrChar_ASCII[j] = true;
}
}
- /**
- * If the character is a printable ASCII character then
- * mark it as and not needing replacement with
- * a String on output.
- * @param ch
- */
- private void setASCIIclean(int j)
- {
- if (0 <= j && j < ASCII_MAX)
- {
- isCleanTextASCII[j] = true;
- isSpecialTextASCII[j] = false;
- }
- }
- void defineChar2StringMapping(String outputString, char inputChar)
+ /**
+ * Call this method to register a char to String mapping, for example
+ * to map '<' to "<".
+ * @param outputString The String to map to.
+ * @param inputChar The char to map from.
+ * @return true if the mapping is not one of:
+ * <ul>
+ * <li> '<' to "<"
+ * <li> '>' to ">"
+ * <li> '&' to "&"
+ * <li> '"' to """
+ * </ul>
+ */
+ boolean defineChar2StringMapping(String outputString, char inputChar)
{
CharKey character = new CharKey(inputChar);
m_charToString.put(character, outputString);
- set(inputChar);
+ set(inputChar); // mark the character has having a mapping to a String
+
+ boolean extraMapping = extraEntity(outputString, inputChar);
+ return extraMapping;
+
}
/**
* Simple class for fast lookup of char values, when used with
* hashtables. You can set the char, then use it as a key.
- *
- * This class is a copy of the one in org.apache.xml.utils.
- * It exists to cut the serializers dependancy on that package.
*
* @xsl.usage internal
*/
diff --git a/src/org/apache/xml/serializer/ToHTMLStream.java b/src/org/apache/xml/serializer/ToHTMLStream.java
index 5773a45..fc648a5 100644
--- a/src/org/apache/xml/serializer/ToHTMLStream.java
+++ b/src/org/apache/xml/serializer/ToHTMLStream.java
@@ -54,7 +54,7 @@
* Map that tells which XML characters should have special treatment, and it
* provides character to entity name lookup.
*/
- private static final CharInfo m_htmlcharInfo =
+ private final CharInfo m_htmlcharInfo =
// new CharInfo(CharInfo.HTML_ENTITIES_RESOURCE);
CharInfo.getCharInfo(CharInfo.HTML_ENTITIES_RESOURCE, Method.HTML);
@@ -1377,7 +1377,7 @@
// System.out.println("ch: "+(int)ch);
// System.out.println("m_maxCharacter: "+(int)m_maxCharacter);
// System.out.println("m_attrCharsMap[ch]: "+(int)m_attrCharsMap[ch]);
- if (escapingNotNeeded(ch) && (!m_charInfo.isSpecialAttrChar(ch)))
+ if (escapingNotNeeded(ch) && (!m_charInfo.shouldMapAttrChar(ch)))
{
cleanLength++;
}
diff --git a/src/org/apache/xml/serializer/ToStream.java b/src/org/apache/xml/serializer/ToStream.java
index 133eaa6..f0026bb 100644
--- a/src/org/apache/xml/serializer/ToStream.java
+++ b/src/org/apache/xml/serializer/ToStream.java
@@ -1,5 +1,5 @@
/*
- * Copyright 2001-2005 The Apache Software Foundation.
+ * Copyright 2001-2006 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -905,7 +905,8 @@
{
// This is the old/fast code here, but is this
// correct for all encodings?
- if (ch >= 0x20 || (0x0A == ch || 0x0D == ch || 0x09 == ch))
+ if (ch >= CharInfo.S_SPACE || (CharInfo.S_LINEFEED == ch ||
+ CharInfo.S_CARRIAGERETURN == ch || CharInfo.S_HORIZONAL_TAB == ch))
ret= true;
else
ret = false;
@@ -1014,7 +1015,7 @@
*
* @throws java.io.IOException
*/
- protected int accumDefaultEntity(
+ int accumDefaultEntity(
java.io.Writer writer,
char ch,
int i,
@@ -1033,7 +1034,7 @@
{
// if this is text node character and a special one of those,
// or if this is a character from attribute value and a special one of those
- if ((fromTextNode && m_charInfo.isSpecialTextChar(ch)) || (!fromTextNode && m_charInfo.isSpecialAttrChar(ch)))
+ if ((fromTextNode && m_charInfo.shouldMapTextChar(ch)) || (!fromTextNode && m_charInfo.shouldMapAttrChar(ch)))
{
String outputStringForChar = m_charInfo.getOutputStringForChar(ch);
@@ -1387,8 +1388,7 @@
if (m_cdataTagOpen)
closeCDATA();
- // the check with _escaping is a bit of a hack for XLSTC
-
+
if (m_disableOutputEscapingStates.peekOrFalse() || (!m_escaping))
{
charactersRaw(chars, start, length);
@@ -1410,82 +1410,175 @@
try
{
int i;
- char ch1;
int startClean;
// skip any leading whitspace
// don't go off the end and use a hand inlined version
// of isWhitespace(ch)
final int end = start + length;
- int lastDirty = start - 1; // last character that needed processing
- for (i = start;
- ((i < end)
- && ((ch1 = chars[i]) == 0x20
- || (ch1 == 0xA && m_lineSepUse)
- || ch1 == 0xD
- || ch1 == 0x09));
- i++)
- {
- /*
- * We are processing leading whitespace, but are doing the same
- * processing for dirty characters here as for non-whitespace.
- *
- */
- if (!m_charInfo.isTextASCIIClean(ch1))
- {
- lastDirty = processDirty(chars,end, i,ch1, lastDirty, true);
- i = lastDirty;
+ int lastDirtyCharProcessed = start - 1; // last non-clean character that was processed
+ // that was processed
+ final Writer writer = m_writer;
+ boolean isAllWhitespace = true;
+
+ // process any leading whitspace
+ i = start;
+ while (i < end && isAllWhitespace) {
+ char ch1 = chars[i];
+
+ if (m_charInfo.shouldMapTextChar(ch1)) {
+ // The character is supposed to be replaced by a String
+ // so write out the clean whitespace characters accumulated
+ // so far
+ // then the String.
+ writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+ String outputStringForChar = m_charInfo
+ .getOutputStringForChar(ch1);
+ writer.write(outputStringForChar);
+ // We can't say that everything we are writing out is
+ // all whitespace, we just wrote out a String.
+ isAllWhitespace = false;
+ lastDirtyCharProcessed = i; // mark the last non-clean
+ // character processed
+ i++;
+ } else {
+ // The character is clean, but is it a whitespace ?
+ switch (ch1) {
+ // TODO: Any other whitespace to consider?
+ case CharInfo.S_SPACE:
+ // Just accumulate the clean whitespace
+ i++;
+ break;
+ case CharInfo.S_LINEFEED:
+ lastDirtyCharProcessed = processLineFeed(chars, i,
+ lastDirtyCharProcessed, writer);
+ i++;
+ break;
+ case CharInfo.S_CARRIAGERETURN:
+ writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+ writer.write(" ");
+ lastDirtyCharProcessed = i;
+ i++;
+ break;
+ case CharInfo.S_HORIZONAL_TAB:
+ // Just accumulate the clean whitespace
+ i++;
+ break;
+ default:
+ // The character was clean, but not a whitespace
+ // so break the loop to continue with this character
+ // (we don't increment index i !!)
+ isAllWhitespace = false;
+ break;
+ }
}
}
+
/* If there is some non-whitespace, mark that we may need
* to preserve this. This is only important if we have indentation on.
*/
- if (i < end)
+ if (i < end || !isAllWhitespace)
m_ispreserve = true;
-
-
-// int lengthClean; // number of clean characters in a row
-// final boolean[] isAsciiClean = m_charInfo.getASCIIClean();
- final boolean isXML10 = XMLVERSION10.equals(getVersion());
- // we've skipped the leading whitespace, now deal with the rest
+
for (; i < end; i++)
- {
- {
- // A tight loop to skip over common clean chars
- // This tight loop makes it easier for the JIT
- // to optimize.
- char ch2;
- while (i<end
- && ((ch2 = chars[i])<127)
- && m_charInfo.isTextASCIIClean(ch2))
- i++;
- if (i == end)
- break;
- }
-
- final char ch = chars[i];
- /* The check for isCharacterInC0orC1Ranger and
- * isNELorLSEPCharacter has been added
- * to support Control Characters in XML 1.1
- */
- if (!isCharacterInC0orC1Range(ch) &&
- (isXML10 || !isNELorLSEPCharacter(ch)) &&
- (escapingNotNeeded(ch) && (!m_charInfo.isSpecialTextChar(ch)))
- || ('"' == ch))
- {
- ; // a character needing no special processing
+ {
+ char ch = chars[i];
+
+ if (m_charInfo.shouldMapTextChar(ch)) {
+ // The character is supposed to be replaced by a String
+ // e.g. '&' --> "&"
+ // e.g. '<' --> "<"
+ writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+ String outputStringForChar = m_charInfo.getOutputStringForChar(ch);
+ writer.write(outputStringForChar);
+ lastDirtyCharProcessed = i;
}
- else
- {
- lastDirty = processDirty(chars,end, i, ch, lastDirty, true);
- i = lastDirty;
+ else {
+ if (ch <= 0x1F) {
+ // Range 0x00 through 0x1F inclusive
+ //
+ // This covers the non-whitespace control characters
+ // in the range 0x1 to 0x1F inclusive.
+ // It also covers the whitespace control characters in the same way:
+ // 0x9 TAB
+ // 0xA NEW LINE
+ // 0xD CARRIAGE RETURN
+ //
+ // We also cover 0x0 ... It isn't valid
+ // but we will output "�"
+
+ // The default will handle this just fine, but this
+ // is a little performance boost to handle the more
+ // common TAB, NEW-LINE, CARRIAGE-RETURN
+ switch (ch) {
+
+ case CharInfo.S_HORIZONAL_TAB:
+ // Leave whitespace TAB as a real character
+ break;
+ case CharInfo.S_LINEFEED:
+ lastDirtyCharProcessed = processLineFeed(chars, i, lastDirtyCharProcessed, writer);
+ break;
+ case CharInfo.S_CARRIAGERETURN:
+ writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+ writer.write(" ");
+ lastDirtyCharProcessed = i;
+ // Leave whitespace carriage return as a real character
+ break;
+ default:
+ writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+ writer.write("&#");
+ writer.write(Integer.toString(ch));
+ writer.write(';');
+ lastDirtyCharProcessed = i;
+ break;
+
+ }
+ }
+ else if (ch < 0x7F) {
+ // Range 0x20 through 0x7E inclusive
+ // Normal ASCII chars, do nothing, just add it to
+ // the clean characters
+
+ }
+ else if (ch <= 0x9F){
+ // Range 0x7F through 0x9F inclusive
+ // More control characters, including NEL (0x85)
+ writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+ writer.write("&#");
+ writer.write(Integer.toString(ch));
+ writer.write(';');
+ lastDirtyCharProcessed = i;
+ }
+ else if (ch == CharInfo.S_LINE_SEPARATOR) {
+ // LINE SEPARATOR
+ writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+ writer.write("
");
+ lastDirtyCharProcessed = i;
+ }
+ else if (m_encodingInfo.isInEncoding(ch)) {
+ // If the character is in the encoding, and
+ // not in the normal ASCII range, we also
+ // just leave it get added on to the clean characters
+
+ }
+ else {
+ // This is a fallback plan, we should never get here
+ // but if the character wasn't previously handled
+ // (i.e. isn't in the encoding, etc.) then what
+ // should we do? We choose to write out an entity
+ writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+ writer.write("&#");
+ writer.write(Integer.toString(ch));
+ writer.write(';');
+ lastDirtyCharProcessed = i;
+ }
}
}
// we've reached the end. Any clean characters at the
// end of the array than need to be written out?
- startClean = lastDirty + 1;
+ startClean = lastDirtyCharProcessed + 1;
if (i > startClean)
{
int lengthClean = i - startClean;
@@ -1503,6 +1596,31 @@
// time to fire off characters generation event
if (m_tracer != null)
super.fireCharEvent(chars, start, length);
+ }
+
+ private int processLineFeed(final char[] chars, int i, int lastProcessed, final Writer writer) throws IOException {
+ if (!m_lineSepUse
+ || (m_lineSepLen ==1 && m_lineSep[0] == CharInfo.S_LINEFEED)){
+ // We are leaving the new-line alone, and it is just
+ // being added to the 'clean' characters,
+ // so the last dirty character processed remains unchanged
+ }
+ else {
+ writeOutCleanChars(chars, i, lastProcessed);
+ writer.write(m_lineSep, 0, m_lineSepLen);
+ lastProcessed = i;
+ }
+ return lastProcessed;
+ }
+
+ private void writeOutCleanChars(final char[] chars, int i, int lastProcessed) throws IOException {
+ int startClean;
+ startClean = lastProcessed + 1;
+ if (startClean < i)
+ {
+ int lengthClean = i - startClean;
+ m_writer.write(chars, startClean, lengthClean);
+ }
}
/**
* This method checks if a given character is between C0 or C1 range
@@ -1623,7 +1741,7 @@
*
* @throws org.xml.sax.SAXException
*/
- protected int accumDefaultEscape(
+ private int accumDefaultEscape(
Writer writer,
char ch,
int i,
@@ -1687,16 +1805,15 @@
* to write it out as Numeric Character Reference(NCR) regardless of XML Version
* being used for output document.
*/
- if (isCharacterInC0orC1Range(ch) ||
- (XMLVERSION11.equals(getVersion()) && isNELorLSEPCharacter(ch)))
+ if (isCharacterInC0orC1Range(ch) || isNELorLSEPCharacter(ch))
{
writer.write("&#");
writer.write(Integer.toString(ch));
writer.write(';');
}
else if ((!escapingNotNeeded(ch) ||
- ( (fromTextNode && m_charInfo.isSpecialTextChar(ch))
- || (!fromTextNode && m_charInfo.isSpecialAttrChar(ch))))
+ ( (fromTextNode && m_charInfo.shouldMapTextChar(ch))
+ || (!fromTextNode && m_charInfo.shouldMapAttrChar(ch))))
&& m_elemContext.m_currentElemDepth > 0)
{
writer.write("&#");
@@ -1952,16 +2069,82 @@
for (int i = 0; i < len; i++)
{
char ch = stringChars[i];
- if (escapingNotNeeded(ch) && (!m_charInfo.isSpecialAttrChar(ch)))
- {
- writer.write(ch);
- }
- else
- {
+
+ if (m_charInfo.shouldMapAttrChar(ch)) {
+ // The character is supposed to be replaced by a String
+ // e.g. '&' --> "&"
+ // e.g. '<' --> "<"
accumDefaultEscape(writer, ch, i, stringChars, len, false, true);
}
- }
+ else {
+ if (0x0 <= ch && ch <= 0x1F) {
+ // Range 0x00 through 0x1F inclusive
+ // This covers the non-whitespace control characters
+ // in the range 0x1 to 0x1F inclusive.
+ // It also covers the whitespace control characters in the same way:
+ // 0x9 TAB
+ // 0xA NEW LINE
+ // 0xD CARRIAGE RETURN
+ //
+ // We also cover 0x0 ... It isn't valid
+ // but we will output "�"
+
+ // The default will handle this just fine, but this
+ // is a little performance boost to handle the more
+ // common TAB, NEW-LINE, CARRIAGE-RETURN
+ switch (ch) {
+ case CharInfo.S_HORIZONAL_TAB:
+ writer.write("	");
+ break;
+ case CharInfo.S_LINEFEED:
+ writer.write(" ");
+ break;
+ case CharInfo.S_CARRIAGERETURN:
+ writer.write(" ");
+ break;
+ default:
+ writer.write("&#");
+ writer.write(Integer.toString(ch));
+ writer.write(';');
+ break;
+
+ }
+ }
+ else if (ch < 0x7F) {
+ // Range 0x20 through 0x7E inclusive
+ // Normal ASCII chars
+ writer.write(ch);
+ }
+ else if (ch <= 0x9F){
+ // Range 0x7F through 0x9F inclusive
+ // More control characters
+ writer.write("&#");
+ writer.write(Integer.toString(ch));
+ writer.write(';');
+ }
+ else if (ch == CharInfo.S_LINE_SEPARATOR) {
+ // LINE SEPARATOR
+ writer.write("
");
+ }
+ else if (m_encodingInfo.isInEncoding(ch)) {
+ // If the character is in the encoding, and
+ // not in the normal ASCII range, we also
+ // just write it out
+ writer.write(ch);
+ }
+ else {
+ // This is a fallback plan, we should never get here
+ // but if the character wasn't previously handled
+ // (i.e. isn't in the encoding, etc.) then what
+ // should we do? We choose to write out a character ref
+ writer.write("&#");
+ writer.write(Integer.toString(ch));
+ writer.write(';');
+ }
+
+ }
+ }
}
/**
@@ -2739,6 +2922,14 @@
closeCDATA();
m_cdataTagOpen = false;
}
+ if (m_writer != null) {
+ try {
+ m_writer.flush();
+ }
+ catch(IOException e) {
+ // what? me worry?
+ }
+ }
}
public void setContentHandler(ContentHandler ch)
diff --git a/src/org/apache/xml/serializer/ToXMLStream.java b/src/org/apache/xml/serializer/ToXMLStream.java
index 03729b2..102b97c 100644
--- a/src/org/apache/xml/serializer/ToXMLStream.java
+++ b/src/org/apache/xml/serializer/ToXMLStream.java
@@ -52,7 +52,7 @@
* Map that tells which XML characters should have special treatment, and it
* provides character to entity name lookup.
*/
- private static CharInfo m_xmlcharInfo =
+ private CharInfo m_xmlcharInfo =
// new CharInfo(CharInfo.XML_ENTITIES_RESOURCE);
CharInfo.getCharInfo(CharInfo.XML_ENTITIES_RESOURCE, Method.XML);