blob: 4aed29ad5f5b8b5a28b6e67658954340c7faf3f5 [file] [log] [blame]
// ***************************************************************************************************************************
// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file *
// * distributed with this work for additional information regarding copyright ownership. The ASF licenses this file *
// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance *
// * with the License. You may obtain a copy of the License at *
// * *
// * http://www.apache.org/licenses/LICENSE-2.0 *
// * *
// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an *
// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the *
// * specific language governing permissions and limitations under the License. *
// ***************************************************************************************************************************
package org.apache.juneau.xml;
import static org.apache.juneau.internal.StringUtils.*;
import java.io.*;
import java.util.*;
import javax.xml.stream.*;
import org.apache.juneau.*;
import org.apache.juneau.internal.*;
import org.apache.juneau.xml.annotation.*;
/**
* XML utility methods.
*/
public final class XmlUtils {
//-----------------------------------------------------------------------------------------------------------------
// XML element names
//-----------------------------------------------------------------------------------------------------------------
/**
* Encodes any invalid XML element name characters to <c>_x####_</c> sequences.
*
* @param w The writer to send the output to.
* @param o The object being encoded.
* @return The same writer passed in.
* @throws IOException Throw by the writer.
*/
public static final Writer encodeElementName(Writer w, Object o) throws IOException {
if (o == null)
return w.append("_x0000_");
String s = o.toString();
if (needsElementNameEncoding(s))
return encodeElementNameInner(w, s);
w.append(s);
return w;
}
/**
* Encodes any invalid XML element name characters to <c>_x####_</c> sequences.
*
* @param o The object being encoded.
* @return The encoded element name string.
*/
public static final String encodeElementName(Object o) {
if (o == null)
return "_x0000_";
String s = o.toString();
if (s.isEmpty())
return "_xE000_";
try {
if (needsElementNameEncoding(s))
try (Writer w = new StringBuilderWriter(s.length() * 2)) {
return encodeElementNameInner(w, s).toString();
}
} catch (IOException e) {
throw new RuntimeException(e); // Never happens
}
return s;
}
private static final Writer encodeElementNameInner(Writer w, String s) throws IOException {
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if ((c >= 'A' && c <= 'Z')
|| (c == '_' && ! isEscapeSequence(s,i))
|| (c >= 'a' && c <= 'z')
|| (i != 0 && (
c == '-'
|| c == '.'
|| (c >= '0' && c <= '9')
|| c == '\u00b7'
|| (c >= '\u0300' && c <= '\u036f')
|| (c >= '\u203f' && c <= '\u2040')
))
|| (c >= '\u00c0' && c <= '\u00d6')
|| (c >= '\u00d8' && c <= '\u00f6')
|| (c >= '\u00f8' && c <= '\u02ff')
|| (c >= '\u0370' && c <= '\u037d')
|| (c >= '\u037f' && c <= '\u1fff')
|| (c >= '\u200c' && c <= '\u200d')
|| (c >= '\u2070' && c <= '\u218f')
|| (c >= '\u2c00' && c <= '\u2fef')
|| (c >= '\u3001' && c <= '\ud7ff')
|| (c >= '\uf900' && c <= '\ufdcf')
|| (c >= '\ufdf0' && c <= '\ufffd')) {
w.append(c);
} else {
appendPaddedHexChar(w, c);
}
}
return w;
}
private static final boolean needsElementNameEncoding(String s) {
// Note that this doesn't need to be perfect, just fast.
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'))
return true;
if (i == 0 && (c >= '0' && c <= '9'))
return true;
}
return false;
}
//-----------------------------------------------------------------------------------------------------------------
// XML element text
//-----------------------------------------------------------------------------------------------------------------
/**
* Escapes invalid XML text characters to <c>_x####_</c> sequences.
*
* @param o The object being encoded.
* @return The encoded string.
*/
public static final String escapeText(Object o) {
if (o == null)
return "_x0000_";
String s = o.toString();
try {
if (! needsTextEncoding(s))
return s;
final int len = s.length();
StringWriter sw = new StringWriter(s.length()*2);
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
if ((i == 0 || i == len-1) && Character.isWhitespace(c))
appendPaddedHexChar(sw, c);
else if (c == '_' && isEscapeSequence(s,i))
appendPaddedHexChar(sw, c);
else if (isValidXmlCharacter(c))
sw.append(c);
else
appendPaddedHexChar(sw, c);
}
return sw.toString();
} catch (IOException e) {
throw new RuntimeException(e); // Never happens
}
}
/**
* Encodes the specified element text and sends the results to the specified writer.
*
* <p>
* Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified
* writer.
* <br>Encodes <js>'&amp;'</js>, <js>'&lt;'</js>, and <js>'&gt;'</js> as XML entities.
* <br>Encodes invalid XML text characters to <c>_x####_</c> sequences.
*
* @param w The writer to send the output to.
* @param o The object being encoded.
* @param trim Trim the text before serializing it.
* @param preserveWhitespace
* Specifies whether we're in preserve-whitespace mode.
* (e.g. {@link XmlFormat#MIXED_PWS} or {@link XmlFormat#TEXT_PWS}.
* If <jk>true</jk>, leading and trailing whitespace characters will be encoded.
* @return The same writer passed in.
* @throws IOException Thrown from the writer.
*/
public static final Writer encodeText(Writer w, Object o, boolean trim, boolean preserveWhitespace) throws IOException {
if (o == null)
return w.append("_x0000_");
String s = o.toString();
if (s.isEmpty())
return w.append("_xE000_");
if (trim)
s = s.trim();
if (needsTextEncoding(s)) {
final int len = s.length();
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
if ((i == 0 || i == len-1) && Character.isWhitespace(c) && ! preserveWhitespace)
appendPaddedHexChar(w, c);
else if (REPLACE_TEXT.contains(c))
w.append(REPLACE_TEXT.get(c));
else if (c == '_' && isEscapeSequence(s,i))
appendPaddedHexChar(w, c);
else if (isValidXmlCharacter(c))
w.append(c);
else
appendPaddedHexChar(w, c);
}
} else {
w.append(s);
}
return w;
}
private static final boolean needsTextEncoding(String s) {
// See if we need to convert the string.
// Conversion is somewhat expensive, so make sure we need to do so before hand.
final int len = s.length();
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
if ((i == 0 || i == len-1) && Character.isWhitespace(c))
return true;
if (REPLACE_TEXT.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(s,i)))
return true;
}
return false;
}
private static AsciiMap REPLACE_TEXT = new AsciiMap()
.append('&', "&amp;")
.append('<', "&lt;")
.append('>', "&gt;")
.append((char)0x09, "&#x0009;")
.append((char)0x0A, "&#x000a;")
.append((char)0x0D, "&#x000d;");
//-----------------------------------------------------------------------------------------------------------------
// XML attribute names
//-----------------------------------------------------------------------------------------------------------------
/**
* Serializes and encodes the specified object as valid XML attribute name.
*
* @param w The writer to send the output to.
* @param o The object being serialized.
* @return This object (for method chaining).
* @throws IOException If a problem occurred.
*/
public static final Writer encodeAttrName(Writer w, Object o) throws IOException {
if (o == null)
return w.append("_x0000_");
String s = o.toString();
if (needsAttrNameEncoding(s)) {
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (i == 0) {
if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':')
w.append(c);
else if (c == '_' && ! isEscapeSequence(s,i))
w.append(c);
else
appendPaddedHexChar(w, c);
} else {
if ((c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':'))
w.append(c);
else if (c == '_' && ! isEscapeSequence(s,i))
w.append(c);
else
appendPaddedHexChar(w, c);
}
}
} else {
w.append(s);
}
return w;
}
private static final boolean needsAttrNameEncoding(String s) {
// Note that this doesn't need to be perfect, just fast.
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'))
return true;
if (i == 0 && ! (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'))
return true;
}
return false;
}
//-----------------------------------------------------------------------------------------------------------------
// XML attribute values
//-----------------------------------------------------------------------------------------------------------------
/**
* Encodes the specified attribute value and sends the results to the specified writer.
*
* <p>
* Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified
* writer.
* <br>Encodes <js>'&amp;'</js>, <js>'&lt;'</js>, <js>'&gt;'</js>, <js>'"'</js>, and <js>'\''</js> as XML entities.
* <br>Encodes invalid XML text characters to <c>_x####_</c> sequences.
*
* @param w The writer to send the output to.
* @param o The object being encoded.
* @param trim
* Trim the text before serializing it.
* If <jk>true</jk>, leading and trailing whitespace characters will be encoded.
* @return The same writer passed in.
* @throws IOException Thrown from the writer.
*/
public static final Writer encodeAttrValue(Writer w, Object o, boolean trim) throws IOException {
if (o == null)
return w.append("_x0000_");
String s = o.toString();
if (s.isEmpty())
return w;
if (trim)
s = s.trim();
if (needsAttrValueEncoding(s)) {
final int len = s.length();
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
if ((i == 0 || i == len-1) && Character.isWhitespace(c))
appendPaddedHexChar(w, c);
else if (REPLACE_ATTR_VAL.contains(c))
w.append(REPLACE_ATTR_VAL.get(c));
else if (c == '_' && isEscapeSequence(s,i))
appendPaddedHexChar(w, c);
else if (isValidXmlCharacter(c))
w.append(c);
else
appendPaddedHexChar(w, c);
}
} else {
w.append(s);
}
return w;
}
private static final boolean needsAttrValueEncoding(String s) {
// See if we need to convert the string.
// Conversion is somewhat expensive, so make sure we need to do so before hand.
final int len = s.length();
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
if ((i == 0 || i == len-1) && Character.isWhitespace(c))
return true;
if (REPLACE_ATTR_VAL.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(s,i)))
return true;
}
return false;
}
private static AsciiMap REPLACE_ATTR_VAL = new AsciiMap()
.append('&', "&amp;")
.append('<', "&lt;")
.append('>', "&gt;")
.append('"', "&quot;")
.append('\'', "&apos;")
.append((char)0x09, "&#x0009;")
.append((char)0x0A, "&#x000a;")
.append((char)0x0D, "&#x000d;");
//-----------------------------------------------------------------------------------------------------------------
// Decode XML text
//-----------------------------------------------------------------------------------------------------------------
/**
* Translates any _x####_ sequences (introduced by the various encode methods) back into their original characters.
*
* @param s The string being decoded.
* @param sb The string builder to use as a scratch pad.
* @return The decoded string.
*/
public static final String decode(String s, StringBuilder sb) {
if (s == null) return null;
if (s.length() == 0)
return s;
if (s.indexOf('_') == -1)
return s;
if (sb == null)
sb = new StringBuilder(s.length());
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (c == '_' && isEscapeSequence(s,i)) {
int x = Integer.parseInt(s.substring(i+2, i+6), 16);
// If we find _x0000_, then that means a null.
// If we find _xE000_, then that means an empty string.
if (x == 0)
return null;
else if (x != 0xE000)
sb.append((char)x);
i+=6;
} else {
sb.append(c);
}
}
return sb.toString();
}
/**
* Given a list of Strings and other Objects, combines Strings that are next to each other in the list.
*
* @param l The list of text nodes to collapse.
* @return The same list.
*/
public static LinkedList<Object> collapseTextNodes(LinkedList<Object> l) {
String prev = null;
for (ListIterator<Object> i = l.listIterator(); i.hasNext();) {
Object o = i.next();
if (o instanceof String) {
if (prev == null)
prev = o.toString();
else {
prev += o;
i.remove();
i.previous();
i.remove();
i.add(prev);
}
} else {
prev = null;
}
}
return l;
}
//-----------------------------------------------------------------------------------------------------------------
// Other methods
//-----------------------------------------------------------------------------------------------------------------
// Returns true if the specified character can safely be used in XML text or an attribute.
private static final boolean isValidXmlCharacter(char c) {
return (c >= 0x20 && c <= 0xD7FF) /*|| c == 0xA || c == 0xD*/ || (c >= 0xE000 && c <= 0xFFFD);
}
// Returns true if the string at the specified position is of the form "_x####_"
// where '#' are hexadecimal characters.
private static final boolean isEscapeSequence(String s, int i) {
return s.length() > i+6
&& s.charAt(i) == '_'
&& s.charAt(i+1) == 'x'
&& isHexCharacter(s.charAt(i+2))
&& isHexCharacter(s.charAt(i+3))
&& isHexCharacter(s.charAt(i+4))
&& isHexCharacter(s.charAt(i+5))
&& s.charAt(i+6) == '_';
}
// Returns true if the character is a hexadecimal character
private static final boolean isHexCharacter(char c) {
return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F');
}
// Converts an integer to a hexadecimal string padded to 4 places.
private static final Writer appendPaddedHexChar(Writer out, int num) throws IOException {
out.append("_x");
for (char c : toHex4(num))
out.append(c);
return out.append('_');
}
/**
* Find the namespace given a list of <ja>@Xml</ja> and <ja>@XmlSchema</ja> annotations.
*
* <p>
* The annotations should be a child-to-parent ordering of annotations found on a class or method.
*
* @param xmls The list of <ja>@Xml</ja> annotations.
* @param schemas The list of <ja>@XmlSchema</ja> annotations.
* @return The namespace, or <jk>null</jk> if it couldn't be found.
*/
public static Namespace findNamespace(List<Xml> xmls, List<XmlSchema> schemas) {
for (Xml xml : xmls) {
Namespace ns = findNamespace(xml.prefix(), xml.namespace(), xmls, schemas);
if (ns != null)
return ns;
}
for (XmlSchema schema : schemas) {
Namespace ns = findNamespace(schema.prefix(), schema.namespace(), null, schemas);
if (ns != null)
return ns;
}
return null;
}
private static Namespace findNamespace(String prefix, String ns, List<Xml> xmls, List<XmlSchema> schemas) {
// If both prefix and namespace specified, use that Namespace mapping.
if (! (prefix.isEmpty() || ns.isEmpty()))
return Namespace.create(prefix, ns);
// If only prefix specified, need to search for namespaceURI.
if (! prefix.isEmpty()) {
if (xmls != null)
for (Xml xml2 : xmls)
if (xml2.prefix().equals(prefix) && ! xml2.namespace().isEmpty())
return Namespace.create(prefix, xml2.namespace());
for (XmlSchema schema : schemas) {
if (schema.prefix().equals(prefix) && ! schema.namespace().isEmpty())
return Namespace.create(prefix, schema.namespace());
for (XmlNs xmlNs : schema.xmlNs())
if (xmlNs.prefix().equals(prefix))
return Namespace.create(prefix, xmlNs.namespaceURI());
}
throw new BeanRuntimeException("Found @Xml.prefix annotation with no matching URI. prefix='"+prefix+"'");
}
// If only namespaceURI specified, need to search for prefix.
if (! ns.isEmpty()) {
if (xmls != null)
for (Xml xml2 : xmls)
if (xml2.namespace().equals(ns) && ! xml2.prefix().isEmpty())
return Namespace.create(xml2.prefix(), ns);
for (XmlSchema schema : schemas) {
if (schema.namespace().equals(ns) && ! schema.prefix().isEmpty())
return Namespace.create(schema.prefix(), ns);
for (XmlNs xmlNs : schema.xmlNs())
if (xmlNs.namespaceURI().equals(ns))
return Namespace.create(xmlNs.prefix(), ns);
}
}
return null;
}
/**
* Utility method that converts the current event on the XML stream to something human-readable for debug purposes.
*
* @param r The XML stream reader whose current event is to be converted to a readable string.
* @return The event in human-readable form.
*/
public static final String toReadableEvent(XMLStreamReader r) {
int t = r.getEventType();
if (t == 1)
return "<"+r.getLocalName()+">";
if (t == 2)
return "</"+r.getLocalName()+">";
if (t == 3)
return "PROCESSING_INSTRUCTION";
if (t == 4)
return "CHARACTERS=[" + r.getText() + "]";
if (t == 5)
return "COMMENTS=[" + r.getText() + "]";
if (t == 6)
return "SPACE=[" + r.getText() + "]";
if (t == 7)
return "START_DOCUMENT";
if (t == 8)
return "END_DOCUMENT";
if (t == 9)
return "ENTITY_REFERENCE";
if (t == 10)
return "ATTRIBUTE";
if (t == 11)
return "DTD";
if (t == 12)
return "CDATA=["+r.getText()+"]";
if (t == 13)
return "NAMESPACE";
if (t == 14)
return "NOTATION_DECLARATION";
if (t == 15)
return "ENTITY_DECLARATION";
return "UNKNOWN";
}
}