// *************************************************************************************************************************** | |
// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file * | |
// * distributed with this work for additional information regarding copyright ownership. The ASF licenses this file * | |
// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance * | |
// * with the License. You may obtain a copy of the License at * | |
// * * | |
// * http://www.apache.org/licenses/LICENSE-2.0 * | |
// * * | |
// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an * | |
// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * | |
// * specific language governing permissions and limitations under the License. * | |
// *************************************************************************************************************************** | |
package org.apache.juneau.xml; | |
import static org.apache.juneau.internal.StringUtils.*; | |
import java.io.*; | |
import java.util.*; | |
import javax.xml.stream.*; | |
import org.apache.juneau.*; | |
import org.apache.juneau.internal.*; | |
import org.apache.juneau.xml.annotation.*; | |
/** | |
* XML utility methods. | |
*/ | |
public final class XmlUtils { | |
//----------------------------------------------------------------------------------------------------------------- | |
// XML element names | |
//----------------------------------------------------------------------------------------------------------------- | |
/** | |
* Encodes any invalid XML element name characters to <c>_x####_</c> sequences. | |
* | |
* @param w The writer to send the output to. | |
* @param o The object being encoded. | |
* @return The same writer passed in. | |
* @throws IOException Throw by the writer. | |
*/ | |
public static final Writer encodeElementName(Writer w, Object o) throws IOException { | |
if (o == null) | |
return w.append("_x0000_"); | |
String s = o.toString(); | |
if (needsElementNameEncoding(s)) | |
return encodeElementNameInner(w, s); | |
w.append(s); | |
return w; | |
} | |
/** | |
* Encodes any invalid XML element name characters to <c>_x####_</c> sequences. | |
* | |
* @param o The object being encoded. | |
* @return The encoded element name string. | |
*/ | |
public static final String encodeElementName(Object o) { | |
if (o == null) | |
return "_x0000_"; | |
String s = o.toString(); | |
if (s.isEmpty()) | |
return "_xE000_"; | |
try { | |
if (needsElementNameEncoding(s)) | |
try (Writer w = new StringBuilderWriter(s.length() * 2)) { | |
return encodeElementNameInner(w, s).toString(); | |
} | |
} catch (IOException e) { | |
throw new RuntimeException(e); // Never happens | |
} | |
return s; | |
} | |
private static final Writer encodeElementNameInner(Writer w, String s) throws IOException { | |
for (int i = 0; i < s.length(); i++) { | |
char c = s.charAt(i); | |
if ((c >= 'A' && c <= 'Z') | |
|| (c == '_' && ! isEscapeSequence(s,i)) | |
|| (c >= 'a' && c <= 'z') | |
|| (i != 0 && ( | |
c == '-' | |
|| c == '.' | |
|| (c >= '0' && c <= '9') | |
|| c == '\u00b7' | |
|| (c >= '\u0300' && c <= '\u036f') | |
|| (c >= '\u203f' && c <= '\u2040') | |
)) | |
|| (c >= '\u00c0' && c <= '\u00d6') | |
|| (c >= '\u00d8' && c <= '\u00f6') | |
|| (c >= '\u00f8' && c <= '\u02ff') | |
|| (c >= '\u0370' && c <= '\u037d') | |
|| (c >= '\u037f' && c <= '\u1fff') | |
|| (c >= '\u200c' && c <= '\u200d') | |
|| (c >= '\u2070' && c <= '\u218f') | |
|| (c >= '\u2c00' && c <= '\u2fef') | |
|| (c >= '\u3001' && c <= '\ud7ff') | |
|| (c >= '\uf900' && c <= '\ufdcf') | |
|| (c >= '\ufdf0' && c <= '\ufffd')) { | |
w.append(c); | |
} else { | |
appendPaddedHexChar(w, c); | |
} | |
} | |
return w; | |
} | |
private static final boolean needsElementNameEncoding(String s) { | |
// Note that this doesn't need to be perfect, just fast. | |
for (int i = 0; i < s.length(); i++) { | |
char c = s.charAt(i); | |
if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')) | |
return true; | |
if (i == 0 && (c >= '0' && c <= '9')) | |
return true; | |
} | |
return false; | |
} | |
//----------------------------------------------------------------------------------------------------------------- | |
// XML element text | |
//----------------------------------------------------------------------------------------------------------------- | |
/** | |
* Escapes invalid XML text characters to <c>_x####_</c> sequences. | |
* | |
* @param o The object being encoded. | |
* @return The encoded string. | |
*/ | |
public static final String escapeText(Object o) { | |
if (o == null) | |
return "_x0000_"; | |
String s = o.toString(); | |
try { | |
if (! needsTextEncoding(s)) | |
return s; | |
final int len = s.length(); | |
StringWriter sw = new StringWriter(s.length()*2); | |
for (int i = 0; i < len; i++) { | |
char c = s.charAt(i); | |
if ((i == 0 || i == len-1) && Character.isWhitespace(c)) | |
appendPaddedHexChar(sw, c); | |
else if (c == '_' && isEscapeSequence(s,i)) | |
appendPaddedHexChar(sw, c); | |
else if (isValidXmlCharacter(c)) | |
sw.append(c); | |
else | |
appendPaddedHexChar(sw, c); | |
} | |
return sw.toString(); | |
} catch (IOException e) { | |
throw new RuntimeException(e); // Never happens | |
} | |
} | |
/** | |
* Encodes the specified element text and sends the results to the specified writer. | |
* | |
* <p> | |
* Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified | |
* writer. | |
* <br>Encodes <js>'&'</js>, <js>'<'</js>, and <js>'>'</js> as XML entities. | |
* <br>Encodes invalid XML text characters to <c>_x####_</c> sequences. | |
* | |
* @param w The writer to send the output to. | |
* @param o The object being encoded. | |
* @param trim Trim the text before serializing it. | |
* @param preserveWhitespace | |
* Specifies whether we're in preserve-whitespace mode. | |
* (e.g. {@link XmlFormat#MIXED_PWS} or {@link XmlFormat#TEXT_PWS}. | |
* If <jk>true</jk>, leading and trailing whitespace characters will be encoded. | |
* @return The same writer passed in. | |
* @throws IOException Thrown from the writer. | |
*/ | |
public static final Writer encodeText(Writer w, Object o, boolean trim, boolean preserveWhitespace) throws IOException { | |
if (o == null) | |
return w.append("_x0000_"); | |
String s = o.toString(); | |
if (s.isEmpty()) | |
return w.append("_xE000_"); | |
if (trim) | |
s = s.trim(); | |
if (needsTextEncoding(s)) { | |
final int len = s.length(); | |
for (int i = 0; i < len; i++) { | |
char c = s.charAt(i); | |
if ((i == 0 || i == len-1) && Character.isWhitespace(c) && ! preserveWhitespace) | |
appendPaddedHexChar(w, c); | |
else if (REPLACE_TEXT.contains(c)) | |
w.append(REPLACE_TEXT.get(c)); | |
else if (c == '_' && isEscapeSequence(s,i)) | |
appendPaddedHexChar(w, c); | |
else if (isValidXmlCharacter(c)) | |
w.append(c); | |
else | |
appendPaddedHexChar(w, c); | |
} | |
} else { | |
w.append(s); | |
} | |
return w; | |
} | |
private static final boolean needsTextEncoding(String s) { | |
// See if we need to convert the string. | |
// Conversion is somewhat expensive, so make sure we need to do so before hand. | |
final int len = s.length(); | |
for (int i = 0; i < len; i++) { | |
char c = s.charAt(i); | |
if ((i == 0 || i == len-1) && Character.isWhitespace(c)) | |
return true; | |
if (REPLACE_TEXT.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(s,i))) | |
return true; | |
} | |
return false; | |
} | |
private static AsciiMap REPLACE_TEXT = new AsciiMap() | |
.append('&', "&") | |
.append('<', "<") | |
.append('>', ">") | |
.append((char)0x09, "	") | |
.append((char)0x0A, "
") | |
.append((char)0x0D, "
"); | |
//----------------------------------------------------------------------------------------------------------------- | |
// XML attribute names | |
//----------------------------------------------------------------------------------------------------------------- | |
/** | |
* Serializes and encodes the specified object as valid XML attribute name. | |
* | |
* @param w The writer to send the output to. | |
* @param o The object being serialized. | |
* @return This object (for method chaining). | |
* @throws IOException If a problem occurred. | |
*/ | |
public static final Writer encodeAttrName(Writer w, Object o) throws IOException { | |
if (o == null) | |
return w.append("_x0000_"); | |
String s = o.toString(); | |
if (needsAttrNameEncoding(s)) { | |
for (int i = 0; i < s.length(); i++) { | |
char c = s.charAt(i); | |
if (i == 0) { | |
if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':') | |
w.append(c); | |
else if (c == '_' && ! isEscapeSequence(s,i)) | |
w.append(c); | |
else | |
appendPaddedHexChar(w, c); | |
} else { | |
if ((c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':')) | |
w.append(c); | |
else if (c == '_' && ! isEscapeSequence(s,i)) | |
w.append(c); | |
else | |
appendPaddedHexChar(w, c); | |
} | |
} | |
} else { | |
w.append(s); | |
} | |
return w; | |
} | |
private static final boolean needsAttrNameEncoding(String s) { | |
// Note that this doesn't need to be perfect, just fast. | |
for (int i = 0; i < s.length(); i++) { | |
char c = s.charAt(i); | |
if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')) | |
return true; | |
if (i == 0 && ! (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')) | |
return true; | |
} | |
return false; | |
} | |
//----------------------------------------------------------------------------------------------------------------- | |
// XML attribute values | |
//----------------------------------------------------------------------------------------------------------------- | |
/** | |
* Encodes the specified attribute value and sends the results to the specified writer. | |
* | |
* <p> | |
* Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified | |
* writer. | |
* <br>Encodes <js>'&'</js>, <js>'<'</js>, <js>'>'</js>, <js>'"'</js>, and <js>'\''</js> as XML entities. | |
* <br>Encodes invalid XML text characters to <c>_x####_</c> sequences. | |
* | |
* @param w The writer to send the output to. | |
* @param o The object being encoded. | |
* @param trim | |
* Trim the text before serializing it. | |
* If <jk>true</jk>, leading and trailing whitespace characters will be encoded. | |
* @return The same writer passed in. | |
* @throws IOException Thrown from the writer. | |
*/ | |
public static final Writer encodeAttrValue(Writer w, Object o, boolean trim) throws IOException { | |
if (o == null) | |
return w.append("_x0000_"); | |
String s = o.toString(); | |
if (s.isEmpty()) | |
return w; | |
if (trim) | |
s = s.trim(); | |
if (needsAttrValueEncoding(s)) { | |
final int len = s.length(); | |
for (int i = 0; i < len; i++) { | |
char c = s.charAt(i); | |
if ((i == 0 || i == len-1) && Character.isWhitespace(c)) | |
appendPaddedHexChar(w, c); | |
else if (REPLACE_ATTR_VAL.contains(c)) | |
w.append(REPLACE_ATTR_VAL.get(c)); | |
else if (c == '_' && isEscapeSequence(s,i)) | |
appendPaddedHexChar(w, c); | |
else if (isValidXmlCharacter(c)) | |
w.append(c); | |
else | |
appendPaddedHexChar(w, c); | |
} | |
} else { | |
w.append(s); | |
} | |
return w; | |
} | |
private static final boolean needsAttrValueEncoding(String s) { | |
// See if we need to convert the string. | |
// Conversion is somewhat expensive, so make sure we need to do so before hand. | |
final int len = s.length(); | |
for (int i = 0; i < len; i++) { | |
char c = s.charAt(i); | |
if ((i == 0 || i == len-1) && Character.isWhitespace(c)) | |
return true; | |
if (REPLACE_ATTR_VAL.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(s,i))) | |
return true; | |
} | |
return false; | |
} | |
private static AsciiMap REPLACE_ATTR_VAL = new AsciiMap() | |
.append('&', "&") | |
.append('<', "<") | |
.append('>', ">") | |
.append('"', """) | |
.append('\'', "'") | |
.append((char)0x09, "	") | |
.append((char)0x0A, "
") | |
.append((char)0x0D, "
"); | |
//----------------------------------------------------------------------------------------------------------------- | |
// Decode XML text | |
//----------------------------------------------------------------------------------------------------------------- | |
/** | |
* Translates any _x####_ sequences (introduced by the various encode methods) back into their original characters. | |
* | |
* @param s The string being decoded. | |
* @param sb The string builder to use as a scratch pad. | |
* @return The decoded string. | |
*/ | |
public static final String decode(String s, StringBuilder sb) { | |
if (s == null) return null; | |
if (s.length() == 0) | |
return s; | |
if (s.indexOf('_') == -1) | |
return s; | |
if (sb == null) | |
sb = new StringBuilder(s.length()); | |
for (int i = 0; i < s.length(); i++) { | |
char c = s.charAt(i); | |
if (c == '_' && isEscapeSequence(s,i)) { | |
int x = Integer.parseInt(s.substring(i+2, i+6), 16); | |
// If we find _x0000_, then that means a null. | |
// If we find _xE000_, then that means an empty string. | |
if (x == 0) | |
return null; | |
else if (x != 0xE000) | |
sb.append((char)x); | |
i+=6; | |
} else { | |
sb.append(c); | |
} | |
} | |
return sb.toString(); | |
} | |
/** | |
* Given a list of Strings and other Objects, combines Strings that are next to each other in the list. | |
* | |
* @param l The list of text nodes to collapse. | |
* @return The same list. | |
*/ | |
public static LinkedList<Object> collapseTextNodes(LinkedList<Object> l) { | |
String prev = null; | |
for (ListIterator<Object> i = l.listIterator(); i.hasNext();) { | |
Object o = i.next(); | |
if (o instanceof String) { | |
if (prev == null) | |
prev = o.toString(); | |
else { | |
prev += o; | |
i.remove(); | |
i.previous(); | |
i.remove(); | |
i.add(prev); | |
} | |
} else { | |
prev = null; | |
} | |
} | |
return l; | |
} | |
//----------------------------------------------------------------------------------------------------------------- | |
// Other methods | |
//----------------------------------------------------------------------------------------------------------------- | |
// Returns true if the specified character can safely be used in XML text or an attribute. | |
private static final boolean isValidXmlCharacter(char c) { | |
return (c >= 0x20 && c <= 0xD7FF) /*|| c == 0xA || c == 0xD*/ || (c >= 0xE000 && c <= 0xFFFD); | |
} | |
// Returns true if the string at the specified position is of the form "_x####_" | |
// where '#' are hexadecimal characters. | |
private static final boolean isEscapeSequence(String s, int i) { | |
return s.length() > i+6 | |
&& s.charAt(i) == '_' | |
&& s.charAt(i+1) == 'x' | |
&& isHexCharacter(s.charAt(i+2)) | |
&& isHexCharacter(s.charAt(i+3)) | |
&& isHexCharacter(s.charAt(i+4)) | |
&& isHexCharacter(s.charAt(i+5)) | |
&& s.charAt(i+6) == '_'; | |
} | |
// Returns true if the character is a hexadecimal character | |
private static final boolean isHexCharacter(char c) { | |
return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'); | |
} | |
// Converts an integer to a hexadecimal string padded to 4 places. | |
private static final Writer appendPaddedHexChar(Writer out, int num) throws IOException { | |
out.append("_x"); | |
for (char c : toHex4(num)) | |
out.append(c); | |
return out.append('_'); | |
} | |
/** | |
* Find the namespace given a list of <ja>@Xml</ja> and <ja>@XmlSchema</ja> annotations. | |
* | |
* <p> | |
* The annotations should be a child-to-parent ordering of annotations found on a class or method. | |
* | |
* @param xmls The list of <ja>@Xml</ja> annotations. | |
* @param schemas The list of <ja>@XmlSchema</ja> annotations. | |
* @return The namespace, or <jk>null</jk> if it couldn't be found. | |
*/ | |
public static Namespace findNamespace(List<Xml> xmls, List<XmlSchema> schemas) { | |
for (Xml xml : xmls) { | |
Namespace ns = findNamespace(xml.prefix(), xml.namespace(), xmls, schemas); | |
if (ns != null) | |
return ns; | |
} | |
for (XmlSchema schema : schemas) { | |
Namespace ns = findNamespace(schema.prefix(), schema.namespace(), null, schemas); | |
if (ns != null) | |
return ns; | |
} | |
return null; | |
} | |
private static Namespace findNamespace(String prefix, String ns, List<Xml> xmls, List<XmlSchema> schemas) { | |
// If both prefix and namespace specified, use that Namespace mapping. | |
if (! (prefix.isEmpty() || ns.isEmpty())) | |
return Namespace.create(prefix, ns); | |
// If only prefix specified, need to search for namespaceURI. | |
if (! prefix.isEmpty()) { | |
if (xmls != null) | |
for (Xml xml2 : xmls) | |
if (xml2.prefix().equals(prefix) && ! xml2.namespace().isEmpty()) | |
return Namespace.create(prefix, xml2.namespace()); | |
for (XmlSchema schema : schemas) { | |
if (schema.prefix().equals(prefix) && ! schema.namespace().isEmpty()) | |
return Namespace.create(prefix, schema.namespace()); | |
for (XmlNs xmlNs : schema.xmlNs()) | |
if (xmlNs.prefix().equals(prefix)) | |
return Namespace.create(prefix, xmlNs.namespaceURI()); | |
} | |
throw new BeanRuntimeException("Found @Xml.prefix annotation with no matching URI. prefix='"+prefix+"'"); | |
} | |
// If only namespaceURI specified, need to search for prefix. | |
if (! ns.isEmpty()) { | |
if (xmls != null) | |
for (Xml xml2 : xmls) | |
if (xml2.namespace().equals(ns) && ! xml2.prefix().isEmpty()) | |
return Namespace.create(xml2.prefix(), ns); | |
for (XmlSchema schema : schemas) { | |
if (schema.namespace().equals(ns) && ! schema.prefix().isEmpty()) | |
return Namespace.create(schema.prefix(), ns); | |
for (XmlNs xmlNs : schema.xmlNs()) | |
if (xmlNs.namespaceURI().equals(ns)) | |
return Namespace.create(xmlNs.prefix(), ns); | |
} | |
} | |
return null; | |
} | |
/** | |
* Utility method that converts the current event on the XML stream to something human-readable for debug purposes. | |
* | |
* @param r The XML stream reader whose current event is to be converted to a readable string. | |
* @return The event in human-readable form. | |
*/ | |
public static final String toReadableEvent(XMLStreamReader r) { | |
int t = r.getEventType(); | |
if (t == 1) | |
return "<"+r.getLocalName()+">"; | |
if (t == 2) | |
return "</"+r.getLocalName()+">"; | |
if (t == 3) | |
return "PROCESSING_INSTRUCTION"; | |
if (t == 4) | |
return "CHARACTERS=[" + r.getText() + "]"; | |
if (t == 5) | |
return "COMMENTS=[" + r.getText() + "]"; | |
if (t == 6) | |
return "SPACE=[" + r.getText() + "]"; | |
if (t == 7) | |
return "START_DOCUMENT"; | |
if (t == 8) | |
return "END_DOCUMENT"; | |
if (t == 9) | |
return "ENTITY_REFERENCE"; | |
if (t == 10) | |
return "ATTRIBUTE"; | |
if (t == 11) | |
return "DTD"; | |
if (t == 12) | |
return "CDATA=["+r.getText()+"]"; | |
if (t == 13) | |
return "NAMESPACE"; | |
if (t == 14) | |
return "NOTATION_DECLARATION"; | |
if (t == 15) | |
return "ENTITY_DECLARATION"; | |
return "UNKNOWN"; | |
} | |
} |