TEXT-27: Adding StringEscapeUtils from commons-lang:3.5
diff --git a/pom.xml b/pom.xml
index 0c3a10a..9113d23 100644
--- a/pom.xml
+++ b/pom.xml
@@ -101,6 +101,12 @@
<version>1.4</version>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>2.5</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<distributionManagement>
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index e60b364..1274672 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -22,6 +22,7 @@
<body>
<release version="1.0" date="tba" description="tba">
+ <action issue="TEXT-27" type="add" dev="chtompki">Move org.apache.commons.lang3.StringEscapeUtils.java into text</action>
<action issue="TEXT-23" type="add" dev="chtompki">Moving from commons-lang, the package org.apache.commons.lang3.text</action>
<action issue="TEXT-10" type="add" dev="kinow" due-to="Don Jeba">A more complex Levenshtein distance</action>
<action issue="TEXT-24" type="add" dev="chtompki">Add coveralls and Travis.ci integration</action>
diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java
new file mode 100644
index 0000000..6b88275
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java
@@ -0,0 +1,811 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text;
+
+import java.io.IOException;
+import java.io.Writer;
+
+import org.apache.commons.lang3.CharUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import org.apache.commons.text.translate.AggregateTranslator;
+import org.apache.commons.text.translate.CharSequenceTranslator;
+import org.apache.commons.text.translate.EntityArrays;
+import org.apache.commons.text.translate.JavaUnicodeEscaper;
+import org.apache.commons.text.translate.LookupTranslator;
+import org.apache.commons.text.translate.NumericEntityEscaper;
+import org.apache.commons.text.translate.NumericEntityUnescaper;
+import org.apache.commons.text.translate.OctalUnescaper;
+import org.apache.commons.text.translate.UnicodeUnescaper;
+import org.apache.commons.text.translate.UnicodeUnpairedSurrogateRemover;
+
+/**
+ * <p>Escapes and unescapes {@code String}s for
+ * Java, Java Script, HTML and XML.</p>
+ *
+ * <p>#ThreadSafe#</p>
+ *
+ *
+ * <p>
+ * This code has been adapted from Apache Commons Lang 3.5.
+ * </p>
+ */
+public class StringEscapeUtils {
+
+ /* ESCAPE TRANSLATORS */
+
+ /**
+ * Translator object for escaping Java.
+ *
+ * While {@link #escapeJava(String)} is the expected method of use, this
+ * object allows the Java escaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.0
+ */
+ public static final CharSequenceTranslator ESCAPE_JAVA =
+ new LookupTranslator(
+ new String[][] {
+ {"\"", "\\\""},
+ {"\\", "\\\\"},
+ }).with(
+ new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE())
+ ).with(
+ JavaUnicodeEscaper.outsideOf(32, 0x7f)
+ );
+
+ /**
+ * Translator object for escaping EcmaScript/JavaScript.
+ *
+ * While {@link #escapeEcmaScript(String)} is the expected method of use, this
+ * object allows the EcmaScript escaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.0
+ */
+ public static final CharSequenceTranslator ESCAPE_ECMASCRIPT =
+ new AggregateTranslator(
+ new LookupTranslator(
+ new String[][] {
+ {"'", "\\'"},
+ {"\"", "\\\""},
+ {"\\", "\\\\"},
+ {"/", "\\/"}
+ }),
+ new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()),
+ JavaUnicodeEscaper.outsideOf(32, 0x7f)
+ );
+
+ /**
+ * Translator object for escaping Json.
+ *
+ * While {@link #escapeJson(String)} is the expected method of use, this
+ * object allows the Json escaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.2
+ */
+ public static final CharSequenceTranslator ESCAPE_JSON =
+ new AggregateTranslator(
+ new LookupTranslator(
+ new String[][] {
+ {"\"", "\\\""},
+ {"\\", "\\\\"},
+ {"/", "\\/"}
+ }),
+ new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()),
+ JavaUnicodeEscaper.outsideOf(32, 0x7f)
+ );
+
+ /**
+ * Translator object for escaping XML.
+ *
+ * While {@link #escapeXml(String)} is the expected method of use, this
+ * object allows the XML escaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.0
+ * @deprecated use {@link #ESCAPE_XML10} or {@link #ESCAPE_XML11} instead.
+ */
+ @Deprecated
+ public static final CharSequenceTranslator ESCAPE_XML =
+ new AggregateTranslator(
+ new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
+ new LookupTranslator(EntityArrays.APOS_ESCAPE())
+ );
+
+ /**
+ * Translator object for escaping XML 1.0.
+ *
+ * While {@link #escapeXml10(String)} is the expected method of use, this
+ * object allows the XML escaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.3
+ */
+ public static final CharSequenceTranslator ESCAPE_XML10 =
+ new AggregateTranslator(
+ new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
+ new LookupTranslator(EntityArrays.APOS_ESCAPE()),
+ new LookupTranslator(
+ new String[][] {
+ { "\u0000", StringUtils.EMPTY },
+ { "\u0001", StringUtils.EMPTY },
+ { "\u0002", StringUtils.EMPTY },
+ { "\u0003", StringUtils.EMPTY },
+ { "\u0004", StringUtils.EMPTY },
+ { "\u0005", StringUtils.EMPTY },
+ { "\u0006", StringUtils.EMPTY },
+ { "\u0007", StringUtils.EMPTY },
+ { "\u0008", StringUtils.EMPTY },
+ { "\u000b", StringUtils.EMPTY },
+ { "\u000c", StringUtils.EMPTY },
+ { "\u000e", StringUtils.EMPTY },
+ { "\u000f", StringUtils.EMPTY },
+ { "\u0010", StringUtils.EMPTY },
+ { "\u0011", StringUtils.EMPTY },
+ { "\u0012", StringUtils.EMPTY },
+ { "\u0013", StringUtils.EMPTY },
+ { "\u0014", StringUtils.EMPTY },
+ { "\u0015", StringUtils.EMPTY },
+ { "\u0016", StringUtils.EMPTY },
+ { "\u0017", StringUtils.EMPTY },
+ { "\u0018", StringUtils.EMPTY },
+ { "\u0019", StringUtils.EMPTY },
+ { "\u001a", StringUtils.EMPTY },
+ { "\u001b", StringUtils.EMPTY },
+ { "\u001c", StringUtils.EMPTY },
+ { "\u001d", StringUtils.EMPTY },
+ { "\u001e", StringUtils.EMPTY },
+ { "\u001f", StringUtils.EMPTY },
+ { "\ufffe", StringUtils.EMPTY },
+ { "\uffff", StringUtils.EMPTY }
+ }),
+ NumericEntityEscaper.between(0x7f, 0x84),
+ NumericEntityEscaper.between(0x86, 0x9f),
+ new UnicodeUnpairedSurrogateRemover()
+ );
+
+ /**
+ * Translator object for escaping XML 1.1.
+ *
+ * While {@link #escapeXml11(String)} is the expected method of use, this
+ * object allows the XML escaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.3
+ */
+ public static final CharSequenceTranslator ESCAPE_XML11 =
+ new AggregateTranslator(
+ new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
+ new LookupTranslator(EntityArrays.APOS_ESCAPE()),
+ new LookupTranslator(
+ new String[][] {
+ { "\u0000", StringUtils.EMPTY },
+ { "\u000b", "" },
+ { "\u000c", "" },
+ { "\ufffe", StringUtils.EMPTY },
+ { "\uffff", StringUtils.EMPTY }
+ }),
+ NumericEntityEscaper.between(0x1, 0x8),
+ NumericEntityEscaper.between(0xe, 0x1f),
+ NumericEntityEscaper.between(0x7f, 0x84),
+ NumericEntityEscaper.between(0x86, 0x9f),
+ new UnicodeUnpairedSurrogateRemover()
+ );
+
+ /**
+ * Translator object for escaping HTML version 3.0.
+ *
+ * While {@link #escapeHtml3(String)} is the expected method of use, this
+ * object allows the HTML escaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.0
+ */
+ public static final CharSequenceTranslator ESCAPE_HTML3 =
+ new AggregateTranslator(
+ new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
+ new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE())
+ );
+
+ /**
+ * Translator object for escaping HTML version 4.0.
+ *
+ * While {@link #escapeHtml4(String)} is the expected method of use, this
+ * object allows the HTML escaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.0
+ */
+ public static final CharSequenceTranslator ESCAPE_HTML4 =
+ new AggregateTranslator(
+ new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
+ new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()),
+ new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE())
+ );
+
+ /**
+ * Translator object for escaping individual Comma Separated Values.
+ *
+ * While {@link #escapeCsv(String)} is the expected method of use, this
+ * object allows the CSV escaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.0
+ */
+ public static final CharSequenceTranslator ESCAPE_CSV = new CsvEscaper();
+
+ // TODO: Create a parent class - 'SinglePassTranslator' ?
+ // It would handle the index checking + length returning,
+ // and could also have an optimization check method.
+ static class CsvEscaper extends CharSequenceTranslator {
+
+ private static final char CSV_DELIMITER = ',';
+ private static final char CSV_QUOTE = '"';
+ private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE);
+ private static final char[] CSV_SEARCH_CHARS =
+ new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF};
+
+ @Override
+ public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
+
+ if(index != 0) {
+ throw new IllegalStateException("CsvEscaper should never reach the [1] index");
+ }
+
+ if (StringUtils.containsNone(input.toString(), CSV_SEARCH_CHARS)) {
+ out.write(input.toString());
+ } else {
+ out.write(CSV_QUOTE);
+ out.write(StringUtils.replace(input.toString(), CSV_QUOTE_STR, CSV_QUOTE_STR + CSV_QUOTE_STR));
+ out.write(CSV_QUOTE);
+ }
+ return Character.codePointCount(input, 0, input.length());
+ }
+ }
+
+ /* UNESCAPE TRANSLATORS */
+
+ /**
+ * Translator object for unescaping escaped Java.
+ *
+ * While {@link #unescapeJava(String)} is the expected method of use, this
+ * object allows the Java unescaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.0
+ */
+ // TODO: throw "illegal character: \92" as an Exception if a \ on the end of the Java (as per the compiler)?
+ public static final CharSequenceTranslator UNESCAPE_JAVA =
+ new AggregateTranslator(
+ new OctalUnescaper(), // .between('\1', '\377'),
+ new UnicodeUnescaper(),
+ new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_UNESCAPE()),
+ new LookupTranslator(
+ new String[][] {
+ {"\\\\", "\\"},
+ {"\\\"", "\""},
+ {"\\'", "'"},
+ {"\\", ""}
+ })
+ );
+
+ /**
+ * Translator object for unescaping escaped EcmaScript.
+ *
+ * While {@link #unescapeEcmaScript(String)} is the expected method of use, this
+ * object allows the EcmaScript unescaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.0
+ */
+ public static final CharSequenceTranslator UNESCAPE_ECMASCRIPT = UNESCAPE_JAVA;
+
+ /**
+ * Translator object for unescaping escaped Json.
+ *
+ * While {@link #unescapeJson(String)} is the expected method of use, this
+ * object allows the Json unescaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.2
+ */
+ public static final CharSequenceTranslator UNESCAPE_JSON = UNESCAPE_JAVA;
+
+ /**
+ * Translator object for unescaping escaped HTML 3.0.
+ *
+ * While {@link #unescapeHtml3(String)} is the expected method of use, this
+ * object allows the HTML unescaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.0
+ */
+ public static final CharSequenceTranslator UNESCAPE_HTML3 =
+ new AggregateTranslator(
+ new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
+ new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()),
+ new NumericEntityUnescaper()
+ );
+
+ /**
+ * Translator object for unescaping escaped HTML 4.0.
+ *
+ * While {@link #unescapeHtml4(String)} is the expected method of use, this
+ * object allows the HTML unescaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.0
+ */
+ public static final CharSequenceTranslator UNESCAPE_HTML4 =
+ new AggregateTranslator(
+ new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
+ new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()),
+ new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE()),
+ new NumericEntityUnescaper()
+ );
+
+ /**
+ * Translator object for unescaping escaped XML.
+ *
+ * While {@link #unescapeXml(String)} is the expected method of use, this
+ * object allows the XML unescaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.0
+ */
+ public static final CharSequenceTranslator UNESCAPE_XML =
+ new AggregateTranslator(
+ new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
+ new LookupTranslator(EntityArrays.APOS_UNESCAPE()),
+ new NumericEntityUnescaper()
+ );
+
+ /**
+ * Translator object for unescaping escaped Comma Separated Value entries.
+ *
+ * While {@link #unescapeCsv(String)} is the expected method of use, this
+ * object allows the CSV unescaping functionality to be used
+ * as the foundation for a custom translator.
+ *
+ * @since 3.0
+ */
+ public static final CharSequenceTranslator UNESCAPE_CSV = new CsvUnescaper();
+
+ static class CsvUnescaper extends CharSequenceTranslator {
+
+ private static final char CSV_DELIMITER = ',';
+ private static final char CSV_QUOTE = '"';
+ private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE);
+ private static final char[] CSV_SEARCH_CHARS =
+ new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF};
+
+ @Override
+ public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
+
+ if(index != 0) {
+ throw new IllegalStateException("CsvUnescaper should never reach the [1] index");
+ }
+
+ if ( input.charAt(0) != CSV_QUOTE || input.charAt(input.length() - 1) != CSV_QUOTE ) {
+ out.write(input.toString());
+ return Character.codePointCount(input, 0, input.length());
+ }
+
+ // strip quotes
+ final String quoteless = input.subSequence(1, input.length() - 1).toString();
+
+ if ( StringUtils.containsAny(quoteless, CSV_SEARCH_CHARS) ) {
+ // deal with escaped quotes; ie) ""
+ out.write(StringUtils.replace(quoteless, CSV_QUOTE_STR + CSV_QUOTE_STR, CSV_QUOTE_STR));
+ } else {
+ out.write(input.toString());
+ }
+ return Character.codePointCount(input, 0, input.length());
+ }
+ }
+
+ /* Helper functions */
+
+ /**
+ * <p>{@code StringEscapeUtils} instances should NOT be constructed in
+ * standard programming.</p>
+ *
+ * <p>Instead, the class should be used as:</p>
+ * <pre>StringEscapeUtils.escapeJava("foo");</pre>
+ *
+ * <p>This constructor is public to permit tools that require a JavaBean
+ * instance to operate.</p>
+ */
+ public StringEscapeUtils() {
+ super();
+ }
+
+ // Java and JavaScript
+ //--------------------------------------------------------------------------
+ /**
+ * <p>Escapes the characters in a {@code String} using Java String rules.</p>
+ *
+ * <p>Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p>
+ *
+ * <p>So a tab becomes the characters {@code '\\'} and
+ * {@code 't'}.</p>
+ *
+ * <p>The only difference between Java strings and JavaScript strings
+ * is that in JavaScript, a single quote and forward-slash (/) are escaped.</p>
+ *
+ * <p>Example:</p>
+ * <pre>
+ * input string: He didn't say, "Stop!"
+ * output string: He didn't say, \"Stop!\"
+ * </pre>
+ *
+ * @param input String to escape values in, may be null
+ * @return String with escaped values, {@code null} if null string input
+ */
+ public static final String escapeJava(final String input) {
+ return ESCAPE_JAVA.translate(input);
+ }
+
+ /**
+ * <p>Escapes the characters in a {@code String} using EcmaScript String rules.</p>
+ * <p>Escapes any values it finds into their EcmaScript String form.
+ * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p>
+ *
+ * <p>So a tab becomes the characters {@code '\\'} and
+ * {@code 't'}.</p>
+ *
+ * <p>The only difference between Java strings and EcmaScript strings
+ * is that in EcmaScript, a single quote and forward-slash (/) are escaped.</p>
+ *
+ * <p>Note that EcmaScript is best known by the JavaScript and ActionScript dialects. </p>
+ *
+ * <p>Example:</p>
+ * <pre>
+ * input string: He didn't say, "Stop!"
+ * output string: He didn\'t say, \"Stop!\"
+ * </pre>
+ *
+ * @param input String to escape values in, may be null
+ * @return String with escaped values, {@code null} if null string input
+ *
+ * @since 3.0
+ */
+ public static final String escapeEcmaScript(final String input) {
+ return ESCAPE_ECMASCRIPT.translate(input);
+ }
+
+ /**
+ * <p>Escapes the characters in a {@code String} using Json String rules.</p>
+ * <p>Escapes any values it finds into their Json String form.
+ * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p>
+ *
+ * <p>So a tab becomes the characters {@code '\\'} and
+ * {@code 't'}.</p>
+ *
+ * <p>The only difference between Java strings and Json strings
+ * is that in Json, forward-slash (/) is escaped.</p>
+ *
+ * <p>See http://www.ietf.org/rfc/rfc4627.txt for further details. </p>
+ *
+ * <p>Example:</p>
+ * <pre>
+ * input string: He didn't say, "Stop!"
+ * output string: He didn't say, \"Stop!\"
+ * </pre>
+ *
+ * @param input String to escape values in, may be null
+ * @return String with escaped values, {@code null} if null string input
+ *
+ * @since 3.2
+ */
+ public static final String escapeJson(final String input) {
+ return ESCAPE_JSON.translate(input);
+ }
+
+ /**
+ * <p>Unescapes any Java literals found in the {@code String}.
+ * For example, it will turn a sequence of {@code '\'} and
+ * {@code 'n'} into a newline character, unless the {@code '\'}
+ * is preceded by another {@code '\'}.</p>
+ *
+ * @param input the {@code String} to unescape, may be null
+ * @return a new unescaped {@code String}, {@code null} if null string input
+ */
+ public static final String unescapeJava(final String input) {
+ return UNESCAPE_JAVA.translate(input);
+ }
+
+ /**
+ * <p>Unescapes any EcmaScript literals found in the {@code String}.</p>
+ *
+ * <p>For example, it will turn a sequence of {@code '\'} and {@code 'n'}
+ * into a newline character, unless the {@code '\'} is preceded by another
+ * {@code '\'}.</p>
+ *
+ * @see #unescapeJava(String)
+ * @param input the {@code String} to unescape, may be null
+ * @return A new unescaped {@code String}, {@code null} if null string input
+ *
+ * @since 3.0
+ */
+ public static final String unescapeEcmaScript(final String input) {
+ return UNESCAPE_ECMASCRIPT.translate(input);
+ }
+
+ /**
+ * <p>Unescapes any Json literals found in the {@code String}.</p>
+ *
+ * <p>For example, it will turn a sequence of {@code '\'} and {@code 'n'}
+ * into a newline character, unless the {@code '\'} is preceded by another
+ * {@code '\'}.</p>
+ *
+ * @see #unescapeJava(String)
+ * @param input the {@code String} to unescape, may be null
+ * @return A new unescaped {@code String}, {@code null} if null string input
+ *
+ * @since 3.2
+ */
+ public static final String unescapeJson(final String input) {
+ return UNESCAPE_JSON.translate(input);
+ }
+
+ // HTML and XML
+ //--------------------------------------------------------------------------
+ /**
+ * <p>Escapes the characters in a {@code String} using HTML entities.</p>
+ *
+ * <p>
+ * For example:
+ * </p>
+ * <p><code>"bread" & "butter"</code></p>
+ * becomes:
+ * <p>
+ * <code>&quot;bread&quot; &amp; &quot;butter&quot;</code>.
+ * </p>
+ *
+ * <p>Supports all known HTML 4.0 entities, including funky accents.
+ * Note that the commonly used apostrophe escape character (&apos;)
+ * is not a legal entity and so is not supported). </p>
+ *
+ * @param input the {@code String} to escape, may be null
+ * @return a new escaped {@code String}, {@code null} if null string input
+ *
+ * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a>
+ * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a>
+ * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a>
+ * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a>
+ * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a>
+ *
+ * @since 3.0
+ */
+ public static final String escapeHtml4(final String input) {
+ return ESCAPE_HTML4.translate(input);
+ }
+
+ /**
+ * <p>Escapes the characters in a {@code String} using HTML entities.</p>
+ * <p>Supports only the HTML 3.0 entities. </p>
+ *
+ * @param input the {@code String} to escape, may be null
+ * @return a new escaped {@code String}, {@code null} if null string input
+ *
+ * @since 3.0
+ */
+ public static final String escapeHtml3(final String input) {
+ return ESCAPE_HTML3.translate(input);
+ }
+
+ //-----------------------------------------------------------------------
+ /**
+ * <p>Unescapes a string containing entity escapes to a string
+ * containing the actual Unicode characters corresponding to the
+ * escapes. Supports HTML 4.0 entities.</p>
+ *
+ * <p>For example, the string {@code "<Français>"}
+ * will become {@code "<Français>"}</p>
+ *
+ * <p>If an entity is unrecognized, it is left alone, and inserted
+ * verbatim into the result string. e.g. {@code ">&zzzz;x"} will
+ * become {@code ">&zzzz;x"}.</p>
+ *
+ * @param input the {@code String} to unescape, may be null
+ * @return a new unescaped {@code String}, {@code null} if null string input
+ *
+ * @since 3.0
+ */
+ public static final String unescapeHtml4(final String input) {
+ return UNESCAPE_HTML4.translate(input);
+ }
+
+ /**
+ * <p>Unescapes a string containing entity escapes to a string
+ * containing the actual Unicode characters corresponding to the
+ * escapes. Supports only HTML 3.0 entities.</p>
+ *
+ * @param input the {@code String} to unescape, may be null
+ * @return a new unescaped {@code String}, {@code null} if null string input
+ *
+ * @since 3.0
+ */
+ public static final String unescapeHtml3(final String input) {
+ return UNESCAPE_HTML3.translate(input);
+ }
+
+ //-----------------------------------------------------------------------
+ /**
+ * <p>Escapes the characters in a {@code String} using XML entities.</p>
+ *
+ * <p>For example: {@code "bread" & "butter"} =>
+ * {@code "bread" & "butter"}.
+ * </p>
+ *
+ * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos).
+ * Does not support DTDs or external entities.</p>
+ *
+ * <p>Note that Unicode characters greater than 0x7f are as of 3.0, no longer
+ * escaped. If you still wish this functionality, you can achieve it
+ * via the following:
+ * {@code StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) );}</p>
+ *
+ * @param input the {@code String} to escape, may be null
+ * @return a new escaped {@code String}, {@code null} if null string input
+ * @see #unescapeXml(java.lang.String)
+ * @deprecated use {@link #escapeXml10(java.lang.String)} or {@link #escapeXml11(java.lang.String)} instead.
+ */
+ @Deprecated
+ public static final String escapeXml(final String input) {
+ return ESCAPE_XML.translate(input);
+ }
+
+ /**
+ * <p>Escapes the characters in a {@code String} using XML entities.</p>
+ *
+ * <p>For example: {@code "bread" & "butter"} =>
+ * {@code "bread" & "butter"}.
+ * </p>
+ *
+ * <p>Note that XML 1.0 is a text-only format: it cannot represent control
+ * characters or unpaired Unicode surrogate codepoints, even after escaping.
+ * {@code escapeXml10} will remove characters that do not fit in the
+ * following ranges:</p>
+ *
+ * <p>{@code #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}</p>
+ *
+ * <p>Though not strictly necessary, {@code escapeXml10} will escape
+ * characters in the following ranges:</p>
+ *
+ * <p>{@code [#x7F-#x84] | [#x86-#x9F]}</p>
+ *
+ * <p>The returned string can be inserted into a valid XML 1.0 or XML 1.1
+ * document. If you want to allow more non-text characters in an XML 1.1
+ * document, use {@link #escapeXml11(String)}.</p>
+ *
+ * @param input the {@code String} to escape, may be null
+ * @return a new escaped {@code String}, {@code null} if null string input
+ * @see #unescapeXml(java.lang.String)
+ * @since 3.3
+ */
+ public static String escapeXml10(final String input) {
+ return ESCAPE_XML10.translate(input);
+ }
+
+ /**
+ * <p>Escapes the characters in a {@code String} using XML entities.</p>
+ *
+ * <p>For example: {@code "bread" & "butter"} =>
+ * {@code "bread" & "butter"}.
+ * </p>
+ *
+ * <p>XML 1.1 can represent certain control characters, but it cannot represent
+ * the null byte or unpaired Unicode surrogate codepoints, even after escaping.
+ * {@code escapeXml11} will remove characters that do not fit in the following
+ * ranges:</p>
+ *
+ * <p>{@code [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]}</p>
+ *
+ * <p>{@code escapeXml11} will escape characters in the following ranges:</p>
+ *
+ * <p>{@code [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]}</p>
+ *
+ * <p>The returned string can be inserted into a valid XML 1.1 document. Do not
+ * use it for XML 1.0 documents.</p>
+ *
+ * @param input the {@code String} to escape, may be null
+ * @return a new escaped {@code String}, {@code null} if null string input
+ * @see #unescapeXml(java.lang.String)
+ * @since 3.3
+ */
+ public static String escapeXml11(final String input) {
+ return ESCAPE_XML11.translate(input);
+ }
+
+ //-----------------------------------------------------------------------
+ /**
+ * <p>Unescapes a string containing XML entity escapes to a string
+ * containing the actual Unicode characters corresponding to the
+ * escapes.</p>
+ *
+ * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos).
+ * Does not support DTDs or external entities.</p>
+ *
+ * <p>Note that numerical \\u Unicode codes are unescaped to their respective
+ * Unicode characters. This may change in future releases. </p>
+ *
+ * @param input the {@code String} to unescape, may be null
+ * @return a new unescaped {@code String}, {@code null} if null string input
+ * @see #escapeXml(String)
+ * @see #escapeXml10(String)
+ * @see #escapeXml11(String)
+ */
+ public static final String unescapeXml(final String input) {
+ return UNESCAPE_XML.translate(input);
+ }
+
+ //-----------------------------------------------------------------------
+
+ /**
+ * <p>Returns a {@code String} value for a CSV column enclosed in double quotes,
+ * if required.</p>
+ *
+ * <p>If the value contains a comma, newline or double quote, then the
+ * String value is returned enclosed in double quotes.</p>
+ *
+ * <p>Any double quote characters in the value are escaped with another double quote.</p>
+ *
+ * <p>If the value does not contain a comma, newline or double quote, then the
+ * String value is returned unchanged.</p>
+ *
+ * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and
+ * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>.
+ *
+ * @param input the input CSV column String, may be null
+ * @return the input String, enclosed in double quotes if the value contains a comma,
+ * newline or double quote, {@code null} if null string input
+ * @since 2.4
+ */
+ public static final String escapeCsv(final String input) {
+ return ESCAPE_CSV.translate(input);
+ }
+
+ /**
+ * <p>Returns a {@code String} value for an unescaped CSV column. </p>
+ *
+ * <p>If the value is enclosed in double quotes, and contains a comma, newline
+ * or double quote, then quotes are removed.
+ * </p>
+ *
+ * <p>Any double quote escaped characters (a pair of double quotes) are unescaped
+ * to just one double quote. </p>
+ *
+ * <p>If the value is not enclosed in double quotes, or is and does not contain a
+ * comma, newline or double quote, then the String value is returned unchanged.</p>
+ *
+ * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and
+ * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>.
+ *
+ * @param input the input CSV column String, may be null
+ * @return the input String, with enclosing double quotes removed and embedded double
+ * quotes unescaped, {@code null} if null string input
+ * @since 2.4
+ */
+ public static final String unescapeCsv(final String input) {
+ return UNESCAPE_CSV.translate(input);
+ }
+
+}
\ No newline at end of file
diff --git a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
new file mode 100644
index 0000000..c86f769
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
@@ -0,0 +1,621 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.StringWriter;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Modifier;
+import java.nio.charset.Charset;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.text.translate.CharSequenceTranslator;
+import org.apache.commons.text.translate.NumericEntityEscaper;
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link StringEscapeUtils}.
+ *
+ * <p>
+ * This code has been adapted from Apache Commons Lang 3.5.
+ * </p>
+ *
+ */
+public class StringEscapeUtilsTest {
+ private final static String FOO = "foo";
+
+ @Test
+ public void testConstructor() {
+ assertNotNull(new StringEscapeUtils());
+ final Constructor<?>[] cons = StringEscapeUtils.class.getDeclaredConstructors();
+ assertEquals(1, cons.length);
+ assertTrue(Modifier.isPublic(cons[0].getModifiers()));
+ assertTrue(Modifier.isPublic(StringEscapeUtils.class.getModifiers()));
+ assertFalse(Modifier.isFinal(StringEscapeUtils.class.getModifiers()));
+ }
+
+ @Test
+ public void testEscapeJava() throws IOException {
+ assertEquals(null, StringEscapeUtils.escapeJava(null));
+ try {
+ StringEscapeUtils.ESCAPE_JAVA.translate(null, null);
+ fail();
+ } catch (final IOException ex) {
+ fail();
+ } catch (final IllegalArgumentException ex) {
+ }
+ try {
+ StringEscapeUtils.ESCAPE_JAVA.translate("", null);
+ fail();
+ } catch (final IOException ex) {
+ fail();
+ } catch (final IllegalArgumentException ex) {
+ }
+
+ assertEscapeJava("empty string", "", "");
+ assertEscapeJava(FOO, FOO);
+ assertEscapeJava("tab", "\\t", "\t");
+ assertEscapeJava("backslash", "\\\\", "\\");
+ assertEscapeJava("single quote should not be escaped", "'", "'");
+ assertEscapeJava("\\\\\\b\\t\\r", "\\\b\t\r");
+ assertEscapeJava("\\u1234", "\u1234");
+ assertEscapeJava("\\u0234", "\u0234");
+ assertEscapeJava("\\u00EF", "\u00ef");
+ assertEscapeJava("\\u0001", "\u0001");
+ assertEscapeJava("Should use capitalized Unicode hex", "\\uABCD", "\uabcd");
+
+ assertEscapeJava("He didn't say, \\\"stop!\\\"",
+ "He didn't say, \"stop!\"");
+ assertEscapeJava("non-breaking space", "This space is non-breaking:" + "\\u00A0",
+ "This space is non-breaking:\u00a0");
+ assertEscapeJava("\\uABCD\\u1234\\u012C",
+ "\uABCD\u1234\u012C");
+ }
+
+ /**
+ * Tests https://issues.apache.org/jira/browse/LANG-421
+ */
+ @Test
+ public void testEscapeJavaWithSlash() {
+ final String input = "String with a slash (/) in it";
+
+ final String expected = input;
+ final String actual = StringEscapeUtils.escapeJava(input);
+
+ /**
+ * In 2.4 StringEscapeUtils.escapeJava(String) escapes '/' characters, which are not a valid character to escape
+ * in a Java string.
+ */
+ assertEquals(expected, actual);
+ }
+
+ private void assertEscapeJava(final String escaped, final String original) throws IOException {
+ assertEscapeJava(null, escaped, original);
+ }
+
+ private void assertEscapeJava(String message, final String expected, final String original) throws IOException {
+ final String converted = StringEscapeUtils.escapeJava(original);
+ message = "escapeJava(String) failed" + (message == null ? "" : (": " + message));
+ assertEquals(message, expected, converted);
+
+ final StringWriter writer = new StringWriter();
+ StringEscapeUtils.ESCAPE_JAVA.translate(original, writer);
+ assertEquals(expected, writer.toString());
+ }
+
+ @Test
+ public void testUnescapeJava() throws IOException {
+ assertEquals(null, StringEscapeUtils.unescapeJava(null));
+ try {
+ StringEscapeUtils.UNESCAPE_JAVA.translate(null, null);
+ fail();
+ } catch (final IOException ex) {
+ fail();
+ } catch (final IllegalArgumentException ex) {
+ }
+ try {
+ StringEscapeUtils.UNESCAPE_JAVA.translate("", null);
+ fail();
+ } catch (final IOException ex) {
+ fail();
+ } catch (final IllegalArgumentException ex) {
+ }
+ try {
+ StringEscapeUtils.unescapeJava("\\u02-3");
+ fail();
+ } catch (final RuntimeException ex) {
+ }
+
+ assertUnescapeJava("", "");
+ assertUnescapeJava("test", "test");
+ assertUnescapeJava("\ntest\b", "\\ntest\\b");
+ assertUnescapeJava("\u123425foo\ntest\b", "\\u123425foo\\ntest\\b");
+ assertUnescapeJava("'\foo\teste\r", "\\'\\foo\\teste\\r");
+ assertUnescapeJava("", "\\");
+ //foo
+ assertUnescapeJava("lowercase Unicode", "\uABCDx", "\\uabcdx");
+ assertUnescapeJava("uppercase Unicode", "\uABCDx", "\\uABCDx");
+ assertUnescapeJava("Unicode as final character", "\uABCD", "\\uabcd");
+ }
+
+ private void assertUnescapeJava(final String unescaped, final String original) throws IOException {
+ assertUnescapeJava(null, unescaped, original);
+ }
+
+ private void assertUnescapeJava(final String message, final String unescaped, final String original) throws IOException {
+ final String expected = unescaped;
+ final String actual = StringEscapeUtils.unescapeJava(original);
+
+ assertEquals("unescape(String) failed" +
+ (message == null ? "" : (": " + message)) +
+ ": expected '" + StringEscapeUtils.escapeJava(expected) +
+ // we escape this so we can see it in the error message
+ "' actual '" + StringEscapeUtils.escapeJava(actual) + "'",
+ expected, actual);
+
+ final StringWriter writer = new StringWriter();
+ StringEscapeUtils.UNESCAPE_JAVA.translate(original, writer);
+ assertEquals(unescaped, writer.toString());
+
+ }
+
+ @Test
+ public void testEscapeEcmaScript() {
+ assertEquals(null, StringEscapeUtils.escapeEcmaScript(null));
+ try {
+ StringEscapeUtils.ESCAPE_ECMASCRIPT.translate(null, null);
+ fail();
+ } catch (final IOException ex) {
+ fail();
+ } catch (final IllegalArgumentException ex) {
+ }
+ try {
+ StringEscapeUtils.ESCAPE_ECMASCRIPT.translate("", null);
+ fail();
+ } catch (final IOException ex) {
+ fail();
+ } catch (final IllegalArgumentException ex) {
+ }
+
+ assertEquals("He didn\\'t say, \\\"stop!\\\"", StringEscapeUtils.escapeEcmaScript("He didn't say, \"stop!\""));
+ assertEquals("document.getElementById(\\\"test\\\").value = \\'<script>alert(\\'aaa\\');<\\/script>\\';",
+ StringEscapeUtils.escapeEcmaScript("document.getElementById(\"test\").value = '<script>alert('aaa');</script>';"));
+ }
+
+
+ // HTML and XML
+ //--------------------------------------------------------------
+
+ private static final String[][] HTML_ESCAPES = {
+ {"no escaping", "plain text", "plain text"},
+ {"no escaping", "plain text", "plain text"},
+ {"empty string", "", ""},
+ {"null", null, null},
+ {"ampersand", "bread & butter", "bread & butter"},
+ {"quotes", ""bread" & butter", "\"bread\" & butter"},
+ {"final character only", "greater than >", "greater than >"},
+ {"first character only", "< less than", "< less than"},
+ {"apostrophe", "Huntington's chorea", "Huntington's chorea"},
+ {"languages", "English,Français,\u65E5\u672C\u8A9E (nihongo)", "English,Fran\u00E7ais,\u65E5\u672C\u8A9E (nihongo)"},
+ {"8-bit ascii shouldn't number-escape", "\u0080\u009F", "\u0080\u009F"},
+ };
+
+ @Test
+ public void testEscapeHtml() {
+ for (final String[] element : HTML_ESCAPES) {
+ final String message = element[0];
+ final String expected = element[1];
+ final String original = element[2];
+ assertEquals(message, expected, StringEscapeUtils.escapeHtml4(original));
+ final StringWriter sw = new StringWriter();
+ try {
+ StringEscapeUtils.ESCAPE_HTML4.translate(original, sw);
+ } catch (final IOException e) {
+ }
+ final String actual = original == null ? null : sw.toString();
+ assertEquals(message, expected, actual);
+ }
+ }
+
+ @Test
+ public void testUnescapeHtml4() {
+ for (final String[] element : HTML_ESCAPES) {
+ final String message = element[0];
+ final String expected = element[2];
+ final String original = element[1];
+ assertEquals(message, expected, StringEscapeUtils.unescapeHtml4(original));
+
+ final StringWriter sw = new StringWriter();
+ try {
+ StringEscapeUtils.UNESCAPE_HTML4.translate(original, sw);
+ } catch (final IOException e) {
+ }
+ final String actual = original == null ? null : sw.toString();
+ assertEquals(message, expected, actual);
+ }
+ // \u00E7 is a cedilla (c with wiggle under)
+ // note that the test string must be 7-bit-clean (Unicode escaped) or else it will compile incorrectly
+ // on some locales
+ assertEquals("funny chars pass through OK", "Fran\u00E7ais", StringEscapeUtils.unescapeHtml4("Fran\u00E7ais"));
+
+ assertEquals("Hello&;World", StringEscapeUtils.unescapeHtml4("Hello&;World"));
+ assertEquals("Hello&#;World", StringEscapeUtils.unescapeHtml4("Hello&#;World"));
+ assertEquals("Hello&# ;World", StringEscapeUtils.unescapeHtml4("Hello&# ;World"));
+ assertEquals("Hello&##;World", StringEscapeUtils.unescapeHtml4("Hello&##;World"));
+ }
+
+ @Test
+ public void testUnescapeHexCharsHtml() {
+ // Simple easy to grok test
+ assertEquals("hex number unescape", "\u0080\u009F", StringEscapeUtils.unescapeHtml4("€Ÿ"));
+ assertEquals("hex number unescape", "\u0080\u009F", StringEscapeUtils.unescapeHtml4("€Ÿ"));
+ // Test all Character values:
+ for (char i = Character.MIN_VALUE; i < Character.MAX_VALUE; i++) {
+ final Character c1 = new Character(i);
+ final Character c2 = new Character((char)(i+1));
+ final String expected = c1.toString() + c2.toString();
+ final String escapedC1 = "&#x" + Integer.toHexString((c1.charValue())) + ";";
+ final String escapedC2 = "&#x" + Integer.toHexString((c2.charValue())) + ";";
+ assertEquals("hex number unescape index " + (int)i, expected, StringEscapeUtils.unescapeHtml4(escapedC1 + escapedC2));
+ }
+ }
+
+ @Test
+ public void testUnescapeUnknownEntity() throws Exception {
+ assertEquals("&zzzz;", StringEscapeUtils.unescapeHtml4("&zzzz;"));
+ }
+
+ @Test
+ public void testEscapeHtmlVersions() throws Exception {
+ assertEquals("Β", StringEscapeUtils.escapeHtml4("\u0392"));
+ assertEquals("\u0392", StringEscapeUtils.unescapeHtml4("Β"));
+
+ // TODO: refine API for escaping/unescaping specific HTML versions
+ }
+
+ @Test
+ @SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3
+ public void testEscapeXml() throws Exception {
+ assertEquals("<abc>", StringEscapeUtils.escapeXml("<abc>"));
+ assertEquals("<abc>", StringEscapeUtils.unescapeXml("<abc>"));
+
+ assertEquals("XML should not escape >0x7f values",
+ "\u00A1", StringEscapeUtils.escapeXml("\u00A1"));
+ assertEquals("XML should be able to unescape >0x7f values",
+ "\u00A0", StringEscapeUtils.unescapeXml(" "));
+ assertEquals("XML should be able to unescape >0x7f values with one leading 0",
+ "\u00A0", StringEscapeUtils.unescapeXml(" "));
+ assertEquals("XML should be able to unescape >0x7f values with two leading 0s",
+ "\u00A0", StringEscapeUtils.unescapeXml(" "));
+ assertEquals("XML should be able to unescape >0x7f values with three leading 0s",
+ "\u00A0", StringEscapeUtils.unescapeXml(" "));
+
+ assertEquals("ain't", StringEscapeUtils.unescapeXml("ain't"));
+ assertEquals("ain't", StringEscapeUtils.escapeXml("ain't"));
+ assertEquals("", StringEscapeUtils.escapeXml(""));
+ assertEquals(null, StringEscapeUtils.escapeXml(null));
+ assertEquals(null, StringEscapeUtils.unescapeXml(null));
+
+ StringWriter sw = new StringWriter();
+ try {
+ StringEscapeUtils.ESCAPE_XML.translate("<abc>", sw);
+ } catch (final IOException e) {
+ }
+ assertEquals("XML was escaped incorrectly", "<abc>", sw.toString() );
+
+ sw = new StringWriter();
+ try {
+ StringEscapeUtils.UNESCAPE_XML.translate("<abc>", sw);
+ } catch (final IOException e) {
+ }
+ assertEquals("XML was unescaped incorrectly", "<abc>", sw.toString() );
+ }
+
+ @Test
+ public void testEscapeXml10() throws Exception {
+ assertEquals("a<b>c"d'e&f", StringEscapeUtils.escapeXml10("a<b>c\"d'e&f"));
+ assertEquals("XML 1.0 should not escape \t \n \r",
+ "a\tb\rc\nd", StringEscapeUtils.escapeXml10("a\tb\rc\nd"));
+ assertEquals("XML 1.0 should omit most #x0-x8 | #xb | #xc | #xe-#x19",
+ "ab", StringEscapeUtils.escapeXml10("a\u0000\u0001\u0008\u000b\u000c\u000e\u001fb"));
+ assertEquals("XML 1.0 should omit #xd800-#xdfff",
+ "a\ud7ff \ue000b", StringEscapeUtils.escapeXml10("a\ud7ff\ud800 \udfff \ue000b"));
+ assertEquals("XML 1.0 should omit #xfffe | #xffff",
+ "a\ufffdb", StringEscapeUtils.escapeXml10("a\ufffd\ufffe\uffffb"));
+ assertEquals("XML 1.0 should escape #x7f-#x84 | #x86 - #x9f, for XML 1.1 compatibility",
+ "a\u007e„\u0085†Ÿ\u00a0b", StringEscapeUtils.escapeXml10("a\u007e\u007f\u0084\u0085\u0086\u009f\u00a0b"));
+ }
+
+ @Test
+ public void testEscapeXml11() throws Exception {
+ assertEquals("a<b>c"d'e&f", StringEscapeUtils.escapeXml11("a<b>c\"d'e&f"));
+ assertEquals("XML 1.1 should not escape \t \n \r",
+ "a\tb\rc\nd", StringEscapeUtils.escapeXml11("a\tb\rc\nd"));
+ assertEquals("XML 1.1 should omit #x0",
+ "ab", StringEscapeUtils.escapeXml11("a\u0000b"));
+ assertEquals("XML 1.1 should escape #x1-x8 | #xb | #xc | #xe-#x19",
+ "ab", StringEscapeUtils.escapeXml11("a\u0001\u0008\u000b\u000c\u000e\u001fb"));
+ assertEquals("XML 1.1 should escape #x7F-#x84 | #x86-#x9F",
+ "a\u007e„\u0085†Ÿ\u00a0b", StringEscapeUtils.escapeXml11("a\u007e\u007f\u0084\u0085\u0086\u009f\u00a0b"));
+ assertEquals("XML 1.1 should omit #xd800-#xdfff",
+ "a\ud7ff \ue000b", StringEscapeUtils.escapeXml11("a\ud7ff\ud800 \udfff \ue000b"));
+ assertEquals("XML 1.1 should omit #xfffe | #xffff",
+ "a\ufffdb", StringEscapeUtils.escapeXml11("a\ufffd\ufffe\uffffb"));
+ }
+
+ /**
+ * Tests Supplementary characters.
+ * <p>
+ * From http://www.w3.org/International/questions/qa-escapes
+ * </p>
+ * <blockquote>
+ * Supplementary characters are those Unicode characters that have code points higher than the characters in
+ * the Basic Multilingual Plane (BMP). In UTF-16 a supplementary character is encoded using two 16-bit surrogate code points from the
+ * BMP. Because of this, some people think that supplementary characters need to be represented using two escapes, but this is incorrect
+ * - you must use the single, code point value for that character. For example, use &#x233B4; rather than
+ * &#xD84C;&#xDFB4;.
+ * </blockquote>
+ * @see <a href="http://www.w3.org/International/questions/qa-escapes">Using character escapes in markup and CSS</a>
+ * @see <a href="https://issues.apache.org/jira/browse/LANG-728">LANG-728</a>
+ */
+ @Test
+ @SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3
+ public void testEscapeXmlSupplementaryCharacters() {
+ final CharSequenceTranslator escapeXml =
+ StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) );
+
+ assertEquals("Supplementary character must be represented using a single escape", "𣎴",
+ escapeXml.translate("\uD84C\uDFB4"));
+
+ assertEquals("Supplementary characters mixed with basic characters should be encoded correctly", "a b c 𣎴",
+ escapeXml.translate("a b c \uD84C\uDFB4"));
+ }
+
+ @Test
+ @SuppressWarnings( "deprecation" ) // ESCAPE_XML has been replaced by ESCAPE_XML10 and ESCAPE_XML11 in 3.3
+ public void testEscapeXmlAllCharacters() {
+ // http://www.w3.org/TR/xml/#charsets says:
+ // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character,
+ // excluding the surrogate blocks, FFFE, and FFFF. */
+ final CharSequenceTranslator escapeXml = StringEscapeUtils.ESCAPE_XML
+ .with(NumericEntityEscaper.below(9), NumericEntityEscaper.between(0xB, 0xC), NumericEntityEscaper.between(0xE, 0x19),
+ NumericEntityEscaper.between(0xD800, 0xDFFF), NumericEntityEscaper.between(0xFFFE, 0xFFFF), NumericEntityEscaper.above(0x110000));
+
+ assertEquals("�", escapeXml.translate("\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008"));
+ assertEquals("\t", escapeXml.translate("\t")); // 0x9
+ assertEquals("\n", escapeXml.translate("\n")); // 0xA
+ assertEquals("", escapeXml.translate("\u000B\u000C"));
+ assertEquals("\r", escapeXml.translate("\r")); // 0xD
+ assertEquals("Hello World! Ain't this great?", escapeXml.translate("Hello World! Ain't this great?"));
+ assertEquals("", escapeXml.translate("\u000E\u000F\u0018\u0019"));
+ }
+
+ /**
+ * Reverse of the above.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/LANG-729">LANG-729</a>
+ */
+ @Test
+ public void testUnescapeXmlSupplementaryCharacters() {
+ assertEquals("Supplementary character must be represented using a single escape", "\uD84C\uDFB4",
+ StringEscapeUtils.unescapeXml("𣎴") );
+
+ assertEquals("Supplementary characters mixed with basic characters should be decoded correctly", "a b c \uD84C\uDFB4",
+ StringEscapeUtils.unescapeXml("a b c 𣎴") );
+ }
+
+ // Tests issue #38569
+ // http://issues.apache.org/bugzilla/show_bug.cgi?id=38569
+ @Test
+ public void testStandaloneAmphersand() {
+ assertEquals("<P&O>", StringEscapeUtils.unescapeHtml4("<P&O>"));
+ assertEquals("test & <", StringEscapeUtils.unescapeHtml4("test & <"));
+ assertEquals("<P&O>", StringEscapeUtils.unescapeXml("<P&O>"));
+ assertEquals("test & <", StringEscapeUtils.unescapeXml("test & <"));
+ }
+
+ @Test
+ public void testLang313() {
+ assertEquals("& &", StringEscapeUtils.unescapeHtml4("& &"));
+ }
+
+ @Test
+ public void testEscapeCsvString() throws Exception {
+ assertEquals("foo.bar", StringEscapeUtils.escapeCsv("foo.bar"));
+ assertEquals("\"foo,bar\"", StringEscapeUtils.escapeCsv("foo,bar"));
+ assertEquals("\"foo\nbar\"", StringEscapeUtils.escapeCsv("foo\nbar"));
+ assertEquals("\"foo\rbar\"", StringEscapeUtils.escapeCsv("foo\rbar"));
+ assertEquals("\"foo\"\"bar\"", StringEscapeUtils.escapeCsv("foo\"bar"));
+ assertEquals("foo\uD84C\uDFB4bar", StringEscapeUtils.escapeCsv("foo\uD84C\uDFB4bar"));
+ assertEquals("", StringEscapeUtils.escapeCsv(""));
+ assertEquals(null, StringEscapeUtils.escapeCsv(null));
+ }
+
+ @Test
+ public void testEscapeCsvWriter() throws Exception {
+ checkCsvEscapeWriter("foo.bar", "foo.bar");
+ checkCsvEscapeWriter("\"foo,bar\"", "foo,bar");
+ checkCsvEscapeWriter("\"foo\nbar\"", "foo\nbar");
+ checkCsvEscapeWriter("\"foo\rbar\"", "foo\rbar");
+ checkCsvEscapeWriter("\"foo\"\"bar\"", "foo\"bar");
+ checkCsvEscapeWriter("foo\uD84C\uDFB4bar", "foo\uD84C\uDFB4bar");
+ checkCsvEscapeWriter("", null);
+ checkCsvEscapeWriter("", "");
+ }
+
+ private void checkCsvEscapeWriter(final String expected, final String value) {
+ try {
+ final StringWriter writer = new StringWriter();
+ StringEscapeUtils.ESCAPE_CSV.translate(value, writer);
+ assertEquals(expected, writer.toString());
+ } catch (final IOException e) {
+ fail("Threw: " + e);
+ }
+ }
+
+ @Test
+ public void testUnescapeCsvString() throws Exception {
+ assertEquals("foo.bar", StringEscapeUtils.unescapeCsv("foo.bar"));
+ assertEquals("foo,bar", StringEscapeUtils.unescapeCsv("\"foo,bar\""));
+ assertEquals("foo\nbar", StringEscapeUtils.unescapeCsv("\"foo\nbar\""));
+ assertEquals("foo\rbar", StringEscapeUtils.unescapeCsv("\"foo\rbar\""));
+ assertEquals("foo\"bar", StringEscapeUtils.unescapeCsv("\"foo\"\"bar\""));
+ assertEquals("foo\uD84C\uDFB4bar", StringEscapeUtils.unescapeCsv("foo\uD84C\uDFB4bar"));
+ assertEquals("", StringEscapeUtils.unescapeCsv(""));
+ assertEquals(null, StringEscapeUtils.unescapeCsv(null));
+
+ assertEquals("\"foo.bar\"", StringEscapeUtils.unescapeCsv("\"foo.bar\""));
+ }
+
+ @Test
+ public void testUnescapeCsvWriter() throws Exception {
+ checkCsvUnescapeWriter("foo.bar", "foo.bar");
+ checkCsvUnescapeWriter("foo,bar", "\"foo,bar\"");
+ checkCsvUnescapeWriter("foo\nbar", "\"foo\nbar\"");
+ checkCsvUnescapeWriter("foo\rbar", "\"foo\rbar\"");
+ checkCsvUnescapeWriter("foo\"bar", "\"foo\"\"bar\"");
+ checkCsvUnescapeWriter("foo\uD84C\uDFB4bar", "foo\uD84C\uDFB4bar");
+ checkCsvUnescapeWriter("", null);
+ checkCsvUnescapeWriter("", "");
+
+ checkCsvUnescapeWriter("\"foo.bar\"", "\"foo.bar\"");
+ }
+
+ private void checkCsvUnescapeWriter(final String expected, final String value) {
+ try {
+ final StringWriter writer = new StringWriter();
+ StringEscapeUtils.UNESCAPE_CSV.translate(value, writer);
+ assertEquals(expected, writer.toString());
+ } catch (final IOException e) {
+ fail("Threw: " + e);
+ }
+ }
+
+ /**
+ * Tests // https://issues.apache.org/jira/browse/LANG-480
+ */
+ @Test
+ public void testEscapeHtmlHighUnicode() {
+ // this is the utf8 representation of the character:
+ // COUNTING ROD UNIT DIGIT THREE
+ // in Unicode
+ // codepoint: U+1D362
+ final byte[] data = new byte[] { (byte)0xF0, (byte)0x9D, (byte)0x8D, (byte)0xA2 };
+
+ final String original = new String(data, Charset.forName("UTF8"));
+
+ final String escaped = StringEscapeUtils.escapeHtml4( original );
+ assertEquals( "High Unicode should not have been escaped", original, escaped);
+
+ final String unescaped = StringEscapeUtils.unescapeHtml4( escaped );
+ assertEquals( "High Unicode should have been unchanged", original, unescaped);
+
+ // TODO: I think this should hold, needs further investigation
+ // String unescapedFromEntity = StringEscapeUtils.unescapeHtml4( "𝍢" );
+ // assertEquals( "High Unicode should have been unescaped", original, unescapedFromEntity);
+ }
+
+ /**
+ * Tests https://issues.apache.org/jira/browse/LANG-339
+ */
+ @Test
+ public void testEscapeHiragana() {
+ // Some random Japanese Unicode characters
+ final String original = "\u304B\u304C\u3068";
+ final String escaped = StringEscapeUtils.escapeHtml4(original);
+ assertEquals( "Hiragana character Unicode behaviour should not be being escaped by escapeHtml4",
+ original, escaped);
+
+ final String unescaped = StringEscapeUtils.unescapeHtml4( escaped );
+
+ assertEquals( "Hiragana character Unicode behaviour has changed - expected no unescaping", escaped, unescaped);
+ }
+
+ /**
+ * Tests https://issues.apache.org/jira/browse/LANG-708
+ *
+ * @throws IOException
+ * if an I/O error occurs
+ */
+ @Test
+ public void testLang708() throws IOException {
+ final FileInputStream fis = new FileInputStream("src/test/resources/lang-708-input.txt");
+ final String input = IOUtils.toString(fis, "UTF-8");
+ final String escaped = StringEscapeUtils.escapeEcmaScript(input);
+ // just the end:
+ assertTrue(escaped, escaped.endsWith("}]"));
+ // a little more:
+ assertTrue(escaped, escaped.endsWith("\"valueCode\\\":\\\"\\\"}]"));
+ fis.close();
+ }
+
+ /**
+ * Tests https://issues.apache.org/jira/browse/LANG-720
+ */
+ @Test
+ @SuppressWarnings( "deprecation" ) // escapeXml(String) has been replaced by escapeXml10(String) and escapeXml11(String) in 3.3
+ public void testLang720() {
+ final String input = "\ud842\udfb7" + "A";
+ final String escaped = StringEscapeUtils.escapeXml(input);
+ assertEquals(input, escaped);
+ }
+
+ /**
+ * Tests https://issues.apache.org/jira/browse/LANG-911
+ */
+ @Test
+ public void testLang911() {
+ final String bellsTest = "\ud83d\udc80\ud83d\udd14";
+ final String value = StringEscapeUtils.escapeJava(bellsTest);
+ final String valueTest = StringEscapeUtils.unescapeJava(value);
+ assertEquals(bellsTest, valueTest);
+ }
+
+ @Test
+ public void testEscapeJson() {
+ assertEquals(null, StringEscapeUtils.escapeJson(null));
+ try {
+ StringEscapeUtils.ESCAPE_JSON.translate(null, null);
+ fail();
+ } catch (final IOException ex) {
+ fail();
+ } catch (final IllegalArgumentException ex) {
+ }
+ try {
+ StringEscapeUtils.ESCAPE_JSON.translate("", null);
+ fail();
+ } catch (final IOException ex) {
+ fail();
+ } catch (final IllegalArgumentException ex) {
+ }
+
+ assertEquals("He didn't say, \\\"stop!\\\"", StringEscapeUtils.escapeJson("He didn't say, \"stop!\""));
+
+ final String expected = "\\\"foo\\\" isn't \\\"bar\\\". specials: \\b\\r\\n\\f\\t\\\\\\/";
+ final String input ="\"foo\" isn't \"bar\". specials: \b\r\n\f\t\\/";
+
+ assertEquals(expected, StringEscapeUtils.escapeJson(input));
+ }
+
+}
\ No newline at end of file