blob: fa9959cad4c86ac822be7d5bf70ee17678812f46 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.myfaces.renderkit.html.util;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
/**
* Converts Strings so that they can be used within HTML-Code.
*/
public abstract class HTMLEncoder
{
/**
* Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
*/
public static String encode(String string)
{
return encode(string, false, true);
}
/**
* Variant of {@link #encode} where encodeNbsp is true.
*/
public static String encode(String string, boolean encodeNewline)
{
return encode(string, encodeNewline, true);
}
/**
* Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true
*/
public static String encode(String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp)
{
return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
}
/**
* Encodes the given string, so that it can be used within a html page.
* @param string the string to convert
* @param encodeNewline if true newline characters are converted to <br>'s
* @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to  's
* @param encodeNonLatin if true encode non-latin characters as numeric character references
*/
public static String encode(String string,
boolean encodeNewline,
boolean encodeSubsequentBlanksToNbsp,
boolean encodeNonLatin)
{
if (string == null)
{
return "";
}
StringBuilder sb = null; //create later on demand
String app;
char c = ' ';
char prevC;
int length = string.length();
for (int i = 0; i < length; ++i)
{
app = null;
prevC = c;
c = string.charAt(i);
// All characters before letters
if ((int)c < 0x41)
{
switch (c)
{
case '"': app = "&quot;"; break; //"
case '&': app = "&amp;"; break; //&
case '<': app = "&lt;"; break; //<
case '>': app = "&gt;"; break; //>
case ' ':
if (encodeSubsequentBlanksToNbsp && prevC == ' ')
{
//Space at beginning or after another space
app = "&#160;";
}
break;
case '\n':
if (encodeNewline)
{
app = "<br/>";
}
break;
default:
break;
}
// http://www.w3.org/MarkUp/html3/specialchars.html
// From C0 extension U+0000-U+001F only U+0009, U+000A and
// U+000D are valid control characters
if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D)
{
// Ignore escape character
app = "";
}
}
else if (encodeNonLatin && (int)c > 0x80)
{
switch(c)
{
//german umlauts
case '\u00E4' : app = "&auml;"; break;
case '\u00C4' : app = "&Auml;"; break;
case '\u00F6' : app = "&ouml;"; break;
case '\u00D6' : app = "&Ouml;"; break;
case '\u00FC' : app = "&uuml;"; break;
case '\u00DC' : app = "&Uuml;"; break;
case '\u00DF' : app = "&szlig;"; break;
//misc
//case 0x80: app = "&euro;"; break; sometimes euro symbol is ascii 128, should we suport it?
case '\u20AC': app = "&euro;"; break;
case '\u00AB': app = "&laquo;"; break;
case '\u00BB': app = "&raquo;"; break;
case '\u00A0': app = "&#160;"; break;
default :
//encode all non basic latin characters
app = "&#" + ((int)c) + ';';
break;
}
}
if (app != null)
{
if (sb == null)
{
sb = new StringBuilder(string.substring(0, i));
}
sb.append(app);
}
else
{
if (sb != null)
{
sb.append(c);
}
}
}
if (sb == null)
{
return string;
}
else
{
return sb.toString();
}
}
/**
* Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
*/
public static void encode(Writer writer, String string) throws IOException
{
encode(writer, string, false, true);
}
/**
* Variant of {@link #encode} where encodeNbsp is true.
*/
public static void encode(Writer writer, String string, boolean encodeNewline) throws IOException
{
encode(writer, string, encodeNewline, true);
}
/**
* Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true
*/
public static void encode(Writer writer, String string,
boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp) throws IOException
{
encode(writer, string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
}
public static void encode(Writer writer, String string,
boolean encodeNewline,
boolean encodeSubsequentBlanksToNbsp,
boolean encodeNonLatin) throws IOException
{
if (string == null)
{
return;
}
int start = 0;
String app;
char c = ' ';
char prevC;
int length = string.length();
for (int i = 0; i < length; ++i)
{
app = null;
prevC = c;
c = string.charAt(i);
// All characters before letters
if ((int)c < 0x41)
{
switch (c)
{
case '"': app = "&quot;"; break; //"
case '&': app = "&amp;"; break; //&
case '<': app = "&lt;"; break; //<
case '>': app = "&gt;"; break; //>
case ' ':
if (encodeSubsequentBlanksToNbsp &&
prevC == ' ')
{
//Space at beginning or after another space
app = "&#160;";
}
break;
case '\n':
if (encodeNewline)
{
app = "<br/>";
}
break;
default:
break;
}
// http://www.w3.org/MarkUp/html3/specialchars.html
// From C0 extension U+0000-U+001F only U+0009, U+000A and
// U+000D are valid control characters
if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D)
{
// Ignore escape character
app = "";
}
}
else if (encodeNonLatin && (int)c > 0x80)
{
switch(c)
{
//german umlauts
case '\u00E4' : app = "&auml;"; break;
case '\u00C4' : app = "&Auml;"; break;
case '\u00F6' : app = "&ouml;"; break;
case '\u00D6' : app = "&Ouml;"; break;
case '\u00FC' : app = "&uuml;"; break;
case '\u00DC' : app = "&Uuml;"; break;
case '\u00DF' : app = "&szlig;"; break;
//misc
//case 0x80: app = "&euro;"; break; sometimes euro symbol is ascii 128, should we suport it?
case '\u20AC': app = "&euro;"; break;
case '\u00AB': app = "&laquo;"; break;
case '\u00BB': app = "&raquo;"; break;
case '\u00A0': app = "&#160;"; break;
default :
//encode all non basic latin characters
app = "&#" + ((int)c) + ';';
break;
}
}
if (app != null)
{
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
writer.write(app);
}
}
if (start == 0)
{
writer.write(string);
}
else if (start < length)
{
writer.write(string,start,length-start);
}
}
/**
* Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
*/
public static void encode(char[] string, int offset, int length, Writer writer) throws IOException
{
encode(string, offset, length, false, true, writer);
}
/**
* Variant of {@link #encode} where encodeNbsp is true.
*/
public static void encode(char[] string, int offset, int length, boolean encodeNewline, Writer writer)
throws IOException
{
encode(string, offset, length, encodeNewline, true, writer);
}
/**
* Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true
*/
public static void encode(char[] string, int offset, int length, boolean encodeNewline,
boolean encodeSubsequentBlanksToNbsp, Writer writer) throws IOException
{
encode(string, offset, length, encodeNewline, encodeSubsequentBlanksToNbsp, true, writer);
}
/**
* Encodes the given string, so that it can be used within a html page.
* @param string the string to convert
* @param encodeNewline if true newline characters are converted to &lt;br&gt;'s
* @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &amp;nbsp;'s
* @param encodeNonLatin if true encode non-latin characters as numeric character references
*/
public static void encode(char[] string, int offset, int length,
boolean encodeNewline,
boolean encodeSubsequentBlanksToNbsp,
boolean encodeNonLatin, Writer writer) throws IOException
{
if (string == null || length < 0 || offset >= string.length)
{
return;
}
offset = Math.max(0, offset);
int realLength = Math.min(length, string.length - offset);
//StringBuilder sb = null; //create later on demand
String app;
char c = ' ';
char prevC;
int start = offset;
for (int i = offset; i < offset + realLength; ++i)
{
app = null;
prevC = c;
c = string[i];
// All characters before letters
if ((int)c < 0x41)
{
switch (c)
{
case '"': app = "&quot;"; break; //"
case '&': app = "&amp;"; break; //&
case '<': app = "&lt;"; break; //<
case '>': app = "&gt;"; break; //>
case ' ':
if (encodeSubsequentBlanksToNbsp &&
prevC == ' ')
{
//Space at beginning or after another space
app = "&#160;";
}
break;
case '\n':
if (encodeNewline)
{
app = "<br/>";
}
break;
default:
break;
}
// http://www.w3.org/MarkUp/html3/specialchars.html
// From C0 extension U+0000-U+001F only U+0009, U+000A and
// U+000D are valid control characters
if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D)
{
// Ignore escape character
app = "";
}
}
else if (encodeNonLatin && (int)c > 0x80)
{
switch(c)
{
//german umlauts
case '\u00E4' : app = "&auml;"; break;
case '\u00C4' : app = "&Auml;"; break;
case '\u00F6' : app = "&ouml;"; break;
case '\u00D6' : app = "&Ouml;"; break;
case '\u00FC' : app = "&uuml;"; break;
case '\u00DC' : app = "&Uuml;"; break;
case '\u00DF' : app = "&szlig;"; break;
//misc
//case 0x80: app = "&euro;"; break; sometimes euro symbol is ascii 128, should we suport it?
case '\u20AC': app = "&euro;"; break;
case '\u00AB': app = "&laquo;"; break;
case '\u00BB': app = "&raquo;"; break;
case '\u00A0': app = "&#160;"; break;
default :
//encode all non basic latin characters
app = "&#" + ((int)c) + ';';
break;
}
}
if (app != null)
{
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
writer.write(app);
}
}
if (start == offset)
{
writer.write(string, offset, realLength);
}
else if (start < offset+realLength)
{
writer.write(string,start,offset+realLength-start);
}
}
private static final String HEX_CHARSET = "0123456789ABCDEF";
private static final String UTF8 = "UTF-8";
/**
* Encode an URI, escaping or percent-encoding all required characters and
* following the rules mentioned on RFC 3986.
*
* @param string
* @param characterEncoding
* @return
* @throws IOException
*/
public static String encodeURIAttribute(final String string, final String characterEncoding)
throws IOException
{
StringBuilder sb = null; //create later on demand
String app;
char c;
boolean endLoop = false;
int length = string.length();
for (int i = 0; i < length; ++i)
{
app = null;
c = string.charAt(i);
// This are the guidelines to be taken into account by this algorithm to encode:
// RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
//
// control = <US-ASCII coded characters 00-1F and 7F hexadecimal>
// space = <US-ASCII coded character 20 hexadecimal>
// delims = "<" | ">" | "#" | "%" | <">
// %3C %3E %23 %25 %22
// unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
// %7D %7B %7C %5C %5E %5B %5D %60
//
// ".... Data corresponding to excluded characters must be escaped in order to
// be properly represented within a URI....."
// RFC 3986 Section 3. Syntax Components
//
// "... The generic URI syntax consists of a hierarchical sequence of
// components referred to as the scheme, authority, path, query, and
// fragment.
//
// URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
//
// hier-part = "//" authority path-abempty
// / path-absolute
// / path-rootless
// / path-empty
// ...."
// RFC 3986 Section 2.2:
// Reserved characters (should not be percent-encoded)
// reserved = gen-delims / sub-delims
// gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
// %3A %2F %3F %23 %5B %5D %40
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
// %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D
// Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
// but on the part D. Changes from RFC 2396 says about this chars (used on IPv6)
// "...those rules were redefined to directly specify the characters allowed...."
// There is also other characters moved from excluded list to reserved:
// "[" / "]" / "#"
// RFC 3986 Section 2.3:
// "... for consistency, percent-encoded octets in the ranges of ALPHA
// (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
// underscore (%5F), or tilde (%7E) should not be created by URI
// producers...."
// RFC 3986 Section 3.2.2. Host
// host = IP-literal / IPv4address / reg-name
// The reg-name syntax allows percent-encoded octets in order to
// represent non-ASCII registered names in a uniform way that is
// independent of the underlying name resolution technology. Non-ASCII
// characters must first be encoded according to UTF-8 [STD63], and then
// each octet of the corresponding UTF-8 sequence must be percent-
// encoded to be represented as URI characters. URI producing
// applications must not use percent-encoding in host unless it is used
// to represent a UTF-8 character sequence.
// RFC 3986 Section 3.4 Query
// query = *( pchar / "/" / "?" )
//
// "... However, as query components are often used to carry identifying information
// in the form of "key=value" pairs and one frequently used value is a reference to
// another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
//
// RFC 3986 Section 2.5 Identifying Data (Apply to query section)
//
// When a new URI scheme defines a component that represents textual
// data consisting of characters from the Universal Character Set [UCS],
// the data should first be encoded as octets according to the UTF-8
// character encoding [STD63]; then only those octets that do not
// correspond to characters in the unreserved set should be percent-
// encoded. For example, the character A would be represented as "A",
// the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
// as "%C3%80", and the character KATAKANA LETTER A would be represented
// as "%E3%82%A2".
//
// RFC 3986 Section 3.5 Fragment
// fragment = *( pchar / "/" / "?" )
//
// Note that follows the same as query
// Based on the extracts the strategy to apply on this method is:
//
// On scheme ":" hier-part
//
// Escape or percent encode chars inside :
//
// - From %00 to %20,
// - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
// duplicate encoding, encode it when we are sure
// that there are not encoded twice)
// - "<" %3C, ">" %3E
// - "\" %5C, "^" %5E, "`" %60
// - "{" %7B, "|" %7C, "}" %7D
// - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
// part of an URI, but it is preferred to encode it that omit it).
//
// The remaining characters must not be encoded
//
// Characters after ? or # should be percent encoding but only the necessary ones:
//
// - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
// - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
// duplicate encoding, encode it when we are sure
// that there are not encoded twice)
// - "<" %3C, ">" %3E,
// - "\" %5C, "^" %5E, "`" %60
// - "{" %7B, "|" %7C, "}" %7D
// - From %7F ad infinitum (each character as many bytes as necessary but take into account
// that a single char should contain 2,3 or more bytes!. This data should be encoded
// translating from the document character encoding to percent encoding, because this values
// could be retrieved from httpRequest.getParameter() and it uses the current character encoding
// for decode values)
//
// "&" should be encoded as "&amp;" because this link is inside an html page, and
// put only & is invalid in this context.
if ( (c <= (char)0x20) || (c >= (char)0x7F) ||
c == '"' || c == '<' ||
c == '>' || c == '\\' || c == '^' || c == '`' ||
c == '{' || c == '|' || c == '}')
{
// The percent encoding on this part should be done using UTF-8 charset
// as RFC 3986 Section 3.2.2 says.
// Also there is a reference on
// http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
// that recommend use of UTF-8 instead the document character encoding.
// Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
app = percentEncode(c, "UTF-8");
}
else if (c == '%')
{
if (i + 2 < length)
{
char c1 = string.charAt(i+1);
char c2 = string.charAt(i+2);
if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) &&
(( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z')))
{
// do not percent encode, because it could be already encoded
// and we don't want encode it twice
}
else
{
app = percentEncode(c, UTF8);
}
}
else
{
app = percentEncode(c, UTF8);
}
}
else if (c == '?' || c == '#')
{
if (i+1 < length)
{
// The remaining part of the URI are data that should be encoded
// using the document character encoding.
app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
endLoop = true;
}
}
else
{
//No encoding, just do nothing, char will be added later.
}
if (app != null)
{
if (sb == null)
{
sb = new StringBuilder(string.substring(0, i));
}
sb.append(app);
}
else
{
if (sb != null)
{
sb.append(c);
}
}
if (endLoop)
{
break;
}
}
if (sb == null)
{
return string;
}
else
{
return sb.toString();
}
}
/**
* Encode a unicode char value in percentEncode, decoding its bytes using a specified
* characterEncoding.
*
* @param c
* @param characterEncoding
* @return
*/
private static String percentEncode(char c, String characterEncoding)
{
String app = null;
if (c > (char)((short)0x007F))
{
//percent encode in the proper encoding to be consistent
app = percentEncodeNonUsAsciiCharacter(c, characterEncoding);
}
else
{
//percent encode US-ASCII char (0x00-0x7F range)
app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
}
return app;
}
private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding)
{
ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
StringBuilder builder = new StringBuilder();
try
{
OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
writer.write(c);
writer.flush();
}
catch(IOException e)
{
baos.reset();
return null;
}
byte [] byteArray = baos.toByteArray();
for (int i=0; i < byteArray.length; i++)
{
builder.append('%');
builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
}
return builder.toString();
}
/**
* Encode the query part using the document charset encoding provided.
*
*
* @param string
* @param characterEncoding
* @return
*/
private static String encodeURIQuery(final String string, final String characterEncoding)
{
StringBuilder sb = null; //create later on demand
String app;
char c;
boolean endLoop = false;
int length = string.length();
for (int i = 0; i < length; ++i)
{
app = null;
c = string.charAt(i);
// - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
// - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so
// we make easier and omit this one)
// - "<" %3C, ">" %3E,
// - "\" %5C, "^" %5E, "`" %60
// - "{" %7B, "|" %7C, "}" %7D
// - From %7F ad infinitum (each character as many bytes as necessary but take into account
// that a single char should contain 2,3 or more bytes!. This data should be encoded
// translating from the document character encoding to percent encoding)
//
// "&" should be encoded as "&amp;" because this link is inside an html page, and
// put & is invalid in this context
if ( (c <= (char)0x20) || (c >= (char)0x7F) ||
c == '"' || c == '<' ||
c == '>' || c == '\\' || c == '^' || c == '`' ||
c == '{' || c == '|' || c == '}')
{
// The percent encoding on this part should be done using UTF-8 charset
// as RFC 3986 Section 3.2.2 says
app = percentEncode(c, characterEncoding);
}
else if (c == '%')
{
if (i + 2 < length)
{
char c1 = string.charAt(i+1);
char c2 = string.charAt(i+2);
if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) &&
(( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z')))
{
// do not percent encode, because it could be already encoded
}
else
{
app = percentEncode(c, characterEncoding);
}
}
else
{
app = percentEncode(c, characterEncoding);
}
}
else if (c == '&')
{
if (i+4 < length )
{
if ('a' == string.charAt(i+1) &&
'm' == string.charAt(i+2) &&
'p' == string.charAt(i+3) &&
';' == string.charAt(i+4))
{
//Skip
}
else
{
app = "&amp;";
}
}
else
{
app = "&amp;";
}
}
else
{
//No encoding, just do nothing, char will be added later.
}
if (app != null)
{
if (sb == null)
{
sb = new StringBuilder(string.substring(0, i));
}
sb.append(app);
}
else
{
if (sb != null)
{
sb.append(c);
}
}
if (endLoop)
{
break;
}
}
if (sb == null)
{
return string;
}
else
{
return sb.toString();
}
}
/**
* Encode an URI, escaping or percent-encoding all required characters and
* following the rules mentioned on RFC 3986.
*
* @param writer
* @param string
* @param characterEncoding
* @throws IOException
*/
public static void encodeURIAttribute(Writer writer, final String string, final String characterEncoding)
throws IOException
{
//StringBuilder sb = null; //create later on demand
int start = 0;
String app;
char c;
boolean endLoop = false;
int length = string.length();
for (int i = 0; i < length; ++i)
{
app = null;
c = string.charAt(i);
// This are the guidelines to be taken into account by this algorithm to encode:
// RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
//
// control = <US-ASCII coded characters 00-1F and 7F hexadecimal>
// space = <US-ASCII coded character 20 hexadecimal>
// delims = "<" | ">" | "#" | "%" | <">
// %3C %3E %23 %25 %22
// unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
// %7D %7B %7C %5C %5E %5B %5D %60
//
// ".... Data corresponding to excluded characters must be escaped in order to
// be properly represented within a URI....."
// RFC 3986 Section 3. Syntax Components
//
// "... The generic URI syntax consists of a hierarchical sequence of
// components referred to as the scheme, authority, path, query, and
// fragment.
//
// URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
//
// hier-part = "//" authority path-abempty
// / path-absolute
// / path-rootless
// / path-empty
// ...."
// RFC 3986 Section 2.2:
// Reserved characters (should not be percent-encoded)
// reserved = gen-delims / sub-delims
// gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
// %3A %2F %3F %23 %5B %5D %40
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
// %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D
// Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
// but on the part D. Changes from RFC 2396 says about this chars (used on IPv6)
// "...those rules were redefined to directly specify the characters allowed...."
// There is also other characters moved from excluded list to reserved:
// "[" / "]" / "#"
// RFC 3986 Section 2.3:
// "... for consistency, percent-encoded octets in the ranges of ALPHA
// (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
// underscore (%5F), or tilde (%7E) should not be created by URI
// producers...."
// RFC 3986 Section 3.2.2. Host
// host = IP-literal / IPv4address / reg-name
// The reg-name syntax allows percent-encoded octets in order to
// represent non-ASCII registered names in a uniform way that is
// independent of the underlying name resolution technology. Non-ASCII
// characters must first be encoded according to UTF-8 [STD63], and then
// each octet of the corresponding UTF-8 sequence must be percent-
// encoded to be represented as URI characters. URI producing
// applications must not use percent-encoding in host unless it is used
// to represent a UTF-8 character sequence.
// RFC 3986 Section 3.4 Query
// query = *( pchar / "/" / "?" )
//
// "... However, as query components are often used to carry identifying information
// in the form of "key=value" pairs and one frequently used value is a reference to
// another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
//
// RFC 3986 Section 2.5 Identifying Data (Apply to query section)
//
// When a new URI scheme defines a component that represents textual
// data consisting of characters from the Universal Character Set [UCS],
// the data should first be encoded as octets according to the UTF-8
// character encoding [STD63]; then only those octets that do not
// correspond to characters in the unreserved set should be percent-
// encoded. For example, the character A would be represented as "A",
// the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
// as "%C3%80", and the character KATAKANA LETTER A would be represented
// as "%E3%82%A2".
//
// RFC 3986 Section 3.5 Fragment
// fragment = *( pchar / "/" / "?" )
//
// Note that follows the same as query
// Based on the extracts the strategy to apply on this method is:
//
// On scheme ":" hier-part
//
// Escape or percent encode chars inside :
//
// - From %00 to %20,
// - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
// duplicate encoding, encode it when we are sure
// that there are not encoded twice)
// - "<" %3C, ">" %3E
// - "\" %5C, "^" %5E, "`" %60
// - "{" %7B, "|" %7C, "}" %7D
// - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
// part of an URI, but it is preferred to encode it that omit it).
//
// The remaining characters must not be encoded
//
// Characters after ? or # should be percent encoding but only the necessary ones:
//
// - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
// - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
// duplicate encoding, encode it when we are sure
// that there are not encoded twice)
// - "<" %3C, ">" %3E,
// - "\" %5C, "^" %5E, "`" %60
// - "{" %7B, "|" %7C, "}" %7D
// - From %7F ad infinitum (each character as many bytes as necessary but take into account
// that a single char should contain 2,3 or more bytes!. This data should be encoded
// translating from the document character encoding to percent encoding, because this values
// could be retrieved from httpRequest.getParameter() and it uses the current character encoding
// for decode values)
//
// "&" should be encoded as "&amp;" because this link is inside an html page, and
// put only & is invalid in this context.
if ( (c <= (char)0x20) || (c >= (char)0x7F) ||
c == '"' || c == '<' ||
c == '>' || c == '\\' || c == '^' || c == '`' ||
c == '{' || c == '|' || c == '}')
{
// The percent encoding on this part should be done using UTF-8 charset
// as RFC 3986 Section 3.2.2 says.
// Also there is a reference on
// http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
// that recommend use of UTF-8 instead the document character encoding.
// Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
//app = percentEncode(c, "UTF-8");
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
percentEncode(writer, c, "UTF-8");
}
else if (c == '%')
{
if (i + 2 < length)
{
char c1 = string.charAt(i+1);
char c2 = string.charAt(i+2);
if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) &&
(( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z')))
{
// do not percent encode, because it could be already encoded
// and we don't want encode it twice
}
else
{
//app = percentEncode(c, UTF8);
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
percentEncode(writer, c, UTF8);
}
}
else
{
//app = percentEncode(c, UTF8);
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
percentEncode(writer, c, UTF8);
}
}
else if (c == '?' || c == '#')
{
if (i+1 < length)
{
// The remaining part of the URI are data that should be encoded
// using the document character encoding.
//app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
writer.write(c);
//encodeURIQuery(writer, string.substring(i+1), characterEncoding);
encodeURIQuery(writer, string, i+1, characterEncoding);
endLoop = true;
}
}
else
{
//No encoding, just do nothing, char will be added later.
}
if (app != null)
{
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
writer.write(app);
}
if (endLoop)
{
start = length;
break;
}
}
if (start == 0)
{
writer.write(string);
}
else if (start < length)
{
writer.write(string,start,length-start);
}
}
/**
* Encode a unicode char value in percentEncode, decoding its bytes using a specified
* characterEncoding.
*
* @param c
* @param characterEncoding
* @return
*/
private static void percentEncode(Writer writer, char c, String characterEncoding) throws IOException
{
if (c > (char)((short)0x007F))
{
//percent encode in the proper encoding to be consistent
percentEncodeNonUsAsciiCharacter(writer, c, characterEncoding);
}
else
{
//percent encode US-ASCII char (0x00-0x7F range)
writer.write('%');
writer.write(HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)));
writer.write(HEX_CHARSET.charAt(c % 0x10));
}
}
private static void percentEncodeNonUsAsciiCharacter(Writer currentWriter, char c, String characterEncoding)
throws IOException
{
ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
try
{
OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
writer.write(c);
writer.flush();
}
catch(IOException e)
{
baos.reset();
return;
}
byte [] byteArray = baos.toByteArray();
for (int i=0; i < byteArray.length; i++)
{
//builder.append('%');
//builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
//builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
currentWriter.write('%');
currentWriter.write(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
currentWriter.write(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
}
//return builder.toString();
}
/**
* Encode the query part using the document charset encoding provided.
*
*
* @param string
* @param characterEncoding
* @return
*/
private static void encodeURIQuery(Writer writer, final String string, int offset, final String characterEncoding)
throws IOException
{
//StringBuilder sb = null; //create later on demand
int start = offset;
int length = string.length();
int realLength = length-offset;
String app;
char c;
//boolean endLoop = false;
for (int i = offset; i < length; ++i)
{
app = null;
c = string.charAt(i);
// - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
// - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so
// we make easier and omit this one)
// - "<" %3C, ">" %3E,
// - "\" %5C, "^" %5E, "`" %60
// - "{" %7B, "|" %7C, "}" %7D
// - From %7F ad infinitum (each character as many bytes as necessary but take into account
// that a single char should contain 2,3 or more bytes!. This data should be encoded
// translating from the document character encoding to percent encoding)
//
// "&" should be encoded as "&amp;" because this link is inside an html page, and
// put & is invalid in this context
if ( (c <= (char)0x20) || (c >= (char)0x7F) ||
c == '"' || c == '<' ||
c == '>' || c == '\\' || c == '^' || c == '`' ||
c == '{' || c == '|' || c == '}')
{
// The percent encoding on this part should be done using UTF-8 charset
// as RFC 3986 Section 3.2.2 says
//app = percentEncode(c, characterEncoding);
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
percentEncode(writer, c, characterEncoding);
}
else if (c == '%')
{
if (i + 2 < length)
{
char c1 = string.charAt(i+1);
char c2 = string.charAt(i+2);
if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) &&
(( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z')))
{
// do not percent encode, because it could be already encoded
}
else
{
//app = percentEncode(c, characterEncoding);
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
percentEncode(writer, c, characterEncoding);
}
}
else
{
//app = percentEncode(c, characterEncoding);
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
percentEncode(writer, c, characterEncoding);
}
}
else if (c == '&')
{
if (i+4 < length )
{
if ('a' == string.charAt(i+1) &&
'm' == string.charAt(i+2) &&
'p' == string.charAt(i+3) &&
';' == string.charAt(i+4))
{
//Skip
}
else
{
app = "&amp;";
}
}
else
{
app = "&amp;";
}
}
else
{
//No encoding, just do nothing, char will be added later.
}
if (app != null)
{
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
writer.write(app);
}
}
if (start == offset)
{
writer.write(string, offset, realLength);
}
else if (start < length)
{
writer.write(string,start,length-start);
}
}
}