blob: 2da117764c432c9f9c0813c00458d8d86b0e7a3b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tomcat.util.buf;
import java.io.IOException;
import java.util.BitSet;
/**
* Efficient implementation of an UTF-8 encoder.
* This class is not thread safe - you need one encoder per thread.
* The encoder will save and recycle the internal objects, avoiding
* garbage.
*
* You can add extra characters that you want preserved, for example
* while encoding a URL you can add "/".
*
* @author Costin Manolache
*/
public final class UEncoder {
public enum SafeCharsSet {
WITH_SLASH("/"), DEFAULT("");
private final BitSet safeChars;
private BitSet getSafeChars() {
return this.safeChars;
}
private SafeCharsSet(String additionalSafeChars) {
safeChars = initialSafeChars();
for (char c : additionalSafeChars.toCharArray()) {
safeChars.set(c);
}
}
}
// Not static - the set may differ ( it's better than adding
// an extra check for "/", "+", etc
private BitSet safeChars=null;
private C2BConverter c2b=null;
private ByteChunk bb=null;
private CharChunk cb=null;
private CharChunk output=null;
private final boolean readOnlySafeChars;
private final String ENCODING = "UTF8";
public UEncoder() {
this.safeChars = initialSafeChars();
readOnlySafeChars = false;
}
/**
* Create a UEncoder with an unmodifiable safe character set.
* <p>
* Calls to {@link UEncoder#addSafeCharacter(char) addSafeCharacter(char)}
* on instances created by this constructor will throw an
* {@link IllegalStateException}.
*
* @param safeCharsSet
* safe characters for this encoder
*/
public UEncoder(SafeCharsSet safeCharsSet) {
this.safeChars = safeCharsSet.getSafeChars();
readOnlySafeChars = true;
}
public void addSafeCharacter( char c ) {
if (readOnlySafeChars) {
throw new IllegalStateException("UEncoders safeChararacters are read only");
}
safeChars.set( c );
}
/**
* URL Encode string, using a specified encoding.
*
* @param s string to be encoded
* @param start the beginning index, inclusive
* @param end the ending index, exclusive
* @throws IOException If an I/O error occurs
*/
public CharChunk encodeURL(String s, int start, int end)
throws IOException {
if (c2b == null) {
bb = new ByteChunk(8); // small enough.
cb = new CharChunk(2); // small enough.
output = new CharChunk(64); // small enough.
c2b = new C2BConverter(ENCODING);
} else {
bb.recycle();
cb.recycle();
output.recycle();
}
for (int i = start; i < end; i++) {
char c = s.charAt(i);
if (safeChars.get(c)) {
output.append(c);
} else {
cb.append(c);
c2b.convert(cb, bb);
// "surrogate" - UTF is _not_ 16 bit, but 21 !!!!
// ( while UCS is 31 ). Amazing...
if (c >= 0xD800 && c <= 0xDBFF) {
if ((i+1) < end) {
char d = s.charAt(i+1);
if (d >= 0xDC00 && d <= 0xDFFF) {
cb.append(d);
c2b.convert(cb, bb);
i++;
}
}
}
urlEncode(output, bb);
cb.recycle();
bb.recycle();
}
}
return output;
}
protected void urlEncode(CharChunk out, ByteChunk bb)
throws IOException {
byte[] bytes = bb.getBuffer();
for (int j = bb.getStart(); j < bb.getEnd(); j++) {
out.append('%');
char ch = Character.forDigit((bytes[j] >> 4) & 0xF, 16);
out.append(ch);
ch = Character.forDigit(bytes[j] & 0xF, 16);
out.append(ch);
}
}
// -------------------- Internal implementation --------------------
private static BitSet initialSafeChars() {
BitSet initialSafeChars=new BitSet(128);
int i;
for (i = 'a'; i <= 'z'; i++) {
initialSafeChars.set(i);
}
for (i = 'A'; i <= 'Z'; i++) {
initialSafeChars.set(i);
}
for (i = '0'; i <= '9'; i++) {
initialSafeChars.set(i);
}
//safe
initialSafeChars.set('$');
initialSafeChars.set('-');
initialSafeChars.set('_');
initialSafeChars.set('.');
// Dangerous: someone may treat this as " "
// RFC1738 does allow it, it's not reserved
// initialSafeChars.set('+');
//extra
initialSafeChars.set('!');
initialSafeChars.set('*');
initialSafeChars.set('\'');
initialSafeChars.set('(');
initialSafeChars.set(')');
initialSafeChars.set(',');
return initialSafeChars;
}
}