| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tomcat.util.buf; |
| |
| import java.io.IOException; |
| import java.util.BitSet; |
| |
| /** |
| * Efficient implementation of an UTF-8 encoder. |
| * This class is not thread safe - you need one encoder per thread. |
| * The encoder will save and recycle the internal objects, avoiding |
| * garbage. |
| * |
| * You can add extra characters that you want preserved, for example |
| * while encoding a URL you can add "/". |
| * |
| * @author Costin Manolache |
| */ |
| public final class UEncoder { |
| |
| public enum SafeCharsSet { |
| WITH_SLASH("/"), DEFAULT(""); |
| private final BitSet safeChars; |
| |
| private BitSet getSafeChars() { |
| return this.safeChars; |
| } |
| |
| private SafeCharsSet(String additionalSafeChars) { |
| safeChars = initialSafeChars(); |
| for (char c : additionalSafeChars.toCharArray()) { |
| safeChars.set(c); |
| } |
| } |
| } |
| |
| // Not static - the set may differ ( it's better than adding |
| // an extra check for "/", "+", etc |
| private BitSet safeChars=null; |
| private C2BConverter c2b=null; |
| private ByteChunk bb=null; |
| private CharChunk cb=null; |
| private CharChunk output=null; |
| private final boolean readOnlySafeChars; |
| |
| private final String ENCODING = "UTF8"; |
| |
| public UEncoder() { |
| this.safeChars = initialSafeChars(); |
| readOnlySafeChars = false; |
| } |
| |
| /** |
| * Create a UEncoder with an unmodifiable safe character set. |
| * <p> |
| * Calls to {@link UEncoder#addSafeCharacter(char) addSafeCharacter(char)} |
| * on instances created by this constructor will throw an |
| * {@link IllegalStateException}. |
| * |
| * @param safeCharsSet |
| * safe characters for this encoder |
| */ |
| public UEncoder(SafeCharsSet safeCharsSet) { |
| this.safeChars = safeCharsSet.getSafeChars(); |
| readOnlySafeChars = true; |
| } |
| |
| public void addSafeCharacter( char c ) { |
| if (readOnlySafeChars) { |
| throw new IllegalStateException("UEncoders safeChararacters are read only"); |
| } |
| safeChars.set( c ); |
| } |
| |
| |
| /** |
| * URL Encode string, using a specified encoding. |
| * |
| * @param s string to be encoded |
| * @param start the beginning index, inclusive |
| * @param end the ending index, exclusive |
| * @throws IOException If an I/O error occurs |
| */ |
| public CharChunk encodeURL(String s, int start, int end) |
| throws IOException { |
| if (c2b == null) { |
| bb = new ByteChunk(8); // small enough. |
| cb = new CharChunk(2); // small enough. |
| output = new CharChunk(64); // small enough. |
| c2b = new C2BConverter(ENCODING); |
| } else { |
| bb.recycle(); |
| cb.recycle(); |
| output.recycle(); |
| } |
| |
| for (int i = start; i < end; i++) { |
| char c = s.charAt(i); |
| if (safeChars.get(c)) { |
| output.append(c); |
| } else { |
| cb.append(c); |
| c2b.convert(cb, bb); |
| |
| // "surrogate" - UTF is _not_ 16 bit, but 21 !!!! |
| // ( while UCS is 31 ). Amazing... |
| if (c >= 0xD800 && c <= 0xDBFF) { |
| if ((i+1) < end) { |
| char d = s.charAt(i+1); |
| if (d >= 0xDC00 && d <= 0xDFFF) { |
| cb.append(d); |
| c2b.convert(cb, bb); |
| i++; |
| } |
| } |
| } |
| |
| urlEncode(output, bb); |
| cb.recycle(); |
| bb.recycle(); |
| } |
| } |
| |
| return output; |
| } |
| |
| protected void urlEncode(CharChunk out, ByteChunk bb) |
| throws IOException { |
| byte[] bytes = bb.getBuffer(); |
| for (int j = bb.getStart(); j < bb.getEnd(); j++) { |
| out.append('%'); |
| char ch = Character.forDigit((bytes[j] >> 4) & 0xF, 16); |
| out.append(ch); |
| ch = Character.forDigit(bytes[j] & 0xF, 16); |
| out.append(ch); |
| } |
| } |
| |
| // -------------------- Internal implementation -------------------- |
| |
| private static BitSet initialSafeChars() { |
| BitSet initialSafeChars=new BitSet(128); |
| int i; |
| for (i = 'a'; i <= 'z'; i++) { |
| initialSafeChars.set(i); |
| } |
| for (i = 'A'; i <= 'Z'; i++) { |
| initialSafeChars.set(i); |
| } |
| for (i = '0'; i <= '9'; i++) { |
| initialSafeChars.set(i); |
| } |
| //safe |
| initialSafeChars.set('$'); |
| initialSafeChars.set('-'); |
| initialSafeChars.set('_'); |
| initialSafeChars.set('.'); |
| |
| // Dangerous: someone may treat this as " " |
| // RFC1738 does allow it, it's not reserved |
| // initialSafeChars.set('+'); |
| //extra |
| initialSafeChars.set('!'); |
| initialSafeChars.set('*'); |
| initialSafeChars.set('\''); |
| initialSafeChars.set('('); |
| initialSafeChars.set(')'); |
| initialSafeChars.set(','); |
| return initialSafeChars; |
| } |
| } |