blob: 11a0569cc00e17286689201cba4514f7294a61a5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tomcat.util.buf;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import org.apache.tomcat.util.res.StringManager;
/**
* NIO based character decoder.
*/
public class B2CConverter {
private static final StringManager sm =
StringManager.getManager(Constants.Package);
private static final Map<String, Charset> encodingToCharsetCache =
new HashMap<>();
// Protected so unit tests can use it
protected static final int LEFTOVER_SIZE = 9;
static {
for (Charset charset: Charset.availableCharsets().values()) {
encodingToCharsetCache.put(
charset.name().toLowerCase(Locale.ENGLISH), charset);
for (String alias : charset.aliases()) {
encodingToCharsetCache.put(
alias.toLowerCase(Locale.ENGLISH), charset);
}
}
}
public static Charset getCharset(String enc)
throws UnsupportedEncodingException {
// Encoding names should all be ASCII
String lowerCaseEnc = enc.toLowerCase(Locale.ENGLISH);
return getCharsetLower(lowerCaseEnc);
}
/**
* Only to be used when it is known that the encoding name is in lower case.
*/
public static Charset getCharsetLower(String lowerCaseEnc)
throws UnsupportedEncodingException {
Charset charset = encodingToCharsetCache.get(lowerCaseEnc);
if (charset == null) {
// Pre-population of the cache means this must be invalid
throw new UnsupportedEncodingException(
sm.getString("b2cConverter.unknownEncoding", lowerCaseEnc));
}
return charset;
}
private final CharsetDecoder decoder;
private ByteBuffer bb = null;
private CharBuffer cb = null;
/**
* Leftover buffer used for incomplete characters.
*/
private final ByteBuffer leftovers;
public B2CConverter(String encoding) throws IOException {
this(encoding, false);
}
public B2CConverter(String encoding, boolean replaceOnError)
throws IOException {
byte[] left = new byte[LEFTOVER_SIZE];
leftovers = ByteBuffer.wrap(left);
CodingErrorAction action;
if (replaceOnError) {
action = CodingErrorAction.REPLACE;
} else {
action = CodingErrorAction.REPORT;
}
Charset charset = getCharset(encoding);
// Special case. Use the Apache Harmony based UTF-8 decoder because it
// - a) rejects invalid sequences that the JVM decoder does not
// - b) fails faster for some invalid sequences
if (charset.equals(StandardCharsets.UTF_8)) {
decoder = new Utf8Decoder();
} else {
decoder = charset.newDecoder();
}
decoder.onMalformedInput(action);
decoder.onUnmappableCharacter(action);
}
/**
* Reset the decoder state.
*/
public void recycle() {
decoder.reset();
leftovers.position(0);
}
/**
* Convert the given bytes to characters.
*
* @param bc byte input
* @param cc char output
* @param endOfInput Is this all of the available data
*/
public void convert(ByteChunk bc, CharChunk cc, boolean endOfInput)
throws IOException {
if ((bb == null) || (bb.array() != bc.getBuffer())) {
// Create a new byte buffer if anything changed
bb = ByteBuffer.wrap(bc.getBuffer(), bc.getStart(), bc.getLength());
} else {
// Initialize the byte buffer
bb.limit(bc.getEnd());
bb.position(bc.getStart());
}
if ((cb == null) || (cb.array() != cc.getBuffer())) {
// Create a new char buffer if anything changed
cb = CharBuffer.wrap(cc.getBuffer(), cc.getEnd(),
cc.getBuffer().length - cc.getEnd());
} else {
// Initialize the char buffer
cb.limit(cc.getBuffer().length);
cb.position(cc.getEnd());
}
CoderResult result = null;
// Parse leftover if any are present
if (leftovers.position() > 0) {
int pos = cb.position();
// Loop until one char is decoded or there is a decoder error
do {
leftovers.put(bc.substractB());
leftovers.flip();
result = decoder.decode(leftovers, cb, endOfInput);
leftovers.position(leftovers.limit());
leftovers.limit(leftovers.array().length);
} while (result.isUnderflow() && (cb.position() == pos));
if (result.isError() || result.isMalformed()) {
result.throwException();
}
bb.position(bc.getStart());
leftovers.position(0);
}
// Do the decoding and get the results into the byte chunk and the char
// chunk
result = decoder.decode(bb, cb, endOfInput);
if (result.isError() || result.isMalformed()) {
result.throwException();
} else if (result.isOverflow()) {
// Propagate current positions to the byte chunk and char chunk, if
// this continues the char buffer will get resized
bc.setOffset(bb.position());
cc.setEnd(cb.position());
} else if (result.isUnderflow()) {
// Propagate current positions to the byte chunk and char chunk
bc.setOffset(bb.position());
cc.setEnd(cb.position());
// Put leftovers in the leftovers byte buffer
if (bc.getLength() > 0) {
leftovers.limit(leftovers.array().length);
leftovers.position(bc.getLength());
bc.substract(leftovers.array(), 0, bc.getLength());
}
}
}
}