blob: b480a5178c290f7b75adfb874a1ff01b913d652a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.expr.fn.impl;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.StandardCharsets;
import io.netty.buffer.DrillBuf;
/**
* A CharSequence is a readable sequence of char values. This interface provides
* uniform, read-only access to many different kinds of char sequences. A char
* value represents a character in the Basic Multilingual Plane (BMP) or a
* surrogate. Refer to Unicode Character Representation for details.<br>
* Specifically this implementation of the CharSequence adapts a Drill
* {@link DrillBuf} to the CharSequence. The implementation is meant to be
* re-used that is allocated once and then passed DrillBuf to adapt. This can be
* handy to exploit API that consume CharSequence avoiding the need to create
* string objects.
*
*/
public class CharSequenceWrapper implements CharSequence {
// The adapted drill buffer (in the case of US-ASCII)
private DrillBuf buffer;
// The converted bytes in the case of non ASCII
private CharBuffer charBuffer;
// initial char buffer capacity
private static final int INITIAL_CHAR_BUF = 1024;
// The decoder to use in the case of non ASCII
private CharsetDecoder decoder;
// The start offset into the drill buffer
private int start;
// The end offset into the drill buffer
private int end;
// Indicates that the current byte buffer contains only ascii chars
private boolean usAscii;
public CharSequenceWrapper() {
}
public CharSequenceWrapper(int start, int end, DrillBuf buffer) {
setBuffer(start, end, buffer);
}
@Override
public int length() {
return end - start;
}
@Override
public char charAt(int index) {
if (usAscii) {
// Each byte is a char, the index is relative to the start of the original buffer
return (char) (buffer.getByte(start + index) & 0x00FF);
} else {
// The char buffer is a copy so the index directly corresponds
return charBuffer.charAt(index);
}
}
/**
* When using the Java regex {@link java.util.regex.Matcher} the subSequence is only called
* when capturing groups. Drill does not currently use capture groups in the
* UDF so this method is not required.<br>
* It could be implemented by creating a new CharSequenceWrapper however
* this would imply newly allocated objects which is what this wrapper tries
* to avoid.
*
*/
@Override
public CharSequence subSequence(int start, int end) {
CharSequenceWrapper charSequenceWrapper = new CharSequenceWrapper();
charSequenceWrapper.setBuffer(start, end, buffer);
return charSequenceWrapper;
}
/**
* Set the DrillBuf to adapt to a CharSequence. This method can be used to
* replace any previous DrillBuf thus avoiding recreating the
* CharSequenceWrapper and thus re-using the CharSequenceWrapper object.
*
* @param start
* @param end
* @param buffer
*/
public void setBuffer(int start, int end, DrillBuf buffer) {
// Test if buffer is an ASCII string or not.
usAscii = isAscii(start, end, buffer);
if (usAscii) {
// each byte equals one char
this.start = start;
this.end = end;
this.buffer = buffer;
} else {
initCharBuffer();
// Wrap with java byte buffer
ByteBuffer byteBuf = buffer.nioBuffer(start, end - start);
while (charBuffer.capacity() < Integer.MAX_VALUE) {
byteBuf.mark();
if (decodeUT8(byteBuf)) {
break;
}
// Failed to convert because the char buffer was not large enough
growCharBuffer();
// Make sure to reset the byte buffer we need to reprocess it
byteBuf.reset();
}
this.start = 0;
this.end = charBuffer.position();
// reset the char buffer so the index are relative to the start of the buffer
charBuffer.rewind();
}
}
/**
* Test if the buffer contains only ASCII bytes.
* @param start
* @param end
* @param buffer
* @return
*/
private boolean isAscii(int start, int end, DrillBuf buffer) {
for (int i = start; i < end; i++) {
byte bb = buffer.getByte(i);
if (bb < 0) {
//System.out.printf("Not a ASCII byte 0x%02X\n", bb);
return false;
}
}
return true;
}
/**
* Initialize the charbuffer and decoder if they are not yet initialized.
*/
private void initCharBuffer() {
if (charBuffer == null) {
charBuffer = CharBuffer.allocate(INITIAL_CHAR_BUF);
}
if (decoder == null) {
decoder = StandardCharsets.UTF_8.newDecoder();
}
}
/**
* Decode the buffer using the CharsetDecoder.
* @param byteBuf
* @return false if failed because the charbuffer was not big enough
* @throws RuntimeException if it fails for encoding errors
*/
private boolean decodeUT8(ByteBuffer byteBuf) {
// We give it all of the input data in call.
boolean endOfInput = true;
decoder.reset();
charBuffer.rewind();
// Convert utf-8 bytes to sequence of chars
CoderResult result = decoder.decode(byteBuf, charBuffer, endOfInput);
if (result.isOverflow()) {
// Not enough space in the charBuffer.
return false;
} else if (result.isError()) {
// Any other error
try {
result.throwException();
} catch (CharacterCodingException e) {
throw new RuntimeException(e);
}
}
return true;
}
/**
* Grow the charbuffer making sure not to overflow size integer. Note
* this grows in the same manner as the ArrayList that is it adds 50%
* to the current size.
*/
private void growCharBuffer() {
// overflow-conscious code
int oldCapacity = charBuffer.capacity();
//System.out.println("old capacity " + oldCapacity);
int newCapacity = oldCapacity + (oldCapacity >> 1);
if (newCapacity < 0) {
newCapacity = Integer.MAX_VALUE;
}
//System.out.println("new capacity " + newCapacity);
charBuffer = CharBuffer.allocate(newCapacity);
}
/**
* The regexp_replace function is implemented in a way to avoid the call to toString()
* not to uselessly create a string object.
*/
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < length(); i++) {
sb.append(charAt(i));
}
return sb.toString();
}
}