exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java - drill - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.drill.exec.expr.fn.impl;

 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CoderResult;
 import java.nio.charset.StandardCharsets;

 import io.netty.buffer.DrillBuf;

 /**
  * A CharSequence is a readable sequence of char values. This interface provides
  * uniform, read-only access to many different kinds of char sequences. A char
  * value represents a character in the Basic Multilingual Plane (BMP) or a
  * surrogate. Refer to Unicode Character Representation for details.<br>
  * Specifically this implementation of the CharSequence adapts a Drill
  * {@link DrillBuf} to the CharSequence. The implementation is meant to be
  * re-used that is allocated once and then passed DrillBuf to adapt. This can be
  * handy to exploit API that consume CharSequence avoiding the need to create
  * string objects.
  *
  */
 public class CharSequenceWrapper implements CharSequence {

   // The adapted drill buffer (in the case of US-ASCII)
   private DrillBuf buffer;
   // The converted bytes in the case of non ASCII
   private CharBuffer charBuffer;
   // initial char buffer capacity
   private static final int INITIAL_CHAR_BUF = 1024;
   // The decoder to use in the case of non ASCII
   private CharsetDecoder decoder;

   // The start offset into the drill buffer
   private int start;
   // The end offset into the drill buffer
   private int end;
   // Indicates that the current byte buffer contains only ascii chars
   private boolean usAscii;

   public CharSequenceWrapper() {
   }

   public CharSequenceWrapper(int start, int end, DrillBuf buffer) {
     setBuffer(start, end, buffer);
   }

   @Override
   public int length() {
     return end - start;
   }

   @Override
   public char charAt(int index) {
     if (usAscii) {
       // Each byte is a char, the index is relative to the start of the original buffer
       return (char) (buffer.getByte(start + index) & 0x00FF);
     } else {
       // The char buffer is a copy so the index directly corresponds
       return charBuffer.charAt(index);
     }
   }

   /**
    * When using the Java regex {@link java.util.regex.Matcher} the subSequence is only called
    * when capturing groups. Drill does not currently use capture groups in the
    * UDF so this method is not required.<br>
    * It could be implemented by creating a new CharSequenceWrapper however
    * this would imply newly allocated objects which is what this wrapper tries
    * to avoid.
    *
    */
   @Override
   public CharSequence subSequence(int start, int end) {
     CharSequenceWrapper charSequenceWrapper = new CharSequenceWrapper();
     charSequenceWrapper.setBuffer(start, end, buffer);
     return charSequenceWrapper;
   }

   /**
    * Set the DrillBuf to adapt to a CharSequence. This method can be used to
    * replace any previous DrillBuf thus avoiding recreating the
    * CharSequenceWrapper and thus re-using the CharSequenceWrapper object.
    *
    * @param start
    * @param end
    * @param buffer
    */
   public void setBuffer(int start, int end, DrillBuf buffer) {
     // Test if buffer is an ASCII string or not.
     usAscii = isAscii(start, end, buffer);

     if (usAscii) {
       // each byte equals one char
       this.start = start;
       this.end = end;
       this.buffer = buffer;
     } else {
       initCharBuffer();
       // Wrap with java byte buffer
       ByteBuffer byteBuf = buffer.nioBuffer(start, end - start);
       while (charBuffer.capacity() < Integer.MAX_VALUE) {
         byteBuf.mark();
         if (decodeUT8(byteBuf)) {
           break;
         }
         // Failed to convert because the char buffer was not large enough
         growCharBuffer();
         // Make sure to reset the byte buffer we need to reprocess it
         byteBuf.reset();
       }
       this.start = 0;
       this.end = charBuffer.position();
       // reset the char buffer so the index are relative to the start of the buffer
       charBuffer.rewind();
     }
   }

   /**
    * Test if the buffer contains only ASCII bytes.
    * @param start
    * @param end
    * @param buffer
    * @return
    */
   private boolean isAscii(int start, int end, DrillBuf buffer) {
     for (int i = start; i < end; i++) {
       byte bb = buffer.getByte(i);
       if (bb < 0) {
         //System.out.printf("Not a ASCII byte 0x%02X\n", bb);
         return false;
       }
     }
     return true;
   }

   /**
    * Initialize the charbuffer and decoder if they are not yet initialized.
    */
   private void initCharBuffer() {
     if (charBuffer == null) {
       charBuffer = CharBuffer.allocate(INITIAL_CHAR_BUF);
     }
     if (decoder == null) {
       decoder = StandardCharsets.UTF_8.newDecoder();
     }
   }

   /**
    * Decode the buffer using the CharsetDecoder.
    * @param byteBuf
    * @return false if failed because the charbuffer was not big enough
    * @throws RuntimeException if it fails for encoding errors
    */
   private boolean decodeUT8(ByteBuffer byteBuf) {
     // We give it all of the input data in call.
     boolean endOfInput = true;
     decoder.reset();
     charBuffer.rewind();
     // Convert utf-8 bytes to sequence of chars
     CoderResult result = decoder.decode(byteBuf, charBuffer, endOfInput);
     if (result.isOverflow()) {
       // Not enough space in the charBuffer.
       return false;
     } else if (result.isError()) {
       // Any other error
       try {
         result.throwException();
       } catch (CharacterCodingException e) {
         throw new RuntimeException(e);
       }
     }
     return true;
   }

   /**
    * Grow the charbuffer making sure not to overflow size integer. Note
    * this grows in the same manner as the ArrayList that is it adds 50%
    * to the current size.
    */
   private void growCharBuffer() {
     // overflow-conscious code
     int oldCapacity = charBuffer.capacity();
     //System.out.println("old capacity " + oldCapacity);
     int newCapacity = oldCapacity + (oldCapacity >> 1);
     if (newCapacity < 0) {
       newCapacity = Integer.MAX_VALUE;
     }
     //System.out.println("new capacity " + newCapacity);
     charBuffer = CharBuffer.allocate(newCapacity);
   }

   /**
    * The regexp_replace function is implemented in a way to avoid the call to toString()
    * not to uselessly create a string object.
    */
   @Override
   public String toString() {
     StringBuilder sb = new StringBuilder();
     for (int i = 0; i < length(); i++) {
       sb.append(charAt(i));
     }
     return sb.toString();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.drill.exec.expr.fn.impl;

	import java.nio.ByteBuffer;
	import java.nio.CharBuffer;
	import java.nio.charset.CharacterCodingException;
	import java.nio.charset.CharsetDecoder;
	import java.nio.charset.CoderResult;
	import java.nio.charset.StandardCharsets;

	import io.netty.buffer.DrillBuf;

	/**
	* A CharSequence is a readable sequence of char values. This interface provides
	* uniform, read-only access to many different kinds of char sequences. A char
	* value represents a character in the Basic Multilingual Plane (BMP) or a
	* surrogate. Refer to Unicode Character Representation for details.<br>
	* Specifically this implementation of the CharSequence adapts a Drill
	* {@link DrillBuf} to the CharSequence. The implementation is meant to be
	* re-used that is allocated once and then passed DrillBuf to adapt. This can be
	* handy to exploit API that consume CharSequence avoiding the need to create
	* string objects.
	*
	*/
	public class CharSequenceWrapper implements CharSequence {

	// The adapted drill buffer (in the case of US-ASCII)
	private DrillBuf buffer;
	// The converted bytes in the case of non ASCII
	private CharBuffer charBuffer;
	// initial char buffer capacity
	private static final int INITIAL_CHAR_BUF = 1024;
	// The decoder to use in the case of non ASCII
	private CharsetDecoder decoder;

	// The start offset into the drill buffer
	private int start;
	// The end offset into the drill buffer
	private int end;
	// Indicates that the current byte buffer contains only ascii chars
	private boolean usAscii;

	public CharSequenceWrapper() {
	}

	public CharSequenceWrapper(int start, int end, DrillBuf buffer) {
	setBuffer(start, end, buffer);
	}

	@Override
	public int length() {
	return end - start;
	}

	@Override
	public char charAt(int index) {
	if (usAscii) {
	// Each byte is a char, the index is relative to the start of the original buffer
	return (char) (buffer.getByte(start + index) & 0x00FF);
	} else {
	// The char buffer is a copy so the index directly corresponds
	return charBuffer.charAt(index);
	}
	}

	/**
	* When using the Java regex {@link java.util.regex.Matcher} the subSequence is only called
	* when capturing groups. Drill does not currently use capture groups in the
	* UDF so this method is not required.<br>
	* It could be implemented by creating a new CharSequenceWrapper however
	* this would imply newly allocated objects which is what this wrapper tries
	* to avoid.
	*
	*/
	@Override
	public CharSequence subSequence(int start, int end) {
	CharSequenceWrapper charSequenceWrapper = new CharSequenceWrapper();
	charSequenceWrapper.setBuffer(start, end, buffer);
	return charSequenceWrapper;
	}

	/**
	* Set the DrillBuf to adapt to a CharSequence. This method can be used to
	* replace any previous DrillBuf thus avoiding recreating the
	* CharSequenceWrapper and thus re-using the CharSequenceWrapper object.
	*
	* @param start
	* @param end
	* @param buffer
	*/
	public void setBuffer(int start, int end, DrillBuf buffer) {
	// Test if buffer is an ASCII string or not.
	usAscii = isAscii(start, end, buffer);

	if (usAscii) {
	// each byte equals one char
	this.start = start;
	this.end = end;
	this.buffer = buffer;
	} else {
	initCharBuffer();
	// Wrap with java byte buffer
	ByteBuffer byteBuf = buffer.nioBuffer(start, end - start);
	while (charBuffer.capacity() < Integer.MAX_VALUE) {
	byteBuf.mark();
	if (decodeUT8(byteBuf)) {
	break;
	}
	// Failed to convert because the char buffer was not large enough
	growCharBuffer();
	// Make sure to reset the byte buffer we need to reprocess it
	byteBuf.reset();
	}
	this.start = 0;
	this.end = charBuffer.position();
	// reset the char buffer so the index are relative to the start of the buffer
	charBuffer.rewind();
	}
	}

	/**
	* Test if the buffer contains only ASCII bytes.
	* @param start
	* @param end
	* @param buffer
	* @return
	*/
	private boolean isAscii(int start, int end, DrillBuf buffer) {
	for (int i = start; i < end; i++) {
	byte bb = buffer.getByte(i);
	if (bb < 0) {
	//System.out.printf("Not a ASCII byte 0x%02X\n", bb);
	return false;
	}
	}
	return true;
	}

	/**
	* Initialize the charbuffer and decoder if they are not yet initialized.
	*/
	private void initCharBuffer() {
	if (charBuffer == null) {
	charBuffer = CharBuffer.allocate(INITIAL_CHAR_BUF);
	}
	if (decoder == null) {
	decoder = StandardCharsets.UTF_8.newDecoder();
	}
	}

	/**
	* Decode the buffer using the CharsetDecoder.
	* @param byteBuf
	* @return false if failed because the charbuffer was not big enough
	* @throws RuntimeException if it fails for encoding errors
	*/
	private boolean decodeUT8(ByteBuffer byteBuf) {
	// We give it all of the input data in call.
	boolean endOfInput = true;
	decoder.reset();
	charBuffer.rewind();
	// Convert utf-8 bytes to sequence of chars
	CoderResult result = decoder.decode(byteBuf, charBuffer, endOfInput);
	if (result.isOverflow()) {
	// Not enough space in the charBuffer.
	return false;
	} else if (result.isError()) {
	// Any other error
	try {
	result.throwException();
	} catch (CharacterCodingException e) {
	throw new RuntimeException(e);
	}
	}
	return true;
	}

	/**
	* Grow the charbuffer making sure not to overflow size integer. Note
	* this grows in the same manner as the ArrayList that is it adds 50%
	* to the current size.
	*/
	private void growCharBuffer() {
	// overflow-conscious code
	int oldCapacity = charBuffer.capacity();
	//System.out.println("old capacity " + oldCapacity);
	int newCapacity = oldCapacity + (oldCapacity >> 1);
	if (newCapacity < 0) {
	newCapacity = Integer.MAX_VALUE;
	}
	//System.out.println("new capacity " + newCapacity);
	charBuffer = CharBuffer.allocate(newCapacity);
	}

	/**
	* The regexp_replace function is implemented in a way to avoid the call to toString()
	* not to uselessly create a string object.
	*/
	@Override
	public String toString() {
	StringBuilder sb = new StringBuilder();
	for (int i = 0; i < length(); i++) {
	sb.append(charAt(i));
	}
	return sb.toString();
	}
	}