exec/vector/src/main/java/org/apache/drill/exec/util/Text.java - drill - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.drill.exec.util;

 import java.io.DataInput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CodingErrorAction;
 import java.nio.charset.MalformedInputException;
 import java.nio.charset.StandardCharsets;
 import java.text.CharacterIterator;
 import java.text.StringCharacterIterator;
 import java.util.Arrays;

 import com.fasterxml.jackson.core.JsonGenerationException;
 import com.fasterxml.jackson.core.JsonGenerator;
 import com.fasterxml.jackson.databind.SerializerProvider;
 import com.fasterxml.jackson.databind.annotation.JsonSerialize;
 import com.fasterxml.jackson.databind.ser.std.StdSerializer;

 /**
  * A simplified byte wrapper similar to Hadoop's Text class without all the dependencies. Lifted from Hadoop 2.7.1
  */
 @JsonSerialize(using = Text.TextSerializer.class)
 public class Text {

   private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
       new ThreadLocal<CharsetEncoder>() {
         @Override
         protected CharsetEncoder initialValue() {
           return StandardCharsets.UTF_8.newEncoder().
               onMalformedInput(CodingErrorAction.REPORT).
               onUnmappableCharacter(CodingErrorAction.REPORT);
         }
       };

   private static ThreadLocal<CharsetDecoder> DECODER_FACTORY =
       new ThreadLocal<CharsetDecoder>() {
         @Override
         protected CharsetDecoder initialValue() {
           return StandardCharsets.UTF_8.newDecoder().
               onMalformedInput(CodingErrorAction.REPORT).
               onUnmappableCharacter(CodingErrorAction.REPORT);
         }
       };

   private static final byte[] EMPTY_BYTES = new byte[0];

   private byte[] bytes;
   private int length;

   public Text() {
     bytes = EMPTY_BYTES;
   }

   /**
    * Construct from a string.
    */
   public Text(String string) {
     set(string);
   }

   /** Construct from another text. */
   public Text(Text utf8) {
     set(utf8);
   }

   /**
    * Construct from a byte array.
    */
   public Text(byte[] utf8) {
     set(utf8);
   }

   /**
    * Get a copy of the bytes that is exactly the length of the data. See {@link #getBytes()} for faster access to the
    * underlying array.
    */
   public byte[] copyBytes() {
     byte[] result = new byte[length];
     System.arraycopy(bytes, 0, result, 0, length);
     return result;
   }

   /**
    * Returns the raw bytes; however, only data up to {@link #getLength()} is valid. Please use {@link #copyBytes()} if
    * you need the returned array to be precisely the length of the data.
    */
   public byte[] getBytes() {
     return bytes;
   }

   /** Returns the number of bytes in the byte array */
   public int getLength() {
     return length;
   }

   /**
    * Returns the Unicode Scalar Value (32-bit integer value) for the character at <code>position</code>. Note that this
    * method avoids using the converter or doing String instantiation
    *
    * @return the Unicode scalar value at position or -1 if the position is invalid or points to a trailing byte
    */
   public int charAt(int position) {
     if (position > this.length)
     {
       return -1; // too long
     }
     if (position < 0)
     {
       return -1; // duh.
     }

     ByteBuffer bb = (ByteBuffer) ByteBuffer.wrap(bytes).position(position);
     return bytesToCodePoint(bb.slice());
   }

   public int find(String what) {
     return find(what, 0);
   }

   /**
    * Finds any occurence of <code>what</code> in the backing buffer, starting as position <code>start</code>. The
    * starting position is measured in bytes and the return value is in terms of byte position in the buffer. The backing
    * buffer is not converted to a string for this operation.
    *
    * @return byte position of the first occurence of the search string in the UTF-8 buffer or -1 if not found
    */
   public int find(String what, int start) {
     try {
       ByteBuffer src = ByteBuffer.wrap(this.bytes, 0, this.length);
       ByteBuffer tgt = encode(what);
       byte b = tgt.get();
       src.position(start);

       while (src.hasRemaining()) {
         if (b == src.get()) { // matching first byte
           src.mark(); // save position in loop
           tgt.mark(); // save position in target
           boolean found = true;
           int pos = src.position() - 1;
           while (tgt.hasRemaining()) {
             if (!src.hasRemaining()) { // src expired first
               tgt.reset();
               src.reset();
               found = false;
               break;
             }
             if (!(tgt.get() == src.get())) {
               tgt.reset();
               src.reset();
               found = false;
               break; // no match
             }
           }
           if (found) {
             return pos;
           }
         }
       }
       return -1; // not found
     } catch (CharacterCodingException e) {
       // can't get here
       e.printStackTrace();
       return -1;
     }
   }

   /**
    * Set to contain the contents of a string.
    */
   public void set(String string) {
     try {
       ByteBuffer bb = encode(string, true);
       bytes = bb.array();
       length = bb.limit();
     } catch (CharacterCodingException e) {
       throw new RuntimeException("Should not have happened ", e);
     }
   }

   /**
    * Set to a utf8 byte array
    */
   public void set(byte[] utf8) {
     set(utf8, 0, utf8.length);
   }

   /** copy a text. */
   public void set(Text other) {
     set(other.getBytes(), 0, other.getLength());
   }

   /**
    * Set the Text to range of bytes
    *
    * @param utf8
    *          the data to copy from
    * @param start
    *          the first position of the new string
    * @param len
    *          the number of bytes of the new string
    */
   public void set(byte[] utf8, int start, int len) {
     setCapacity(len, false);
     System.arraycopy(utf8, start, bytes, 0, len);
     this.length = len;
   }

   /**
    * Append a range of bytes to the end of the given text
    *
    * @param utf8
    *          the data to copy from
    * @param start
    *          the first position to append from utf8
    * @param len
    *          the number of bytes to append
    */
   public void append(byte[] utf8, int start, int len) {
     setCapacity(length + len, true);
     System.arraycopy(utf8, start, bytes, length, len);
     length += len;
   }

   /**
    * Clear the string to empty.
    *
    * <em>Note</em>: For performance reasons, this call does not clear the underlying byte array that is retrievable via
    * {@link #getBytes()}. In order to free the byte-array memory, call {@link #set(byte[])} with an empty byte array
    * (For example, <code>new byte[0]</code>).
    */
   public void clear() {
     length = 0;
   }

   /*
    * Sets the capacity of this Text object to <em>at least</em> <code>len</code> bytes. If the current buffer is longer,
    * then the capacity and existing content of the buffer are unchanged. If <code>len</code> is larger than the current
    * capacity, the Text object's capacity is increased to match.
    *
    * @param len the number of bytes we need
    *
    * @param keepData should the old data be kept
    */
   private void setCapacity(int len, boolean keepData) {
     if (bytes == null || bytes.length < len) {
       if (bytes != null && keepData) {
         bytes = Arrays.copyOf(bytes, Math.max(len, length << 1));
       } else {
         bytes = new byte[len];
       }
     }
   }

   /**
    * Convert text back to string
    *
    * @see java.lang.Object#toString()
    */
   @Override
   public String toString() {
     try {
       return decode(bytes, 0, length);
     } catch (CharacterCodingException e) {
       throw new RuntimeException("Should not have happened ", e);
     }
   }

   /**
    * Read a Text object whose length is already known. This allows creating Text from a stream which uses a different
    * serialization format.
    */
   public void readWithKnownLength(DataInput in, int len) throws IOException {
     setCapacity(len, false);
     in.readFully(bytes, 0, len);
     length = len;
   }

   /** Returns true iff <code>o</code> is a Text with the same contents. */
   @Override
   public boolean equals(Object o) {
     if (!(o instanceof Text)) {
       return false;
     }

     final Text that = (Text) o;
     if (this.getLength() != that.getLength()) {
       return false;
     }

     byte[] thisBytes = Arrays.copyOf(this.getBytes(), getLength());
     byte[] thatBytes = Arrays.copyOf(that.getBytes(), getLength());
     return Arrays.equals(thisBytes, thatBytes);

   }

   @Override
   public int hashCode() {
     return Arrays.hashCode(getBytes());
   }

   // / STATIC UTILITIES FROM HERE DOWN
   /**
    * Converts the provided byte array to a String using the UTF-8 encoding. If the input is malformed, replace by a
    * default value.
    */
   public static String decode(byte[] utf8) throws CharacterCodingException {
     return decode(ByteBuffer.wrap(utf8), true);
   }

   public static String decode(byte[] utf8, int start, int length)
       throws CharacterCodingException {
     return decode(ByteBuffer.wrap(utf8, start, length), true);
   }

   /**
    * Converts the provided byte array to a String using the UTF-8 encoding. If <code>replace</code> is true, then
    * malformed input is replaced with the substitution character, which is U+FFFD. Otherwise the method throws a
    * MalformedInputException.
    */
   public static String decode(byte[] utf8, int start, int length, boolean replace)
       throws CharacterCodingException {
     return decode(ByteBuffer.wrap(utf8, start, length), replace);
   }

   private static String decode(ByteBuffer utf8, boolean replace)
       throws CharacterCodingException {
     CharsetDecoder decoder = DECODER_FACTORY.get();
     if (replace) {
       decoder.onMalformedInput(
           java.nio.charset.CodingErrorAction.REPLACE);
       decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
     }
     String str = decoder.decode(utf8).toString();
     // set decoder back to its default value: REPORT
     if (replace) {
       decoder.onMalformedInput(CodingErrorAction.REPORT);
       decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
     }
     return str;
   }

   /**
    * Converts the provided String to bytes using the UTF-8 encoding. If the input is malformed, invalid chars are
    * replaced by a default value.
    *
    * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is ByteBuffer.limit()
    */

   public static ByteBuffer encode(String string)
       throws CharacterCodingException {
     return encode(string, true);
   }

   /**
    * Converts the provided String to bytes using the UTF-8 encoding. If <code>replace</code> is true, then malformed
    * input is replaced with the substitution character, which is U+FFFD. Otherwise the method throws a
    * MalformedInputException.
    *
    * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is ByteBuffer.limit()
    */
   public static ByteBuffer encode(String string, boolean replace)
       throws CharacterCodingException {
     CharsetEncoder encoder = ENCODER_FACTORY.get();
     if (replace) {
       encoder.onMalformedInput(CodingErrorAction.REPLACE);
       encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
     }
     ByteBuffer bytes =
         encoder.encode(CharBuffer.wrap(string.toCharArray()));
     if (replace) {
       encoder.onMalformedInput(CodingErrorAction.REPORT);
       encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
     }
     return bytes;
   }

   static final public int DEFAULT_MAX_LEN = 1024 * 1024;

   // //// states for validateUTF8

   private static final int LEAD_BYTE = 0;

   private static final int TRAIL_BYTE_1 = 1;

   private static final int TRAIL_BYTE = 2;

   /**
    * Check if a byte array contains valid utf-8
    *
    * @param utf8
    *          byte array
    * @throws MalformedInputException
    *           if the byte array contains invalid utf-8
    */
   public static void validateUTF8(byte[] utf8) throws MalformedInputException {
     validateUTF8(utf8, 0, utf8.length);
   }

   /**
    * Check to see if a byte array is valid utf-8
    *
    * @param utf8
    *          the array of bytes
    * @param start
    *          the offset of the first byte in the array
    * @param len
    *          the length of the byte sequence
    * @throws MalformedInputException
    *           if the byte array contains invalid bytes
    */
   public static void validateUTF8(byte[] utf8, int start, int len)
       throws MalformedInputException {
     int count = start;
     int leadByte = 0;
     int length = 0;
     int state = LEAD_BYTE;
     while (count < start + len) {
       int aByte = utf8[count] & 0xFF;

       switch (state) {
       case LEAD_BYTE:
         leadByte = aByte;
         length = bytesFromUTF8[aByte];

         switch (length) {
         case 0: // check for ASCII
           if (leadByte > 0x7F) {
             throw new MalformedInputException(count);
           }
           break;
         case 1:
           if (leadByte < 0xC2 || leadByte > 0xDF) {
             throw new MalformedInputException(count);
           }
           state = TRAIL_BYTE_1;
           break;
         case 2:
           if (leadByte < 0xE0 || leadByte > 0xEF) {
             throw new MalformedInputException(count);
           }
           state = TRAIL_BYTE_1;
           break;
         case 3:
           if (leadByte < 0xF0 || leadByte > 0xF4) {
             throw new MalformedInputException(count);
           }
           state = TRAIL_BYTE_1;
           break;
         default:
           // too long! Longest valid UTF-8 is 4 bytes (lead + three)
           // or if < 0 we got a trail byte in the lead byte position
           throw new MalformedInputException(count);
         } // switch (length)
         break;

       case TRAIL_BYTE_1:
         if (leadByte == 0xF0 && aByte < 0x90) {
           throw new MalformedInputException(count);
         }
         if (leadByte == 0xF4 && aByte > 0x8F) {
           throw new MalformedInputException(count);
         }
         if (leadByte == 0xE0 && aByte < 0xA0) {
           throw new MalformedInputException(count);
         }
         if (leadByte == 0xED && aByte > 0x9F) {
           throw new MalformedInputException(count);
         }
         // falls through to regular trail-byte test!!
       case TRAIL_BYTE:
         if (aByte < 0x80 || aByte > 0xBF) {
           throw new MalformedInputException(count);
         }
         if (--length == 0) {
           state = LEAD_BYTE;
         } else {
           state = TRAIL_BYTE;
         }
         break;
       default:
         break;
       } // switch (state)
       count++;
     }
   }

   /**
    * Magic numbers for UTF-8. These are the number of bytes that <em>follow</em> a given lead byte. Trailing bytes have
    * the value -1. The values 4 and 5 are presented in this table, even though valid UTF-8 cannot include the five and
    * six byte sequences.
    */
   static final int[] bytesFromUTF8 =
   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0,
       // trail bytes
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
       3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };

   /**
    * Returns the next code point at the current position in the buffer. The buffer's position will be incremented. Any
    * mark set on this buffer will be changed by this method!
    */
   public static int bytesToCodePoint(ByteBuffer bytes) {
     bytes.mark();
     byte b = bytes.get();
     bytes.reset();
     int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
     if (extraBytesToRead < 0)
     {
       return -1; // trailing byte!
     }
     int ch = 0;

     switch (extraBytesToRead) {
     case 5:
       ch += (bytes.get() & 0xFF);
       ch <<= 6; /* remember, illegal UTF-8 */
     case 4:
       ch += (bytes.get() & 0xFF);
       ch <<= 6; /* remember, illegal UTF-8 */
     case 3:
       ch += (bytes.get() & 0xFF);
       ch <<= 6;
     case 2:
       ch += (bytes.get() & 0xFF);
       ch <<= 6;
     case 1:
       ch += (bytes.get() & 0xFF);
       ch <<= 6;
     case 0:
       ch += (bytes.get() & 0xFF);
     }
     ch -= offsetsFromUTF8[extraBytesToRead];

     return ch;
   }

   static final int offsetsFromUTF8[] =
   { 0x00000000, 0x00003080,
       0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };

   /**
    * For the given string, returns the number of UTF-8 bytes required to encode the string.
    *
    * @param string
    *          text to encode
    * @return number of UTF-8 bytes required to encode
    */
   public static int utf8Length(String string) {
     CharacterIterator iter = new StringCharacterIterator(string);
     char ch = iter.first();
     int size = 0;
     while (ch != CharacterIterator.DONE) {
       if ((ch >= 0xD800) && (ch < 0xDC00)) {
         // surrogate pair?
         char trail = iter.next();
         if ((trail > 0xDBFF) && (trail < 0xE000)) {
           // valid pair
           size += 4;
         } else {
           // invalid pair
           size += 3;
           iter.previous(); // rewind one
         }
       } else if (ch < 0x80) {
         size++;
       } else if (ch < 0x800) {
         size += 2;
       } else {
         // ch < 0x10000, that is, the largest char value
         size += 3;
       }
       ch = iter.next();
     }
     return size;
   }

   public static class TextSerializer extends StdSerializer<Text> {

     public TextSerializer() {
       super(Text.class);
     }

     @Override
     public void serialize(Text text, JsonGenerator jsonGenerator, SerializerProvider serializerProvider)
         throws IOException, JsonGenerationException {
       jsonGenerator.writeString(text.toString());
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.drill.exec.util;

	import java.io.DataInput;
	import java.io.IOException;
	import java.nio.ByteBuffer;
	import java.nio.CharBuffer;
	import java.nio.charset.CharacterCodingException;
	import java.nio.charset.CharsetDecoder;
	import java.nio.charset.CharsetEncoder;
	import java.nio.charset.CodingErrorAction;
	import java.nio.charset.MalformedInputException;
	import java.nio.charset.StandardCharsets;
	import java.text.CharacterIterator;
	import java.text.StringCharacterIterator;
	import java.util.Arrays;

	import com.fasterxml.jackson.core.JsonGenerationException;
	import com.fasterxml.jackson.core.JsonGenerator;
	import com.fasterxml.jackson.databind.SerializerProvider;
	import com.fasterxml.jackson.databind.annotation.JsonSerialize;
	import com.fasterxml.jackson.databind.ser.std.StdSerializer;

	/**
	* A simplified byte wrapper similar to Hadoop's Text class without all the dependencies. Lifted from Hadoop 2.7.1
	*/
	@JsonSerialize(using = Text.TextSerializer.class)
	public class Text {

	private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
	new ThreadLocal<CharsetEncoder>() {
	@Override
	protected CharsetEncoder initialValue() {
	return StandardCharsets.UTF_8.newEncoder().
	onMalformedInput(CodingErrorAction.REPORT).
	onUnmappableCharacter(CodingErrorAction.REPORT);
	}
	};

	private static ThreadLocal<CharsetDecoder> DECODER_FACTORY =
	new ThreadLocal<CharsetDecoder>() {
	@Override
	protected CharsetDecoder initialValue() {
	return StandardCharsets.UTF_8.newDecoder().
	onMalformedInput(CodingErrorAction.REPORT).
	onUnmappableCharacter(CodingErrorAction.REPORT);
	}
	};

	private static final byte[] EMPTY_BYTES = new byte[0];

	private byte[] bytes;
	private int length;

	public Text() {
	bytes = EMPTY_BYTES;
	}

	/**
	* Construct from a string.
	*/
	public Text(String string) {
	set(string);
	}

	/** Construct from another text. */
	public Text(Text utf8) {
	set(utf8);
	}

	/**
	* Construct from a byte array.
	*/
	public Text(byte[] utf8) {
	set(utf8);
	}

	/**
	* Get a copy of the bytes that is exactly the length of the data. See {@link #getBytes()} for faster access to the
	* underlying array.
	*/
	public byte[] copyBytes() {
	byte[] result = new byte[length];
	System.arraycopy(bytes, 0, result, 0, length);
	return result;
	}

	/**
	* Returns the raw bytes; however, only data up to {@link #getLength()} is valid. Please use {@link #copyBytes()} if
	* you need the returned array to be precisely the length of the data.
	*/
	public byte[] getBytes() {
	return bytes;
	}

	/** Returns the number of bytes in the byte array */
	public int getLength() {
	return length;
	}

	/**
	* Returns the Unicode Scalar Value (32-bit integer value) for the character at <code>position</code>. Note that this
	* method avoids using the converter or doing String instantiation
	*
	* @return the Unicode scalar value at position or -1 if the position is invalid or points to a trailing byte
	*/
	public int charAt(int position) {
	if (position > this.length)
	{
	return -1; // too long
	}
	if (position < 0)
	{
	return -1; // duh.
	}

	ByteBuffer bb = (ByteBuffer) ByteBuffer.wrap(bytes).position(position);
	return bytesToCodePoint(bb.slice());
	}

	public int find(String what) {
	return find(what, 0);
	}

	/**
	* Finds any occurence of <code>what</code> in the backing buffer, starting as position <code>start</code>. The
	* starting position is measured in bytes and the return value is in terms of byte position in the buffer. The backing
	* buffer is not converted to a string for this operation.
	*
	* @return byte position of the first occurence of the search string in the UTF-8 buffer or -1 if not found
	*/
	public int find(String what, int start) {
	try {
	ByteBuffer src = ByteBuffer.wrap(this.bytes, 0, this.length);
	ByteBuffer tgt = encode(what);
	byte b = tgt.get();
	src.position(start);

	while (src.hasRemaining()) {
	if (b == src.get()) { // matching first byte
	src.mark(); // save position in loop
	tgt.mark(); // save position in target
	boolean found = true;
	int pos = src.position() - 1;
	while (tgt.hasRemaining()) {
	if (!src.hasRemaining()) { // src expired first
	tgt.reset();
	src.reset();
	found = false;
	break;
	}
	if (!(tgt.get() == src.get())) {
	tgt.reset();
	src.reset();
	found = false;
	break; // no match
	}
	}
	if (found) {
	return pos;
	}
	}
	}
	return -1; // not found
	} catch (CharacterCodingException e) {
	// can't get here
	e.printStackTrace();
	return -1;
	}
	}

	/**
	* Set to contain the contents of a string.
	*/
	public void set(String string) {
	try {
	ByteBuffer bb = encode(string, true);
	bytes = bb.array();
	length = bb.limit();
	} catch (CharacterCodingException e) {
	throw new RuntimeException("Should not have happened ", e);
	}
	}

	/**
	* Set to a utf8 byte array
	*/
	public void set(byte[] utf8) {
	set(utf8, 0, utf8.length);
	}

	/** copy a text. */
	public void set(Text other) {
	set(other.getBytes(), 0, other.getLength());
	}

	/**
	* Set the Text to range of bytes
	*
	* @param utf8
	* the data to copy from
	* @param start
	* the first position of the new string
	* @param len
	* the number of bytes of the new string
	*/
	public void set(byte[] utf8, int start, int len) {
	setCapacity(len, false);
	System.arraycopy(utf8, start, bytes, 0, len);
	this.length = len;
	}

	/**
	* Append a range of bytes to the end of the given text
	*
	* @param utf8
	* the data to copy from
	* @param start
	* the first position to append from utf8
	* @param len
	* the number of bytes to append
	*/
	public void append(byte[] utf8, int start, int len) {
	setCapacity(length + len, true);
	System.arraycopy(utf8, start, bytes, length, len);
	length += len;
	}

	/**
	* Clear the string to empty.
	*
	* <em>Note</em>: For performance reasons, this call does not clear the underlying byte array that is retrievable via
	* {@link #getBytes()}. In order to free the byte-array memory, call {@link #set(byte[])} with an empty byte array
	* (For example, <code>new byte[0]</code>).
	*/
	public void clear() {
	length = 0;
	}

	/*
	* Sets the capacity of this Text object to <em>at least</em> <code>len</code> bytes. If the current buffer is longer,
	* then the capacity and existing content of the buffer are unchanged. If <code>len</code> is larger than the current
	* capacity, the Text object's capacity is increased to match.
	*
	* @param len the number of bytes we need
	*
	* @param keepData should the old data be kept
	*/
	private void setCapacity(int len, boolean keepData) {
	if (bytes == null \|\| bytes.length < len) {
	if (bytes != null && keepData) {
	bytes = Arrays.copyOf(bytes, Math.max(len, length << 1));
	} else {
	bytes = new byte[len];
	}
	}
	}

	/**
	* Convert text back to string
	*
	* @see java.lang.Object#toString()
	*/
	@Override
	public String toString() {
	try {
	return decode(bytes, 0, length);
	} catch (CharacterCodingException e) {
	throw new RuntimeException("Should not have happened ", e);
	}
	}

	/**
	* Read a Text object whose length is already known. This allows creating Text from a stream which uses a different
	* serialization format.
	*/
	public void readWithKnownLength(DataInput in, int len) throws IOException {
	setCapacity(len, false);
	in.readFully(bytes, 0, len);
	length = len;
	}

	/** Returns true iff <code>o</code> is a Text with the same contents. */
	@Override
	public boolean equals(Object o) {
	if (!(o instanceof Text)) {
	return false;
	}

	final Text that = (Text) o;
	if (this.getLength() != that.getLength()) {
	return false;
	}

	byte[] thisBytes = Arrays.copyOf(this.getBytes(), getLength());
	byte[] thatBytes = Arrays.copyOf(that.getBytes(), getLength());
	return Arrays.equals(thisBytes, thatBytes);

	}

	@Override
	public int hashCode() {
	return Arrays.hashCode(getBytes());
	}

	// / STATIC UTILITIES FROM HERE DOWN
	/**
	* Converts the provided byte array to a String using the UTF-8 encoding. If the input is malformed, replace by a
	* default value.
	*/
	public static String decode(byte[] utf8) throws CharacterCodingException {
	return decode(ByteBuffer.wrap(utf8), true);
	}

	public static String decode(byte[] utf8, int start, int length)
	throws CharacterCodingException {
	return decode(ByteBuffer.wrap(utf8, start, length), true);
	}

	/**
	* Converts the provided byte array to a String using the UTF-8 encoding. If <code>replace</code> is true, then
	* malformed input is replaced with the substitution character, which is U+FFFD. Otherwise the method throws a
	* MalformedInputException.
	*/
	public static String decode(byte[] utf8, int start, int length, boolean replace)
	throws CharacterCodingException {
	return decode(ByteBuffer.wrap(utf8, start, length), replace);
	}

	private static String decode(ByteBuffer utf8, boolean replace)
	throws CharacterCodingException {
	CharsetDecoder decoder = DECODER_FACTORY.get();
	if (replace) {
	decoder.onMalformedInput(
	java.nio.charset.CodingErrorAction.REPLACE);
	decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
	}
	String str = decoder.decode(utf8).toString();
	// set decoder back to its default value: REPORT
	if (replace) {
	decoder.onMalformedInput(CodingErrorAction.REPORT);
	decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
	}
	return str;
	}

	/**
	* Converts the provided String to bytes using the UTF-8 encoding. If the input is malformed, invalid chars are
	* replaced by a default value.
	*
	* @return ByteBuffer: bytes stores at ByteBuffer.array() and length is ByteBuffer.limit()
	*/

	public static ByteBuffer encode(String string)
	throws CharacterCodingException {
	return encode(string, true);
	}

	/**
	* Converts the provided String to bytes using the UTF-8 encoding. If <code>replace</code> is true, then malformed
	* input is replaced with the substitution character, which is U+FFFD. Otherwise the method throws a
	* MalformedInputException.
	*
	* @return ByteBuffer: bytes stores at ByteBuffer.array() and length is ByteBuffer.limit()
	*/
	public static ByteBuffer encode(String string, boolean replace)
	throws CharacterCodingException {
	CharsetEncoder encoder = ENCODER_FACTORY.get();
	if (replace) {
	encoder.onMalformedInput(CodingErrorAction.REPLACE);
	encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
	}
	ByteBuffer bytes =
	encoder.encode(CharBuffer.wrap(string.toCharArray()));
	if (replace) {
	encoder.onMalformedInput(CodingErrorAction.REPORT);
	encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
	}
	return bytes;
	}

	static final public int DEFAULT_MAX_LEN = 1024 * 1024;

	// //// states for validateUTF8

	private static final int LEAD_BYTE = 0;

	private static final int TRAIL_BYTE_1 = 1;

	private static final int TRAIL_BYTE = 2;

	/**
	* Check if a byte array contains valid utf-8
	*
	* @param utf8
	* byte array
	* @throws MalformedInputException
	* if the byte array contains invalid utf-8
	*/
	public static void validateUTF8(byte[] utf8) throws MalformedInputException {
	validateUTF8(utf8, 0, utf8.length);
	}

	/**
	* Check to see if a byte array is valid utf-8
	*
	* @param utf8
	* the array of bytes
	* @param start
	* the offset of the first byte in the array
	* @param len
	* the length of the byte sequence
	* @throws MalformedInputException
	* if the byte array contains invalid bytes
	*/
	public static void validateUTF8(byte[] utf8, int start, int len)
	throws MalformedInputException {
	int count = start;
	int leadByte = 0;
	int length = 0;
	int state = LEAD_BYTE;
	while (count < start + len) {
	int aByte = utf8[count] & 0xFF;

	switch (state) {
	case LEAD_BYTE:
	leadByte = aByte;
	length = bytesFromUTF8[aByte];

	switch (length) {
	case 0: // check for ASCII
	if (leadByte > 0x7F) {
	throw new MalformedInputException(count);
	}
	break;
	case 1:
	if (leadByte < 0xC2 \|\| leadByte > 0xDF) {
	throw new MalformedInputException(count);
	}
	state = TRAIL_BYTE_1;
	break;
	case 2:
	if (leadByte < 0xE0 \|\| leadByte > 0xEF) {
	throw new MalformedInputException(count);
	}
	state = TRAIL_BYTE_1;
	break;
	case 3:
	if (leadByte < 0xF0 \|\| leadByte > 0xF4) {
	throw new MalformedInputException(count);
	}
	state = TRAIL_BYTE_1;
	break;
	default:
	// too long! Longest valid UTF-8 is 4 bytes (lead + three)
	// or if < 0 we got a trail byte in the lead byte position
	throw new MalformedInputException(count);
	} // switch (length)
	break;

	case TRAIL_BYTE_1:
	if (leadByte == 0xF0 && aByte < 0x90) {
	throw new MalformedInputException(count);
	}
	if (leadByte == 0xF4 && aByte > 0x8F) {
	throw new MalformedInputException(count);
	}
	if (leadByte == 0xE0 && aByte < 0xA0) {
	throw new MalformedInputException(count);
	}
	if (leadByte == 0xED && aByte > 0x9F) {
	throw new MalformedInputException(count);
	}
	// falls through to regular trail-byte test!!
	case TRAIL_BYTE:
	if (aByte < 0x80 \|\| aByte > 0xBF) {
	throw new MalformedInputException(count);
	}
	if (--length == 0) {
	state = LEAD_BYTE;
	} else {
	state = TRAIL_BYTE;
	}
	break;
	default:
	break;
	} // switch (state)
	count++;
	}
	}

	/**
	* Magic numbers for UTF-8. These are the number of bytes that <em>follow</em> a given lead byte. Trailing bytes have
	* the value -1. The values 4 and 5 are presented in this table, even though valid UTF-8 cannot include the five and
	* six byte sequences.
	*/
	static final int[] bytesFromUTF8 =
	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0,
	// trail bytes
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
	3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };

	/**
	* Returns the next code point at the current position in the buffer. The buffer's position will be incremented. Any
	* mark set on this buffer will be changed by this method!
	*/
	public static int bytesToCodePoint(ByteBuffer bytes) {
	bytes.mark();
	byte b = bytes.get();
	bytes.reset();
	int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
	if (extraBytesToRead < 0)
	{
	return -1; // trailing byte!
	}
	int ch = 0;

	switch (extraBytesToRead) {
	case 5:
	ch += (bytes.get() & 0xFF);
	ch <<= 6; /* remember, illegal UTF-8 */
	case 4:
	ch += (bytes.get() & 0xFF);
	ch <<= 6; /* remember, illegal UTF-8 */
	case 3:
	ch += (bytes.get() & 0xFF);
	ch <<= 6;
	case 2:
	ch += (bytes.get() & 0xFF);
	ch <<= 6;
	case 1:
	ch += (bytes.get() & 0xFF);
	ch <<= 6;
	case 0:
	ch += (bytes.get() & 0xFF);
	}
	ch -= offsetsFromUTF8[extraBytesToRead];

	return ch;
	}

	static final int offsetsFromUTF8[] =
	{ 0x00000000, 0x00003080,
	0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };

	/**
	* For the given string, returns the number of UTF-8 bytes required to encode the string.
	*
	* @param string
	* text to encode
	* @return number of UTF-8 bytes required to encode
	*/
	public static int utf8Length(String string) {
	CharacterIterator iter = new StringCharacterIterator(string);
	char ch = iter.first();
	int size = 0;
	while (ch != CharacterIterator.DONE) {
	if ((ch >= 0xD800) && (ch < 0xDC00)) {
	// surrogate pair?
	char trail = iter.next();
	if ((trail > 0xDBFF) && (trail < 0xE000)) {
	// valid pair
	size += 4;
	} else {
	// invalid pair
	size += 3;
	iter.previous(); // rewind one
	}
	} else if (ch < 0x80) {
	size++;
	} else if (ch < 0x800) {
	size += 2;
	} else {
	// ch < 0x10000, that is, the largest char value
	size += 3;
	}
	ch = iter.next();
	}
	return size;
	}

	public static class TextSerializer extends StdSerializer<Text> {

	public TextSerializer() {
	super(Text.class);
	}

	@Override
	public void serialize(Text text, JsonGenerator jsonGenerator, SerializerProvider serializerProvider)
	throws IOException, JsonGenerationException {
	jsonGenerator.writeString(text.toString());
	}
	}
	}