blob: 866ebb01eb45bafb6843761574ce68020b83f448 [file] [log] [blame]
/*
* Copyright 2009-2010 by The Regents of the University of California
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.hyracks.data.std.primitive;
import edu.uci.ics.hyracks.api.dataflow.value.ITypeTraits;
import edu.uci.ics.hyracks.data.std.api.AbstractPointable;
import edu.uci.ics.hyracks.data.std.api.IComparable;
import edu.uci.ics.hyracks.data.std.api.IHashable;
import edu.uci.ics.hyracks.data.std.api.IPointable;
import edu.uci.ics.hyracks.data.std.api.IPointableFactory;
public final class UTF8StringPointable extends AbstractPointable implements IHashable, IComparable {
public static final ITypeTraits TYPE_TRAITS = new ITypeTraits() {
private static final long serialVersionUID = 1L;
@Override
public boolean isFixedLength() {
return false;
}
@Override
public int getFixedLength() {
return 0;
}
};
public static final IPointableFactory FACTORY = new IPointableFactory() {
private static final long serialVersionUID = 1L;
@Override
public IPointable createPointable() {
return new UTF8StringPointable();
}
@Override
public ITypeTraits getTypeTraits() {
return TYPE_TRAITS;
}
};
/**
* Returns the character at the given byte offset. The caller is responsible for making sure that
* the provided offset is within bounds and points to the beginning of a valid UTF8 character.
*
* @param offset
* - Byte offset
* @return Character at the given offset.
*/
public char charAt(int offset) {
return charAt(bytes, start + offset);
}
public static char charAt(byte[] b, int s) {
int c = b[s] & 0xff;
switch (c >> 4) {
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
case 6:
case 7:
return (char) c;
case 12:
case 13:
return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
case 14:
return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (((b[s + 2]) & 0x3F) << 0));
default:
throw new IllegalArgumentException();
}
}
public int charSize(int offset) {
return charSize(bytes, start + offset);
}
public static int charSize(byte[] b, int s) {
int c = b[s] & 0xff;
switch (c >> 4) {
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
case 6:
case 7:
return 1;
case 12:
case 13:
return 2;
case 14:
return 3;
}
throw new IllegalStateException();
}
public static int getModifiedUTF8Len(char c) {
if (c >= 0x0000 && c <= 0x007F) {
return 1;
} else if (c <= 0x07FF) {
return 2;
} else {
return 3;
}
}
/**
* Gets the length of the string in characters.
*
* @return length of string in characters
*/
public int getStringLength() {
return getStringLength(bytes, start);
}
public static int getStringLength(byte[] b, int s) {
int pos = s + 2;
int end = pos + getUTFLength(b, s);
int charCount = 0;
while (pos < end) {
charCount++;
pos += charSize(b, pos);
}
return charCount;
}
/**
* Gets the length of the UTF-8 encoded string in bytes.
*
* @return length of UTF-8 encoded string in bytes
*/
public int getUTFLength() {
return getUTFLength(bytes, start);
}
public static int getUTFLength(byte[] b, int s) {
return ((b[s] & 0xff) << 8) + ((b[s + 1] & 0xff) << 0);
}
@Override
public int compareTo(IPointable pointer) {
return compareTo(pointer.getByteArray(), pointer.getStartOffset(), pointer.getLength());
}
@Override
public int compareTo(byte[] bytes, int start, int length) {
int utflen1 = getUTFLength(this.bytes, this.start);
int utflen2 = getUTFLength(bytes, start);
int c1 = 0;
int c2 = 0;
int s1Start = this.start + 2;
int s2Start = start + 2;
while (c1 < utflen1 && c2 < utflen2) {
char ch1 = charAt(this.bytes, s1Start + c1);
char ch2 = charAt(bytes, s2Start + c2);
if (ch1 != ch2) {
return ch1 - ch2;
}
c1 += charSize(this.bytes, s1Start + c1);
c2 += charSize(bytes, s2Start + c2);
}
return utflen1 - utflen2;
}
@Override
public int hash() {
int h = 0;
int utflen = getUTFLength(bytes, start);
int sStart = start + 2;
int c = 0;
while (c < utflen) {
char ch = charAt(bytes, sStart + c);
h = 31 * h + ch;
c += charSize(bytes, sStart + c);
}
return h;
}
public static void toString(StringBuilder buffer, byte[] bytes, int start) {
int utfLen = getUTFLength(bytes, start);
int offset = 2;
while (utfLen > 0) {
char c = charAt(bytes, start + offset);
buffer.append(c);
int cLen = UTF8StringPointable.getModifiedUTF8Len(c);
offset += cLen;
utfLen -= cLen;
}
}
public void toString(StringBuilder buffer) {
toString(buffer, bytes, start);
}
}