blob: 574fc3342000ee0ef64d712a08bec3b61a312f09 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.table.dataformat;
import org.apache.flink.api.common.typeinfo.TypeInfo;
import org.apache.flink.core.memory.MemorySegment;
import org.apache.flink.core.memory.MemorySegmentFactory;
import org.apache.flink.table.dataformat.util.BinaryRowUtil;
import org.apache.flink.table.dataformat.util.MultiSegUtil;
import org.apache.flink.table.runtime.util.StringUtf8Utils;
import org.apache.flink.table.typeutils.BinaryStringTypeFactory;
import org.apache.flink.table.util.hash.Murmur32;
import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.KryoSerializable;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
import org.apache.commons.codec.binary.Hex;
import java.math.BigDecimal;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.apache.flink.util.Preconditions.checkArgument;
/**
* A utf8 string which is backed by {@link MemorySegment} instead of String. Its data may span
* multiple {@link MemorySegment}s.
*
* <p>Used for internal table-level implementation. The built-in operator will use it for comparison,
* search, and so on.
*
* <p>{@code BinaryString} are influenced by Apache Spark UTF8String.
*/
@TypeInfo(BinaryStringTypeFactory.class)
public final class BinaryString implements Comparable<BinaryString>, Cloneable, KryoSerializable {
// TODO remove it for thread safe.
public static final BinaryString EMPTY_UTF8 = BinaryString.fromString("");
static {
EMPTY_UTF8.ensureEncoded();
}
static final BinaryString[] EMPTY_STRING_ARRAY = new BinaryString[0];
private MemorySegment[] segments;
private int offset;
private int numBytes;
/** Cache the java string for the binary string to avoid redundant decode. */
private String javaString;
public BinaryString() {
pointTo((MemorySegment[]) null, -1, -1, null);
}
private BinaryString(String str) {
pointToString(str);
}
private BinaryString(MemorySegment[] segments, int offset, int numBytes) {
pointTo(segments, offset, numBytes);
}
private BinaryString(MemorySegment[] segments, int offset, int numBytes, String javaString) {
pointTo(segments, offset, numBytes, javaString);
}
public void pointTo(byte[] bytes, int offset, int numBytes) {
pointTo(bytes, offset, numBytes, null);
}
public void pointTo(byte[] bytes, int offset, int numBytes, String javaString) {
MemorySegment[] segments = this.segments;
if (segments != null && segments.length == 1) {
segments[0].pointTo(bytes);
} else {
segments = new MemorySegment[] {MemorySegmentFactory.wrap(bytes)};
}
pointTo(segments, offset, numBytes, javaString);
}
public void pointTo(MemorySegment[] segments, int offset, int numBytes) {
pointTo(segments, offset, numBytes, null);
}
private void pointToString(String javaString) {
pointTo((MemorySegment[]) null, -1, -1, javaString);
}
private void pointTo(MemorySegment[] segments, int offset, int numBytes, String javaString) {
this.segments = segments;
this.offset = offset;
this.numBytes = numBytes;
this.javaString = javaString;
}
/**
* Creates an BinaryString from given address (base and offset) and length.
*/
public static BinaryString fromAddress(
MemorySegment[] segments, int offset, int numBytes) {
return new BinaryString(segments, offset, numBytes);
}
public static BinaryString fromString(String str) {
if (str == null) {
return null;
} else {
return fromNonNullString(str);
}
}
private static BinaryString fromNonNullString(String str) {
return new BinaryString(str);
}
public static BinaryString fromString(BinaryString str) {
return str;
}
public static BinaryString fromString(Object obj) {
if (obj == null) {
return null;
} else if (obj instanceof String) {
return fromNonNullString((String) obj);
} else if (obj instanceof BinaryString) {
return (BinaryString) obj;
} else {
return fromNonNullString(obj.toString());
}
}
public static BinaryString fromBytes(byte[] bytes) {
if (bytes != null) {
return fromBytes(bytes, 0, bytes.length);
} else {
return null;
}
}
public static BinaryString fromBytes(byte[] bytes, int offset, int numBytes) {
return fromBytes(bytes, offset, numBytes, null);
}
public static BinaryString fromBytes(byte[] bytes, int offset, int numBytes, String javaString) {
return new BinaryString(
new MemorySegment[]{MemorySegmentFactory.wrap(bytes)}, offset, numBytes, javaString);
}
/**
* Creates an BinaryString that contains `length` spaces.
*/
public static BinaryString blankString(int length) {
byte[] spaces = new byte[length];
Arrays.fill(spaces, (byte) ' ');
return fromBytes(spaces);
}
/**
* Returns the number of bytes for a code point with the first byte as `b`.
* @param b The first byte of a code point
*/
private static int numBytesForFirstByte(final byte b) {
if (b >= 0) {
// 1 byte, 7 bits: 0xxxxxxx
return 1;
} else if ((b >> 5) == -2 && (b & 0x1e) != 0) {
// 2 bytes, 11 bits: 110xxxxx 10xxxxxx
return 2;
} else if ((b >> 4) == -2) {
// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
return 3;
} else if ((b >> 3) == -2) {
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
return 4;
} else {
// throw new IllegalArgumentException();
// Skip the first byte disallowed in UTF-8
return 1;
}
}
public boolean isSpaceString() {
if (javaString != null) {
return javaString.equals(" ");
} else {
return getByte(0) == ' ';
}
}
public void ensureEncoded() {
if (!isEncoded()) {
encodeToBytes();
}
}
private void encodeToBytes() {
if (javaString != null) {
byte[] bytes = StringUtf8Utils.encodeUTF8(javaString);
pointTo(bytes, 0, bytes.length, javaString);
}
}
public int getOffset() {
ensureEncoded();
return offset;
}
public MemorySegment[] getSegments() {
ensureEncoded();
return segments;
}
/**
* Returns the number of bytes.
*/
public int numBytes() {
ensureEncoded();
return numBytes;
}
/**
* Returns the number of code points in it.
*/
public int numChars() {
ensureEncoded();
if (inOneSeg()) {
int len = 0;
for (int i = 0; i < numBytes; i += numBytesForFirstByte(getByteOneSeg(i))) {
len++;
}
return len;
} else {
return numCharsSlow();
}
}
private int numCharsSlow() {
int len = 0;
int segSize = segments[0].size();
SegmentAndOffset index = firstSegmentAndOffset(segSize);
int i = 0;
while (i < numBytes) {
int charBytes = numBytesForFirstByte(index.value());
i += charBytes;
len++;
index.skipBytes(charBytes, segSize);
}
return len;
}
public byte getByte(int i) {
ensureEncoded();
int globalOffset = offset + i;
int size = segments[0].size();
if (globalOffset < size) {
return segments[0].get(globalOffset);
} else {
return segments[globalOffset / size].get(globalOffset % size);
}
}
private byte getByteOneSeg(int i) {
return segments[0].get(offset + i);
}
@Override
public boolean equals(final Object o) {
if (o != null && o instanceof BinaryString) {
BinaryString other = (BinaryString) o;
if (javaString != null && other.javaString != null) {
return javaString.equals(other.javaString);
}
ensureEncoded();
other.ensureEncoded();
return numBytes == other.numBytes &&
BinaryRowUtil.equals(segments, offset, other.segments, other.offset, numBytes);
} else {
return false;
}
}
@Override
public int compareTo(BinaryString other) {
if (javaString != null && other.javaString != null) {
return javaString.compareTo(other.javaString);
}
ensureEncoded();
other.ensureEncoded();
if (segments.length == 1 && other.segments.length == 1) {
int len = Math.min(numBytes, other.numBytes);
MemorySegment seg1 = segments[0];
MemorySegment seg2 = other.segments[0];
for (int i = 0; i < len; i++) {
// We can use MemorySegment.compare.
// But need careful about inline.
int res = (seg1.get(offset + i) & 0xFF) - (seg2.get(other.offset + i) & 0xFF);
if (res != 0) {
return res;
}
}
return numBytes - other.numBytes;
}
// if there are multi segments.
return compareComplex(other);
}
/**
* Find the boundaries of segments, and then compare MemorySegment.
*/
private int compareComplex(BinaryString other) {
if (numBytes == 0 || other.numBytes == 0) {
return numBytes - other.numBytes;
}
int len = Math.min(numBytes, other.numBytes);
MemorySegment seg1 = segments[0];
MemorySegment seg2 = other.segments[0];
int segmentSize = segments[0].size();
int otherSegmentSize = other.segments[0].size();
int sizeOfFirst1 = segmentSize - offset;
int sizeOfFirst2 = otherSegmentSize - other.offset;
int varSegIndex1 = 1;
int varSegIndex2 = 1;
// find the first segment of this string.
while (sizeOfFirst1 <= 0) {
sizeOfFirst1 += segmentSize;
seg1 = segments[varSegIndex1++];
}
while (sizeOfFirst2 <= 0) {
sizeOfFirst2 += otherSegmentSize;
seg2 = other.segments[varSegIndex2++];
}
int offset1 = segmentSize - sizeOfFirst1;
int offset2 = otherSegmentSize - sizeOfFirst2;
int needCompare = Math.min(Math.min(sizeOfFirst1, sizeOfFirst2), len);
while (needCompare > 0) {
// compare in one segment.
for (int i = 0; i < needCompare; i++) {
int res = (seg1.get(offset1 + i) & 0xFF) - (seg2.get(offset2 + i) & 0xFF);
if (res != 0) {
return res;
}
}
if (needCompare == len) {
break;
}
len -= needCompare;
// next segment
if (sizeOfFirst1 < sizeOfFirst2) { //I am smaller
seg1 = segments[varSegIndex1++];
offset1 = 0;
offset2 += needCompare;
sizeOfFirst1 = segmentSize;
sizeOfFirst2 -= needCompare;
} else if (sizeOfFirst1 > sizeOfFirst2) { //other is smaller
seg2 = other.segments[varSegIndex2++];
offset2 = 0;
offset1 += needCompare;
sizeOfFirst2 = otherSegmentSize;
sizeOfFirst1 -= needCompare;
} else { // same, should go ahead both.
seg1 = segments[varSegIndex1++];
seg2 = other.segments[varSegIndex2++];
offset1 = 0;
offset2 = 0;
sizeOfFirst1 = segmentSize;
sizeOfFirst2 = otherSegmentSize;
}
needCompare = Math.min(Math.min(sizeOfFirst1, sizeOfFirst2), len);
}
checkArgument(needCompare == len);
return numBytes - other.numBytes;
}
@Override
public String toString() {
if (javaString != null) {
return javaString;
}
String str;
if (segments.length == 1) {
str = StringUtf8Utils.decodeUTF8(segments[0], offset, numBytes);
} else {
byte[] bytes = StringUtf8Utils.allocateBytes(numBytes);
copyTo(bytes);
str = StringUtf8Utils.decodeUTF8(bytes, 0, numBytes);
}
this.javaString = str;
return str;
}
/**
* Maybe not copied, if want copy, please use copyTo.
*/
public byte[] getBytes() {
ensureEncoded();
return MultiSegUtil.getBytes(segments, offset, numBytes);
}
@Override
public int hashCode() {
ensureEncoded();
if (segments.length == 1) {
return Murmur32.hashBytes(segments[0], offset, numBytes, 42);
} else {
return hashSlow();
}
}
private int hashSlow() {
return Murmur32.hashBytes(MemorySegmentFactory.wrap(getBytes()), 0, numBytes, 42);
}
public long hash64() {
ensureEncoded();
if (segments.length == 1) {
return Murmur32.hash64(segments[0], offset, numBytes, 42);
} else {
return hash64Slow();
}
}
private long hash64Slow() {
return Murmur32.hash64(MemorySegmentFactory.wrap(getBytes()), 0, numBytes, 42);
}
public BinaryString copy() {
if (segments == null) {
return new BinaryString(javaString);
} else {
byte[] copy = BinaryRowUtil.copy(segments, offset, numBytes);
return BinaryString.fromBytes(copy, 0, copy.length, javaString);
}
}
public BinaryString copy(BinaryString reuse) {
if (segments == null) {
reuse.pointToString(javaString);
} else {
byte[] copy = BinaryRowUtil.copy(segments, offset, numBytes);
reuse.pointTo(copy, 0, copy.length, javaString);
}
return reuse;
}
public BinaryString cloneReference() {
if (segments == null) {
return new BinaryString(javaString);
} else {
MemorySegment[] cloneSegs = new MemorySegment[segments.length];
for (int i = 0; i < segments.length; i++) {
cloneSegs[i] = segments[i].cloneReference();
}
return new BinaryString(cloneSegs, offset, numBytes, javaString);
}
}
public boolean isEncoded() {
return segments != null;
}
public void copyTo(byte[] bytes) {
ensureEncoded();
BinaryRowUtil.copy(segments, offset, bytes, 0, numBytes);
}
/**
* Range partition use Kryo to emit local sample data.
*/
@Override
public void write(Kryo kryo, Output output) {
ensureEncoded();
byte[] copy = BinaryRowUtil.copy(segments, offset, numBytes);
output.writeInt(numBytes);
output.writeBytes(copy);
}
@Override
public void read(Kryo kryo, Input input) {
int numBytes = input.readInt();
byte[] bytes = input.readBytes(numBytes);
pointTo(bytes, 0, numBytes);
}
public static String safeToString(BinaryString str) {
if (str == null) {
return null;
} else {
return str.toString();
}
}
private boolean inOneSeg() {
return numBytes + offset <= segments[0].size();
}
public BinaryString substringSQL(int pos) {
return substringSQL(pos, Integer.MAX_VALUE);
}
public BinaryString substringSQL(int pos, int length) {
if (length < 0) {
return null;
}
ensureEncoded();
if (equals(EMPTY_UTF8)) {
return EMPTY_UTF8;
}
int start;
int end;
int numChars = numChars();
if (pos > 0) {
start = pos - 1;
if (start >= numChars) {
return EMPTY_UTF8;
}
} else if (pos < 0) {
start = numChars + pos;
if (start < 0) {
return EMPTY_UTF8;
}
} else {
start = 0;
}
if ((numChars - start) < length) {
end = numChars;
} else {
end = start + length;
}
return substring(start, end);
}
/**
* Returns a substring of this.
* @param start the position of first code point
* @param until the position after last code point, exclusive.
*/
public BinaryString substring(final int start, final int until) {
ensureEncoded();
if (until <= start || start >= numBytes()) {
return EMPTY_UTF8;
}
if (inOneSeg()) {
MemorySegment segment = segments[0];
int i = 0;
int c = 0;
while (i < numBytes && c < start) {
i += numBytesForFirstByte(segment.get(i + offset));
c += 1;
}
int j = i;
while (i < numBytes && c < until) {
i += numBytesForFirstByte(segment.get(i + offset));
c += 1;
}
if (i > j) {
byte[] bytes = new byte[i - j];
segment.get(offset + j, bytes, 0, i - j);
return fromBytes(bytes);
} else {
return EMPTY_UTF8;
}
} else {
return substringSlow(start, until);
}
}
private BinaryString substringSlow(final int start, final int until) {
int segSize = segments[0].size();
SegmentAndOffset index = firstSegmentAndOffset(segSize);
int i = 0;
int c = 0;
while (i < numBytes && c < start) {
int charSize = numBytesForFirstByte(index.value());
i += charSize;
index.skipBytes(charSize, segSize);
c += 1;
}
int j = i;
while (i < numBytes && c < until) {
int charSize = numBytesForFirstByte(index.value());
i += charSize;
index.skipBytes(charSize, segSize);
c += 1;
}
if (i > j) {
return fromBytes(BinaryRowUtil.copy(segments, offset + j, i - j));
} else {
return EMPTY_UTF8;
}
}
/**
* Concatenates input strings together into a single string.
*/
public static BinaryString concat(BinaryString... inputs) {
return concat(Arrays.asList(inputs));
}
/**
* Concatenates input strings together into a single string.
*/
public static BinaryString concat(Iterable<BinaryString> inputs) {
// Compute the total length of the result.
int totalLength = 0;
for (BinaryString input : inputs) {
if (input != null) {
input.ensureEncoded();
totalLength += input.numBytes();
}
}
// Allocate a new byte array, and copy the inputs one by one into it.
final byte[] result = new byte[totalLength];
int offset = 0;
for (BinaryString input : inputs) {
if (input != null) {
int len = input.numBytes;
BinaryRowUtil.copy(input.segments, input.offset, result, offset, len);
offset += len;
}
}
return fromBytes(result);
}
/**
* Concatenates input strings together into a single string using the separator.
* A null input is skipped. For example, concat(",", "a", null, "c") would yield "a,c".
*/
public static BinaryString concatWs(BinaryString separator, BinaryString... inputs) {
return concatWs(separator, Arrays.asList(inputs));
}
/**
* Concatenates input strings together into a single string using the separator.
* A null input is skipped. For example, concat(",", "a", null, "c") would yield "a,c".
*/
public static BinaryString concatWs(BinaryString separator, Iterable<BinaryString> inputs) {
if (null == separator || EMPTY_UTF8.equals(separator)) {
return concat(inputs);
}
separator.ensureEncoded();
int numInputBytes = 0; // total number of bytes from the inputs
int numInputs = 0; // number of non-null inputs
for (BinaryString input : inputs) {
if (input != null) {
input.ensureEncoded();
numInputBytes += input.numBytes;
numInputs++;
}
}
if (numInputs == 0) {
// Return an empty string if there is no input, or all the inputs are null.
return EMPTY_UTF8;
}
// Allocate a new byte array, and copy the inputs one by one into it.
// The size of the new array is the size of all inputs, plus the separators.
final byte[] result = new byte[numInputBytes + (numInputs - 1) * separator.numBytes];
int offset = 0;
int j = 0;
for (BinaryString input : inputs) {
if (input != null) {
int len = input.numBytes;
BinaryRowUtil.copy(input.segments, input.offset, result, offset, len);
offset += len;
j++;
// Add separator if this is not the last input.
if (j < numInputs) {
BinaryRowUtil.copy(separator.segments, separator.offset, result, offset, separator.numBytes);
offset += separator.numBytes;
}
}
}
return fromBytes(result);
}
/**
* Returns whether this contains `substring` or not.
* Same to like '%substring%'.
*/
public boolean contains(final BinaryString substring) {
ensureEncoded();
substring.ensureEncoded();
if (substring.numBytes == 0) {
return true;
}
int find = BinaryRowUtil.find(
segments, offset, numBytes,
substring.segments, substring.offset, substring.numBytes);
return find != -1;
}
private boolean matchAt(final BinaryString s, int pos) {
return (inOneSeg() && s.inOneSeg()) ? matchAtOneSeg(s, pos) : matchAtVarSeg(s, pos);
}
private boolean matchAtOneSeg(final BinaryString s, int pos) {
return s.numBytes + pos <= numBytes && pos >= 0 &&
segments[0].equalTo(s.segments[0], offset + pos, s.offset, s.numBytes);
}
private boolean matchAtVarSeg(final BinaryString s, int pos) {
return s.numBytes + pos <= numBytes && pos >= 0 &&
BinaryRowUtil.equalsSlow(segments, offset + pos, s.segments, s.offset, s.numBytes);
}
/**
* Same to like 'prefix%'.
*/
public boolean startsWith(final BinaryString prefix) {
ensureEncoded();
prefix.ensureEncoded();
return matchAt(prefix, 0);
}
/**
* Same to like '%suffix'.
*/
public boolean endsWith(final BinaryString suffix) {
ensureEncoded();
suffix.ensureEncoded();
return matchAt(suffix, numBytes - suffix.numBytes);
}
private BinaryString copyBinaryStringInOneSeg(int start, int end) {
int len = end - start + 1;
byte[] newBytes = new byte[len];
segments[0].get(offset + start, newBytes, 0, len);
return fromBytes(newBytes);
}
private BinaryString copyBinaryString(int start, int end) {
int len = end - start + 1;
byte[] newBytes = new byte[len];
BinaryRowUtil.copy(segments, offset + start, newBytes, 0, len);
return fromBytes(newBytes);
}
public BinaryString trim() {
ensureEncoded();
if (inOneSeg()) {
int s = 0;
int e = this.numBytes - 1;
// skip all of the space (0x20) in the left side
while (s < this.numBytes && getByteOneSeg(s) == 0x20) {
s++;
}
// skip all of the space (0x20) in the right side
while (e >= s && getByteOneSeg(e) == 0x20) {
e--;
}
if (s > e) {
// empty string
return EMPTY_UTF8;
} else {
return copyBinaryStringInOneSeg(s, e);
}
} else {
return trimSlow();
}
}
private BinaryString trimSlow() {
int s = 0;
int e = this.numBytes - 1;
int segSize = segments[0].size();
SegmentAndOffset front = firstSegmentAndOffset(segSize);
// skip all of the space (0x20) in the left side
while (s < this.numBytes && front.value() == 0x20) {
s++;
front.nextByte(segSize);
}
SegmentAndOffset behind = lastSegmentAndOffset(segSize);
// skip all of the space (0x20) in the right side
while (e >= s && behind.value() == 0x20) {
e--;
behind.previousByte(segSize);
}
if (s > e) {
// empty string
return EMPTY_UTF8;
} else {
return copyBinaryString(s, e);
}
}
/**
* Walk each character of current string from both ends, remove the character if it
* is in trim string.
* Return the new substring which both ends trim characters have been removed.
*
* @param trimStr the trim string
* @return A subString which both ends trim characters have been removed.
*/
public BinaryString trim(BinaryString trimStr) {
if (trimStr == null) {
return null;
}
return trimLeft(trimStr).trimRight(trimStr);
}
public BinaryString trimLeft() {
ensureEncoded();
if (inOneSeg()) {
int s = 0;
// skip all of the space (0x20) in the left side
while (s < this.numBytes && getByteOneSeg(s) == 0x20) {
s++;
}
if (s == this.numBytes) {
// empty string
return EMPTY_UTF8;
} else {
return copyBinaryStringInOneSeg(s, this.numBytes - 1);
}
} else {
return trimLeftSlow();
}
}
private BinaryString trimLeftSlow() {
int s = 0;
int segSize = segments[0].size();
SegmentAndOffset front = firstSegmentAndOffset(segSize);
// skip all of the space (0x20) in the left side
while (s < this.numBytes && front.value() == 0x20) {
s++;
front.nextByte(segSize);
}
if (s == this.numBytes) {
// empty string
return EMPTY_UTF8;
} else {
return copyBinaryString(s, this.numBytes - 1);
}
}
/**
* Walk each character of current string from left end, remove the character if it
* is in trim string. Stops at the first character which is not in trim string.
* Return the new substring.
*
* @param trimStr the trim string
* @return A subString which removes all of the character from the left side that is in
* trim string.
*/
public BinaryString trimLeft(BinaryString trimStr) {
ensureEncoded();
if (trimStr == null) {
return null;
}
trimStr.ensureEncoded();
if (trimStr.isSpaceString()) {
return trimLeft();
}
if (inOneSeg()) {
int searchIdx = 0;
while (searchIdx < this.numBytes) {
int charBytes = numBytesForFirstByte(getByteOneSeg(searchIdx));
BinaryString currentChar = copyBinaryStringInOneSeg(searchIdx,
searchIdx + charBytes - 1);
// try to find the matching for the character in the trimString characters.
if (trimStr.contains(currentChar)) {
searchIdx += charBytes;
} else {
break;
}
}
// empty string
if (searchIdx >= numBytes) {
return EMPTY_UTF8;
} else {
return copyBinaryStringInOneSeg(searchIdx, numBytes - 1);
}
} else {
return trimLeftSlow(trimStr);
}
}
private BinaryString trimLeftSlow(BinaryString trimStr) {
int searchIdx = 0;
int segSize = segments[0].size();
SegmentAndOffset front = firstSegmentAndOffset(segSize);
while (searchIdx < this.numBytes) {
int charBytes = numBytesForFirstByte(front.value());
BinaryString currentChar = copyBinaryString(searchIdx, searchIdx + charBytes - 1);
if (trimStr.contains(currentChar)) {
searchIdx += charBytes;
front.skipBytes(charBytes, segSize);
} else {
break;
}
}
if (searchIdx == this.numBytes) {
// empty string
return EMPTY_UTF8;
} else {
return copyBinaryString(searchIdx, this.numBytes - 1);
}
}
public BinaryString trimRight() {
ensureEncoded();
if (inOneSeg()) {
int e = numBytes - 1;
// skip all of the space (0x20) in the right side
while (e >= 0 && getByteOneSeg(e) == 0x20) {
e--;
}
if (e < 0) {
// empty string
return EMPTY_UTF8;
} else {
return copyBinaryStringInOneSeg(0, e);
}
} else {
return trimRightSlow();
}
}
private BinaryString trimRightSlow() {
int e = numBytes - 1;
int segSize = segments[0].size();
SegmentAndOffset behind = lastSegmentAndOffset(segSize);
// skip all of the space (0x20) in the right side
while (e >= 0 && behind.value() == 0x20) {
e--;
behind.previousByte(segSize);
}
if (e < 0) {
// empty string
return EMPTY_UTF8;
} else {
return copyBinaryString(0, e);
}
}
/**
* Walk each character of current string from right end, remove the character if it
* is in trim string. Stops at the first character which is not in trim string.
* Return the new substring.
*
* @param trimStr the trim string
* @return A subString which removes all of the character from the right side that is in
* trim string.
*/
public BinaryString trimRight(BinaryString trimStr) {
ensureEncoded();
if (trimStr == null) {
return null;
}
trimStr.ensureEncoded();
if (trimStr.isSpaceString()) {
return trimRight();
}
if (inOneSeg()) {
int charIdx = 0;
int byteIdx = 0;
// each element in charLens is length of character in the source string
int[] charLens = new int[numBytes];
// each element in charStartPos is start position of first byte in the source string
int[] charStartPos = new int[numBytes];
while (byteIdx < numBytes) {
charStartPos[charIdx] = byteIdx;
charLens[charIdx] = numBytesForFirstByte(getByteOneSeg(byteIdx));
byteIdx += charLens[charIdx];
charIdx++;
}
// searchIdx points to the first character which is not in trim string from the right
// end.
int searchIdx = numBytes - 1;
charIdx -= 1;
while (charIdx >= 0) {
BinaryString currentChar = copyBinaryStringInOneSeg(
charStartPos[charIdx],
charStartPos[charIdx] + charLens[charIdx] - 1);
if (trimStr.contains(currentChar)) {
searchIdx -= charLens[charIdx];
} else {
break;
}
charIdx--;
}
if (searchIdx < 0) {
// empty string
return EMPTY_UTF8;
} else {
return copyBinaryStringInOneSeg(0, searchIdx);
}
} else {
return trimRightSlow(trimStr);
}
}
private BinaryString trimRightSlow(BinaryString trimStr) {
int charIdx = 0;
int byteIdx = 0;
int segSize = segments[0].size();
SegmentAndOffset index = firstSegmentAndOffset(segSize);
// each element in charLens is length of character in the source string
int[] charLens = new int[numBytes];
// each element in charStartPos is start position of first byte in the source string
int[] charStartPos = new int[numBytes];
while (byteIdx < numBytes) {
charStartPos[charIdx] = byteIdx;
int charBytes = numBytesForFirstByte(index.value());
charLens[charIdx] = charBytes;
byteIdx += charBytes;
charIdx++;
index.skipBytes(charBytes, segSize);
}
// searchIdx points to the first character which is not in trim string from the right
// end.
int searchIdx = numBytes - 1;
charIdx -= 1;
while (charIdx >= 0) {
BinaryString currentChar = copyBinaryString(
charStartPos[charIdx],
charStartPos[charIdx] + charLens[charIdx] - 1);
if (trimStr.contains(currentChar)) {
searchIdx -= charLens[charIdx];
} else {
break;
}
charIdx--;
}
if (searchIdx < 0) {
// empty string
return EMPTY_UTF8;
} else {
return copyBinaryString(0, searchIdx);
}
}
public BinaryString trim(boolean leading, boolean trailing, BinaryString seek) {
ensureEncoded();
if (seek == null) {
return null;
}
if (leading && trailing) {
return trim(seek);
} else if (leading) {
return trimLeft(seek);
} else if (trailing) {
return trimRight(seek);
} else {
return this;
}
}
/**
* Parse target string as key-value string and
* return the value matches key name.
* If accept any null arguments, return null.
* example:
* keyvalue('k1=v1;k2=v2', ';', '=', 'k2') = 'v2'
* keyvalue('k1:v1,k2:v2', ',', ':', 'k3') = NULL
*
* @param split1 separator between key-value tuple.
* @param split2 separator between key and value.
* @param keyName name of the key whose value you want return.
*
* @return target value.
*/
public BinaryString keyValue(byte split1, byte split2, BinaryString keyName) {
ensureEncoded();
if (keyName == null || keyName.numBytes() == 0) {
return null;
}
if (inOneSeg() && keyName.inOneSeg()) {
// position in byte
int byteIdx = 0;
// position of last split1
int lastSplit1Idx = -1;
while (byteIdx < numBytes) {
// If find next split1 in str, process current kv
if (segments[0].get(offset + byteIdx) == split1) {
int currentKeyIdx = lastSplit1Idx + 1;
// If key of current kv is keyName, return the value directly
BinaryString value = findValueOfKey(split2, keyName, currentKeyIdx, byteIdx);
if (value != null) {
return value;
}
lastSplit1Idx = byteIdx;
}
byteIdx++;
}
// process the string which is not ends with split1
int currentKeyIdx = lastSplit1Idx + 1;
BinaryString value = findValueOfKey(split2, keyName, currentKeyIdx, numBytes);
return value;
} else {
return keyValueSlow(split1, split2, keyName);
}
}
private BinaryString findValueOfKey(
byte split,
BinaryString keyName,
int start,
int end) {
int keyNameLen = keyName.numBytes;
for (int idx = start; idx < end; idx++) {
if (segments[0].get(offset + idx) == split) {
if (idx == start + keyNameLen &&
segments[0].equalTo(keyName.segments[0], offset + start,
keyName.offset, keyNameLen)) {
int valueIdx = idx + 1;
int valueLen = end - valueIdx;
byte[] bytes = new byte[valueLen];
segments[0].get(offset + valueIdx, bytes, 0, valueLen);
return fromBytes(bytes, 0, valueLen);
} else {
return null;
}
}
}
return null;
}
private BinaryString keyValueSlow(
byte split1,
byte split2,
BinaryString keyName) {
// position in byte
int byteIdx = 0;
// position of last split1
int lastSplit1Idx = -1;
while (byteIdx < numBytes) {
// If find next split1 in str, process current kv
if (getByte(byteIdx) == split1) {
int currentKeyIdx = lastSplit1Idx + 1;
BinaryString value = findValueOfKeySlow(split2, keyName, currentKeyIdx, byteIdx);
if (value != null) {
return value;
}
lastSplit1Idx = byteIdx;
}
byteIdx++;
}
int currentKeyIdx = lastSplit1Idx + 1;
BinaryString value = findValueOfKeySlow(split2, keyName, currentKeyIdx, numBytes);
return value;
}
private BinaryString findValueOfKeySlow(
byte split,
BinaryString keyName,
int start,
int end) {
int keyNameLen = keyName.numBytes;
for (int idx = start; idx < end; idx++) {
if (getByte(idx) == split) {
if (idx == start + keyNameLen &&
BinaryRowUtil.equals(segments, offset + start, keyName.segments,
keyName.offset, keyNameLen)) {
int valueIdx = idx + 1;
byte[] bytes = BinaryRowUtil.copy(segments, offset + valueIdx, end - valueIdx);
return fromBytes(bytes);
} else {
return null;
}
}
}
return null;
}
/**
* Returns the position of the first occurence of substr in current string starting from given
* position.
*
* @param subStr subStr to be searched
* @param start start position
* @return the position of the first occurence of substring. Return -1 if not found.
*/
public int indexOf(BinaryString subStr, int start) {
ensureEncoded();
subStr.ensureEncoded();
if (subStr.numBytes == 0) {
return 0;
}
if (inOneSeg()) {
// position in byte
int byteIdx = 0;
// position is char
int charIdx = 0;
while (byteIdx < numBytes && charIdx < start) {
byteIdx += numBytesForFirstByte(getByteOneSeg(byteIdx));
charIdx++;
}
do {
if (byteIdx + subStr.numBytes > numBytes) {
return -1;
}
if (BinaryRowUtil.equals(segments, offset + byteIdx,
subStr.segments, subStr.offset, subStr.numBytes)) {
return charIdx;
}
byteIdx += numBytesForFirstByte(getByteOneSeg(byteIdx));
charIdx++;
} while (byteIdx < numBytes);
return -1;
} else {
return indexOfSlow(subStr, start);
}
}
private int indexOfSlow(BinaryString subStr, int start) {
// position in byte
int byteIdx = 0;
// position is char
int charIdx = 0;
int segSize = segments[0].size();
SegmentAndOffset index = firstSegmentAndOffset(segSize);
while (byteIdx < numBytes && charIdx < start) {
int charBytes = numBytesForFirstByte(index.value());
byteIdx += charBytes;
charIdx++;
index.skipBytes(charBytes, segSize);
}
do {
if (byteIdx + subStr.numBytes > numBytes) {
return -1;
}
if (BinaryRowUtil.equals(segments, offset + byteIdx,
subStr.segments, subStr.offset, subStr.numBytes)) {
return charIdx;
}
int charBytes = numBytesForFirstByte(index.segment.get(index.offset));
byteIdx += charBytes;
charIdx++;
index.skipBytes(charBytes, segSize);
} while (byteIdx < numBytes);
return -1;
}
/**
* Reverse each character in current string.
*
* @return a new string which character order is reverse to current string.
*/
public BinaryString reverse() {
ensureEncoded();
if (inOneSeg()) {
byte[] result = new byte[this.numBytes];
// position in byte
int byteIdx = 0;
while (byteIdx < numBytes) {
int charBytes = numBytesForFirstByte(getByteOneSeg(byteIdx));
segments[0].get(
offset + byteIdx,
result,
result.length - byteIdx - charBytes,
charBytes);
byteIdx += charBytes;
}
return BinaryString.fromBytes(result);
} else {
return reverseSlow();
}
}
private BinaryString reverseSlow() {
byte[] result = new byte[this.numBytes];
// position in byte
int byteIdx = 0;
int segSize = segments[0].size();
SegmentAndOffset index = firstSegmentAndOffset(segSize);
while (byteIdx < numBytes) {
int charBytes = numBytesForFirstByte(index.value());
BinaryRowUtil.copySlow(
segments,
offset + byteIdx,
result,
result.length - byteIdx - charBytes,
charBytes);
byteIdx += charBytes;
index.skipBytes(charBytes, segSize);
}
return BinaryString.fromBytes(result);
}
// TODO repeat find rfind rpad lpad split
// TODO upper/lower is slow?..
private SegmentAndOffset firstSegmentAndOffset(int segSize) {
int segIndex = offset / segSize;
return new SegmentAndOffset(segIndex, offset % segSize);
}
private SegmentAndOffset lastSegmentAndOffset(int segSize) {
int lastOffset = offset + numBytes - 1;
int segIndex = lastOffset / segSize;
return new SegmentAndOffset(segIndex, lastOffset % segSize);
}
private SegmentAndOffset startSegmentAndOffset(int segSize) {
if (inOneSeg()) {
return new SegmentAndOffset(0, offset);
}
else {
return firstSegmentAndOffset(segSize);
}
}
/**
* CurrentSegment and positionInSegment.
*/
private class SegmentAndOffset {
int segIndex;
MemorySegment segment;
int offset;
private SegmentAndOffset(int segIndex, int offset) {
this.segIndex = segIndex;
this.segment = segments[segIndex];
this.offset = offset;
}
private void assignSegment() {
if (segIndex >= 0 && segIndex < segments.length) {
segment = segments[segIndex];
} else {
segment = null;
}
}
private void previousByte(int segSize) {
offset--;
if (offset == -1) {
segIndex--;
assignSegment();
offset = segSize - 1;
}
}
private void nextByte(int segSize) {
offset++;
checkAdvance(segSize);
}
private void checkAdvance(int segSize) {
if (offset == segSize) {
advance();
}
}
private void advance() {
segIndex++;
assignSegment();
offset = 0;
}
private void skipBytes(int n, int segSize) {
int remaining = segSize - this.offset;
if (remaining > n) {
this.offset += n;
} else {
while (true) {
int toSkip = Math.min(remaining, n);
n -= toSkip;
if (n <= 0) {
this.offset += toSkip;
checkAdvance(segSize);
return;
}
advance();
remaining = segSize - this.offset;
}
}
}
private byte value() {
return this.segment.get(this.offset);
}
}
/**
* Parses this BinaryString to Long.
*
* <p>Note that, in this method we accumulate the result in negative format, and convert it to
* positive format at the end, if this string is not started with '-'. This is because min value
* is bigger than max value in digits, e.g. Long.MAX_VALUE is '9223372036854775807' and
* Long.MIN_VALUE is '-9223372036854775808'.
*
* <p>This code is mostly copied from LazyLong.parseLong in Hive.
* @return Long value if the parsing was successful else null.
*/
public Long toLong() {
ensureEncoded();
if (numBytes == 0) {
return null;
}
int size = segments[0].size();
SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size);
int totalOffset = 0;
byte b = segmentAndOffset.value();
final boolean negative = b == '-';
if (negative || b == '+') {
segmentAndOffset.nextByte(size);
totalOffset++;
if (numBytes == 1) {
return null;
}
}
long result = 0;
final byte separator = '.';
final int radix = 10;
final long stopValue = Long.MIN_VALUE / radix;
while (totalOffset < this.numBytes) {
b = segmentAndOffset.value();
totalOffset++;
segmentAndOffset.nextByte(size);
if (b == separator) {
// We allow decimals and will return a truncated integral in that case.
// Therefore we won't throw an exception here (checking the fractional
// part happens below.)
break;
}
int digit;
if (b >= '0' && b <= '9') {
digit = b - '0';
} else {
return null;
}
// We are going to process the new digit and accumulate the result. However, before
// doing this, if the result is already smaller than the
// stopValue(Long.MIN_VALUE / radix), then result * 10 will definitely be smaller
// than minValue, and we can stop.
if (result < stopValue) {
return null;
}
result = result * radix - digit;
// Since the previous result is less than or equal to
// stopValue(Long.MIN_VALUE / radix), we can just use `result > 0` to check overflow.
// If result overflows, we should stop.
if (result > 0) {
return null;
}
}
// This is the case when we've encountered a decimal separator. The fractional
// part will not change the number, but we will verify that the fractional part
// is well formed.
while (totalOffset < numBytes) {
byte currentByte = segmentAndOffset.value();
if (currentByte < '0' || currentByte > '9') {
return null;
}
totalOffset++;
segmentAndOffset.nextByte(size);
}
if (!negative) {
result = -result;
if (result < 0) {
return null;
}
}
return result;
}
/**
* Parses this BinaryString to Int.
*
* <p>Note that, in this method we accumulate the result in negative format, and convert it to
* positive format at the end, if this string is not started with '-'. This is because min value
* is bigger than max value in digits, e.g. Integer.MAX_VALUE is '2147483647' and
* Integer.MIN_VALUE is '-2147483648'.
*
* <p>This code is mostly copied from LazyInt.parseInt in Hive.
*
* <p>Note that, this method is almost same as `toLong`, but we leave it duplicated for performance
* reasons, like Hive does.
* @return Integer value if the parsing was successful else null.
*/
public Integer toInt() {
ensureEncoded();
if (numBytes == 0) {
return null;
}
int size = segments[0].size();
SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size);
int totalOffset = 0;
byte b = segmentAndOffset.value();
final boolean negative = b == '-';
if (negative || b == '+') {
segmentAndOffset.nextByte(size);
totalOffset++;
if (numBytes == 1) {
return null;
}
}
int result = 0;
final byte separator = '.';
final int radix = 10;
final long stopValue = Integer.MIN_VALUE / radix;
while (totalOffset < this.numBytes) {
b = segmentAndOffset.value();
totalOffset++;
segmentAndOffset.nextByte(size);
if (b == separator) {
// We allow decimals and will return a truncated integral in that case.
// Therefore we won't throw an exception here (checking the fractional
// part happens below.)
break;
}
int digit;
if (b >= '0' && b <= '9') {
digit = b - '0';
} else {
return null;
}
// We are going to process the new digit and accumulate the result. However, before
// doing this, if the result is already smaller than the
// stopValue(Long.MIN_VALUE / radix), then result * 10 will definitely be smaller
// than minValue, and we can stop.
if (result < stopValue) {
return null;
}
result = result * radix - digit;
// Since the previous result is less than or equal to
// stopValue(Long.MIN_VALUE / radix), we can just use `result > 0` to check overflow.
// If result overflows, we should stop.
if (result > 0) {
return null;
}
}
// This is the case when we've encountered a decimal separator. The fractional
// part will not change the number, but we will verify that the fractional part
// is well formed.
while (totalOffset < numBytes) {
byte currentByte = segmentAndOffset.value();
if (currentByte < '0' || currentByte > '9') {
return null;
}
totalOffset++;
segmentAndOffset.nextByte(size);
}
if (!negative) {
result = -result;
if (result < 0) {
return null;
}
}
return result;
}
public Short toShort() {
Integer intValue = toInt();
if (intValue != null) {
short result = intValue.shortValue();
if (result == intValue) {
return result;
}
}
return null;
}
public Byte toByte() {
Integer intValue = toInt();
if (intValue != null) {
byte result = intValue.byteValue();
if (result == intValue) {
return result;
}
}
return null;
}
public Double toDouble() {
try {
return Double.valueOf(toString());
} catch (NumberFormatException e) {
return null;
}
}
public Float toFloat() {
try {
return Float.valueOf(toString());
} catch (NumberFormatException e) {
return null;
}
}
/**
* Parses this BinaryString to Decimal.
*
* @return Decimal value if the parsing was successful, or null if overflow
* @throws NumberFormatException if the parsing failed.
*/
public Decimal toDecimal(int precision, int scale) {
ensureEncoded();
if (precision > Decimal.MAX_LONG_DIGITS || this.numBytes > Decimal.MAX_LONG_DIGITS) {
return toDecimalSlow(precision, scale);
}
// Data in Decimal is stored by one long value if `precision` <= Decimal.MAX_LONG_DIGITS.
// In this case we can directly extract the value from memory segment.
int size = getSegments()[0].size();
SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size);
int totalOffset = 0;
// Remove white spaces at the beginning
byte b = 0;
while (totalOffset < this.numBytes) {
b = segmentAndOffset.value();
if (b != ' ' && b != '\n' && b != '\t') {
break;
}
totalOffset++;
segmentAndOffset.nextByte(size);
}
if (totalOffset == this.numBytes) {
// all whitespaces
return null;
}
// ======= Significand part begin =======
final boolean negative = b == '-';
if (negative || b == '+') {
segmentAndOffset.nextByte(size);
totalOffset++;
if (totalOffset == this.numBytes) {
// only contains prefix plus/minus
return null;
}
}
long significand = 0;
int exp = 0;
int significandLen = 0, pointPos = -1;
while (totalOffset < this.numBytes) {
b = segmentAndOffset.value();
totalOffset++;
segmentAndOffset.nextByte(size);
if (b >= '0' && b <= '9') {
// No need to worry about overflow, because this.numBytes <= Decimal.MAX_LONG_DIGITS
significand = significand * 10 + (b - '0');
significandLen++;
} else if (b == '.') {
if (pointPos >= 0) {
// More than one decimal point
return null;
}
pointPos = significandLen;
} else {
break;
}
}
if (pointPos < 0) {
pointPos = significandLen;
}
if (negative) {
significand = -significand;
}
// ======= Significand part end =======
// ======= Exponential part begin =======
if ((b == 'e' || b == 'E') && totalOffset < this.numBytes) {
b = segmentAndOffset.value();
final boolean expNegative = b == '-';
if (expNegative || b == '+') {
segmentAndOffset.nextByte(size);
totalOffset++;
if (totalOffset == this.numBytes) {
return null;
}
}
int expDigits = 0;
// As `precision` <= 18, value absolute range is limited to 10^-18 ~ 10^18.
// The worst case is <18-digits>E-36
final int expStopValue = 40;
while (totalOffset < this.numBytes) {
b = segmentAndOffset.value();
totalOffset++;
segmentAndOffset.nextByte(size);
if (b >= '0' && b <= '9') {
// No need to worry about larger exponents,
// because they will produce overflow or underflow
if (expDigits < expStopValue) {
expDigits = expDigits * 10 + (b - '0');
}
} else {
break;
}
}
if (expNegative) {
expDigits = -expDigits;
}
exp += expDigits;
}
exp -= significandLen - pointPos;
// ======= Exponential part end =======
// Check for invalid character at the end
while (totalOffset < this.numBytes) {
b = segmentAndOffset.value();
totalOffset++;
segmentAndOffset.nextByte(size);
// White spaces are allowed at the end
if (b != ' ' && b != '\n' && b != '\t') {
return null;
}
}
// Round exp to scale
int change = exp + scale;
if (significandLen + change > precision) {
// Overflow
return null;
}
if (change >= 0) {
significand *= Decimal.POW10[change];
} else {
int k = negative ? -5 : 5;
significand = (significand + k * Decimal.POW10[-change - 1]) / Decimal.POW10[-change];
}
return Decimal.fromLong(significand, precision, scale);
}
private Decimal toDecimalSlow(int precision, int scale) {
// As data in Decimal is currently stored by BigDecimal if `precision` > Decimal.MAX_LONG_DIGITS,
// and BigDecimal only supports String or char[] for its constructor,
// we can't directly extract the value from BinaryString.
//
// As BigDecimal(char[], int, int) is faster than BigDecimal(String, int, int),
// we extract char[] from the memory segment and pass it to the constructor of BigDecimal.
char[] chars = StringUtf8Utils.allocateChars(numBytes);
int len;
if (segments.length == 1) {
len = StringUtf8Utils.decodeUTF8Strict(segments[0], offset, numBytes, chars);
} else {
byte[] bytes = StringUtf8Utils.allocateBytes(numBytes);
copyTo(bytes);
len = StringUtf8Utils.decodeUTF8Strict(bytes, 0, numBytes, chars);
}
if (len < 0) {
return null;
} else {
// Trim white spaces
int start = 0, end = len;
for (int i = 0; i < len; i++) {
if (chars[i] != ' ' && chars[i] != '\n' && chars[i] != '\t') {
start = i;
break;
}
}
for (int i = len - 1; i >= 0; i--) {
if (chars[i] != ' ' && chars[i] != '\n' && chars[i] != '\t') {
end = i + 1;
break;
}
}
try {
BigDecimal bd = new BigDecimal(chars, start, end - start);
return Decimal.fromBigDecimal(bd, precision, scale);
} catch (NumberFormatException nfe) {
return null;
}
}
}
/**
* Returns the upper case of this string.
*/
public BinaryString toUpperCase() {
if (javaString != null) {
return toUpperCaseSlow();
}
if (numBytes == 0) {
return EMPTY_UTF8;
}
int size = segments[0].size();
SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size);
byte[] bytes = new byte[numBytes];
bytes[0] = (byte) Character.toTitleCase(segmentAndOffset.value());
for (int i = 0; i < numBytes; i++) {
byte b = segmentAndOffset.value();
if (numBytesForFirstByte(b) != 1) {
// fallback
return toUpperCaseSlow();
}
int upper = Character.toUpperCase((int) b);
if (upper > 127) {
// fallback
return toUpperCaseSlow();
}
bytes[i] = (byte) upper;
segmentAndOffset.nextByte(size);
}
return fromBytes(bytes);
}
private BinaryString toUpperCaseSlow() {
return fromString(toString().toUpperCase());
}
/**
* Returns the lower case of this string.
*/
public BinaryString toLowerCase() {
if (javaString != null) {
return toLowerCaseSlow();
}
if (numBytes == 0) {
return EMPTY_UTF8;
}
int size = segments[0].size();
SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size);
byte[] bytes = new byte[numBytes];
bytes[0] = (byte) Character.toTitleCase(segmentAndOffset.value());
for (int i = 0; i < numBytes; i++) {
byte b = segmentAndOffset.value();
if (numBytesForFirstByte(b) != 1) {
// fallback
return toLowerCaseSlow();
}
int lower = Character.toLowerCase((int) b);
if (lower > 127) {
// fallback
return toLowerCaseSlow();
}
bytes[i] = (byte) lower;
segmentAndOffset.nextByte(size);
}
return fromBytes(bytes);
}
private BinaryString toLowerCaseSlow() {
return fromString(toString().toLowerCase());
}
/**
* <p>Splits the provided text into an array, separator string specified. </p>
*
* <p>The separator is not included in the returned String array.
* Adjacent separators are treated as separators for empty tokens.</p>
*
* <p>A {@code null} separator splits on whitespace.</p>
*
* <pre>
* "".splitByWholeSeparatorPreserveAllTokens(*) = []
* "ab de fg".splitByWholeSeparatorPreserveAllTokens(null) = ["ab", "de", "fg"]
* "ab de fg".splitByWholeSeparatorPreserveAllTokens(null) = ["ab", "", "", "de", "fg"]
* "ab:cd:ef".splitByWholeSeparatorPreserveAllTokens(":") = ["ab", "cd", "ef"]
* "ab-!-cd-!-ef".splitByWholeSeparatorPreserveAllTokens("-!-") = ["ab", "cd", "ef"]
* </pre>
*
* <p>Note: return BinaryStrings is reuse MemorySegments from this.</p>
*
* @param separator String containing the String to be used as a delimiter,
* {@code null} splits on whitespace
* @return an array of parsed Strings, {@code null} if null String was input
* @since 2.4
*/
public BinaryString[] splitByWholeSeparatorPreserveAllTokens(BinaryString separator) {
ensureEncoded();
final int len = numBytes;
if (len == 0) {
return EMPTY_STRING_ARRAY;
}
if (separator == null || EMPTY_UTF8.equals(separator)) {
// Split on whitespace.
return splitByWholeSeparatorPreserveAllTokens(fromString(" "));
}
separator.ensureEncoded();
final int separatorLength = separator.numBytes;
final ArrayList<BinaryString> substrings = new ArrayList<>();
int beg = 0;
int end = 0;
while (end < len) {
end = BinaryRowUtil.find(
segments, offset + beg, numBytes - beg,
separator.segments, separator.offset, separator.numBytes) - offset;
if (end > -1) {
if (end > beg) {
// The following is OK, because String.substring( beg, end ) excludes
// the character at the position 'end'.
substrings.add(BinaryString.fromAddress(segments, offset + beg, end - beg));
// Set the starting point for the next search.
// The following is equivalent to beg = end + (separatorLength - 1) + 1,
// which is the right calculation:
beg = end + separatorLength;
} else {
// We found a consecutive occurrence of the separator.
substrings.add(EMPTY_UTF8);
beg = end + separatorLength;
}
} else {
// String.substring( beg ) goes from 'beg' to the end of the String.
substrings.add(BinaryString.fromAddress(segments, offset + beg, numBytes - beg));
end = len;
}
}
return substrings.toArray(new BinaryString[substrings.size()]);
}
/**
* Calculate the hash value of a given string use {@link MessageDigest}.
*/
public BinaryString hash(MessageDigest md) {
return fromString(Hex.encodeHexString(md.digest(getBytes())));
}
public BinaryString hash(String algorithm) throws NoSuchAlgorithmException {
return hash(MessageDigest.getInstance(algorithm));
}
private static final List<BinaryString> TRUE_STRINGS =
Stream
.of("t", "true", "y", "yes", "1")
.map(BinaryString::fromString)
.peek(BinaryString::ensureEncoded)
.collect(Collectors.toList());
private static final List<BinaryString> FALSE_STRINGS =
Stream
.of("f", "false", "n", "no", "0")
.map(BinaryString::fromString)
.peek(BinaryString::ensureEncoded)
.collect(Collectors.toList());
/**
* Decide boolean representation of a string.
*/
public Boolean toBooleanSQL() {
if (TRUE_STRINGS.contains(toLowerCase())) {
return true;
} else if (FALSE_STRINGS.contains(toLowerCase())) {
return false;
} else {
return null;
}
}
}