blob: 130b0e08ff66ead730bf9d2f1d0b5ac2d034cc3c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util;
import java.io.DataInputStream;
import java.math.BigInteger;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Properties;
/**
* Methods for manipulating strings.
*
* @lucene.internal
*/
public abstract class StringHelper {
/**
* Compares two {@link BytesRef}, element by element, and returns the
* number of elements common to both arrays (from the start of each).
* This method assumes currentTerm comes after priorTerm.
*
* @param priorTerm The first {@link BytesRef} to compare
* @param currentTerm The second {@link BytesRef} to compare
* @return The number of common elements (from the start of each).
*/
public static int bytesDifference(BytesRef priorTerm, BytesRef currentTerm) {
int mismatch = FutureArrays.mismatch(priorTerm.bytes, priorTerm.offset, priorTerm.offset + priorTerm.length,
currentTerm.bytes, currentTerm.offset, currentTerm.offset + currentTerm.length);
if (mismatch < 0) {
throw new IllegalArgumentException("terms out of order: priorTerm=" + priorTerm + ",currentTerm=" + currentTerm);
}
return mismatch;
}
/**
* Returns the length of {@code currentTerm} needed for use as a sort key.
* so that {@link BytesRef#compareTo(BytesRef)} still returns the same result.
* This method assumes currentTerm comes after priorTerm.
*/
public static int sortKeyLength(final BytesRef priorTerm, final BytesRef currentTerm) {
return bytesDifference(priorTerm, currentTerm) + 1;
}
private StringHelper() {
}
/**
* Returns <code>true</code> iff the ref starts with the given prefix.
* Otherwise <code>false</code>.
*
* @param ref
* the {@code byte[]} to test
* @param prefix
* the expected prefix
* @return Returns <code>true</code> iff the ref starts with the given prefix.
* Otherwise <code>false</code>.
*/
public static boolean startsWith(byte[] ref, BytesRef prefix) {
// not long enough to start with the prefix
if (ref.length < prefix.length) {
return false;
}
return FutureArrays.equals(ref, 0, prefix.length,
prefix.bytes, prefix.offset, prefix.offset + prefix.length);
}
/**
* Returns <code>true</code> iff the ref starts with the given prefix.
* Otherwise <code>false</code>.
*
* @param ref
* the {@link BytesRef} to test
* @param prefix
* the expected prefix
* @return Returns <code>true</code> iff the ref starts with the given prefix.
* Otherwise <code>false</code>.
*/
public static boolean startsWith(BytesRef ref, BytesRef prefix) {
// not long enough to start with the prefix
if (ref.length < prefix.length) {
return false;
}
return FutureArrays.equals(ref.bytes, ref.offset, ref.offset + prefix.length,
prefix.bytes, prefix.offset, prefix.offset + prefix.length);
}
/**
* Returns <code>true</code> iff the ref ends with the given suffix. Otherwise
* <code>false</code>.
*
* @param ref
* the {@link BytesRef} to test
* @param suffix
* the expected suffix
* @return Returns <code>true</code> iff the ref ends with the given suffix.
* Otherwise <code>false</code>.
*/
public static boolean endsWith(BytesRef ref, BytesRef suffix) {
int startAt = ref.length - suffix.length;
// not long enough to start with the suffix
if (startAt < 0) {
return false;
}
return FutureArrays.equals(ref.bytes, ref.offset + startAt, ref.offset + startAt + suffix.length,
suffix.bytes, suffix.offset, suffix.offset + suffix.length);
}
/** Pass this as the seed to {@link #murmurhash3_x86_32}. */
// Poached from Guava: set a different salt/seed
// for each JVM instance, to frustrate hash key collision
// denial of service attacks, and to catch any places that
// somehow rely on hash function/order across JVM
// instances:
public static final int GOOD_FAST_HASH_SEED;
static {
String prop = System.getProperty("tests.seed");
if (prop != null) {
// So if there is a test failure that relied on hash
// order, we remain reproducible based on the test seed:
GOOD_FAST_HASH_SEED = prop.hashCode();
} else {
GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis();
}
}
/** Returns the MurmurHash3_x86_32 hash.
* Original source/tests at https://github.com/yonik/java_util/
*/
@SuppressWarnings("fallthrough")
public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) {
final int c1 = 0xcc9e2d51;
final int c2 = 0x1b873593;
int h1 = seed;
int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block
for (int i=offset; i<roundedEnd; i+=4) {
// little endian load order
int k1 = (data[i] & 0xff) | ((data[i+1] & 0xff) << 8) | ((data[i+2] & 0xff) << 16) | (data[i+3] << 24);
k1 *= c1;
k1 = Integer.rotateLeft(k1, 15);
k1 *= c2;
h1 ^= k1;
h1 = Integer.rotateLeft(h1, 13);
h1 = h1*5+0xe6546b64;
}
// tail
int k1 = 0;
switch(len & 0x03) {
case 3:
k1 = (data[roundedEnd + 2] & 0xff) << 16;
// fallthrough
case 2:
k1 |= (data[roundedEnd + 1] & 0xff) << 8;
// fallthrough
case 1:
k1 |= (data[roundedEnd] & 0xff);
k1 *= c1;
k1 = Integer.rotateLeft(k1, 15);
k1 *= c2;
h1 ^= k1;
}
// finalization
h1 ^= len;
// fmix(h1);
h1 ^= h1 >>> 16;
h1 *= 0x85ebca6b;
h1 ^= h1 >>> 13;
h1 *= 0xc2b2ae35;
h1 ^= h1 >>> 16;
return h1;
}
public static int murmurhash3_x86_32(BytesRef bytes, int seed) {
return murmurhash3_x86_32(bytes.bytes, bytes.offset, bytes.length, seed);
}
// Holds 128 bit unsigned value:
private static BigInteger nextId;
private static final BigInteger mask128;
private static final Object idLock = new Object();
static {
// 128 bit unsigned mask
byte[] maskBytes128 = new byte[16];
Arrays.fill(maskBytes128, (byte) 0xff);
mask128 = new BigInteger(1, maskBytes128);
String prop = System.getProperty("tests.seed");
// State for xorshift128:
long x0;
long x1;
if (prop != null) {
// So if there is a test failure that somehow relied on this id,
// we remain reproducible based on the test seed:
if (prop.length() > 8) {
prop = prop.substring(prop.length()-8);
}
x0 = Long.parseLong(prop, 16);
x1 = x0;
} else {
// seed from /dev/urandom, if its available
try (DataInputStream is = new DataInputStream(Files.newInputStream(Paths.get("/dev/urandom")))) {
x0 = is.readLong();
x1 = is.readLong();
} catch (Exception unavailable) {
// may not be available on this platform
// fall back to lower quality randomness from 3 different sources:
x0 = System.nanoTime();
x1 = StringHelper.class.hashCode() << 32;
StringBuilder sb = new StringBuilder();
// Properties can vary across JVM instances:
try {
Properties p = System.getProperties();
for (String s: p.stringPropertyNames()) {
sb.append(s);
sb.append(p.getProperty(s));
}
x1 |= sb.toString().hashCode();
} catch (SecurityException notallowed) {
// getting Properties requires wildcard read-write: may not be allowed
x1 |= StringBuffer.class.hashCode();
}
}
}
// Use a few iterations of xorshift128 to scatter the seed
// in case multiple Lucene instances starting up "near" the same
// nanoTime, since we use ++ (mod 2^128) for full period cycle:
for(int i=0;i<10;i++) {
long s1 = x0;
long s0 = x1;
x0 = s0;
s1 ^= s1 << 23; // a
x1 = s1 ^ s0 ^ (s1 >>> 17) ^ (s0 >>> 26); // b, c
}
// 64-bit unsigned mask
byte[] maskBytes64 = new byte[8];
Arrays.fill(maskBytes64, (byte) 0xff);
BigInteger mask64 = new BigInteger(1, maskBytes64);
// First make unsigned versions of x0, x1:
BigInteger unsignedX0 = BigInteger.valueOf(x0).and(mask64);
BigInteger unsignedX1 = BigInteger.valueOf(x1).and(mask64);
// Concatentate bits of x0 and x1, as unsigned 128 bit integer:
nextId = unsignedX0.shiftLeft(64).or(unsignedX1);
}
/** length in bytes of an ID */
public static final int ID_LENGTH = 16;
/** Generates a non-cryptographic globally unique id. */
public static byte[] randomId() {
// NOTE: we don't use Java's UUID.randomUUID() implementation here because:
//
// * It's overkill for our usage: it tries to be cryptographically
// secure, whereas for this use we don't care if someone can
// guess the IDs.
//
// * It uses SecureRandom, which on Linux can easily take a long time
// (I saw ~ 10 seconds just running a Lucene test) when entropy
// harvesting is falling behind.
//
// * It loses a few (6) bits to version and variant and it's not clear
// what impact that has on the period, whereas the simple ++ (mod 2^128)
// we use here is guaranteed to have the full period.
byte bits[];
synchronized(idLock) {
bits = nextId.toByteArray();
nextId = nextId.add(BigInteger.ONE).and(mask128);
}
// toByteArray() always returns a sign bit, so it may require an extra byte (always zero)
if (bits.length > ID_LENGTH) {
assert bits.length == ID_LENGTH + 1;
assert bits[0] == 0;
return ArrayUtil.copyOfSubArray(bits, 1, bits.length);
} else {
byte[] result = new byte[ID_LENGTH];
System.arraycopy(bits, 0, result, result.length - bits.length, bits.length);
return result;
}
}
/**
* Helper method to render an ID as a string, for debugging
* <p>
* Returns the string {@code (null)} if the id is null.
* Otherwise, returns a string representation for debugging.
* Never throws an exception. The returned string may
* indicate if the id is definitely invalid.
*/
public static String idToString(byte id[]) {
if (id == null) {
return "(null)";
} else {
StringBuilder sb = new StringBuilder();
sb.append(new BigInteger(1, id).toString(Character.MAX_RADIX));
if (id.length != ID_LENGTH) {
sb.append(" (INVALID FORMAT)");
}
return sb.toString();
}
}
/** Just converts each int in the incoming {@link IntsRef} to each byte
* in the returned {@link BytesRef}, throwing {@code IllegalArgumentException}
* if any int value is out of bounds for a byte. */
public static BytesRef intsRefToBytesRef(IntsRef ints) {
byte[] bytes = new byte[ints.length];
for(int i=0;i<ints.length;i++) {
int x = ints.ints[ints.offset+i];
if (x < 0 || x > 255) {
throw new IllegalArgumentException("int at pos=" + i + " with value=" + x + " is out-of-bounds for byte");
}
bytes[i] = (byte) x;
}
return new BytesRef(bytes);
}
}