lucene/core/src/java/org/apache/lucene/util/StringHelper.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.util;


 import java.io.DataInputStream;
 import java.math.BigInteger;
 import java.nio.file.Files;
 import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.Properties;

 /**
  * Methods for manipulating strings.
  *
  * @lucene.internal
  */
 public abstract class StringHelper {

   /**
    * Compares two {@link BytesRef}, element by element, and returns the
    * number of elements common to both arrays (from the start of each).
    * This method assumes currentTerm comes after priorTerm.
    *
    * @param priorTerm The first {@link BytesRef} to compare
    * @param currentTerm The second {@link BytesRef} to compare
    * @return The number of common elements (from the start of each).
    */
   public static int bytesDifference(BytesRef priorTerm, BytesRef currentTerm) {
     int mismatch = FutureArrays.mismatch(priorTerm.bytes, priorTerm.offset, priorTerm.offset + priorTerm.length,
                                          currentTerm.bytes, currentTerm.offset, currentTerm.offset + currentTerm.length);
     if (mismatch < 0) {
       throw new IllegalArgumentException("terms out of order: priorTerm=" + priorTerm + ",currentTerm=" + currentTerm);
     }
     return mismatch;
   }

   /**
    * Returns the length of {@code currentTerm} needed for use as a sort key.
    * so that {@link BytesRef#compareTo(BytesRef)} still returns the same result.
    * This method assumes currentTerm comes after priorTerm.
    */
   public static int sortKeyLength(final BytesRef priorTerm, final BytesRef currentTerm) {
     return bytesDifference(priorTerm, currentTerm) + 1;
   }

   private StringHelper() {
   }

   /**
    * Returns <code>true</code> iff the ref starts with the given prefix.
    * Otherwise <code>false</code>.
    *
    * @param ref
    *         the {@code byte[]} to test
    * @param prefix
    *         the expected prefix
    * @return Returns <code>true</code> iff the ref starts with the given prefix.
    *         Otherwise <code>false</code>.
    */
   public static boolean startsWith(byte[] ref, BytesRef prefix) {
     // not long enough to start with the prefix
     if (ref.length < prefix.length) {
       return false;
     }
     return FutureArrays.equals(ref, 0, prefix.length,
                                prefix.bytes, prefix.offset, prefix.offset + prefix.length);
   }

   /**
    * Returns <code>true</code> iff the ref starts with the given prefix.
    * Otherwise <code>false</code>.
    *
    * @param ref
    *          the {@link BytesRef} to test
    * @param prefix
    *          the expected prefix
    * @return Returns <code>true</code> iff the ref starts with the given prefix.
    *         Otherwise <code>false</code>.
    */
   public static boolean startsWith(BytesRef ref, BytesRef prefix) {
     // not long enough to start with the prefix
     if (ref.length < prefix.length) {
       return false;
     }
     return FutureArrays.equals(ref.bytes, ref.offset, ref.offset + prefix.length,
                                prefix.bytes, prefix.offset, prefix.offset + prefix.length);
   }

   /**
    * Returns <code>true</code> iff the ref ends with the given suffix. Otherwise
    * <code>false</code>.
    *
    * @param ref
    *          the {@link BytesRef} to test
    * @param suffix
    *          the expected suffix
    * @return Returns <code>true</code> iff the ref ends with the given suffix.
    *         Otherwise <code>false</code>.
    */
   public static boolean endsWith(BytesRef ref, BytesRef suffix) {
     int startAt = ref.length - suffix.length;
     // not long enough to start with the suffix
     if (startAt < 0) {
       return false;
     }
     return FutureArrays.equals(ref.bytes, ref.offset + startAt, ref.offset + startAt + suffix.length,
                                suffix.bytes, suffix.offset, suffix.offset + suffix.length);
   }

   /** Pass this as the seed to {@link #murmurhash3_x86_32}. */

   // Poached from Guava: set a different salt/seed
   // for each JVM instance, to frustrate hash key collision
   // denial of service attacks, and to catch any places that
   // somehow rely on hash function/order across JVM
   // instances:
   public static final int GOOD_FAST_HASH_SEED;

   static {
     String prop = System.getProperty("tests.seed");
     if (prop != null) {
       // So if there is a test failure that relied on hash
       // order, we remain reproducible based on the test seed:
       GOOD_FAST_HASH_SEED = prop.hashCode();
     } else {
       GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis();
     }
   }

   /** Returns the MurmurHash3_x86_32 hash.
    * Original source/tests at https://github.com/yonik/java_util/
    */
   @SuppressWarnings("fallthrough")
   public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) {

     final int c1 = 0xcc9e2d51;
     final int c2 = 0x1b873593;

     int h1 = seed;
     int roundedEnd = offset + (len & 0xfffffffc);  // round down to 4 byte block

     for (int i=offset; i<roundedEnd; i+=4) {
       // little endian load order
       int k1 = (data[i] & 0xff) | ((data[i+1] & 0xff) << 8) | ((data[i+2] & 0xff) << 16) | (data[i+3] << 24);
       k1 *= c1;
       k1 = Integer.rotateLeft(k1, 15);
       k1 *= c2;

       h1 ^= k1;
       h1 = Integer.rotateLeft(h1, 13);
       h1 = h1*5+0xe6546b64;
     }

     // tail
     int k1 = 0;

     switch(len & 0x03) {
       case 3:
         k1 = (data[roundedEnd + 2] & 0xff) << 16;
         // fallthrough
       case 2:
         k1 |= (data[roundedEnd + 1] & 0xff) << 8;
         // fallthrough
       case 1:
         k1 |= (data[roundedEnd] & 0xff);
         k1 *= c1;
         k1 = Integer.rotateLeft(k1, 15);
         k1 *= c2;
         h1 ^= k1;
     }

     // finalization
     h1 ^= len;

     // fmix(h1);
     h1 ^= h1 >>> 16;
     h1 *= 0x85ebca6b;
     h1 ^= h1 >>> 13;
     h1 *= 0xc2b2ae35;
     h1 ^= h1 >>> 16;

     return h1;
   }

   public static int murmurhash3_x86_32(BytesRef bytes, int seed) {
     return murmurhash3_x86_32(bytes.bytes, bytes.offset, bytes.length, seed);
   }

   // Holds 128 bit unsigned value:
   private static BigInteger nextId;
   private static final BigInteger mask128;
   private static final Object idLock = new Object();

   static {
     // 128 bit unsigned mask
     byte[] maskBytes128 = new byte[16];
     Arrays.fill(maskBytes128, (byte) 0xff);
     mask128 = new BigInteger(1, maskBytes128);

     String prop = System.getProperty("tests.seed");

     // State for xorshift128:
     long x0;
     long x1;

     if (prop != null) {
       // So if there is a test failure that somehow relied on this id,
       // we remain reproducible based on the test seed:
       if (prop.length() > 8) {
         prop = prop.substring(prop.length()-8);
       }
       x0 = Long.parseLong(prop, 16);
       x1 = x0;
     } else {
       // seed from /dev/urandom, if its available
       try (DataInputStream is = new DataInputStream(Files.newInputStream(Paths.get("/dev/urandom")))) {
         x0 = is.readLong();
         x1 = is.readLong();
       } catch (Exception unavailable) {
         // may not be available on this platform
         // fall back to lower quality randomness from 3 different sources:
         x0 = System.nanoTime();
         x1 = StringHelper.class.hashCode() << 32;

         StringBuilder sb = new StringBuilder();
         // Properties can vary across JVM instances:
         try {
           Properties p = System.getProperties();
           for (String s: p.stringPropertyNames()) {
             sb.append(s);
             sb.append(p.getProperty(s));
           }
           x1 |= sb.toString().hashCode();
         } catch (SecurityException notallowed) {
           // getting Properties requires wildcard read-write: may not be allowed
           x1 |= StringBuffer.class.hashCode();
         }
       }
     }

     // Use a few iterations of xorshift128 to scatter the seed
     // in case multiple Lucene instances starting up "near" the same
     // nanoTime, since we use ++ (mod 2^128) for full period cycle:
     for(int i=0;i<10;i++) {
       long s1 = x0;
       long s0 = x1;
       x0 = s0;
       s1 ^= s1 << 23; // a
       x1 = s1 ^ s0 ^ (s1 >>> 17) ^ (s0 >>> 26); // b, c
     }

     // 64-bit unsigned mask
     byte[] maskBytes64 = new byte[8];
     Arrays.fill(maskBytes64, (byte) 0xff);
     BigInteger mask64 = new BigInteger(1, maskBytes64);

     // First make unsigned versions of x0, x1:
     BigInteger unsignedX0 = BigInteger.valueOf(x0).and(mask64);
     BigInteger unsignedX1 = BigInteger.valueOf(x1).and(mask64);

     // Concatentate bits of x0 and x1, as unsigned 128 bit integer:
     nextId = unsignedX0.shiftLeft(64).or(unsignedX1);
   }

   /** length in bytes of an ID */
   public static final int ID_LENGTH = 16;

   /** Generates a non-cryptographic globally unique id. */
   public static byte[] randomId() {

     // NOTE: we don't use Java's UUID.randomUUID() implementation here because:
     //
     //   * It's overkill for our usage: it tries to be cryptographically
     //     secure, whereas for this use we don't care if someone can
     //     guess the IDs.
     //
     //   * It uses SecureRandom, which on Linux can easily take a long time
     //     (I saw ~ 10 seconds just running a Lucene test) when entropy
     //     harvesting is falling behind.
     //
     //   * It loses a few (6) bits to version and variant and it's not clear
     //     what impact that has on the period, whereas the simple ++ (mod 2^128)
     //     we use here is guaranteed to have the full period.

     byte bits[];
     synchronized(idLock) {
       bits = nextId.toByteArray();
       nextId = nextId.add(BigInteger.ONE).and(mask128);
     }

     // toByteArray() always returns a sign bit, so it may require an extra byte (always zero)
     if (bits.length > ID_LENGTH) {
       assert bits.length == ID_LENGTH + 1;
       assert bits[0] == 0;
       return ArrayUtil.copyOfSubArray(bits, 1, bits.length);
     } else {
       byte[] result = new byte[ID_LENGTH];
       System.arraycopy(bits, 0, result, result.length - bits.length, bits.length);
       return result;
     }
   }

   /**
    * Helper method to render an ID as a string, for debugging
    * <p>
    * Returns the string {@code (null)} if the id is null.
    * Otherwise, returns a string representation for debugging.
    * Never throws an exception. The returned string may
    * indicate if the id is definitely invalid.
    */
   public static String idToString(byte id[]) {
     if (id == null) {
       return "(null)";
     } else {
       StringBuilder sb = new StringBuilder();
       sb.append(new BigInteger(1, id).toString(Character.MAX_RADIX));
       if (id.length != ID_LENGTH) {
         sb.append(" (INVALID FORMAT)");
       }
       return sb.toString();
     }
   }

   /** Just converts each int in the incoming {@link IntsRef} to each byte
    *  in the returned {@link BytesRef}, throwing {@code IllegalArgumentException}
    *  if any int value is out of bounds for a byte. */
   public static BytesRef intsRefToBytesRef(IntsRef ints) {
     byte[] bytes = new byte[ints.length];
     for(int i=0;i<ints.length;i++) {
       int x = ints.ints[ints.offset+i];
       if (x < 0 || x > 255) {
         throw new IllegalArgumentException("int at pos=" + i + " with value=" + x + " is out-of-bounds for byte");
       }
       bytes[i] = (byte) x;
     }

     return new BytesRef(bytes);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.util;


	import java.io.DataInputStream;
	import java.math.BigInteger;
	import java.nio.file.Files;
	import java.nio.file.Paths;
	import java.util.Arrays;
	import java.util.Properties;

	/**
	* Methods for manipulating strings.
	*
	* @lucene.internal
	*/
	public abstract class StringHelper {

	/**
	* Compares two {@link BytesRef}, element by element, and returns the
	* number of elements common to both arrays (from the start of each).
	* This method assumes currentTerm comes after priorTerm.
	*
	* @param priorTerm The first {@link BytesRef} to compare
	* @param currentTerm The second {@link BytesRef} to compare
	* @return The number of common elements (from the start of each).
	*/
	public static int bytesDifference(BytesRef priorTerm, BytesRef currentTerm) {
	int mismatch = FutureArrays.mismatch(priorTerm.bytes, priorTerm.offset, priorTerm.offset + priorTerm.length,
	currentTerm.bytes, currentTerm.offset, currentTerm.offset + currentTerm.length);
	if (mismatch < 0) {
	throw new IllegalArgumentException("terms out of order: priorTerm=" + priorTerm + ",currentTerm=" + currentTerm);
	}
	return mismatch;
	}

	/**
	* Returns the length of {@code currentTerm} needed for use as a sort key.
	* so that {@link BytesRef#compareTo(BytesRef)} still returns the same result.
	* This method assumes currentTerm comes after priorTerm.
	*/
	public static int sortKeyLength(final BytesRef priorTerm, final BytesRef currentTerm) {
	return bytesDifference(priorTerm, currentTerm) + 1;
	}

	private StringHelper() {
	}

	/**
	* Returns <code>true</code> iff the ref starts with the given prefix.
	* Otherwise <code>false</code>.
	*
	* @param ref
	* the {@code byte[]} to test
	* @param prefix
	* the expected prefix
	* @return Returns <code>true</code> iff the ref starts with the given prefix.
	* Otherwise <code>false</code>.
	*/
	public static boolean startsWith(byte[] ref, BytesRef prefix) {
	// not long enough to start with the prefix
	if (ref.length < prefix.length) {
	return false;
	}
	return FutureArrays.equals(ref, 0, prefix.length,
	prefix.bytes, prefix.offset, prefix.offset + prefix.length);
	}

	/**
	* Returns <code>true</code> iff the ref starts with the given prefix.
	* Otherwise <code>false</code>.
	*
	* @param ref
	* the {@link BytesRef} to test
	* @param prefix
	* the expected prefix
	* @return Returns <code>true</code> iff the ref starts with the given prefix.
	* Otherwise <code>false</code>.
	*/
	public static boolean startsWith(BytesRef ref, BytesRef prefix) {
	// not long enough to start with the prefix
	if (ref.length < prefix.length) {
	return false;
	}
	return FutureArrays.equals(ref.bytes, ref.offset, ref.offset + prefix.length,
	prefix.bytes, prefix.offset, prefix.offset + prefix.length);
	}

	/**
	* Returns <code>true</code> iff the ref ends with the given suffix. Otherwise
	* <code>false</code>.
	*
	* @param ref
	* the {@link BytesRef} to test
	* @param suffix
	* the expected suffix
	* @return Returns <code>true</code> iff the ref ends with the given suffix.
	* Otherwise <code>false</code>.
	*/
	public static boolean endsWith(BytesRef ref, BytesRef suffix) {
	int startAt = ref.length - suffix.length;
	// not long enough to start with the suffix
	if (startAt < 0) {
	return false;
	}
	return FutureArrays.equals(ref.bytes, ref.offset + startAt, ref.offset + startAt + suffix.length,
	suffix.bytes, suffix.offset, suffix.offset + suffix.length);
	}

	/** Pass this as the seed to {@link #murmurhash3_x86_32}. */

	// Poached from Guava: set a different salt/seed
	// for each JVM instance, to frustrate hash key collision
	// denial of service attacks, and to catch any places that
	// somehow rely on hash function/order across JVM
	// instances:
	public static final int GOOD_FAST_HASH_SEED;

	static {
	String prop = System.getProperty("tests.seed");
	if (prop != null) {
	// So if there is a test failure that relied on hash
	// order, we remain reproducible based on the test seed:
	GOOD_FAST_HASH_SEED = prop.hashCode();
	} else {
	GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis();
	}
	}

	/** Returns the MurmurHash3_x86_32 hash.
	* Original source/tests at https://github.com/yonik/java_util/
	*/
	@SuppressWarnings("fallthrough")
	public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) {

	final int c1 = 0xcc9e2d51;
	final int c2 = 0x1b873593;

	int h1 = seed;
	int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block

	for (int i=offset; i<roundedEnd; i+=4) {
	// little endian load order
	int k1 = (data[i] & 0xff) \| ((data[i+1] & 0xff) << 8) \| ((data[i+2] & 0xff) << 16) \| (data[i+3] << 24);
	k1 *= c1;
	k1 = Integer.rotateLeft(k1, 15);
	k1 *= c2;

	h1 ^= k1;
	h1 = Integer.rotateLeft(h1, 13);
	h1 = h1*5+0xe6546b64;
	}

	// tail
	int k1 = 0;

	switch(len & 0x03) {
	case 3:
	k1 = (data[roundedEnd + 2] & 0xff) << 16;
	// fallthrough
	case 2:
	k1 \|= (data[roundedEnd + 1] & 0xff) << 8;
	// fallthrough
	case 1:
	k1 \|= (data[roundedEnd] & 0xff);
	k1 *= c1;
	k1 = Integer.rotateLeft(k1, 15);
	k1 *= c2;
	h1 ^= k1;
	}

	// finalization
	h1 ^= len;

	// fmix(h1);
	h1 ^= h1 >>> 16;
	h1 *= 0x85ebca6b;
	h1 ^= h1 >>> 13;
	h1 *= 0xc2b2ae35;
	h1 ^= h1 >>> 16;

	return h1;
	}

	public static int murmurhash3_x86_32(BytesRef bytes, int seed) {
	return murmurhash3_x86_32(bytes.bytes, bytes.offset, bytes.length, seed);
	}

	// Holds 128 bit unsigned value:
	private static BigInteger nextId;
	private static final BigInteger mask128;
	private static final Object idLock = new Object();

	static {
	// 128 bit unsigned mask
	byte[] maskBytes128 = new byte[16];
	Arrays.fill(maskBytes128, (byte) 0xff);
	mask128 = new BigInteger(1, maskBytes128);

	String prop = System.getProperty("tests.seed");

	// State for xorshift128:
	long x0;
	long x1;

	if (prop != null) {
	// So if there is a test failure that somehow relied on this id,
	// we remain reproducible based on the test seed:
	if (prop.length() > 8) {
	prop = prop.substring(prop.length()-8);
	}
	x0 = Long.parseLong(prop, 16);
	x1 = x0;
	} else {
	// seed from /dev/urandom, if its available
	try (DataInputStream is = new DataInputStream(Files.newInputStream(Paths.get("/dev/urandom")))) {
	x0 = is.readLong();
	x1 = is.readLong();
	} catch (Exception unavailable) {
	// may not be available on this platform
	// fall back to lower quality randomness from 3 different sources:
	x0 = System.nanoTime();
	x1 = StringHelper.class.hashCode() << 32;

	StringBuilder sb = new StringBuilder();
	// Properties can vary across JVM instances:
	try {
	Properties p = System.getProperties();
	for (String s: p.stringPropertyNames()) {
	sb.append(s);
	sb.append(p.getProperty(s));
	}
	x1 \|= sb.toString().hashCode();
	} catch (SecurityException notallowed) {
	// getting Properties requires wildcard read-write: may not be allowed
	x1 \|= StringBuffer.class.hashCode();
	}
	}
	}

	// Use a few iterations of xorshift128 to scatter the seed
	// in case multiple Lucene instances starting up "near" the same
	// nanoTime, since we use ++ (mod 2^128) for full period cycle:
	for(int i=0;i<10;i++) {
	long s1 = x0;
	long s0 = x1;
	x0 = s0;
	s1 ^= s1 << 23; // a
	x1 = s1 ^ s0 ^ (s1 >>> 17) ^ (s0 >>> 26); // b, c
	}

	// 64-bit unsigned mask
	byte[] maskBytes64 = new byte[8];
	Arrays.fill(maskBytes64, (byte) 0xff);
	BigInteger mask64 = new BigInteger(1, maskBytes64);

	// First make unsigned versions of x0, x1:
	BigInteger unsignedX0 = BigInteger.valueOf(x0).and(mask64);
	BigInteger unsignedX1 = BigInteger.valueOf(x1).and(mask64);

	// Concatentate bits of x0 and x1, as unsigned 128 bit integer:
	nextId = unsignedX0.shiftLeft(64).or(unsignedX1);
	}

	/** length in bytes of an ID */
	public static final int ID_LENGTH = 16;

	/** Generates a non-cryptographic globally unique id. */
	public static byte[] randomId() {

	// NOTE: we don't use Java's UUID.randomUUID() implementation here because:
	//
	// * It's overkill for our usage: it tries to be cryptographically
	// secure, whereas for this use we don't care if someone can
	// guess the IDs.
	//
	// * It uses SecureRandom, which on Linux can easily take a long time
	// (I saw ~ 10 seconds just running a Lucene test) when entropy
	// harvesting is falling behind.
	//
	// * It loses a few (6) bits to version and variant and it's not clear
	// what impact that has on the period, whereas the simple ++ (mod 2^128)
	// we use here is guaranteed to have the full period.

	byte bits[];
	synchronized(idLock) {
	bits = nextId.toByteArray();
	nextId = nextId.add(BigInteger.ONE).and(mask128);
	}

	// toByteArray() always returns a sign bit, so it may require an extra byte (always zero)
	if (bits.length > ID_LENGTH) {
	assert bits.length == ID_LENGTH + 1;
	assert bits[0] == 0;
	return ArrayUtil.copyOfSubArray(bits, 1, bits.length);
	} else {
	byte[] result = new byte[ID_LENGTH];
	System.arraycopy(bits, 0, result, result.length - bits.length, bits.length);
	return result;
	}
	}

	/**
	* Helper method to render an ID as a string, for debugging
	* <p>
	* Returns the string {@code (null)} if the id is null.
	* Otherwise, returns a string representation for debugging.
	* Never throws an exception. The returned string may
	* indicate if the id is definitely invalid.
	*/
	public static String idToString(byte id[]) {
	if (id == null) {
	return "(null)";
	} else {
	StringBuilder sb = new StringBuilder();
	sb.append(new BigInteger(1, id).toString(Character.MAX_RADIX));
	if (id.length != ID_LENGTH) {
	sb.append(" (INVALID FORMAT)");
	}
	return sb.toString();
	}
	}

	/** Just converts each int in the incoming {@link IntsRef} to each byte
	* in the returned {@link BytesRef}, throwing {@code IllegalArgumentException}
	* if any int value is out of bounds for a byte. */
	public static BytesRef intsRefToBytesRef(IntsRef ints) {
	byte[] bytes = new byte[ints.length];
	for(int i=0;i<ints.length;i++) {
	int x = ints.ints[ints.offset+i];
	if (x < 0 \|\| x > 255) {
	throw new IllegalArgumentException("int at pos=" + i + " with value=" + x + " is out-of-bounds for byte");
	}
	bytes[i] = (byte) x;
	}

	return new BytesRef(bytes);
	}
	}