solr/core/src/java/org/apache/solr/util/hll/HLLUtil.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.solr.util.hll;

 /**
  * Static functions for computing constants and parameters used in the HLL
  * algorithm.
  */
 final class HLLUtil {
     /**
      * Precomputed <code>pwMaxMask</code> values indexed by <code>registerSizeInBits</code>.
      * Calculated with this formula:
      * <pre>
      *     int maxRegisterValue = (1 << registerSizeInBits) - 1;
      *     // Mask with all bits set except for (maxRegisterValue - 1) least significant bits (see #addRaw())
      *     return ~((1L << (maxRegisterValue - 1)) - 1);
      * </pre>
      *
      * @see #pwMaxMask(int)
      */
     private static final long[] PW_MASK = {
             ~((1L << (((1 << 0) - 1) - 1)) - 1),
             ~((1L << (((1 << 1) - 1) - 1)) - 1),
             ~((1L << (((1 << 2) - 1) - 1)) - 1),
             ~((1L << (((1 << 3) - 1) - 1)) - 1),
             ~((1L << (((1 << 4) - 1) - 1)) - 1),
             ~((1L << (((1 << 5) - 1) - 1)) - 1),
             ~((1L << (((1 << 6) - 1) - 1)) - 1),
             ~((1L << (((1 << 7) - 1) - 1)) - 1),
             ~((1L << (((1 << 8) - 1) - 1)) - 1)
     };

     /**
      * Precomputed <code>twoToL</code> values indexed by a linear combination of
      * <code>regWidth</code> and <code>log2m</code>.
      *
      * The array is one-dimensional and can be accessed by using index
      * <code>(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m</code>
      * for <code>regWidth</code> and <code>log2m</code> between the specified
      * <code>HLL.{MINIMUM,MAXIMUM}_{REGWIDTH,LOG2M}_PARAM</code> constants.
      *
      * @see #largeEstimator(int, int, double)
      * @see #largeEstimatorCutoff(int, int)
      * @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 2^L</a>"
      */
     private static final double[] TWO_TO_L = new double[(HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)];

     /**
      * Spacing constant used to compute offsets into {@link #TWO_TO_L}.
      */
     private static final int REG_WIDTH_INDEX_MULTIPLIER = HLL.MAXIMUM_LOG2M_PARAM + 1;

     static {
         for(int regWidth = HLL.MINIMUM_REGWIDTH_PARAM; regWidth <= HLL.MAXIMUM_REGWIDTH_PARAM; regWidth++) {
             for(int log2m = HLL.MINIMUM_LOG2M_PARAM ; log2m <= HLL.MAXIMUM_LOG2M_PARAM; log2m++) {
                 int maxRegisterValue = (1 << regWidth) - 1;

                 // Since 1 is added to p(w) in the insertion algorithm, only
                 // (maxRegisterValue - 1) bits are inspected hence the hash
                 // space is one power of two smaller.
                 final int pwBits = (maxRegisterValue - 1);
                 final int totalBits = (pwBits + log2m);
                 final double twoToL = Math.pow(2, totalBits);
                 TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m] = twoToL;
             }
         }
     }

     // ************************************************************************
     /**
      * Computes the bit-width of HLL registers necessary to estimate a set of
      * the specified cardinality.
      *
      * @param  expectedUniqueElements an upper bound on the number of unique
      *         elements that are expected.  This must be greater than zero.
      * @return a register size in bits (i.e. <code>log2(log2(n))</code>)
      */
     public static int registerBitSize(final long expectedUniqueElements) {
         return Math.max(HLL.MINIMUM_REGWIDTH_PARAM,
                         (int)Math.ceil(NumberUtil.log2(NumberUtil.log2(expectedUniqueElements))));
     }

     // ========================================================================
     /**
      * Computes the 'alpha-m-squared' constant used by the HyperLogLog algorithm.
      *
      * @param  m this must be a power of two, cannot be less than
      *         16 (2<sup>4</sup>), and cannot be greater than 65536 (2<sup>16</sup>).
      * @return gamma times <code>registerCount</code> squared where gamma is
      *         based on the value of <code>registerCount</code>.
      * @throws IllegalArgumentException if <code>registerCount</code> is less
      *         than 16.
      */
     public static double alphaMSquared(final int m) {
         switch(m) {
             case 1/*2^0*/:
             case 2/*2^1*/:
             case 4/*2^2*/:
             case 8/*2^3*/:
                 throw new IllegalArgumentException("'m' cannot be less than 16 (" + m + " < 16).");

             case 16/*2^4*/:
                 return 0.673 * m * m;

             case 32/*2^5*/:
                 return 0.697 * m * m;

             case 64/*2^6*/:
                 return 0.709 * m * m;

             default/*>2^6*/:
                 return (0.7213 / (1.0 + 1.079 / m)) * m * m;
         }
     }

     // ========================================================================
     /**
      * Computes a mask that prevents overflow of HyperLogLog registers.
      *
      * @param  registerSizeInBits the size of the HLL registers, in bits.
      * @return mask a <code>long</code> mask to prevent overflow of the registers
      * @see #registerBitSize(long)
      */
     public static long pwMaxMask(final int registerSizeInBits) {
         return PW_MASK[registerSizeInBits];
     }

     // ========================================================================
     /**
      * The cutoff for using the "small range correction" formula, in the
      * HyperLogLog algorithm.
      *
      * @param  m the number of registers in the HLL. <em>m<em> in the paper.
      * @return the cutoff for the small range correction.
      * @see #smallEstimator(int, int)
      */
     public static double smallEstimatorCutoff(final int m) {
         return ((double)m * 5) / 2;
     }

     /**
      * The "small range correction" formula from the HyperLogLog algorithm. Only
      * appropriate if both the estimator is smaller than <pre>(5/2) * m</pre> and
      * there are still registers that have the zero value.
      *
      * @param  m the number of registers in the HLL. <em>m<em> in the paper.
      * @param  numberOfZeroes the number of registers with value zero. <em>V</em>
      *         in the paper.
      * @return a corrected cardinality estimate.
      */
     public static double smallEstimator(final int m, final int numberOfZeroes) {
         return m * Math.log((double)m / numberOfZeroes);
     }

     /**
      * The cutoff for using the "large range correction" formula, from the
      * HyperLogLog algorithm, adapted for 64 bit hashes.
      *
      * @param  log2m log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
      * @param  registerSizeInBits the size of the HLL registers, in bits.
      * @return the cutoff for the large range correction.
      * @see #largeEstimator(int, int, double)
      * @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and 'large range correction' cutoff</a>"
      */
     public static double largeEstimatorCutoff(final int log2m, final int registerSizeInBits) {
         return (TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m]) / 30.0;
     }

     /**
      * The "large range correction" formula from the HyperLogLog algorithm, adapted
      * for 64 bit hashes. Only appropriate for estimators whose value exceeds
      * the return of {@link #largeEstimatorCutoff(int, int)}.
      *
      * @param  log2m log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
      * @param  registerSizeInBits the size of the HLL registers, in bits.
      * @param  estimator the original estimator ("E" in the paper).
      * @return a corrected cardinality estimate.
      * @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and 'large range correction'</a>"
      */
     public static double largeEstimator(final int log2m, final int registerSizeInBits, final double estimator) {
         final double twoToL = TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m];
         return -1 * twoToL * Math.log(1.0 - (estimator/twoToL));
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.solr.util.hll;

	/**
	* Static functions for computing constants and parameters used in the HLL
	* algorithm.
	*/
	final class HLLUtil {
	/**
	* Precomputed <code>pwMaxMask</code> values indexed by <code>registerSizeInBits</code>.
	* Calculated with this formula:
	* <pre>
	* int maxRegisterValue = (1 << registerSizeInBits) - 1;
	* // Mask with all bits set except for (maxRegisterValue - 1) least significant bits (see #addRaw())
	* return ~((1L << (maxRegisterValue - 1)) - 1);
	* </pre>
	*
	* @see #pwMaxMask(int)
	*/
	private static final long[] PW_MASK = {
	~((1L << (((1 << 0) - 1) - 1)) - 1),
	~((1L << (((1 << 1) - 1) - 1)) - 1),
	~((1L << (((1 << 2) - 1) - 1)) - 1),
	~((1L << (((1 << 3) - 1) - 1)) - 1),
	~((1L << (((1 << 4) - 1) - 1)) - 1),
	~((1L << (((1 << 5) - 1) - 1)) - 1),
	~((1L << (((1 << 6) - 1) - 1)) - 1),
	~((1L << (((1 << 7) - 1) - 1)) - 1),
	~((1L << (((1 << 8) - 1) - 1)) - 1)
	};

	/**
	* Precomputed <code>twoToL</code> values indexed by a linear combination of
	* <code>regWidth</code> and <code>log2m</code>.
	*
	* The array is one-dimensional and can be accessed by using index
	* <code>(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m</code>
	* for <code>regWidth</code> and <code>log2m</code> between the specified
	* <code>HLL.{MINIMUM,MAXIMUM}_{REGWIDTH,LOG2M}_PARAM</code> constants.
	*
	* @see #largeEstimator(int, int, double)
	* @see #largeEstimatorCutoff(int, int)
	* @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 2^L</a>"
	*/
	private static final double[] TWO_TO_L = new double[(HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)];

	/**
	* Spacing constant used to compute offsets into {@link #TWO_TO_L}.
	*/
	private static final int REG_WIDTH_INDEX_MULTIPLIER = HLL.MAXIMUM_LOG2M_PARAM + 1;

	static {
	for(int regWidth = HLL.MINIMUM_REGWIDTH_PARAM; regWidth <= HLL.MAXIMUM_REGWIDTH_PARAM; regWidth++) {
	for(int log2m = HLL.MINIMUM_LOG2M_PARAM ; log2m <= HLL.MAXIMUM_LOG2M_PARAM; log2m++) {
	int maxRegisterValue = (1 << regWidth) - 1;

	// Since 1 is added to p(w) in the insertion algorithm, only
	// (maxRegisterValue - 1) bits are inspected hence the hash
	// space is one power of two smaller.
	final int pwBits = (maxRegisterValue - 1);
	final int totalBits = (pwBits + log2m);
	final double twoToL = Math.pow(2, totalBits);
	TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m] = twoToL;
	}
	}
	}

	// ************************************************************************
	/**
	* Computes the bit-width of HLL registers necessary to estimate a set of
	* the specified cardinality.
	*
	* @param expectedUniqueElements an upper bound on the number of unique
	* elements that are expected. This must be greater than zero.
	* @return a register size in bits (i.e. <code>log2(log2(n))</code>)
	*/
	public static int registerBitSize(final long expectedUniqueElements) {
	return Math.max(HLL.MINIMUM_REGWIDTH_PARAM,
	(int)Math.ceil(NumberUtil.log2(NumberUtil.log2(expectedUniqueElements))));
	}

	// ========================================================================
	/**
	* Computes the 'alpha-m-squared' constant used by the HyperLogLog algorithm.
	*
	* @param m this must be a power of two, cannot be less than
	* 16 (2<sup>4</sup>), and cannot be greater than 65536 (2<sup>16</sup>).
	* @return gamma times <code>registerCount</code> squared where gamma is
	* based on the value of <code>registerCount</code>.
	* @throws IllegalArgumentException if <code>registerCount</code> is less
	* than 16.
	*/
	public static double alphaMSquared(final int m) {
	switch(m) {
	case 1/2^0/:
	case 2/2^1/:
	case 4/2^2/:
	case 8/2^3/:
	throw new IllegalArgumentException("'m' cannot be less than 16 (" + m + " < 16).");

	case 16/2^4/:
	return 0.673 * m * m;

	case 32/2^5/:
	return 0.697 * m * m;

	case 64/2^6/:
	return 0.709 * m * m;

	default/>2^6/:
	return (0.7213 / (1.0 + 1.079 / m)) * m * m;
	}
	}

	// ========================================================================
	/**
	* Computes a mask that prevents overflow of HyperLogLog registers.
	*
	* @param registerSizeInBits the size of the HLL registers, in bits.
	* @return mask a <code>long</code> mask to prevent overflow of the registers
	* @see #registerBitSize(long)
	*/
	public static long pwMaxMask(final int registerSizeInBits) {
	return PW_MASK[registerSizeInBits];
	}

	// ========================================================================
	/**
	* The cutoff for using the "small range correction" formula, in the
	* HyperLogLog algorithm.
	*
	* @param m the number of registers in the HLL. <em>m<em> in the paper.
	* @return the cutoff for the small range correction.
	* @see #smallEstimator(int, int)
	*/
	public static double smallEstimatorCutoff(final int m) {
	return ((double)m * 5) / 2;
	}

	/**
	* The "small range correction" formula from the HyperLogLog algorithm. Only
	* appropriate if both the estimator is smaller than <pre>(5/2) * m</pre> and
	* there are still registers that have the zero value.
	*
	* @param m the number of registers in the HLL. <em>m<em> in the paper.
	* @param numberOfZeroes the number of registers with value zero. <em>V</em>
	* in the paper.
	* @return a corrected cardinality estimate.
	*/
	public static double smallEstimator(final int m, final int numberOfZeroes) {
	return m * Math.log((double)m / numberOfZeroes);
	}

	/**
	* The cutoff for using the "large range correction" formula, from the
	* HyperLogLog algorithm, adapted for 64 bit hashes.
	*
	* @param log2m log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
	* @param registerSizeInBits the size of the HLL registers, in bits.
	* @return the cutoff for the large range correction.
	* @see #largeEstimator(int, int, double)
	* @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and 'large range correction' cutoff</a>"
	*/
	public static double largeEstimatorCutoff(final int log2m, final int registerSizeInBits) {
	return (TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m]) / 30.0;
	}

	/**
	* The "large range correction" formula from the HyperLogLog algorithm, adapted
	* for 64 bit hashes. Only appropriate for estimators whose value exceeds
	* the return of {@link #largeEstimatorCutoff(int, int)}.
	*
	* @param log2m log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
	* @param registerSizeInBits the size of the HLL registers, in bits.
	* @param estimator the original estimator ("E" in the paper).
	* @return a corrected cardinality estimate.
	* @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and 'large range correction'</a>"
	*/
	public static double largeEstimator(final int log2m, final int registerSizeInBits, final double estimator) {
	final double twoToL = TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m];
	return -1 * twoToL * Math.log(1.0 - (estimator/twoToL));
	}
	}