src/main/java/org/apache/sysds/runtime/compress/estim/sample/SmoothedJackknifeEstimator.java - systemds - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.sysds.runtime.compress.estim.sample;

 import org.apache.sysds.runtime.compress.utils.Bitmap;

 public class SmoothedJackknifeEstimator {

 	/**
 	 * Peter J. Haas, Jeffrey F. Naughton, S. Seshadri, and Lynne Stokes. Sampling-Based Estimation of the Number of
 	 * Distinct Values of an Attribute. VLDB'95, Section 4.3.
 	 *
 	 * @param ubm        The Uncompressed Bitmap containing the data from the sample
 	 * @param nRows      The original number of rows in the entire input
 	 * @param sampleSize The number of rows in the sample
 	 * @return Estimate of the number of distinct values
 	 */
 	public static int get(Bitmap ubm, int nRows, int sampleSize) {
 		int numVals = ubm.getNumValues();
 		int[] freqCounts = FrequencyCount.get(ubm);
 		// all values in the sample are zeros
 		if(freqCounts.length == 0)
 			return 0;
 		// nRows is N and sampleSize is n

 		int d = numVals;
 		double f1 = freqCounts[0];
 		int Nn = nRows * sampleSize;
 		double D0 = (d - f1 / sampleSize) / (1 - (nRows - sampleSize + 1) * f1 / Nn);
 		double NTilde = nRows / D0;
 		/*-
 		 *
 		 * h (as defined in eq. 5 in the paper) can be implemented as:
 		 *
 		 * double h = Gamma(nRows - NTilde + 1) x Gamma.gamma(nRows -sampleSize + 1)
 		 * 		     ----------------------------------------------------------------
 		 *  		Gamma.gamma(nRows - sampleSize - NTilde + 1) x Gamma.gamma(nRows + 1)
 		 *
 		 *
 		 * However, for large values of nRows, Gamma.gamma returns NAN
 		 * (factorial of a very large number).
 		 *
 		 * The following implementation solves this problem by levaraging the
 		 * cancelations that show up when expanding the factorials in the
 		 * numerator and the denominator.
 		 *
 		 *
 		 * 		min(A,D-1) x [min(A,D-1) -1] x .... x B
 		 * h = -------------------------------------------
 		 * 		C x [C-1] x .... x max(A+1,D)
 		 *
 		 * where A = N-\tilde{N}
 		 *       B = N-\tilde{N} - n + a
 		 *       C = N
 		 *       D = N-n+1
 		 *
 		 *
 		 *
 		 */
 		double A = nRows - NTilde;
 		double B = A - sampleSize + 1;
 		double C = nRows;
 		double D = nRows - sampleSize + 1;
 		A = Math.min(A, D - 1);
 		D = Math.max(A + 1, D);
 		double h = 1;

 		for(; A >= B || C >= D; A--, C--) {
 			if(A >= B)
 				h *= A;
 			if(C >= D)
 				h /= C;
 		}
 		// end of h computation

 		double g = 0, gamma = 0;
 		// k here corresponds to k+1 in the paper (the +1 comes from replacing n
 		// with n-1)
 		for(int k = 2; k <= sampleSize + 1; k++) {
 			g += 1.0 / (nRows - NTilde - sampleSize + k);
 		}
 		for(int i = 1; i <= freqCounts.length; i++) {
 			gamma += i * (i - 1) * freqCounts[i - 1];
 		}
 		gamma *= (nRows - 1) * D0 / Nn / (sampleSize - 1);
 		gamma += D0 / nRows - 1;

 		double estimate = (d + nRows * h * g * gamma) / (1 - (nRows - NTilde - sampleSize + 1) * f1 / Nn);
 		return estimate < 1 ? 1 : (int) Math.round(estimate);
 	}
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.sysds.runtime.compress.estim.sample;

	import org.apache.sysds.runtime.compress.utils.Bitmap;

	public class SmoothedJackknifeEstimator {

	/**
	* Peter J. Haas, Jeffrey F. Naughton, S. Seshadri, and Lynne Stokes. Sampling-Based Estimation of the Number of
	* Distinct Values of an Attribute. VLDB'95, Section 4.3.
	*
	* @param ubm The Uncompressed Bitmap containing the data from the sample
	* @param nRows The original number of rows in the entire input
	* @param sampleSize The number of rows in the sample
	* @return Estimate of the number of distinct values
	*/
	public static int get(Bitmap ubm, int nRows, int sampleSize) {
	int numVals = ubm.getNumValues();
	int[] freqCounts = FrequencyCount.get(ubm);
	// all values in the sample are zeros
	if(freqCounts.length == 0)
	return 0;
	// nRows is N and sampleSize is n

	int d = numVals;
	double f1 = freqCounts[0];
	int Nn = nRows * sampleSize;
	double D0 = (d - f1 / sampleSize) / (1 - (nRows - sampleSize + 1) * f1 / Nn);
	double NTilde = nRows / D0;
	/*-
	*
	* h (as defined in eq. 5 in the paper) can be implemented as:
	*
	* double h = Gamma(nRows - NTilde + 1) x Gamma.gamma(nRows -sampleSize + 1)
	* ----------------------------------------------------------------
	* Gamma.gamma(nRows - sampleSize - NTilde + 1) x Gamma.gamma(nRows + 1)
	*
	*
	* However, for large values of nRows, Gamma.gamma returns NAN
	* (factorial of a very large number).
	*
	* The following implementation solves this problem by levaraging the
	* cancelations that show up when expanding the factorials in the
	* numerator and the denominator.
	*
	*
	* min(A,D-1) x [min(A,D-1) -1] x .... x B
	* h = -------------------------------------------
	* C x [C-1] x .... x max(A+1,D)
	*
	* where A = N-\tilde{N}
	* B = N-\tilde{N} - n + a
	* C = N
	* D = N-n+1
	*
	*
	*
	*/
	double A = nRows - NTilde;
	double B = A - sampleSize + 1;
	double C = nRows;
	double D = nRows - sampleSize + 1;
	A = Math.min(A, D - 1);
	D = Math.max(A + 1, D);
	double h = 1;

	for(; A >= B \|\| C >= D; A--, C--) {
	if(A >= B)
	h *= A;
	if(C >= D)
	h /= C;
	}
	// end of h computation

	double g = 0, gamma = 0;
	// k here corresponds to k+1 in the paper (the +1 comes from replacing n
	// with n-1)
	for(int k = 2; k <= sampleSize + 1; k++) {
	g += 1.0 / (nRows - NTilde - sampleSize + k);
	}
	for(int i = 1; i <= freqCounts.length; i++) {
	gamma += i * (i - 1) * freqCounts[i - 1];
	}
	gamma = (nRows - 1) D0 / Nn / (sampleSize - 1);
	gamma += D0 / nRows - 1;

	double estimate = (d + nRows * h * g * gamma) / (1 - (nRows - NTilde - sampleSize + 1) * f1 / Nn);
	return estimate < 1 ? 1 : (int) Math.round(estimate);
	}
	}