blob: b5536e2b66abe97905a37892cd17939fead3527a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysds.runtime.compress.estim.sample;
import org.apache.sysds.runtime.compress.utils.Bitmap;
public class SmoothedJackknifeEstimator {
/**
* Peter J. Haas, Jeffrey F. Naughton, S. Seshadri, and Lynne Stokes. Sampling-Based Estimation of the Number of
* Distinct Values of an Attribute. VLDB'95, Section 4.3.
*
* @param ubm The Uncompressed Bitmap containing the data from the sample
* @param nRows The original number of rows in the entire input
* @param sampleSize The number of rows in the sample
* @return Estimate of the number of distinct values
*/
public static int get(Bitmap ubm, int nRows, int sampleSize) {
int numVals = ubm.getNumValues();
int[] freqCounts = FrequencyCount.get(ubm);
// all values in the sample are zeros
if(freqCounts.length == 0)
return 0;
// nRows is N and sampleSize is n
int d = numVals;
double f1 = freqCounts[0];
int Nn = nRows * sampleSize;
double D0 = (d - f1 / sampleSize) / (1 - (nRows - sampleSize + 1) * f1 / Nn);
double NTilde = nRows / D0;
/*-
*
* h (as defined in eq. 5 in the paper) can be implemented as:
*
* double h = Gamma(nRows - NTilde + 1) x Gamma.gamma(nRows -sampleSize + 1)
* ----------------------------------------------------------------
* Gamma.gamma(nRows - sampleSize - NTilde + 1) x Gamma.gamma(nRows + 1)
*
*
* However, for large values of nRows, Gamma.gamma returns NAN
* (factorial of a very large number).
*
* The following implementation solves this problem by levaraging the
* cancelations that show up when expanding the factorials in the
* numerator and the denominator.
*
*
* min(A,D-1) x [min(A,D-1) -1] x .... x B
* h = -------------------------------------------
* C x [C-1] x .... x max(A+1,D)
*
* where A = N-\tilde{N}
* B = N-\tilde{N} - n + a
* C = N
* D = N-n+1
*
*
*
*/
double A = nRows - NTilde;
double B = A - sampleSize + 1;
double C = nRows;
double D = nRows - sampleSize + 1;
A = Math.min(A, D - 1);
D = Math.max(A + 1, D);
double h = 1;
for(; A >= B || C >= D; A--, C--) {
if(A >= B)
h *= A;
if(C >= D)
h /= C;
}
// end of h computation
double g = 0, gamma = 0;
// k here corresponds to k+1 in the paper (the +1 comes from replacing n
// with n-1)
for(int k = 2; k <= sampleSize + 1; k++) {
g += 1.0 / (nRows - NTilde - sampleSize + k);
}
for(int i = 1; i <= freqCounts.length; i++) {
gamma += i * (i - 1) * freqCounts[i - 1];
}
gamma *= (nRows - 1) * D0 / Nn / (sampleSize - 1);
gamma += D0 / nRows - 1;
double estimate = (d + nRows * h * g * gamma) / (1 - (nRows - NTilde - sampleSize + 1) * f1 / Nn);
return estimate < 1 ? 1 : (int) Math.round(estimate);
}
}