blob: b745d4ef9c1b8a63fa5a38aabadff9f965ae8b01 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysds.runtime.compress.estim.sample;
import java.util.HashMap;
import org.apache.commons.math3.analysis.UnivariateFunction;
import org.apache.commons.math3.analysis.solvers.UnivariateSolverUtils;
import org.apache.sysds.runtime.compress.utils.ABitmap;
public class HassAndStokes {
public static final double HAAS_AND_STOKES_ALPHA1 = 0.9; // 0.9 recommended in paper
public static final double HAAS_AND_STOKES_ALPHA2 = 30; // 30 recommended in paper
public static final int HAAS_AND_STOKES_UJ2A_C = 50; // 50 recommend in paper
public static final boolean HAAS_AND_STOKES_UJ2A_CUT2 = true; // cut frequency in half
public static final boolean HAAS_AND_STOKES_UJ2A_SOLVE = true; // true recommended
public static final int MAX_SOLVE_CACHE_SIZE = 64 * 1024; // global 2MB cache
/**
* Haas, Peter J., and Lynne Stokes. "Estimating the number of classes in a finite population." Journal of the
* American Statistical Association 93.444 (1998): 1475-1487.
*
* The hybrid estimator given by Eq. 33 in Section 6
*
* @param ubm The Uncompressed Bit map
* @param nRows The number of rows originally in the input
* @param sampleSize The number of rows used in the sample
* @param solveCache A Hashmap containing information for getDuj2aEstimate
* @return An estimation of distinct elements in the population.
*/
public static int haasAndStokes(ABitmap ubm, int nRows, int sampleSize,
HashMap<Integer, Double> solveCache) {
// obtain value and frequency histograms
int numVals = ubm.getNumValues();
int[] freqCounts = FrequencyCount.get(ubm);
// all values in the sample are zeros.
if(numVals == 0)
return 1;
double q = ((double) sampleSize) / nRows;
double f1 = freqCounts[0];
// compute basic Duj1 estimate
double duj1 = getDuj1Estimate(q, f1, sampleSize, numVals);
// compute gamma based on Duj1
double gamma = getGammaSquared(duj1, freqCounts, sampleSize, nRows);
double d = -1;
// core hybrid estimator based on gamma
if(gamma < HAAS_AND_STOKES_ALPHA1)
d = getDuj2Estimate(q, f1, sampleSize, numVals, gamma);
else if(gamma < HAAS_AND_STOKES_ALPHA2)
d = getDuj2aEstimate(q, freqCounts, sampleSize, numVals, gamma, nRows, solveCache);
else
d = getSh3Estimate(q, freqCounts, numVals);
// round and ensure min value 1
return Math.max(1, (int) Math.round(d));
}
/**
* Computes the "un-smoothed first-order jackknife estimator" (Eq 11).
*
* @param q ??
* @param f1 ??
* @param n ??
* @param dn ??
* @return ??
*/
private static double getDuj1Estimate(double q, double f1, int n, int dn) {
return dn / (1 - ((1 - q) * f1) / n);
}
/**
* Computes the "un-smoothed second-order jackknife estimator" (Eq 18b).
*
* @param q ??
* @param f1 ??
* @param n ??
* @param dn ??
* @param gammaDuj1 ??
* @return ??
*/
private static double getDuj2Estimate(double q, double f1, int n, int dn, double gammaDuj1) {
return (dn - (1 - q) * f1 * Math.log(1 - q) * gammaDuj1 / q) / (1 - ((1 - q) * f1) / n);
}
/**
* Computes the "un-smoothed second-order jackknife estimator" with additional stabilization procedure, which
* removes the classes whose frequency exceed c, computes Duj2 over the reduced sample, and finally adds the removed
* frequencies.
*
* @param q ??
* @param f ??
* @param n ??
* @param dn ??
* @param gammaDuj1 ??
* @param N ??
* @param solveCache ??
* @return ??
*/
private static double getDuj2aEstimate(double q, int f[], int n, int dn, double gammaDuj1, int N,
HashMap<Integer, Double> solveCache) {
int c = HAAS_AND_STOKES_UJ2A_CUT2 ? f.length / 2 + 1 : HAAS_AND_STOKES_UJ2A_C + 1;
// compute adjusted sample size after removing classes that
// exceed a fixed frequency c
int nB = 0, cardB = 0;
for(int i = c; i <= f.length; i++)
if(f[i - 1] != 0) {
nB += f[i - 1] * i; // numVals times frequency
cardB += f[i - 1];
}
// fallback to Duj2 over full sample if only high frequency columns
if(n - nB == 0)
return getDuj2Estimate(q, f[0], n, dn, gammaDuj1);
// compute reduced population size via numeric solve
int updatedN = N;
for(int i = c; i <= f.length; i++)
if(f[i - 1] != 0)
updatedN -= f[i - 1] *
(!HAAS_AND_STOKES_UJ2A_SOLVE ? i / q : getMethodOfMomentsEstimate(i, q, 1, N, solveCache));
// remove classes that exceed a fixed frequency c
for(int i = c; i <= f.length; i++)
f[i - 1] = 0;
// compute duj2a over reduced sample
double updatedDuj1 = getDuj1Estimate(q, f[0], n - nB, dn - cardB);
double updatedGammaDuj1 = getGammaSquared(updatedDuj1, f, n - nB, updatedN);
double duj2 = getDuj2Estimate(q, f[0], n - nB, dn - cardB, updatedGammaDuj1);
return duj2 + cardB;
}
/**
* Computes the "squared coefficient of variation" based on a given initial estimate D (Eq 16).
*
* @param D ??
* @param f ??
* @param n ??
* @param N ??
* @return ??
*/
private static double getGammaSquared(double D, int[] f, int n, int N) {
double gamma = 0;
for(int i = 1; i <= f.length; i++)
if(f[i - 1] != 0)
gamma += i * (i - 1) * f[i - 1];
gamma *= D / n / n;
gamma += D / N - 1;
return Math.max(0, gamma);
}
/**
* Computed the "shlosser third-order estimator". (Eq 30b)
*
* Note that this estimator can show anomalies with NaN as the results due to terms such as Math.pow(1+q, i) which
* exceed Double.MAX_VALUE even for moderately large i, e.g., q=0.05 at around 14K.
*
* @param q ??
* @param f ??
* @param dn ??
* @return ??
*/
private static double getSh3Estimate(double q, int[] f, double dn) {
double fraq11 = 0, fraq12 = 0, fraq21 = 0, fraq22 = 0;
for(int i = 1; i <= f.length; i++)
if(f[i - 1] != 0) {
fraq11 += i * q * q * Math.pow(1 - q * q, i - 1) * f[i - 1];
// NOTE: numerically unstable due to Math.pow(1+q, i) overflows
// fraq12 += Math.pow(1 - q, i) * (Math.pow(1+q, i)-1) * f[i-1];
fraq12 += (Math.pow(1 - q * q, i) - Math.pow(1 - q, i)) * f[i - 1];
fraq21 += Math.pow(1 - q, i) * f[i - 1];
fraq22 += i * q * Math.pow(1 - q, i - 1) * f[i - 1];
}
return dn + f[0] * fraq11 / fraq12 * Math.pow(fraq21 / fraq22, 2);
}
/**
* Solves the method-of-moments estimate numerically. We use a cache on the same observed instances in the sample as
* q is constant and min/max are chosen conservatively.
*
* @param nj ??
* @param q ??
* @param min ??
* @param max ??
* @param solveCache ??
* @return ??
*/
private static double getMethodOfMomentsEstimate(int nj, double q, double min, double max,
HashMap<Integer, Double> solveCache) {
if(solveCache.containsKey(nj))
return solveCache.get(nj);
double est = UnivariateSolverUtils.solve(new MethodOfMomentsFunction(nj, q), min, max, 1e-9);
if(solveCache.size() < MAX_SOLVE_CACHE_SIZE)
solveCache.put(nj, est);
return est;
}
private static class MethodOfMomentsFunction implements UnivariateFunction {
private final int _nj;
private final double _q;
public MethodOfMomentsFunction(int nj, double q) {
_nj = nj;
_q = q;
}
@Override
public double value(double x) {
return _q * x / (1 - Math.pow(1 - _q, x)) - _nj;
}
}
}