| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.sysds.runtime.compress.estim.sample; |
| |
| import java.util.HashMap; |
| |
| import org.apache.commons.math3.analysis.UnivariateFunction; |
| import org.apache.commons.math3.analysis.solvers.UnivariateSolverUtils; |
| import org.apache.sysds.runtime.compress.utils.ABitmap; |
| |
| public class HassAndStokes { |
| |
| public static final double HAAS_AND_STOKES_ALPHA1 = 0.9; // 0.9 recommended in paper |
| public static final double HAAS_AND_STOKES_ALPHA2 = 30; // 30 recommended in paper |
| public static final int HAAS_AND_STOKES_UJ2A_C = 50; // 50 recommend in paper |
| public static final boolean HAAS_AND_STOKES_UJ2A_CUT2 = true; // cut frequency in half |
| public static final boolean HAAS_AND_STOKES_UJ2A_SOLVE = true; // true recommended |
| public static final int MAX_SOLVE_CACHE_SIZE = 64 * 1024; // global 2MB cache |
| |
| /** |
| * Haas, Peter J., and Lynne Stokes. "Estimating the number of classes in a finite population." Journal of the |
| * American Statistical Association 93.444 (1998): 1475-1487. |
| * |
| * The hybrid estimator given by Eq. 33 in Section 6 |
| * |
| * @param ubm The Uncompressed Bit map |
| * @param nRows The number of rows originally in the input |
| * @param sampleSize The number of rows used in the sample |
| * @param solveCache A Hashmap containing information for getDuj2aEstimate |
| * @return An estimation of distinct elements in the population. |
| */ |
| public static int haasAndStokes(ABitmap ubm, int nRows, int sampleSize, |
| HashMap<Integer, Double> solveCache) { |
| // obtain value and frequency histograms |
| int numVals = ubm.getNumValues(); |
| int[] freqCounts = FrequencyCount.get(ubm); |
| |
| // all values in the sample are zeros. |
| if(numVals == 0) |
| return 1; |
| |
| double q = ((double) sampleSize) / nRows; |
| double f1 = freqCounts[0]; |
| |
| // compute basic Duj1 estimate |
| double duj1 = getDuj1Estimate(q, f1, sampleSize, numVals); |
| |
| // compute gamma based on Duj1 |
| double gamma = getGammaSquared(duj1, freqCounts, sampleSize, nRows); |
| double d = -1; |
| |
| // core hybrid estimator based on gamma |
| if(gamma < HAAS_AND_STOKES_ALPHA1) |
| d = getDuj2Estimate(q, f1, sampleSize, numVals, gamma); |
| else if(gamma < HAAS_AND_STOKES_ALPHA2) |
| d = getDuj2aEstimate(q, freqCounts, sampleSize, numVals, gamma, nRows, solveCache); |
| else |
| d = getSh3Estimate(q, freqCounts, numVals); |
| |
| // round and ensure min value 1 |
| return Math.max(1, (int) Math.round(d)); |
| } |
| |
| /** |
| * Computes the "un-smoothed first-order jackknife estimator" (Eq 11). |
| * |
| * @param q ?? |
| * @param f1 ?? |
| * @param n ?? |
| * @param dn ?? |
| * @return ?? |
| */ |
| private static double getDuj1Estimate(double q, double f1, int n, int dn) { |
| return dn / (1 - ((1 - q) * f1) / n); |
| } |
| |
| /** |
| * Computes the "un-smoothed second-order jackknife estimator" (Eq 18b). |
| * |
| * @param q ?? |
| * @param f1 ?? |
| * @param n ?? |
| * @param dn ?? |
| * @param gammaDuj1 ?? |
| * @return ?? |
| */ |
| private static double getDuj2Estimate(double q, double f1, int n, int dn, double gammaDuj1) { |
| return (dn - (1 - q) * f1 * Math.log(1 - q) * gammaDuj1 / q) / (1 - ((1 - q) * f1) / n); |
| } |
| |
| /** |
| * Computes the "un-smoothed second-order jackknife estimator" with additional stabilization procedure, which |
| * removes the classes whose frequency exceed c, computes Duj2 over the reduced sample, and finally adds the removed |
| * frequencies. |
| * |
| * @param q ?? |
| * @param f ?? |
| * @param n ?? |
| * @param dn ?? |
| * @param gammaDuj1 ?? |
| * @param N ?? |
| * @param solveCache ?? |
| * @return ?? |
| */ |
| private static double getDuj2aEstimate(double q, int f[], int n, int dn, double gammaDuj1, int N, |
| HashMap<Integer, Double> solveCache) { |
| int c = HAAS_AND_STOKES_UJ2A_CUT2 ? f.length / 2 + 1 : HAAS_AND_STOKES_UJ2A_C + 1; |
| |
| // compute adjusted sample size after removing classes that |
| // exceed a fixed frequency c |
| int nB = 0, cardB = 0; |
| for(int i = c; i <= f.length; i++) |
| if(f[i - 1] != 0) { |
| nB += f[i - 1] * i; // numVals times frequency |
| cardB += f[i - 1]; |
| } |
| |
| // fallback to Duj2 over full sample if only high frequency columns |
| if(n - nB == 0) |
| return getDuj2Estimate(q, f[0], n, dn, gammaDuj1); |
| |
| // compute reduced population size via numeric solve |
| int updatedN = N; |
| for(int i = c; i <= f.length; i++) |
| if(f[i - 1] != 0) |
| updatedN -= f[i - 1] * |
| (!HAAS_AND_STOKES_UJ2A_SOLVE ? i / q : getMethodOfMomentsEstimate(i, q, 1, N, solveCache)); |
| |
| // remove classes that exceed a fixed frequency c |
| for(int i = c; i <= f.length; i++) |
| f[i - 1] = 0; |
| |
| // compute duj2a over reduced sample |
| double updatedDuj1 = getDuj1Estimate(q, f[0], n - nB, dn - cardB); |
| double updatedGammaDuj1 = getGammaSquared(updatedDuj1, f, n - nB, updatedN); |
| double duj2 = getDuj2Estimate(q, f[0], n - nB, dn - cardB, updatedGammaDuj1); |
| return duj2 + cardB; |
| } |
| |
| /** |
| * Computes the "squared coefficient of variation" based on a given initial estimate D (Eq 16). |
| * |
| * @param D ?? |
| * @param f ?? |
| * @param n ?? |
| * @param N ?? |
| * @return ?? |
| */ |
| private static double getGammaSquared(double D, int[] f, int n, int N) { |
| double gamma = 0; |
| for(int i = 1; i <= f.length; i++) |
| if(f[i - 1] != 0) |
| gamma += i * (i - 1) * f[i - 1]; |
| gamma *= D / n / n; |
| gamma += D / N - 1; |
| return Math.max(0, gamma); |
| } |
| |
| /** |
| * Computed the "shlosser third-order estimator". (Eq 30b) |
| * |
| * Note that this estimator can show anomalies with NaN as the results due to terms such as Math.pow(1+q, i) which |
| * exceed Double.MAX_VALUE even for moderately large i, e.g., q=0.05 at around 14K. |
| * |
| * @param q ?? |
| * @param f ?? |
| * @param dn ?? |
| * @return ?? |
| */ |
| private static double getSh3Estimate(double q, int[] f, double dn) { |
| double fraq11 = 0, fraq12 = 0, fraq21 = 0, fraq22 = 0; |
| for(int i = 1; i <= f.length; i++) |
| if(f[i - 1] != 0) { |
| fraq11 += i * q * q * Math.pow(1 - q * q, i - 1) * f[i - 1]; |
| // NOTE: numerically unstable due to Math.pow(1+q, i) overflows |
| // fraq12 += Math.pow(1 - q, i) * (Math.pow(1+q, i)-1) * f[i-1]; |
| fraq12 += (Math.pow(1 - q * q, i) - Math.pow(1 - q, i)) * f[i - 1]; |
| fraq21 += Math.pow(1 - q, i) * f[i - 1]; |
| fraq22 += i * q * Math.pow(1 - q, i - 1) * f[i - 1]; |
| } |
| return dn + f[0] * fraq11 / fraq12 * Math.pow(fraq21 / fraq22, 2); |
| } |
| |
| /** |
| * Solves the method-of-moments estimate numerically. We use a cache on the same observed instances in the sample as |
| * q is constant and min/max are chosen conservatively. |
| * |
| * @param nj ?? |
| * @param q ?? |
| * @param min ?? |
| * @param max ?? |
| * @param solveCache ?? |
| * @return ?? |
| */ |
| private static double getMethodOfMomentsEstimate(int nj, double q, double min, double max, |
| HashMap<Integer, Double> solveCache) { |
| if(solveCache.containsKey(nj)) |
| return solveCache.get(nj); |
| |
| double est = UnivariateSolverUtils.solve(new MethodOfMomentsFunction(nj, q), min, max, 1e-9); |
| |
| if(solveCache.size() < MAX_SOLVE_CACHE_SIZE) |
| solveCache.put(nj, est); |
| |
| return est; |
| } |
| |
| private static class MethodOfMomentsFunction implements UnivariateFunction { |
| private final int _nj; |
| private final double _q; |
| |
| public MethodOfMomentsFunction(int nj, double q) { |
| _nj = nj; |
| _q = q; |
| } |
| |
| @Override |
| public double value(double x) { |
| return _q * x / (1 - Math.pow(1 - _q, x)) - _nj; |
| } |
| } |
| } |