blob: 070788b98f51ab7e0e166c5a29044a95a2663967 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysds.runtime.compress.estim;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
import org.apache.sysds.runtime.compress.CompressionSettings;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
public interface ComEstFactory {
static final Log LOG = LogFactory.getLog(ComEstFactory.class.getName());
/**
* Create an estimator for the input data with the given settings and parallelization degree.
*
* @param data The matrix to extract compression information from.
* @param cs The settings for the compression
* @param k The parallelization degree
* @return A new CompressionSizeEstimator used to extract information of column groups
*/
public static AComEst createEstimator(MatrixBlock data, CompressionSettings cs, int k) {
if(data instanceof CompressedMatrixBlock)
return createCompressedEstimator((CompressedMatrixBlock) data, cs);
final int nRows = cs.transposed ? data.getNumColumns() : data.getNumRows();
final int nCols = cs.transposed ? data.getNumRows() : data.getNumColumns();
final double sparsity = data.getSparsity();
final int sampleSize = getSampleSize(cs, nRows, nCols, sparsity);
if(data.isEmpty())
return createExactEstimator(data, cs);
return createEstimator(data, cs, sampleSize, k, nRows);
}
/**
* Create an estimator for the input data with the given settings and parallelization degree.
*
* @param data The matrix to extract compression information from.
* @param cs The settings for the compression
* @param sampleSize The number of rows to extract from the input data to extract information from.
* @param k The parallelization degree
* @return A new CompressionSizeEstimator used to extract information of column groups
*/
public static AComEst createEstimator(MatrixBlock data, CompressionSettings cs, int sampleSize, int k) {
final int nRows = cs.transposed ? data.getNumColumns() : data.getNumRows();
return createEstimator(data, cs, sampleSize, k, nRows);
}
private static AComEst createEstimator(MatrixBlock data, CompressionSettings cs, int sampleSize, int k, int nRows) {
if(sampleSize >= (double) nRows * 0.8) // if sample size is larger than 80% use entire input as sample.
return createExactEstimator(data, cs);
else
return createSampleEstimator(data, cs, sampleSize, k);
}
private static ComEstExact createExactEstimator(MatrixBlock data, CompressionSettings cs) {
LOG.debug("Using full sample");
return new ComEstExact(data, cs);
}
private static ComEstCompressed createCompressedEstimator(CompressedMatrixBlock data, CompressionSettings cs) {
LOG.debug("Using Compressed Estimator");
return new ComEstCompressed(data, cs);
}
private static ComEstSample createSampleEstimator(MatrixBlock data, CompressionSettings cs, int sampleSize, int k) {
LOG.debug("Using sample size: " + sampleSize);
return new ComEstSample(data, cs, sampleSize, k);
}
/**
* Get sampleSize based on compression settings.
*
* @param cs The compression settings
* @param nRows Number of rows in input
* @param nCols Number of columns in input
* @param sparsity The sparsity of the input
* @return a sample size
*/
private static int getSampleSize(CompressionSettings cs, int nRows, int nCols, double sparsity) {
final int maxSize = Math.min(cs.maxSampleSize, nRows);
return getSampleSize(cs.samplePower, nRows, nCols, sparsity, cs.minimumSampleSize, maxSize);
}
/**
* This function returns the sample size to use.
*
* The sampling is bound by the maximum sampling and the minimum sampling.
*
* The sampling is calculated based on the a power of the number of rows and a sampling fraction
*
* @param samplePower The sample power
* @param nRows The number of rows
* @param nCols The number of columns
* @param sparsity The sparsity of the input
* @param minimumSampleSize The minimum sample size
* @param maxSampleSize The maximum sample size
* @return The sample size to use.
*/
private static int getSampleSize(double samplePower, int nRows, int nCols, double sparsity, int minSampleSize,
int maxSampleSize) {
// Start sample size at the min sample size as the basis sample.
int sampleSize = minSampleSize;
// ensure samplePower is in valid range
samplePower = Math.max(0, Math.min(1, samplePower));
// Scale the sample size with the number of rows in the input.
// Sub linearly since the the number of rows needed to classify the contained values in a population doesn't scale
// linearly.
sampleSize += (int) Math.ceil(Math.pow(nRows, samplePower));
// Scale sample size based on overall sparsity so that if the input is very sparse, increase the sample size.
sampleSize = (int) (sampleSize * (1.0 / Math.min(sparsity + 0.2, 1.0)));
// adhere to maximum sample size.
sampleSize = (int) Math.max(minSampleSize, Math.min(sampleSize, maxSampleSize));
return sampleSize;
}
}