blob: 023d2017f2d9215d80b3c13619d6f319738a341e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysds.test.component.compress;
import static org.junit.Assert.assertTrue;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.DoubleSummaryStatistics;
import java.util.List;
import java.util.Random;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.runtime.compress.colgroup.ColGroup.CompressionType;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
import org.apache.sysds.runtime.util.DataConverter;
/**
* WARNING, this compressible input generator generates transposed inputs, (rows and cols are switched) this is because
* then the test does not need to transpose the input for the colGroups that expect transposed inputs.
*
*/
public class CompressibleInputGenerator {
protected static final Log LOG = LogFactory.getLog(CompressibleInputGenerator.class.getName());
public static MatrixBlock getInput(int rows, int cols, CompressionType ct, int nrUnique, double sparsity,
int seed) {
double[][] output = getInputDoubleMatrix(rows, cols, ct, nrUnique, 1000000, -1000000, sparsity, seed, false);
return DataConverter.convertToMatrixBlock(output);
}
public static MatrixBlock getInput(int rows, int cols, CompressionType ct, int nrUnique, int max, int min,
double sparsity, int seed) {
double[][] output = getInputDoubleMatrix(rows, cols, ct, nrUnique, max, min, sparsity, seed, false);
return DataConverter.convertToMatrixBlock(output);
}
public static double[][] getInputDoubleMatrix(int rows, int cols, CompressionType ct, int nrUnique, int max,
int min, double sparsity, int seed, boolean transpose) {
double[][] output;
switch(ct) {
case RLE:
output = rle(rows, cols, nrUnique, max, min, sparsity, seed, transpose);
break;
case OLE:
output = ole(rows, cols, nrUnique, max, min, sparsity, seed, transpose);
break;
default:
throw new NotImplementedException("Not implemented generator.");
}
for(double[] x : output) {
DoubleSummaryStatistics stat = Arrays.stream(x).summaryStatistics();
double maxV = stat.getMax();
double minV = stat.getMin();
// LOG.debug("MAX: " + maxV + " - MIN:" + minV);
assertTrue(maxV <= max);
assertTrue(minV >= min);
}
return output;
}
private static double[][] rle(int rows, int cols, int nrUnique, int max, int min, double sparsity, int seed,
boolean transpose) {
Random r = new Random(seed);
List<Double> values = getNRandomValues(nrUnique, r, max, min);
double[][] matrix = transpose ? new double[rows][cols] : new double[cols][rows];
for(int colNr = 0; colNr < cols; colNr++) {
Collections.shuffle(values, r);
// Generate a Dirichlet distribution, to distribute the values
int[] occurences = makeDirichletDistribution(nrUnique, rows, r);
int pointer = 0;
int valuePointer = 0;
for(int nr : occurences) {
int zeros = (int) (Math.floor(nr * (1.0 - sparsity)));
int before = (zeros > 0) ? r.nextInt(zeros) : 0;
int after = zeros - before;
pointer += before;
for(int i = before; i < nr - after; i++) {
if(transpose) {
matrix[pointer][colNr] = values.get(valuePointer);
}
else {
matrix[colNr][pointer] = values.get(valuePointer);
}
pointer++;
}
pointer += after;
valuePointer++;
if(valuePointer == values.size() && after == 0) {
while(pointer < rows) {
if(transpose) {
matrix[pointer][colNr] = values.get(nrUnique - 1);
}
else {
matrix[colNr][pointer] = values.get(nrUnique - 1);
}
pointer++;
}
}
}
}
return matrix;
}
/**
* Note ole compress the best if there are multiple correlated columns. Therefore the multiple columns are needed
* for good compressions. Also Nr Unique is only associated to a specific column in this compression, so the number
* of uniques are only in a single column, making actual the nrUnique (cols * nrUnique) Does not guaranty that all
* the nr uniques are in use, since the values are randomly selected.
*
* @param rows Number of rows in generated output
* @param cols Number of cols in generated output
* @param nrUnique Number of unique values in generated output, Note this means base unique in this case. and this
* number will grow according to sparsity as well.
* @param max The Maximum Value contained
* @param min The Minimum value contained
* @param sparsity The sparsity of the generated matrix (only applicable to the first column)
* @param seed The seed of the generated matrix
* @param transpose If the output should be a transposed matrix or not
* @return Generated nicely compressible OLE col Group.
*/
private static double[][] ole(int rows, int cols, int nrUnique, int max, int min, double sparsity, int seed,
boolean transpose) {
// chose some random values
Random r = new Random(seed);
List<Double> values = getNRandomValues(nrUnique, r, max, min);
double[][] matrix = transpose ? new double[rows][cols] : new double[cols][rows];
// Generate the first column.
for(int x = 0; x < rows; x++) {
if(transpose) {
matrix[x][0] = values.get(r.nextInt(nrUnique));
// LOG.debug(matrix[x][0]);
}
else {
matrix[0][x] = values.get(r.nextInt(nrUnique));
}
}
for(int y = 1; y < cols; y++) {
for(int x = 0; x < rows; x++) {
if(r.nextDouble() < sparsity) {
if(transpose) {
double off = (double) y;
int v = (int) (matrix[x][0] * off);
matrix[x][y] = Math.abs((v) % ((int) (max - min))) + min;
}
else {
double off = (double) y;
int v = (int) (matrix[0][x] * off);
matrix[y][x] = Math.abs((v) % ((int) (max - min))) + min;
// matrix[y][x] = ((int) (matrix[0][x] * y + y)) % ((int) (max - min)) + min;
}
}
}
}
for(int x = 0; x < rows; x++) {
if(r.nextDouble() > sparsity) {
if(transpose) {
matrix[x][0] = 0.0;
// LOG.debug(matrix[x][0]);
}
else {
matrix[0][x] = 0.0;
}
}
}
return matrix;
}
private static int[] makeDirichletDistribution(int nrUnique, int rows, Random r) {
double[] distribution = new double[nrUnique];
double sum = 0;
for(int i = 0; i < nrUnique; i++) {
distribution[i] = r.nextDouble();
sum += distribution[i];
}
int[] occurences = new int[nrUnique];
for(int i = 0; i < nrUnique; i++) {
occurences[i] = (int) (((double) distribution[i] / (double) sum) * (double) rows);
}
return occurences;
}
private static List<Double> getNRandomValues(int nrUnique, Random r, int max, int min) {
List<Double> values = new ArrayList<>();
for(int i = 0; i < nrUnique; i++) {
values.add((r.nextDouble() * (max - min)) + min);
// values.add(Math.floor(v));
}
// LOG.debug(values);
return values;
}
}