src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java - systemds - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.sysds.runtime.compress;

 import java.util.EnumSet;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.cocode.CoCoderFactory.PartitionerType;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
 import org.apache.sysds.runtime.compress.colgroup.insertionsort.InsertionSorterFactory.SORT_TYPE;
 import org.apache.sysds.runtime.compress.cost.CostEstimatorFactory.CostType;
 import org.apache.sysds.runtime.compress.estim.sample.SampleEstimatorFactory.EstimationType;

 /**
  * Compression Settings class, used as a bundle of parameters inside the Compression framework. See
  * CompressionSettingsBuilder for default non static parameters.
  */
 public class CompressionSettings {
 	private static final Log LOG = LogFactory.getLog(CompressionSettings.class.getName());

 	/** Parallelization threshold for DDC compression */
 	public static int PAR_DDC_THRESHOLD = 10000;

 	/**
 	 * Size of the blocks used in a blocked bitmap representation. Note it is exactly Character.MAX_VALUE. This is not
 	 * Character max value + 1 because it breaks the offsets in cases with fully dense values.
 	 */
 	public static final int BITMAP_BLOCK_SZ = Character.MAX_VALUE;

 	/**
 	 * Sorting of values by physical length helps by 10-20%, especially for serial, while slight performance decrease for
 	 * parallel incl multi-threaded, hence not applied for distributed operations (also because compression time +
 	 * garbage collection increases)
 	 */
 	public final boolean sortTuplesByFrequency;

 	/**
 	 * The sampling ratio used when choosing ColGroups. Note that, default behavior is to use exact estimator if the
 	 * number of elements is below 1000.
 	 *
 	 * DEPRECATED
 	 */
 	public final double samplingRatio;

 	/**
 	 * The sampling ratio power to use when choosing sample size. This is used in accordance to the function:
 	 *
 	 * sampleSize += nRows^samplePower;
 	 *
 	 * The value is bounded to be in the range of 0 to 1, 1 giving a sample size of everything, and 0 adding 1.
 	 */
 	public final double samplePower;

 	/** Share DDC Dictionaries between ColGroups. */
 	public final boolean allowSharedDictionary;

 	/** Boolean specifying which transpose setting is used, can be auto, true or false */
 	public final String transposeInput;

 	/** If the seed is -1 then the system used system millisecond time and class hash for seeding. */
 	public final int seed;

 	/** True if lossy compression is enabled */
 	public final boolean lossy;

 	/** The selected method for column partitioning used in CoCoding compressed columns */
 	public final PartitionerType columnPartitioner;

 	/** The cost computation type for the compression */
 	public final CostType costComputationType;

 	/** The maximum number of columns CoCoded allowed */
 	public final int maxColGroupCoCode;

 	/**
 	 * A Cocode parameter that differ in behavior based on compression method, in general it is a value that reflects
 	 * aggressively likely coCoding is used.
 	 */
 	public final double coCodePercentage;

 	/**
 	 * Valid Compressions List, containing the ColGroup CompressionTypes that are allowed to be used for the compression
 	 * Default is to always allow for Uncompromisable ColGroup.
 	 */
 	public final EnumSet<CompressionType> validCompressions;

 	/** The minimum size of the sample extracted. */
 	public final int minimumSampleSize;

 	/** The maximum size of the sample extracted. */
 	public final int maxSampleSize;

 	/** The sample type used for sampling */
 	public final EstimationType estimationType;

 	/**
 	 * Transpose input matrix, to optimize access when extracting bitmaps. This setting is changed inside the script
 	 * based on the transposeInput setting.
 	 *
 	 * This is intentionally left as a mutable value, since the transposition of the input matrix is decided in phase 3.
 	 */
 	public boolean transposed = false;

 	/** The minimum compression ratio to achieve. */
 	public final double minimumCompressionRatio;

 	/** Is a spark instruction */
 	public final boolean isInSparkInstruction;

 	/** The sorting type used in sorting/joining offsets to create SDC groups */
 	public final SORT_TYPE sdcSortType;

 	protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary,
 		String transposeInput, int seed, boolean lossy, EnumSet<CompressionType> validCompressions,
 		boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage,
 		int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType,
 		double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType) {
 		this.samplingRatio = samplingRatio;
 		this.samplePower = samplePower;
 		this.allowSharedDictionary = allowSharedDictionary;
 		this.transposeInput = transposeInput;
 		this.seed = seed == -1 ? (int) System.nanoTime() : seed;
 		this.validCompressions = validCompressions;
 		this.lossy = lossy;
 		this.sortTuplesByFrequency = sortValuesByLength;
 		this.columnPartitioner = columnPartitioner;
 		this.maxColGroupCoCode = maxColGroupCoCode;
 		this.coCodePercentage = coCodePercentage;
 		this.minimumSampleSize = minimumSampleSize;
 		this.maxSampleSize = maxSampleSize;
 		this.estimationType = estimationType;
 		this.costComputationType = costComputationType;
 		this.minimumCompressionRatio = minimumCompressionRatio;
 		this.isInSparkInstruction = isInSparkInstruction;
 		this.sdcSortType = sdcSortType;
 		if(LOG.isDebugEnabled())
 			LOG.debug(this.toString());
 	}

 	public boolean isRLEAllowed(){
 		return this.validCompressions.contains(CompressionType.RLE);
 	}

 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 		sb.append("CompressionSettings: ");
 		sb.append("\t Valid Compressions: " + validCompressions);
 		sb.append("\t Share dict: " + allowSharedDictionary);
 		sb.append("\t Partitioner: " + columnPartitioner);
 		sb.append("\t Lossy: " + lossy);
 		sb.append("\t Cost Computation Type: " + costComputationType);
 		if(samplingRatio < 1.0)
 			sb.append("\t Estimation Type: " + estimationType);
 		return sb.toString();
 	}
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.sysds.runtime.compress;

	import java.util.EnumSet;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;
	import org.apache.sysds.runtime.compress.cocode.CoCoderFactory.PartitionerType;
	import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
	import org.apache.sysds.runtime.compress.colgroup.insertionsort.InsertionSorterFactory.SORT_TYPE;
	import org.apache.sysds.runtime.compress.cost.CostEstimatorFactory.CostType;
	import org.apache.sysds.runtime.compress.estim.sample.SampleEstimatorFactory.EstimationType;

	/**
	* Compression Settings class, used as a bundle of parameters inside the Compression framework. See
	* CompressionSettingsBuilder for default non static parameters.
	*/
	public class CompressionSettings {
	private static final Log LOG = LogFactory.getLog(CompressionSettings.class.getName());

	/** Parallelization threshold for DDC compression */
	public static int PAR_DDC_THRESHOLD = 10000;

	/**
	* Size of the blocks used in a blocked bitmap representation. Note it is exactly Character.MAX_VALUE. This is not
	* Character max value + 1 because it breaks the offsets in cases with fully dense values.
	*/
	public static final int BITMAP_BLOCK_SZ = Character.MAX_VALUE;

	/**
	* Sorting of values by physical length helps by 10-20%, especially for serial, while slight performance decrease for
	* parallel incl multi-threaded, hence not applied for distributed operations (also because compression time +
	* garbage collection increases)
	*/
	public final boolean sortTuplesByFrequency;

	/**
	* The sampling ratio used when choosing ColGroups. Note that, default behavior is to use exact estimator if the
	* number of elements is below 1000.
	*
	* DEPRECATED
	*/
	public final double samplingRatio;

	/**
	* The sampling ratio power to use when choosing sample size. This is used in accordance to the function:
	*
	* sampleSize += nRows^samplePower;
	*
	* The value is bounded to be in the range of 0 to 1, 1 giving a sample size of everything, and 0 adding 1.
	*/
	public final double samplePower;

	/** Share DDC Dictionaries between ColGroups. */
	public final boolean allowSharedDictionary;

	/** Boolean specifying which transpose setting is used, can be auto, true or false */
	public final String transposeInput;

	/** If the seed is -1 then the system used system millisecond time and class hash for seeding. */
	public final int seed;

	/** True if lossy compression is enabled */
	public final boolean lossy;

	/** The selected method for column partitioning used in CoCoding compressed columns */
	public final PartitionerType columnPartitioner;

	/** The cost computation type for the compression */
	public final CostType costComputationType;

	/** The maximum number of columns CoCoded allowed */
	public final int maxColGroupCoCode;

	/**
	* A Cocode parameter that differ in behavior based on compression method, in general it is a value that reflects
	* aggressively likely coCoding is used.
	*/
	public final double coCodePercentage;

	/**
	* Valid Compressions List, containing the ColGroup CompressionTypes that are allowed to be used for the compression
	* Default is to always allow for Uncompromisable ColGroup.
	*/
	public final EnumSet<CompressionType> validCompressions;

	/** The minimum size of the sample extracted. */
	public final int minimumSampleSize;

	/** The maximum size of the sample extracted. */
	public final int maxSampleSize;

	/** The sample type used for sampling */
	public final EstimationType estimationType;

	/**
	* Transpose input matrix, to optimize access when extracting bitmaps. This setting is changed inside the script
	* based on the transposeInput setting.
	*
	* This is intentionally left as a mutable value, since the transposition of the input matrix is decided in phase 3.
	*/
	public boolean transposed = false;

	/** The minimum compression ratio to achieve. */
	public final double minimumCompressionRatio;

	/** Is a spark instruction */
	public final boolean isInSparkInstruction;

	/** The sorting type used in sorting/joining offsets to create SDC groups */
	public final SORT_TYPE sdcSortType;

	protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary,
	String transposeInput, int seed, boolean lossy, EnumSet<CompressionType> validCompressions,
	boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage,
	int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType,
	double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType) {
	this.samplingRatio = samplingRatio;
	this.samplePower = samplePower;
	this.allowSharedDictionary = allowSharedDictionary;
	this.transposeInput = transposeInput;
	this.seed = seed == -1 ? (int) System.nanoTime() : seed;
	this.validCompressions = validCompressions;
	this.lossy = lossy;
	this.sortTuplesByFrequency = sortValuesByLength;
	this.columnPartitioner = columnPartitioner;
	this.maxColGroupCoCode = maxColGroupCoCode;
	this.coCodePercentage = coCodePercentage;
	this.minimumSampleSize = minimumSampleSize;
	this.maxSampleSize = maxSampleSize;
	this.estimationType = estimationType;
	this.costComputationType = costComputationType;
	this.minimumCompressionRatio = minimumCompressionRatio;
	this.isInSparkInstruction = isInSparkInstruction;
	this.sdcSortType = sdcSortType;
	if(LOG.isDebugEnabled())
	LOG.debug(this.toString());
	}

	public boolean isRLEAllowed(){
	return this.validCompressions.contains(CompressionType.RLE);
	}

	@Override
	public String toString() {
	StringBuilder sb = new StringBuilder();
	sb.append("CompressionSettings: ");
	sb.append("\t Valid Compressions: " + validCompressions);
	sb.append("\t Share dict: " + allowSharedDictionary);
	sb.append("\t Partitioner: " + columnPartitioner);
	sb.append("\t Lossy: " + lossy);
	sb.append("\t Cost Computation Type: " + costComputationType);
	if(samplingRatio < 1.0)
	sb.append("\t Estimation Type: " + estimationType);
	return sb.toString();
	}
	}