src/main/java/org/apache/sysds/hops/OptimizerUtils.java - systemds - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.sysds.hops;

 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.common.Types.ExecMode;
 import org.apache.sysds.common.Types.FileFormat;
 import org.apache.sysds.common.Types.OpOp1;
 import org.apache.sysds.common.Types.OpOp2;
 import org.apache.sysds.common.Types.OpOpData;
 import org.apache.sysds.common.Types.ReOrgOp;
 import org.apache.sysds.common.Types.ValueType;
 import org.apache.sysds.conf.CompilerConfig;
 import org.apache.sysds.conf.CompilerConfig.ConfigType;
 import org.apache.sysds.conf.ConfigurationManager;
 import org.apache.sysds.conf.DMLConfig;
 import org.apache.sysds.hops.rewrite.HopRewriteUtils;
 import org.apache.sysds.lops.Checkpoint;
 import org.apache.sysds.lops.Lop;
 import org.apache.sysds.lops.LopProperties.ExecType;
 import org.apache.sysds.lops.compile.Dag;
 import org.apache.sysds.parser.ForStatementBlock;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.ForProgramBlock;
 import org.apache.sysds.runtime.controlprogram.LocalVariableMap;
 import org.apache.sysds.runtime.controlprogram.caching.LazyWriteBuffer;
 import org.apache.sysds.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.IntegerDivide;
 import org.apache.sysds.runtime.functionobjects.Modulus;
 import org.apache.sysds.runtime.instructions.cp.Data;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.meta.DataCharacteristics;
 import org.apache.sysds.runtime.meta.MatrixCharacteristics;
 import org.apache.sysds.runtime.util.IndexRange;
 import org.apache.sysds.runtime.util.UtilFunctions;

 import java.util.Arrays;
 import java.util.HashMap;

 public class OptimizerUtils
 {
 	////////////////////////////////////////////////////////
 	// Optimizer constants and flags (incl tuning knobs)  //
 	////////////////////////////////////////////////////////
 	/**
 	 * Utilization factor used in deciding whether an operation to be scheduled on CP or MR.
 	 * NOTE: it is important that MEM_UTIL_FACTOR+CacheableData.CACHING_BUFFER_SIZE &lt; 1.0
 	 */
 	public static double MEM_UTIL_FACTOR = 0.7d;

 	/** Default blocksize if unspecified or for testing purposes */
 	public static final int DEFAULT_BLOCKSIZE = 1000;

 	/** Default frame blocksize */
 	public static final int DEFAULT_FRAME_BLOCKSIZE = 1000;

 	/** Default optimization level if unspecified */
 	public static final OptimizationLevel DEFAULT_OPTLEVEL =
 			OptimizationLevel.O2_LOCAL_MEMORY_DEFAULT;

 	/**
 	 * Default memory size, which is used if the actual estimate can not be computed
 	 * e.g., when input/output dimensions are unknown. The default is set to a large
 	 * value so that operations are scheduled on MR while avoiding overflows as well.
 	 */
 	public static double DEFAULT_SIZE;


 	public static final long DOUBLE_SIZE = 8;
 	public static final long INT_SIZE = 4;
 	public static final long CHAR_SIZE = 1;
 	public static final long BOOLEAN_SIZE = 1;
 	public static final double INVALID_SIZE = -1d; // memory estimate not computed

 	//constants for valid CP matrix dimension sizes / nnz (dense/sparse)
 	public static final long MAX_NUMCELLS_CP_DENSE = Integer.MAX_VALUE;
 	public static final long MAX_NNZ_CP_SPARSE = (MatrixBlock.DEFAULT_SPARSEBLOCK ==
 			SparseBlock.Type.MCSR) ? Long.MAX_VALUE : Integer.MAX_VALUE;

 	public static final long SAFE_REP_CHANGE_THRES = 8 * 1024 *1024; //8MB

 	/**
 	 * Enables common subexpression elimination in dags. There is however, a potential tradeoff
 	 * between computation redundancy and data transfer between MR jobs. Since, we do not reason
 	 * about transferred data yet, this rewrite rule is enabled by default.
 	 */
 	public static boolean ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = true;

 	/**
 	 * Enables constant folding in dags. Constant folding computes simple expressions of binary
 	 * operations and literals and replaces the hop sub-DAG with a new literal operator.
 	 */
 	public static boolean ALLOW_CONSTANT_FOLDING = true;

 	public static boolean ALLOW_ALGEBRAIC_SIMPLIFICATION = true;
 	public static boolean ALLOW_OPERATOR_FUSION = true;

 	/**
 	 * Enables if-else branch removal for constant predicates (original literals or
 	 * results of constant folding).
 	 *
 	 */
 	public static boolean ALLOW_BRANCH_REMOVAL = true;

 	public static boolean ALLOW_AUTO_VECTORIZATION = true;

 	/**
 	 * Enables simple expression evaluation for datagen parameters 'rows', 'cols'. Simple
 	 * expressions are defined as binary operations on literals and nrow/ncol. This applies
 	 * only to exact size information.
 	 */
 	public static boolean ALLOW_SIZE_EXPRESSION_EVALUATION = true;

 	/**
 	 * Enables simple expression evaluation for datagen parameters 'rows', 'cols'. Simple
 	 * expressions are defined as binary operations on literals and b(+) or b(*) on nrow/ncol.
 	 * This applies also to worst-case size information.
 	 */
 	public static boolean ALLOW_WORSTCASE_SIZE_EXPRESSION_EVALUATION = true;

 	public static boolean ALLOW_RAND_JOB_RECOMPILE = true;

 	/**
 	 * Enables parfor runtime piggybacking of MR jobs into the packed jobs for
 	 * scan sharing.
 	 */
 	public static boolean ALLOW_RUNTIME_PIGGYBACKING = true;

 	/**
 	 * Enables interprocedural analysis between main script and functions as well as functions
 	 * and other functions. This includes, for example, to propagate statistics into functions
 	 * if save to do so (e.g., if called once).
 	 */
 	public static boolean ALLOW_INTER_PROCEDURAL_ANALYSIS = true;

 	/**
 	 * Number of inter-procedural analysis (IPA) repetitions. If set to {@literal >=2}, we apply
 	 * IPA multiple times in order to allow scalar propagation over complex function call
 	 * graphs and various interactions between constant propagation, constant folding,
 	 * and other rewrites such as branch removal and the merge of statement block sequences.
 	 */
 	public static int IPA_NUM_REPETITIONS = 5;

 	/**
 	 * Enables sum product rewrites such as mapmultchains. In the future, this will cover
 	 * all sum-product related rewrites.
 	 */
 	public static boolean ALLOW_SUM_PRODUCT_REWRITES = true;

 	/**
 	 * Enables a specific hop dag rewrite that splits hop dags after csv persistent reads with
 	 * unknown size in order to allow for recompile.
 	 */
 	public static boolean ALLOW_SPLIT_HOP_DAGS = true;

 	/**
 	 * Enables a specific rewrite that enables update in place for loop variables that are
 	 * only read/updated via cp leftindexing.
 	 */
 	public static boolean ALLOW_LOOP_UPDATE_IN_PLACE = true;

 	/**
 	 * Enables a specific rewrite for code motion, i.e., hoisting loop invariant code
 	 * out of while, for, and parfor loops.
 	 */
 	public static boolean ALLOW_CODE_MOTION = false;


 	/**
 	 * Specifies a multiplier computing the degree of parallelism of parallel
 	 * text read/write out of the available degree of parallelism. Set it to 1.0
 	 * to get a number of threads equal the number of virtual cores.
 	 *
 	 */
 	public static final double PARALLEL_CP_READ_PARALLELISM_MULTIPLIER = 1.0;
 	public static final double PARALLEL_CP_WRITE_PARALLELISM_MULTIPLIER = 1.0;

 	/**
 	 * Enables the use of CombineSequenceFileInputFormat with splitsize = 2x hdfs blocksize,
 	 * if sort buffer size large enough and parallelism not hurt. This solves to issues:
 	 * (1) it combines small files (depending on producers), and (2) it reduces task
 	 * latency of large jobs with many tasks by factor 2.
 	 *
 	 */
 	public static final boolean ALLOW_COMBINE_FILE_INPUT_FORMAT = true;

 	//////////////////////
 	// Optimizer levels //
 	//////////////////////

 	/**
 	 * Optimization Types for Compilation
 	 *
 	 *  O0 STATIC - Decisions for scheduling operations on CP/MR are based on
 	 *  predefined set of rules, which check if the dimensions are below a
 	 *  fixed/static threshold (OLD Method of choosing between CP and MR).
 	 *  The optimization scope is LOCAL, i.e., per statement block.
 	 *  Advanced rewrites like constant folding, common subexpression elimination,
 	 *  or inter procedural analysis are NOT applied.
 	 *
 	 *  O1 MEMORY_BASED - Every operation is scheduled on CP or MR, solely
 	 *  based on the amount of memory required to perform that operation.
 	 *  It does NOT take the execution time into account.
 	 *  The optimization scope is LOCAL, i.e., per statement block.
 	 *  Advanced rewrites like constant folding, common subexpression elimination,
 	 *  or inter procedural analysis are NOT applied.
 	 *
 	 *  O2 MEMORY_BASED - Every operation is scheduled on CP or MR, solely
 	 *  based on the amount of memory required to perform that operation.
 	 *  It does NOT take the execution time into account.
 	 *  The optimization scope is LOCAL, i.e., per statement block.
 	 *  All advanced rewrites are applied. This is the default optimization
 	 *  level of SystemDS.
 	 *
 	 *  O3 GLOBAL TIME_MEMORY_BASED - Operation scheduling on CP or MR as well as
 	 *  many other rewrites of data flow properties such as block size, partitioning,
 	 *  replication, vectorization, etc are done with the optimization objective of
 	 *  minimizing execution time under hard memory constraints per operation and
 	 *  execution context. The optimization scope if GLOBAL, i.e., program-wide.
 	 *  All advanced rewrites are applied. This optimization level requires more
 	 *  optimization time but has higher optimization potential.
 	 *
 	 *  O4 DEBUG MODE - All optimizations, global and local, which interfere with
 	 *  breakpoints are NOT applied. This optimization level is REQUIRED for the
 	 *  compiler running in debug mode.
 	 */
 	public enum OptimizationLevel {
 		O0_LOCAL_STATIC,
 		O1_LOCAL_MEMORY_MIN,
 		O2_LOCAL_MEMORY_DEFAULT,
 		O3_LOCAL_RESOURCE_TIME_MEMORY,
 		O4_GLOBAL_TIME_MEMORY,
 		O5_DEBUG_MODE,
 	}

 	public static OptimizationLevel getOptLevel() {
 		int optlevel = ConfigurationManager.getCompilerConfig().getInt(ConfigType.OPT_LEVEL);
 		return OptimizationLevel.values()[optlevel];
 	}

 	public static boolean isMemoryBasedOptLevel() {
 		return (getOptLevel() != OptimizationLevel.O0_LOCAL_STATIC);
 	}

 	public static boolean isOptLevel( OptimizationLevel level ){
 		return (getOptLevel() == level);
 	}

 	public static CompilerConfig constructCompilerConfig( DMLConfig dmlconf ) {
 		return constructCompilerConfig(new CompilerConfig(), dmlconf);
 	}

 	public static CompilerConfig constructCompilerConfig( CompilerConfig cconf, DMLConfig dmlconf )
 	{
 		//each script sets its own block size, opt level etc
 		cconf.set(ConfigType.BLOCK_SIZE, dmlconf.getIntValue( DMLConfig.DEFAULT_BLOCK_SIZE ));

 		//handle optimization level
 		int optlevel = dmlconf.getIntValue(DMLConfig.OPTIMIZATION_LEVEL);
 		if( optlevel < 0 || optlevel > 7 )
 			throw new DMLRuntimeException("Error: invalid optimization level '"+optlevel+"' (valid values: 0-5).");

 		switch( optlevel )
 		{
 			// opt level 0: static dimensionality
 			case 0:
 				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O0_LOCAL_STATIC.ordinal());
 				ALLOW_CONSTANT_FOLDING = false;
 				ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = false;
 				ALLOW_ALGEBRAIC_SIMPLIFICATION = false;
 				ALLOW_AUTO_VECTORIZATION = false;
 				ALLOW_INTER_PROCEDURAL_ANALYSIS = false;
 				IPA_NUM_REPETITIONS = 1;
 				ALLOW_BRANCH_REMOVAL = false;
 				ALLOW_SUM_PRODUCT_REWRITES = false;
 				break;
 			// opt level 1: memory-based (no advanced rewrites)
 			case 1:
 				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O1_LOCAL_MEMORY_MIN.ordinal());
 				ALLOW_CONSTANT_FOLDING = false;
 				ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = false;
 				ALLOW_ALGEBRAIC_SIMPLIFICATION = false;
 				ALLOW_AUTO_VECTORIZATION = false;
 				ALLOW_INTER_PROCEDURAL_ANALYSIS = false;
 				IPA_NUM_REPETITIONS = 1;
 				ALLOW_BRANCH_REMOVAL = false;
 				ALLOW_SUM_PRODUCT_REWRITES = false;
 				ALLOW_LOOP_UPDATE_IN_PLACE = false;
 				break;
 			// opt level 2: memory-based (all advanced rewrites)
 			case 2:
 				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O2_LOCAL_MEMORY_DEFAULT.ordinal());
 				break;
 			// opt level 3: resource optimization, time- and memory-based (2 w/ resource optimizat)
 			case 3:
 				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O3_LOCAL_RESOURCE_TIME_MEMORY.ordinal());
 			break;

 			// opt level 3: global, time- and memory-based (all advanced rewrites)
 			case 4:
 				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O4_GLOBAL_TIME_MEMORY.ordinal());
 				break;

 			// opt level 6 and7: SPOOF w/o fused operators, otherwise same as O2
 			// (hidden optimization levels not documented on purpose, as they will
 			// be removed once SPOOF is production ready)
 			case 6:
 				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O2_LOCAL_MEMORY_DEFAULT.ordinal());
 				ALLOW_AUTO_VECTORIZATION = false;
 				break;
 			case 7:
 				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O2_LOCAL_MEMORY_DEFAULT.ordinal());
 				ALLOW_OPERATOR_FUSION = false;
 				ALLOW_AUTO_VECTORIZATION = false;
 				ALLOW_SUM_PRODUCT_REWRITES = false;
 				break;
 		}

 		//handle parallel text io (incl awareness of thread contention in <jdk8)
 		if (!dmlconf.getBooleanValue(DMLConfig.CP_PARALLEL_IO)) {
 			cconf.set(ConfigType.PARALLEL_CP_READ_TEXTFORMATS, false);
 			cconf.set(ConfigType.PARALLEL_CP_WRITE_TEXTFORMATS, false);
 			cconf.set(ConfigType.PARALLEL_CP_READ_BINARYFORMATS, false);
 			cconf.set(ConfigType.PARALLEL_CP_WRITE_BINARYFORMATS, false);
 		}

 		//handle parallel matrix mult / rand configuration
 		if (!dmlconf.getBooleanValue(DMLConfig.CP_PARALLEL_OPS)) {
 			cconf.set(ConfigType.PARALLEL_CP_MATRIX_OPERATIONS, false);
 		}

 		return cconf;
 	}

 	public static void resetStaticCompilerFlags() {
 		//TODO this is a workaround for MLContext to avoid a major refactoring before the release; this method
 		//should be removed as soon all modified static variables are properly handled in the compiler config
 		ALLOW_ALGEBRAIC_SIMPLIFICATION = true;
 		ALLOW_AUTO_VECTORIZATION = true;
 		ALLOW_BRANCH_REMOVAL = true;
 		ALLOW_CONSTANT_FOLDING = true;
 		ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = true;
 		ALLOW_INTER_PROCEDURAL_ANALYSIS = true;
 		ALLOW_LOOP_UPDATE_IN_PLACE = true;
 		ALLOW_OPERATOR_FUSION = true;
 		ALLOW_RAND_JOB_RECOMPILE = true;
 		ALLOW_SIZE_EXPRESSION_EVALUATION = true;
 		ALLOW_SPLIT_HOP_DAGS = true;
 		ALLOW_SUM_PRODUCT_REWRITES = true;
 		ALLOW_WORSTCASE_SIZE_EXPRESSION_EVALUATION = true;
 		IPA_NUM_REPETITIONS = 3;
 	}

 	public static long getDefaultSize() {
 		//we need to set default_size larger than any execution context
 		//memory budget, however, it should not produce overflows on sum
 		return InfrastructureAnalyzer.getLocalMaxMemory();
 	}

 	public static void resetDefaultSize() {
 		DEFAULT_SIZE = getDefaultSize();
 	}


 	public static int getDefaultFrameSize() {
 		return DEFAULT_FRAME_BLOCKSIZE;
 	}

 	/**
 	 * Returns memory budget (according to util factor) in bytes
 	 *
 	 * @return local memory budget
 	 */
 	public static double getLocalMemBudget() {
 		double ret = InfrastructureAnalyzer.getLocalMaxMemory();
 		return ret * OptimizerUtils.MEM_UTIL_FACTOR;
 	}

 	public static boolean isMaxLocalParallelism(int k) {
 		return InfrastructureAnalyzer.getLocalParallelism() == k;
 	}

 	public static boolean isTopLevelParFor() {
 		//since every local parfor with degree of parallelism k>1 changes the
 		//local memory budget, we can simply probe the current memory fraction
 		return InfrastructureAnalyzer.getLocalMaxMemoryFraction() >= 0.99;
 	}

 	public static boolean checkSparkBroadcastMemoryBudget( double size )
 	{
 		double memBudgetExec = SparkExecutionContext.getBroadcastMemoryBudget();
 		double memBudgetLocal = OptimizerUtils.getLocalMemBudget();

 		//basic requirement: the broadcast needs to to fit once in the remote broadcast memory
 		//and twice into the local memory budget because we have to create a partitioned broadcast
 		//memory and hand it over to the spark context as in-memory object
 		return ( size < memBudgetExec && 2*size < memBudgetLocal );
 	}

 	public static boolean checkSparkBroadcastMemoryBudget( long rlen, long clen, long blen, long nnz ) {
 		double memBudgetExec = SparkExecutionContext.getBroadcastMemoryBudget();
 		double memBudgetLocal = OptimizerUtils.getLocalMemBudget();
 		double sp = getSparsity(rlen, clen, nnz);
 		double size = estimateSizeExactSparsity(rlen, clen, sp);
 		double sizeP = estimatePartitionedSizeExactSparsity(rlen, clen, blen, sp);
 		//basic requirement: the broadcast needs to to fit once in the remote broadcast memory
 		//and twice into the local memory budget because we have to create a partitioned broadcast
 		//memory and hand it over to the spark context as in-memory object
 		return (   OptimizerUtils.isValidCPDimensions(rlen, clen)
 				&& sizeP < memBudgetExec && size+sizeP < memBudgetLocal );
 	}

 	public static boolean checkSparkCollectMemoryBudget(DataCharacteristics dc, long memPinned ) {
 		if (dc instanceof MatrixCharacteristics) {
 			return checkSparkCollectMemoryBudget(dc.getRows(), dc.getCols(),
 				dc.getBlocksize(), dc.getNonZerosBound(), memPinned, false);
 		} else {
 			long[] dims = dc.getDims();
 			return checkSparkCollectMemoryBudget(dims, dc.getNonZeros(), memPinned, false);
 		}
 	}

 	public static boolean checkSparkCollectMemoryBudget(DataCharacteristics dc, long memPinned, boolean checkBP ) {
 		if (dc instanceof MatrixCharacteristics) {
 			return checkSparkCollectMemoryBudget(dc.getRows(), dc.getCols(),
 				dc.getBlocksize(), dc.getNonZerosBound(), memPinned, checkBP);
 		} else {
 			long[] dims = dc.getDims();
 			return checkSparkCollectMemoryBudget(dims, dc.getNonZeros(), memPinned, checkBP);
 		}
 	}

 	private static boolean checkSparkCollectMemoryBudget( long rlen, long clen, int blen, long nnz, long memPinned, boolean checkBP ) {
 		//compute size of output matrix and its blocked representation
 		double sp = getSparsity(rlen, clen, nnz);
 		double memMatrix = estimateSizeExactSparsity(rlen, clen, sp);
 		double memPMatrix = estimatePartitionedSizeExactSparsity(rlen, clen, blen, sp);
 		//check if both output matrix and partitioned matrix fit into local mem budget
 		return (memPinned + memMatrix + memPMatrix < getLocalMemBudget())
 		//check if the output matrix fits into the write buffer to avoid unnecessary evictions
 			&& (!checkBP || memMatrix < LazyWriteBuffer.getWriteBufferLimit());
 	}

 	private static boolean checkSparkCollectMemoryBudget( long[] dims, long nnz, long memPinned, boolean checkBP ) {
 		//compute size of output matrix and its blocked representation
 		//double sp = getSparsity(dims, nnz);
 		// TODO estimate exact size
 		long doubleSize = UtilFunctions.prod(dims) * 8;
 		double memTensor = doubleSize;
 		double memPTensor = doubleSize;
 		//check if both output matrix and partitioned matrix fit into local mem budget
 		return (memPinned + memTensor + memPTensor < getLocalMemBudget())
 				//check if the output matrix fits into the write buffer to avoid unnecessary evictions
 				&& (!checkBP || memTensor < LazyWriteBuffer.getWriteBufferLimit());
 	}

 	public static boolean checkSparseBlockCSRConversion( DataCharacteristics dcIn ) {
 		//we use the non-zero bound to make the important csr decision in
 		//an best effort manner (the precise non-zeros is irrelevant here)
 		double sp = OptimizerUtils.getSparsity(
 			dcIn.getRows(), dcIn.getCols(), dcIn.getNonZerosBound());
 		return Checkpoint.CHECKPOINT_SPARSE_CSR
 			&& sp < MatrixBlock.SPARSITY_TURN_POINT;
 	}

 	/**
 	 * Returns the number of reducers that potentially run in parallel.
 	 * This is either just the configured value (SystemDS config) or
 	 * the minimum of configured value and available reduce slots.
 	 *
 	 * @param configOnly true if configured value
 	 * @return number of reducers
 	 */
 	public static int getNumReducers( boolean configOnly ) {
 		if( isSparkExecutionMode() )
 			return SparkExecutionContext.getDefaultParallelism(false);
 		return InfrastructureAnalyzer.getLocalParallelism();
 	}

 	public static int getNumMappers() {
 		if( isSparkExecutionMode() )
 			return SparkExecutionContext.getDefaultParallelism(false);
 		return InfrastructureAnalyzer.getLocalParallelism();
 	}

 	public static ExecMode getDefaultExecutionMode() {
 		//default execution type is hybrid (cp+mr)
 		ExecMode ret = ExecMode.HYBRID;

 		//switch default to HYBRID (cp+spark) if in spark driver
 		String sparkenv = System.getenv().get("SPARK_ENV_LOADED");
 		if( sparkenv != null && sparkenv.equals("1") )
 			ret = ExecMode.HYBRID;

 		return ret;
 	}

 	public static boolean isSparkExecutionMode() {
 		return DMLScript.getGlobalExecMode() == ExecMode.SPARK
 			|| DMLScript.getGlobalExecMode() == ExecMode.HYBRID;
 	}

 	public static boolean isHybridExecutionMode() {
 		return DMLScript.getGlobalExecMode() == ExecMode.HYBRID;
 	}

 	/**
 	 * Returns the degree of parallelism used for parallel text read.
 	 * This is computed as the number of virtual cores scales by the
 	 * PARALLEL_READ_PARALLELISM_MULTIPLIER. If PARALLEL_READ_TEXTFORMATS
 	 * is disabled, this method returns 1.
 	 *
 	 * @return degree of parallelism
 	 */
 	public static int getParallelTextReadParallelism()
 	{
 		if( !ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_READ_TEXTFORMATS) )
 			return 1; // sequential execution

 		//compute degree of parallelism for parallel text read
 		double dop = InfrastructureAnalyzer.getLocalParallelism()
 				     * PARALLEL_CP_READ_PARALLELISM_MULTIPLIER;
 		return (int) Math.round(dop);
 	}

 	public static int getParallelBinaryReadParallelism()
 	{
 		if( !ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_READ_BINARYFORMATS) )
 			return 1; // sequential execution

 		//compute degree of parallelism for parallel text read
 		double dop = InfrastructureAnalyzer.getLocalParallelism()
 				     * PARALLEL_CP_READ_PARALLELISM_MULTIPLIER;
 		return (int) Math.round(dop);
 	}

 	/**
 	 * Returns the degree of parallelism used for parallel text write.
 	 * This is computed as the number of virtual cores scales by the
 	 * PARALLEL_WRITE_PARALLELISM_MULTIPLIER. If PARALLEL_WRITE_TEXTFORMATS
 	 * is disabled, this method returns 1.
 	 *
 	 * @return degree of parallelism
 	 */
 	public static int getParallelTextWriteParallelism()
 	{
 		if( !ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_WRITE_TEXTFORMATS) )
 			return 1; // sequential execution

 		//compute degree of parallelism for parallel text read
 		double dop = InfrastructureAnalyzer.getLocalParallelism()
 				     * PARALLEL_CP_WRITE_PARALLELISM_MULTIPLIER;
 		return (int) Math.round(dop);
 	}

 	public static int getParallelBinaryWriteParallelism()
 	{
 		if( !ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_WRITE_BINARYFORMATS) )
 			return 1; // sequential execution

 		//compute degree of parallelism for parallel text read
 		double dop = InfrastructureAnalyzer.getLocalParallelism()
 				     * PARALLEL_CP_WRITE_PARALLELISM_MULTIPLIER;
 		return (int) Math.round(dop);
 	}

 	////////////////////////
 	// Memory Estimates   //
 	////////////////////////

 	public static long estimateSize(DataCharacteristics dc) {
 		return estimateSizeExactSparsity(dc);
 	}

 	public static long estimateSizeExactSparsity(DataCharacteristics dc)
 	{
 		return estimateSizeExactSparsity(
 				dc.getRows(),
 				dc.getCols(),
 				dc.getNonZeros());
 	}

 	/**
 	 * Estimates the footprint (in bytes) for an in-memory representation of a
 	 * matrix with dimensions=(nrows,ncols) and and number of non-zeros nnz.
 	 *
 	 * @param nrows number of rows
 	 * @param ncols number of cols
 	 * @param nnz number of non-zeros
 	 * @return memory footprint
 	 */
 	public static long estimateSizeExactSparsity(long nrows, long ncols, long nnz)
 	{
 		double sp = getSparsity(nrows, ncols, nnz);
 		return estimateSizeExactSparsity(nrows, ncols, sp);
 	}

 	/**
 	 * Estimates the footprint (in bytes) for an in-memory representation of a
 	 * matrix with dimensions=(nrows,ncols) and sparsity=sp.
 	 *
 	 * This function can be used directly in Hops, when the actual sparsity is
 	 * known i.e., <code>sp</code> is guaranteed to give worst-case estimate
 	 * (e.g., Rand with a fixed sparsity). In all other cases, estimateSize()
 	 * must be used so that worst-case estimates are computed, whenever
 	 * applicable.
 	 *
 	 * @param nrows number of rows
 	 * @param ncols number of cols
 	 * @param sp sparsity
 	 * @return memory footprint
 	 */
 	public static long estimateSizeExactSparsity(long nrows, long ncols, double sp)
 	{
 		return MatrixBlock.estimateSizeInMemory(nrows,ncols,sp);
 	}

 	/**
 	 * Estimates the footprint (in bytes) for a partitioned in-memory representation of a
 	 * matrix with the given matrix characteristics
 	 *
 	 * @param dc matrix characteristics
 	 * @return memory estimate
 	 */
 	public static long estimatePartitionedSizeExactSparsity(DataCharacteristics dc)
 	{
 		if (dc instanceof MatrixCharacteristics) {
 			return estimatePartitionedSizeExactSparsity(
 				dc.getRows(), dc.getCols(),
 				dc.getBlocksize(), dc.getNonZerosBound());
 		}
 		else {
 			// TODO estimate partitioned size exact for tensor
 			long inaccurateSize = 8; // 8 for double
 			for (int i = 0; i < dc.getNumDims(); i++) {
 				inaccurateSize *= dc.getDim(i);
 			}
 			return inaccurateSize;
 		}
 	}

 	/**
 	 * Estimates the footprint (in bytes) for a partitioned in-memory representation of a
 	 * matrix with dimensions=(nrows,ncols) and number of non-zeros nnz.
 	 *
 	 * @param rlen number of rows
 	 * @param clen number of cols
 	 * @param blen rows/cols per block
 	 * @param nnz number of non-zeros
 	 * @return memory estimate
 	 */
 	public static long estimatePartitionedSizeExactSparsity(long rlen, long clen, long blen, long nnz)  {
 		double sp = getSparsity(rlen, clen, nnz);
 		return estimatePartitionedSizeExactSparsity(rlen, clen, blen, sp);
 	}

 	/**
 	 * Estimates the footprint (in bytes) for a partitioned in-memory representation of a
 	 * matrix with dimensions=(nrows,ncols) and sparsity=sp.
 	 *
 	 * @param rlen number of rows
 	 * @param clen number of cols
 	 * @param blen rows/cols per block
 	 * @param sp sparsity
 	 * @return memory estimate
 	 */
 	public static long estimatePartitionedSizeExactSparsity(long rlen, long clen, long blen, double sp)
 	{
 		long ret = 0;

 		//check for guaranteed existence of empty blocks (less nnz than total number of blocks)
 		long tnrblks = (long)Math.ceil((double)rlen/blen);
 		long tncblks = (long)Math.ceil((double)clen/blen);
 		long nnz = (long) Math.ceil(sp * rlen * clen);
 		if( nnz <= tnrblks * tncblks ) {
 			long lrlen = Math.min(rlen, blen);
 			long lclen = Math.min(clen, blen);
 			return nnz * estimateSizeExactSparsity(lrlen, lclen, 1)
 				 + (tnrblks * tncblks - nnz) * estimateSizeEmptyBlock(lrlen, lclen);
 		}

 		//estimate size of full blen x blen blocks
 		long nrblks = rlen / blen;
 		long ncblks = clen / blen;
 		if( nrblks * ncblks > 0 )
 			ret += nrblks * ncblks * estimateSizeExactSparsity(blen, blen, sp);

 		//estimate size of bottom boundary blocks
 		long lrlen = rlen % blen;
 		if( ncblks > 0 && lrlen >= 0 )
 			ret += ncblks * estimateSizeExactSparsity(lrlen, blen, sp);

 		//estimate size of right boundary blocks
 		long lclen = clen % blen;
 		if( nrblks > 0 && lclen >= 0 )
 			ret += nrblks * estimateSizeExactSparsity(blen, lclen, sp);

 		//estimate size of bottom right boundary block
 		if( lrlen >= 0 && lclen >= 0  )
 			ret += estimateSizeExactSparsity(lrlen, lclen, sp);

 		return ret;
 	}

 	/**
 	 * Similar to estimate() except that it provides worst-case estimates
 	 * when the optimization type is ROBUST.
 	 *
 	 * @param nrows number of rows
 	 * @param ncols number of cols
 	 * @return memory estimate
 	 */
 	public static long estimateSize(long nrows, long ncols)
 	{
 		return estimateSizeExactSparsity(nrows, ncols, 1.0);
 	}

 	public static long estimateSizeEmptyBlock(long nrows, long ncols)
 	{
 		return estimateSizeExactSparsity(0, 0, 0.0d);
 	}

 	public static long estimateSizeTextOutput(long rows, long cols, long nnz, FileFormat fmt) {
 		long bsize = MatrixBlock.estimateSizeOnDisk(rows, cols, nnz);
 		if( fmt.isIJVFormat() )
 			return bsize * 3;
 		else if( fmt == FileFormat.LIBSVM )
 			return Math.round(bsize * 2.5);
 		else if( fmt == FileFormat.CSV )
 			return bsize * 2;
 		return bsize;
 	}

 	public static long estimateSizeTextOutput(int[] dims, long nnz, FileFormat fmt) {
 		// TODO accurate estimation
 		if( fmt == FileFormat.TEXT )
 			// nnz * (8 bytes for number + each dimension with an expected String length of 3 and one space)
 			return nnz * (8 + dims.length * 4); // very simple estimation. example:100 100 1.345678
 		throw new DMLRuntimeException("Tensor output format not implemented.");
 	}

 	public static double getTotalMemEstimate(Hop[] in, Hop out) {
 		return getTotalMemEstimate(in, out, false);
 	}

 	public static double getTotalMemEstimate(Hop[] in, Hop out, boolean denseOut) {
 		return Arrays.stream(in)
 			.mapToDouble(h -> h.getOutputMemEstimate()).sum()
 			+ (!denseOut ? out.getOutputMemEstimate() :
 				OptimizerUtils.estimateSize(out.getDim1(), out.getDim2()));
 	}

 	/**
 	 * Indicates if the given indexing range is block aligned, i.e., it does not require
 	 * global aggregation of blocks.
 	 *
 	 * @param ixrange indexing range
 	 * @param mc matrix characteristics
 	 * @return true if indexing range is block aligned
 	 */
 	public static boolean isIndexingRangeBlockAligned(IndexRange ixrange, DataCharacteristics mc) {
 		long rl = ixrange.rowStart;
 		long ru = ixrange.rowEnd;
 		long cl = ixrange.colStart;
 		long cu = ixrange.colEnd;
 		long blen = mc.getBlocksize();
 		return isIndexingRangeBlockAligned(rl, ru, cl, cu, blen);
 	}

 	/**
 	 * Indicates if the given indexing range is block aligned, i.e., it does not require
 	 * global aggregation of blocks.
 	 *
 	 * @param rl rows lower
 	 * @param ru rows upper
 	 * @param cl cols lower
 	 * @param cu cols upper
 	 * @param blen rows/cols per block
 	 * @return true if indexing range is block aligned
 	 */
 	public static boolean isIndexingRangeBlockAligned(long rl, long ru, long cl, long cu, long blen) {
 		return rl != -1 && ru != -1 && cl != -1 && cu != -1
 				&&((rl-1)%blen == 0 && (cl-1)%blen == 0
 				|| (rl-1)/blen == (ru-1)/blen && (cl-1)%blen == 0
 				|| (rl-1)%blen == 0 && (cl-1)/blen == (cu-1)/blen);
 	}

 	public static boolean isValidCPDimensions( DataCharacteristics mc ) {
 		return isValidCPDimensions(mc.getRows(), mc.getCols());
 	}

 	/**
 	 * Returns false if dimensions known to be invalid; other true
 	 *
 	 * @param rows number of rows
 	 * @param cols number of cols
 	 * @return true if dimensions valid
 	 */
 	public static boolean isValidCPDimensions( long rows, long cols )
 	{
 		//the current CP runtime implementation requires that rows and cols
 		//are integers since we use a single matrixblock to represent the
 		//entire matrix
 		return (rows <= Integer.MAX_VALUE && cols<=Integer.MAX_VALUE);
 	}

 	/**
 	 * Returns false if schema and names are not properly specified; other true
 	 * Length to be &gt; 0, and length of both to be equal.
 	 *
 	 * @param schema the schema
 	 * @param names the names
 	 * @return false if schema and names are not properly specified
 	 */
 	public static boolean isValidCPDimensions( ValueType[] schema, String[] names )
 	{
 		// Length of schema and names to be same, and > 0.
 		return (schema != null && names != null && schema.length > 0 && schema.length == names.length);
 	}

 	/**
 	 * Determines if valid matrix size to be represented in CP data structures. Note that
 	 * sparsity needs to be specified as rows*cols if unknown.
 	 *
 	 * @param rows number of rows
 	 * @param cols number of cols
 	 * @param sparsity the sparsity
 	 * @return true if valid matrix size
 	 */
 	public static boolean isValidCPMatrixSize( long rows, long cols, double sparsity )
 	{
 		boolean ret = true;

 		//the current CP runtime implementation has several limitations:
 		//1) for dense: 16GB because we use a linearized array (bounded to int in java)
 		//2) for sparse: 2G x 2G nnz because (1) nnz maintained as long, (2) potential changes
 		//   to dense, and (3) sparse row arrays also of max int size (worst case in case of skew)
 		long nnz = (long)(sparsity * rows * cols);
 		boolean sparse = MatrixBlock.evalSparseFormatInMemory(rows, cols, nnz);

 		if( sparse ) //SPARSE
 		{
 			//check max nnz (dependent on sparse block format)
 			ret = (nnz <= MAX_NNZ_CP_SPARSE);
 		}
 		else //DENSE
 		{
 			//check number of matrix cell
 			ret = ((rows * cols) <= MAX_NUMCELLS_CP_DENSE);
 		}

 		return ret;
 	}

 	/**
 	 * Indicates if the given matrix characteristics exceed the threshold for
 	 * caching, i.e., the matrix should be cached.
 	 *
 	 * @param dim2 dimension 2
 	 * @param outMem ?
 	 * @return true if the given matrix characteristics exceed threshold
 	 */
 	public static boolean exceedsCachingThreshold(long dim2, double outMem) {
 		//NOTE: We heuristically cache matrices that are close to or larger
 		//than the local memory budget. The different relative fractions
 		//according to number of columns is reflecting common operations
 		//(e.g., two inputs/one output for binary vector operations)
 		return !(dim2 > 1 && outMem < getLocalMemBudget()/2
 			|| dim2 == 1 && outMem < getLocalMemBudget()/3);
 	}

 	/**
 	 * Wrapper over internal filename construction for external usage.
 	 *
 	 * @return unique temp file name
 	 */
 	public static String getUniqueTempFileName() {
 		return ConfigurationManager.getScratchSpace()
 			+ Lop.FILE_SEPARATOR + Lop.PROCESS_PREFIX + DMLScript.getUUID()
 			+ Lop.FILE_SEPARATOR + Lop.CP_ROOT_THREAD_ID + Lop.FILE_SEPARATOR
 			+ Dag.getNextUniqueFilenameSuffix();
 	}

 	public static boolean allowsToFilterEmptyBlockOutputs( Hop hop ) {
 		boolean ret = true;
 		for( Hop p : hop.getParent() ) {
 			p.optFindExecType(); //ensure exec type evaluated
 			ret &= ( p.getExecType()==ExecType.CP
 				||(p instanceof AggBinaryOp && allowsToFilterEmptyBlockOutputs(p) )
 				||(HopRewriteUtils.isReorg(p, ReOrgOp.RESHAPE, ReOrgOp.TRANS) && allowsToFilterEmptyBlockOutputs(p) )
 				||(HopRewriteUtils.isData(p, OpOpData.PERSISTENTWRITE) && ((DataOp)p).getInputFormatType()==FileFormat.TEXT))
 				&& !(p instanceof FunctionOp || (p instanceof DataOp && ((DataOp)p).getInputFormatType()!=FileFormat.TEXT) ); //no function call or transient write
 		}
 		return ret;
 	}

 	public static int getConstrainedNumThreads(int maxNumThreads)
 	{
 		//by default max local parallelism (vcores)
 		int ret = InfrastructureAnalyzer.getLocalParallelism();

 		//apply external max constraint (e.g., set by parfor or other rewrites)
 		if( maxNumThreads > 0 ) {
 			ret = Math.min(ret, maxNumThreads);
 		}

 		//apply global multi-threading constraint
 		if( !ConfigurationManager.isParallelMatrixOperations() ) {
 			ret = 1;
 		}

 		return ret;
 	}

 	public static Level getDefaultLogLevel() {
 		Level log = Logger.getRootLogger().getLevel();
 		return (log != null) ? log : Level.INFO;
 	}

 	////////////////////////
 	// Sparsity Estimates //
 	////////////////////////

 	public static long getMatMultNnz(double sp1, double sp2, long m, long k, long n, boolean worstcase) {
 		return getNnz( m, n, getMatMultSparsity(sp1, sp2, m, k, n, worstcase));
 	}

 	/**
 	 * Estimates the result sparsity for Matrix Multiplication A %*% B.
 	 *
 	 * @param sp1 sparsity of A
 	 * @param sp2 sparsity of B
 	 * @param m nrow(A)
 	 * @param k ncol(A), nrow(B)
 	 * @param n ncol(B)
 	 * @param worstcase true if worst case
 	 * @return the sparsity
 	 */
 	public static double getMatMultSparsity(double sp1, double sp2, long m, long k, long n, boolean worstcase) {
 		if( worstcase ){
 			double nnz1 = sp1 * m * k;
 			double nnz2 = sp2 * k * n;
 			return Math.min(1, nnz1/m) * Math.min(1, nnz2/n);
 		}
 		else
 			return 1 - Math.pow(1-sp1*sp2, k);
 	}

 	public static double getLeftIndexingSparsity( long rlen1, long clen1, long nnz1, long rlen2, long clen2, long nnz2 )
 	{
 		boolean scalarRhs = (rlen2==0 && clen2==0);

 		//infer output worstcase output nnz
 		long lnnz = -1;
 		if( nnz1>=0 && scalarRhs )
 			lnnz = nnz1+1;             // nnz(left) + scalar
 		else if( nnz1>=0 && nnz2>=0 )
 			lnnz = nnz1 + nnz2;        // nnz(left) + nnz(right)
 		else if( nnz1>=0 && rlen2>0 && clen2>0 )
 			lnnz = nnz1 + rlen2*clen2; // nnz(left) + nnz(right_dense)
 		lnnz = Math.min(lnnz, rlen1*clen1);

 		return getSparsity(rlen1, clen1, (lnnz>=0) ? lnnz : rlen1*clen1);
 	}

 	/**
 	 * Determines if a given binary op is potentially conditional sparse safe.
 	 *
 	 * @param op the HOP OpOp2
 	 * @return true if potentially conditional sparse safe
 	 */
 	public static boolean isBinaryOpConditionalSparseSafe( OpOp2 op )
 	{
 		return (   op==OpOp2.GREATER
 			    || op==OpOp2.LESS
 			    || op==OpOp2.NOTEQUAL
 			    || op==OpOp2.EQUAL
 			    || op==OpOp2.MINUS);
 	}

 	/**
 	 * Determines if a given binary op with scalar literal guarantee an output
 	 * sparsity which is exactly the same as its matrix input sparsity.
 	 *
 	 * @param op the HOP OpOp2
 	 * @param lit literal operator
 	 * @return true if output sparsity same as matrix input sparsity
 	 */
 	public static boolean isBinaryOpConditionalSparseSafeExact( OpOp2 op, LiteralOp lit )
 	{
 		double val = HopRewriteUtils.getDoubleValueSafe(lit);

 		return ( op==OpOp2.NOTEQUAL && val==0);
 	}

 	public static boolean isBinaryOpSparsityConditionalSparseSafe( OpOp2 op, LiteralOp lit ) {
 		double val = HopRewriteUtils.getDoubleValueSafe(lit);
 		return (  (op==OpOp2.GREATER  && val==0)
 				||(op==OpOp2.LESS     && val==0)
 				||(op==OpOp2.NOTEQUAL && val==0)
 				||(op==OpOp2.EQUAL    && val!=0)
 				||(op==OpOp2.MINUS    && val==0)
 				||(op==OpOp2.PLUS     && val==0)
 				||(op==OpOp2.POW      && val!=0)
 				||(op==OpOp2.MAX      && val<=0)
 				||(op==OpOp2.MIN      && val>=0));
 	}

 	public static double getBinaryOpSparsityConditionalSparseSafe( double sp1, OpOp2 op, LiteralOp lit ) {
 		return isBinaryOpSparsityConditionalSparseSafe(op, lit) ? sp1 : 1.0;
 	}

 	/**
 	 * Estimates the result sparsity for matrix-matrix binary operations (A op B)
 	 *
 	 * @param sp1 sparsity of A
 	 * @param sp2 sparsity of B
 	 * @param op binary operation
 	 * @param worstcase true if worst case
 	 * @return result sparsity for matrix-matrix binary operations
 	 */
 	public static double getBinaryOpSparsity(double sp1, double sp2, OpOp2 op, boolean worstcase)
 	{
 		// default is worst-case estimate for robustness
 		double ret = 1.0;

 		if( worstcase )
 		{
 			//NOTE: for matrix-scalar operations this estimate is too conservative, because
 			//Math.min(1, sp1 + sp2) will always give a sparsity 1 if we pass sp2=1 for scalars.
 			//In order to do better (with guarantees), we need to take the actual values into account
 			switch(op) {
 				case PLUS:
 				case MINUS:
 				case LESS:
 				case GREATER:
 				case NOTEQUAL:
 				case MIN:
 				case MAX:
 				case OR:
 					ret = worstcase ? Math.min(1, sp1 + sp2) :
 						sp1 + sp2 - sp1 * sp2; break;
 				case MULT:
 				case AND:
 					ret = worstcase ? Math.min(sp1, sp2) :
 						sp1 * sp2; break;
 				case DIV:
 					ret = Math.min(1, sp1 + (1-sp2)); break;
 				case MODULUS:
 				case POW:
 				case MINUS_NZ:
 				case LOG_NZ:
 					ret = sp1; break;
 				//case EQUAL: //doesnt work on worstcase estimates, but on
 				//	ret = 1-Math.abs(sp1-sp2); break;
 				default:
 					ret = 1.0;
 			}
 		}
 		else
 		{
 			switch(op) {
 				case PLUS:
 				case MINUS:
 					// result[i,j] != 0 iff A[i,j] !=0 || B[i,j] != 0
 					// worst case estimate = sp1+sp2
 					ret = (1 - (1-sp1)*(1-sp2));
 					break;
 				case MULT:
 					// result[i,j] != 0 iff A[i,j] !=0 && B[i,j] != 0
 					// worst case estimate = min(sp1,sp2)
 					ret = sp1 * sp2;
 					break;
 				case DIV:
 					ret = 1.0; // worst case estimate
 					break;
 				case LESS:
 				case LESSEQUAL:
 				case GREATER:
 				case GREATEREQUAL:
 				case EQUAL:
 				case NOTEQUAL:
 					ret = 1.0; // purely data-dependent operations, and hence worse-case estimate
 					break;
 				//MIN, MAX, AND, OR, LOG, POW
 				default:
 					ret = 1.0;
 			}
 		}

 		return ret;
 	}

 	public static long getOuterNonZeros(long n1, long n2, long nnz1, long nnz2, OpOp2 op) {
 		if( nnz1 < 0 || nnz2 < 0 || op == null )
 			return n1 * n2;
 		switch(op) {
 			case PLUS:
 			case MINUS:
 			case LESS:
 			case GREATER:
 			case NOTEQUAL:
 			case MIN:
 			case MAX:
 			case OR:
 				return n1 * n2
 					- (n1-nnz1) * (n2-nnz2);
 			case MULT:
 			case AND:
 				return nnz1 * nnz2;
 			default:
 				return n1 * n2;
 		}
 	}

 	public static long getNnz(long dim1, long dim2, double sp) {
 		return Math.round(sp * dim1 * dim2);
 	}

 	public static double getSparsity( DataCharacteristics dc ) {
 		return getSparsity(dc.getRows(), dc.getCols(), dc.getNonZeros());
 	}

 	public static double getSparsity( long dim1, long dim2, long nnz ) {
 		return ( dim1<=0 || dim2<=0 || nnz<0 ) ? 1.0 :
 			Math.min(((double)nnz)/dim1/dim2, 1.0);
 	}

 	public static double getSparsity(long[] dims, long nnz) {
 		double sparsity = nnz;
 		for (long dim : dims) {
 			if (dim <= 0) {
 				return 1.0;
 			}
 			sparsity /= dim;
 		}
 		return Math.min(sparsity, 1.0);
 	}

 	public static String toMB(double inB) {
 		if ( inB < 0 )
 			return "-";
 		return String.format("%.0f", inB/(1024*1024) );
 	}

 	public static long getNumIterations(ForProgramBlock fpb, long defaultValue) {
 		if( fpb.getStatementBlock()==null )
 			return defaultValue;
 		ForStatementBlock fsb = (ForStatementBlock) fpb.getStatementBlock();
 		try {
 			HashMap<Long,Long> memo = new HashMap<>();
 			long from = rEvalSimpleLongExpression(fsb.getFromHops().getInput().get(0), memo);
 			long to = rEvalSimpleLongExpression(fsb.getToHops().getInput().get(0), memo);
 			long increment = (fsb.getIncrementHops()==null) ? (from < to) ? 1 : -1 :
 				rEvalSimpleLongExpression(fsb.getIncrementHops().getInput().get(0), memo);
 			if( from != Long.MAX_VALUE && to != Long.MAX_VALUE && increment != Long.MAX_VALUE )
 				return (int)Math.ceil(((double)(to-from+1))/increment);
 		}
 		catch(Exception ex){}
 		return defaultValue;
 	}

 	public static long getNumIterations(ForProgramBlock fpb, LocalVariableMap vars, long defaultValue) {
 		if( fpb.getStatementBlock()==null )
 			return defaultValue;
 		ForStatementBlock fsb = (ForStatementBlock) fpb.getStatementBlock();
 		try {
 			HashMap<Long,Long> memo = new HashMap<>();
 			long from = rEvalSimpleLongExpression(fsb.getFromHops().getInput().get(0), memo, vars);
 			long to = rEvalSimpleLongExpression(fsb.getToHops().getInput().get(0), memo, vars);
 			long increment = (fsb.getIncrementHops()==null) ? (from < to) ? 1 : -1 :
 				rEvalSimpleLongExpression(fsb.getIncrementHops().getInput().get(0), memo);
 			if( from != Long.MAX_VALUE && to != Long.MAX_VALUE && increment != Long.MAX_VALUE )
 				return (int)Math.ceil(((double)(to-from+1))/increment);
 		}
 		catch(Exception ex){}
 		return defaultValue;
 	}

 	/**
 	 * Function to evaluate simple size expressions over literals and now/ncol.
 	 *
 	 * It returns the exact results of this expressions if known, otherwise
 	 * Long.MAX_VALUE if unknown.
 	 *
 	 * @param root the root high-level operator
 	 * @param valMemo ?
 	 * @return size expression
 	 */
 	public static long rEvalSimpleLongExpression( Hop root, HashMap<Long, Long> valMemo )
 	{
 		long ret = Long.MAX_VALUE;

 		//for simplicity and robustness call double and cast.
 		HashMap<Long, Double> dvalMemo = new HashMap<>();
 		double tmp = rEvalSimpleDoubleExpression(root, dvalMemo);
 		if( tmp!=Double.MAX_VALUE )
 			ret = UtilFunctions.toLong( tmp );

 		return ret;
 	}

 	public static long rEvalSimpleLongExpression( Hop root, HashMap<Long, Long> valMemo, LocalVariableMap vars )
 	{
 		long ret = Long.MAX_VALUE;

 		//for simplicity and robustness call double and cast.
 		HashMap<Long, Double> dvalMemo = new HashMap<>();
 		double tmp = rEvalSimpleDoubleExpression(root, dvalMemo, vars);
 		if( tmp!=Double.MAX_VALUE )
 			ret = UtilFunctions.toLong( tmp );

 		return ret;
 	}

 	public static double rEvalSimpleDoubleExpression( Hop root, HashMap<Long, Double> valMemo )
 	{
 		//memoization (prevent redundant computation of common subexpr)
 		if( valMemo.containsKey(root.getHopID()) )
 			return valMemo.get(root.getHopID());

 		double ret = Double.MAX_VALUE;

 		//always use constants
 		if( root instanceof LiteralOp )
 			ret = HopRewriteUtils.getDoubleValue((LiteralOp)root);

 		//advanced size expression evaluation
 		if( OptimizerUtils.ALLOW_SIZE_EXPRESSION_EVALUATION )
 		{
 			if( root instanceof UnaryOp )
 				ret = rEvalSimpleUnaryDoubleExpression(root, valMemo);
 			else if( root instanceof BinaryOp )
 				ret = rEvalSimpleBinaryDoubleExpression(root, valMemo);
 		}

 		valMemo.put(root.getHopID(), ret);
 		return ret;
 	}

 	public static double rEvalSimpleDoubleExpression( Hop root, HashMap<Long, Double> valMemo, LocalVariableMap vars )
 	{
 		//memoization (prevent redundant computation of common subexpr)
 		if( valMemo.containsKey(root.getHopID()) )
 			return valMemo.get(root.getHopID());

 		double ret = Double.MAX_VALUE;

 		if( OptimizerUtils.ALLOW_SIZE_EXPRESSION_EVALUATION )
 		{
 			if( root instanceof LiteralOp )
 				ret = HopRewriteUtils.getDoubleValue((LiteralOp)root);
 			else if( root instanceof UnaryOp )
 				ret = rEvalSimpleUnaryDoubleExpression(root, valMemo, vars);
 			else if( root instanceof BinaryOp )
 				ret = rEvalSimpleBinaryDoubleExpression(root, valMemo, vars);
 			else if( root instanceof DataOp ) {
 				String name = root.getName();
 				Data dat = vars.get(name);
 				if( dat!=null && dat instanceof ScalarObject )
 					ret = ((ScalarObject)dat).getDoubleValue();
 			}
 		}

 		valMemo.put(root.getHopID(), ret);
 		return ret;
 	}

 	protected static double rEvalSimpleUnaryDoubleExpression( Hop root, HashMap<Long, Double> valMemo )
 	{
 		//memoization (prevent redundant computation of common subexpr)
 		if( valMemo.containsKey(root.getHopID()) )
 			return valMemo.get(root.getHopID());

 		double ret = Double.MAX_VALUE;

 		UnaryOp uroot = (UnaryOp) root;
 		Hop input = uroot.getInput().get(0);

 		if(uroot.getOp() == OpOp1.NROW)
 			ret = input.rowsKnown() ? input.getDim1() : Double.MAX_VALUE;
 		else if( uroot.getOp() == OpOp1.NCOL )
 			ret = input.colsKnown() ? input.getDim2() : Double.MAX_VALUE;
 		else
 		{
 			double lval = rEvalSimpleDoubleExpression(uroot.getInput().get(0), valMemo);
 			if( lval != Double.MAX_VALUE )
 			{
 				switch( uroot.getOp() )
 				{
 					case SQRT:  ret = Math.sqrt(lval); break;
 					case ROUND: ret = Math.round(lval); break;
 					case CEIL:  ret = Math.ceil(lval); break;
 					case FLOOR: ret = Math.floor(lval); break;
 					case CAST_AS_BOOLEAN: ret = (lval!=0)? 1 : 0; break;
 					case CAST_AS_INT: ret = UtilFunctions.toLong(lval); break;
 					case CAST_AS_DOUBLE: ret = lval; break;
 					default: ret = Double.MAX_VALUE;
 				}
 			}
 		}

 		valMemo.put(root.getHopID(), ret);
 		return ret;
 	}

 	protected static double rEvalSimpleUnaryDoubleExpression( Hop root, HashMap<Long, Double> valMemo, LocalVariableMap vars )
 	{
 		//memoization (prevent redundant computation of common subexpr)
 		if( valMemo.containsKey(root.getHopID()) )
 			return valMemo.get(root.getHopID());

 		double ret = Double.MAX_VALUE;

 		UnaryOp uroot = (UnaryOp) root;
 		Hop input = uroot.getInput().get(0);

 		if(uroot.getOp() == OpOp1.NROW)
 			ret = input.rowsKnown() ? input.getDim1() : Double.MAX_VALUE;
 		else if( uroot.getOp() == OpOp1.NCOL )
 			ret = input.colsKnown() ? input.getDim2() : Double.MAX_VALUE;
 		else
 		{
 			double lval = rEvalSimpleDoubleExpression(uroot.getInput().get(0), valMemo, vars);
 			if( lval != Double.MAX_VALUE )
 			{
 				switch( uroot.getOp() )
 				{
 					case SQRT:  ret = Math.sqrt(lval); break;
 					case ROUND: ret = Math.round(lval); break;
 					case CEIL:  ret = Math.ceil(lval); break;
 					case FLOOR: ret = Math.floor(lval); break;
 					case CAST_AS_BOOLEAN: ret = (lval!=0)? 1 : 0; break;
 					case CAST_AS_INT: ret = UtilFunctions.toLong(lval); break;
 					case CAST_AS_DOUBLE: ret = lval; break;
 					default: ret = Double.MAX_VALUE;
 				}
 			}
 		}

 		valMemo.put(root.getHopID(), ret);
 		return ret;
 	}

 	protected static double rEvalSimpleBinaryDoubleExpression( Hop root, HashMap<Long, Double> valMemo )
 	{
 		//memoization (prevent redundant computation of common subexpr)
 		if( valMemo.containsKey(root.getHopID()) )
 			return valMemo.get(root.getHopID());

 		double ret = Double.MAX_VALUE;

 		BinaryOp broot = (BinaryOp) root;

 		double lret = rEvalSimpleDoubleExpression(broot.getInput().get(0), valMemo);
 		double rret = rEvalSimpleDoubleExpression(broot.getInput().get(1), valMemo);
 		//note: positive and negative values might be valid subexpressions
 		if( lret!=Double.MAX_VALUE && rret!=Double.MAX_VALUE ) //if known
 		{
 			switch( broot.getOp() )
 			{
 				case PLUS:	ret = lret + rret; break;
 				case MINUS:	ret = lret - rret; break;
 				case MULT:  ret = lret * rret; break;
 				case DIV:   ret = lret / rret; break;
 				case MIN:   ret = Math.min(lret, rret); break;
 				case MAX:   ret = Math.max(lret, rret); break;
 				case POW:   ret = Math.pow(lret, rret); break;
 				//special mod / inddiv for runtime consistency
 				case MODULUS: ret = Modulus.getFnObject().execute(lret, rret); break;
 				case INTDIV:  ret = IntegerDivide.getFnObject().execute(lret, rret); break;
 				default: ret = Double.MAX_VALUE;
 			}
 		}

 		valMemo.put(root.getHopID(), ret);
 		return ret;
 	}

 	protected static double rEvalSimpleBinaryDoubleExpression( Hop root, HashMap<Long, Double> valMemo, LocalVariableMap vars )
 	{
 		//memoization (prevent redundant computation of common subexpr)
 		if( valMemo.containsKey(root.getHopID()) )
 			return valMemo.get(root.getHopID());

 		double ret = Double.MAX_VALUE;

 		BinaryOp broot = (BinaryOp) root;

 		double lret = rEvalSimpleDoubleExpression(broot.getInput().get(0), valMemo, vars);
 		double rret = rEvalSimpleDoubleExpression(broot.getInput().get(1), valMemo, vars);
 		//note: positive and negative values might be valid subexpressions
 		if( lret!=Double.MAX_VALUE && rret!=Double.MAX_VALUE ) //if known
 		{
 			switch( broot.getOp() )
 			{
 				case PLUS:	ret = lret + rret; break;
 				case MINUS:	ret = lret - rret; break;
 				case MULT:  ret = lret * rret; break;
 				case DIV:   ret = lret / rret; break;
 				case MIN:   ret = Math.min(lret, rret); break;
 				case MAX:   ret = Math.max(lret, rret); break;
 				case POW:   ret = Math.pow(lret, rret); break;
 				//special mod / inddiv for runtime consistency
 				case MODULUS: ret = Modulus.getFnObject().execute(lret, rret); break;
 				case INTDIV:  ret = IntegerDivide.getFnObject().execute(lret, rret); break;
 				default: ret = Double.MAX_VALUE;
 			}
 		}

 		valMemo.put(root.getHopID(), ret);
 		return ret;
 	}
 }