src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java - systemds - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.sysds.runtime.matrix.data;

 import org.apache.commons.lang3.ArrayUtils;
 import org.apache.commons.lang3.concurrent.ConcurrentUtils;
 import org.apache.commons.math3.random.Well1024a;
 import org.apache.hadoop.io.DataInputBuffer;
 import org.apache.sysds.common.Types.BlockType;
 import org.apache.sysds.common.Types.CorrectionLocationType;
 import org.apache.sysds.conf.ConfigurationManager;
 import org.apache.sysds.hops.OptimizerUtils;
 import org.apache.sysds.lops.MMTSJ.MMTSJType;
 import org.apache.sysds.lops.MapMultChain.ChainType;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.CacheBlock;
 import org.apache.sysds.runtime.controlprogram.caching.LazyWriteBuffer;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject.UpdateType;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.DenseBlockFactory;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.data.SparseBlockCOO;
 import org.apache.sysds.runtime.data.SparseBlockCSR;
 import org.apache.sysds.runtime.data.SparseBlockFactory;
 import org.apache.sysds.runtime.data.SparseBlockMCSR;
 import org.apache.sysds.runtime.data.SparseRow;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.Builtin.BuiltinCode;
 import org.apache.sysds.runtime.functionobjects.CM;
 import org.apache.sysds.runtime.functionobjects.CTable;
 import org.apache.sysds.runtime.functionobjects.DiagIndex;
 import org.apache.sysds.runtime.functionobjects.Divide;
 import org.apache.sysds.runtime.functionobjects.FunctionObject;
 import org.apache.sysds.runtime.functionobjects.IfElse;
 import org.apache.sysds.runtime.functionobjects.KahanFunction;
 import org.apache.sysds.runtime.functionobjects.KahanPlus;
 import org.apache.sysds.runtime.functionobjects.KahanPlusSq;
 import org.apache.sysds.runtime.functionobjects.MinusMultiply;
 import org.apache.sysds.runtime.functionobjects.Multiply;
 import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.functionobjects.PlusMultiply;
 import org.apache.sysds.runtime.functionobjects.ReduceAll;
 import org.apache.sysds.runtime.functionobjects.ReduceCol;
 import org.apache.sysds.runtime.functionobjects.ReduceRow;
 import org.apache.sysds.runtime.functionobjects.RevIndex;
 import org.apache.sysds.runtime.functionobjects.SortIndex;
 import org.apache.sysds.runtime.functionobjects.SwapIndex;
 import org.apache.sysds.runtime.functionobjects.TernaryValueFunction.ValueFunctionWithConstant;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
 import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
 import org.apache.sysds.runtime.instructions.cp.KahanObject;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.io.IOUtilFunctions;
 import org.apache.sysds.runtime.matrix.data.LibMatrixBincell.BinaryAccessType;
 import org.apache.sysds.runtime.matrix.operators.AggregateBinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.AggregateOperator;
 import org.apache.sysds.runtime.matrix.operators.AggregateTernaryOperator;
 import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.CMOperator;
 import org.apache.sysds.runtime.matrix.operators.CMOperator.AggregateOperationTypes;
 import org.apache.sysds.runtime.matrix.operators.COVOperator;
 import org.apache.sysds.runtime.matrix.operators.Operator;
 import org.apache.sysds.runtime.matrix.operators.QuaternaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 import org.apache.sysds.runtime.matrix.operators.SimpleOperator;
 import org.apache.sysds.runtime.matrix.operators.TernaryOperator;
 import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
 import org.apache.sysds.runtime.meta.DataCharacteristics;
 import org.apache.sysds.runtime.meta.MatrixCharacteristics;
 import org.apache.sysds.runtime.util.DataConverter;
 import org.apache.sysds.runtime.util.FastBufferedDataInputStream;
 import org.apache.sysds.runtime.util.FastBufferedDataOutputStream;
 import org.apache.sysds.runtime.util.HDFSTool;
 import org.apache.sysds.runtime.util.IndexRange;
 import org.apache.sysds.runtime.util.UtilFunctions;
 import org.apache.sysds.utils.NativeHelper;

 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.Externalizable;
 import java.io.IOException;
 import java.io.ObjectInput;
 import java.io.ObjectInputStream;
 import java.io.ObjectOutput;
 import java.io.ObjectOutputStream;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
 import java.util.stream.IntStream;


 public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizable
 {
 	private static final long serialVersionUID = 7319972089143154056L;

 	//sparsity nnz threshold, based on practical experiments on space consumption and performance
 	public static final double SPARSITY_TURN_POINT = 0.4;
 	//sparsity threshold for ultra-sparse matrix operations (40nnz in a 1kx1k block)
 	public static final double ULTRA_SPARSITY_TURN_POINT  = 0.00004;
 	public static final double ULTRA_SPARSITY_TURN_POINT2 = 0.0004;
 	public static final int ULTRA_SPARSE_BLOCK_NNZ = 40;
 	//default sparse block type: modified compressed sparse rows, for efficient incremental construction
 	public static final SparseBlock.Type DEFAULT_SPARSEBLOCK = SparseBlock.Type.MCSR;
 	//default sparse block type for update in place: compressed sparse rows, to prevent serialization
 	public static final SparseBlock.Type DEFAULT_INPLACE_SPARSEBLOCK = SparseBlock.Type.CSR;
 	//allowed overhead for shallow serialize in terms of in-memory-size/x <= serialized-size
 	public static final double MAX_SHALLOW_SERIALIZE_OVERHEAD = 2; //2x size of serialized
 	//flag if MCSR blocks that do not qualify for shallow serialize should be converted to CSR
 	public static final boolean CONVERT_MCSR_TO_CSR_ON_DEEP_SERIALIZE = true;
 	//basic header (int rlen, int clen, byte type)
 	public static final int HEADER_SIZE = 9;

 	//matrix meta data
 	protected int rlen       = -1;
 	protected int clen       = -1;
 	protected boolean sparse = true;
 	protected long nonZeros   = 0;

 	//matrix data (sparse or dense)
 	protected DenseBlock denseBlock   = null;
 	protected SparseBlock sparseBlock = null;

 	//sparse-block-specific attributes (allocation only)
 	protected int estimatedNNzsPerRow = -1;

 	////////
 	// Matrix Constructors
 	//

 	public MatrixBlock() {
 		this(0, 0, true, -1);
 	}

 	public MatrixBlock(int rl, int cl, boolean sp) {
 		this(rl, cl, sp, -1);
 	}

 	public MatrixBlock(int rl, int cl, long estnnz) {
 		this(rl, cl, evalSparseFormatInMemory(rl, cl, estnnz), estnnz);
 	}

 	public MatrixBlock(int rl, int cl, boolean sp, long estnnz) {
 		reset(rl, cl, sp, estnnz, 0);
 	}

 	public MatrixBlock(MatrixBlock that) {
 		copy(that);
 	}

 	public MatrixBlock(double val) {
 		reset(1, 1, false, 1, val);
 	}

 	public MatrixBlock(int rl, int cl, double val) {
 		reset(rl, cl, false, (long)rl*cl, val);
 	}

 	/**
 	 * Constructs a sparse {@link MatrixBlock} with a given instance of a {@link SparseBlock}
 	 * @param rl number of rows
 	 * @param cl number of columns
 	 * @param nnz number of non zeroes
 	 * @param sblock sparse block
 	 */
 	public MatrixBlock(int rl, int cl, long nnz, SparseBlock sblock) {
 		this(rl, cl, true, nnz);
 		nonZeros = nnz;
 		sparseBlock = sblock;
 	}

 	public MatrixBlock(MatrixBlock that, SparseBlock.Type stype, boolean deep) {
 		this(that.rlen, that.clen, that.sparse);

 		//sanity check sparse matrix block
 		if( !that.isInSparseFormat() )
 			throw new RuntimeException("Sparse matrix block expected.");

 		//deep copy and change sparse block type
 		if( !that.isEmptyBlock(false) ) {
 			nonZeros = that.nonZeros;
 			estimatedNNzsPerRow = that.estimatedNNzsPerRow;
 			sparseBlock = SparseBlockFactory
 				.copySparseBlock(stype, that.sparseBlock, deep);
 		}
 	}

 	public MatrixBlock(int rl, int cl, DenseBlock dBlock){
 		rlen = rl;
 		clen = cl;
 		sparse = false;
 		denseBlock = dBlock;
 	}

 	////////
 	// Initialization methods
 	// (reset, init, allocate, etc)

 	@Override
 	public void reset() {
 		reset(rlen, clen, sparse, -1, 0);
 	}

 	@Override
 	public void reset(int rl, int cl) {
 		reset(rl, cl, sparse, -1, 0);
 	}

 	public void reset(int rl, int cl, long estnnz) {
 		reset(rl, cl, evalSparseFormatInMemory(rl, cl, estnnz), estnnz, 0);
 	}

 	@Override
 	public void reset(int rl, int cl, boolean sp) {
 		reset(rl, cl, sp, -1, 0);
 	}

 	@Override
 	public void reset(int rl, int cl, boolean sp, long estnnz) {
 		reset(rl, cl, sp, estnnz, 0);
 	}

 	@Override
 	public void reset(int rl, int cl, double val) {
 		reset(rl, cl, false, -1, val);
 	}

 	/**
 	 * Internal canonical reset of dense and sparse matrix blocks.
 	 *
 	 * @param rl      number of rows
 	 * @param cl      number of columns
 	 * @param sp      sparse representation
 	 * @param estnnz  estimated number of non-zeros
 	 * @param val     initialization value
 	 */
 	private void reset(int rl, int cl, boolean sp, long estnnz, double val) {
 		//check for valid dimensions
 		if( rl < 0 || cl < 0 )
 			throw new RuntimeException("Invalid block dimensions: "+rl+" "+cl);

 		//reset basic meta data
 		rlen = rl;
 		clen = cl;
 		sparse = (val == 0) ? sp : false;
 		nonZeros = (val == 0) ? 0 : (long)rl*cl;
 		estimatedNNzsPerRow = (estnnz < 0 || !sparse) ? -1 :
 			(int)Math.ceil((double)estnnz/(double)rlen);

 		//reset sparse/dense blocks
 		if( sparse )
 			resetSparse();
 		else
 			resetDense(val);
 	}

 	private void resetSparse() {
 		if(sparseBlock == null)
 			return;
 		sparseBlock.reset(estimatedNNzsPerRow, clen);
 	}

 	private void resetDense(double val) {
 		//handle to dense block allocation and
 		//reset dense block to given value
 		if( denseBlock != null )
 			denseBlock.reset(rlen, clen, val);
 		else if( val != 0 ) {
 			allocateDenseBlock(false);
 			denseBlock.set(val);
 		}
 	}

 	/**
 	 * NOTE: This method is designed only for dense representation.
 	 *
 	 * @param arr 2d double array matrix
 	 * @param r number of rows
 	 * @param c number of columns
 	 */
 	public void init(double[][] arr, int r, int c) {
 		//input checks
 		if ( sparse )
 			throw new DMLRuntimeException("MatrixBlockDSM.init() can be invoked only on matrices with dense representation.");
 		if( r*c > rlen*clen )
 			throw new DMLRuntimeException("MatrixBlockDSM.init() invoked with too large dimensions ("+r+","+c+") vs ("+rlen+","+clen+")");

 		//allocate or resize dense block
 		allocateDenseBlock();

 		//copy and compute nnz
 		DenseBlock db = getDenseBlock();
 		for(int i=0; i < r; i++)
 			System.arraycopy(arr[i], 0, db.values(i), db.pos(i), arr[i].length);
 		recomputeNonZeros();
 	}

 	/**
 	 * NOTE: This method is designed only for dense representation.
 	 *
 	 * @param arr double array matrix
 	 * @param r number of rows
 	 * @param c number of columns
 	 */
 	public void init(double[] arr, int r, int c) {
 		//input checks
 		if ( sparse )
 			throw new DMLRuntimeException("MatrixBlockDSM.init() can be invoked only on matrices with dense representation.");
 		if( r*c > rlen*clen )
 			throw new DMLRuntimeException("MatrixBlockDSM.init() invoked with too large dimensions ("+r+","+c+") vs ("+rlen+","+clen+")");

 		//allocate or resize dense block
 		allocateDenseBlock();

 		//copy and compute nnz (guaranteed single block)
 		System.arraycopy(arr, 0, getDenseBlockValues(), 0, arr.length);
 		recomputeNonZeros();
 	}

 	public boolean isAllocated() {
 		return sparse ? (sparseBlock!=null) : (denseBlock!=null);
 	}

 	public MatrixBlock allocateDenseBlock() {
 		allocateDenseBlock( true );
 		return this;
 	}

 	public Future<MatrixBlock> allocateBlockAsync() {
 		ExecutorService pool = LazyWriteBuffer.getUtilThreadPool();
 		return (pool != null) ? pool.submit(() -> allocateBlock()) : //async
 			ConcurrentUtils.constantFuture(allocateBlock()); //fallback sync
 	}

 	public MatrixBlock allocateBlock() {
 		if( sparse )
 			allocateSparseRowsBlock();
 		else
 			allocateDenseBlock();
 		return this;
 	}

 	public boolean allocateDenseBlock(boolean clearNNZ) {
 		//allocate block if non-existing or too small (guaranteed to be 0-initialized),
 		long limit = (long)rlen * clen;
 		boolean reset = (denseBlock == null || denseBlock.capacity() < limit);
 		if( denseBlock == null )
 			denseBlock = DenseBlockFactory.createDenseBlock(rlen, clen);
 		else if( denseBlock.capacity() < limit )
 			denseBlock.reset(rlen, clen);

 		//clear nnz if necessary
 		if( clearNNZ )
 			nonZeros = 0;
 		sparse = false;

 		return reset;
 	}

 	public boolean allocateSparseRowsBlock() {
 		return allocateSparseRowsBlock(true);
 	}

 	public boolean allocateSparseRowsBlock(boolean clearNNZ) {
 		//allocate block if non-existing or too small (guaranteed to be 0-initialized)
 		//but do not replace existing block even if not in default type
 		boolean reset = sparseBlock == null || sparseBlock.numRows()<rlen;
 		if( reset ) {
 			sparseBlock = SparseBlockFactory
 				.createSparseBlock(DEFAULT_SPARSEBLOCK, rlen);
 		}
 		//clear nnz if necessary
 		if( clearNNZ ) {
 			nonZeros = 0;
 		}
 		return reset;
 	}

 	public void allocateAndResetSparseBlock(boolean clearNNZ, SparseBlock.Type stype)
 	{
 		//allocate block if non-existing or too small (guaranteed to be 0-initialized)
 		if( sparseBlock == null || sparseBlock.numRows()<rlen
 			|| !SparseBlockFactory.isSparseBlockType(sparseBlock, stype))  {
 			sparseBlock = SparseBlockFactory.createSparseBlock(stype, rlen);
 		}
 		else {
 			sparseBlock.reset(estimatedNNzsPerRow, clen);
 		}

 		//clear nnz if necessary
 		if( clearNNZ ) {
 			nonZeros = 0;
 		}
 	}


 	/**
 	 * This should be called only in the read and write functions for CP
 	 * This function should be called before calling any setValueDenseUnsafe()
 	 *
 	 * @param rl number of rows
 	 * @param cl number of columns
 	 */
 	public void allocateDenseBlockUnsafe(int rl, int cl) {
 		sparse=false;
 		rlen=rl;
 		clen=cl;
 		//allocate dense block
 		allocateDenseBlock();
 	}


 	/**
 	 * Allows to cleanup all previously allocated sparserows or denseblocks.
 	 * This is for example required in reading a matrix with many empty blocks
 	 * via distributed cache into in-memory list of blocks - not cleaning blocks
 	 * from non-empty blocks would significantly increase the total memory consumption.
 	 *
 	 * @param dense if true, set dense block to null
 	 * @param sparse if true, set sparse block to null
 	 */
 	public void cleanupBlock( boolean dense, boolean sparse ) {
 		if(dense)
 			denseBlock = null;
 		if(sparse)
 			sparseBlock = null;
 	}

 	////////
 	// Metadata information

 	@Override
 	public int getNumRows() {
 		return rlen;
 	}

 	/**
 	 * NOTE: setNumRows() and setNumColumns() are used only in ternaryInstruction (for contingency tables)
 	 * and pmm for meta corrections.
 	 *
 	 * @param r number of rows
 	 */
 	public void setNumRows(int r) {
 		rlen = r;
 	}

 	@Override
 	public int getNumColumns() {
 		return clen;
 	}

 	public void setNumColumns(int c) {
 		clen = c;
 	}

 	@Override
 	public long getNonZeros() {
 		return nonZeros;
 	}

 	public long setNonZeros(long nnz) {
 		return (nonZeros = nnz);
 	}

 	public double getSparsity() {
 		return OptimizerUtils.getSparsity(rlen, clen, nonZeros);
 	}

 	public DataCharacteristics getDataCharacteristics() {
 		return new MatrixCharacteristics(rlen, clen, -1, nonZeros);
 	}

 	public boolean isVector() {
 		return (rlen == 1 || clen == 1);
 	}

 	public long getLength() {
 		return (long)rlen * clen;
 	}

 	@Override
 	public boolean isEmpty() {
 		return isEmptyBlock(false);
 	}

 	public boolean isEmptyBlock() {
 		return isEmptyBlock(true);
 	}


 	public boolean isEmptyBlock(boolean safe)
 	{
 		boolean ret = ( sparse && sparseBlock==null ) || ( !sparse && denseBlock==null );
 		if( nonZeros==0 )
 		{
 			//prevent under-estimation
 			if(safe)
 				recomputeNonZeros();
 			ret = (nonZeros==0);
 		}
 		return ret;
 	}

 	////////
 	// Data handling

 	public DenseBlock getDenseBlock() {
 		return denseBlock;
 	}

 	public double[] getDenseBlockValues() {
 		//this method is used as a short-hand for all operations that
 		//guaranteed only deal with dense blocks of a single block.
 		if( denseBlock != null && denseBlock.numBlocks() > 1 ) {
 			throw new RuntimeException("Large dense in-memory block (with numblocks="+denseBlock.numBlocks()+") "
 				+ "allocated but operation access to first block only, which might cause incorrect results.");
 		}
 		return (denseBlock != null) ? denseBlock.valuesAt(0) : null;
 	}

 	public SparseBlock getSparseBlock() {
 		return sparseBlock;
 	}

 	public void setSparseBlock(SparseBlock sblock) {
 		sparseBlock = sblock;
 	}

 	public Iterator<IJV> getSparseBlockIterator() {
 		//check for valid format, should have been checked from outside
 		if( !sparse )
 			throw new RuntimeException("getSparseBlockInterator should not be called for dense format");

 		//check for existing sparse block: return empty list
 		if( sparseBlock==null )
 			return new ArrayList<IJV>().iterator();

 		//get iterator over sparse block
 		if( rlen == sparseBlock.numRows() )
 			return sparseBlock.getIterator();
 		else
 			return sparseBlock.getIterator(rlen);
 	}

 	public Iterator<IJV> getSparseBlockIterator(int rl, int ru) {
 		//check for valid format, should have been checked from outside
 		if( !sparse )
 			throw new RuntimeException("getSparseBlockInterator should not be called for dense format");

 		//check for existing sparse block: return empty list
 		if( sparseBlock==null )
 			return Collections.emptyListIterator();

 		//get iterator over sparse block
 		return sparseBlock.getIterator(rl, ru);
 	}

 	@Override
 	public double getValue(int r, int c)
 	{
 		//matrix bounds check
 		if( r >= rlen || c >= clen )
 			throw new RuntimeException("indexes ("+r+","+c+") out of range ("+rlen+","+clen+")");

 		return quickGetValue(r, c);
 	}

 	@Override
 	public void setValue(int r, int c, double v)
 	{
 		//matrix bounds check
 		if( r >= rlen || c >= clen )
 			throw new RuntimeException("indexes ("+r+","+c+") out of range ("+rlen+","+clen+")");

 		quickSetValue(r, c, v);
 	}

 	public double quickGetValue(int r, int c) {
 		if( sparse && sparseBlock!=null )
 			return sparseBlock.get(r, c);
 		else if( !sparse && denseBlock!=null )
 			return denseBlock.get(r, c);
 		return 0;
 	}

 	public void quickSetValue(int r, int c, double v)
 	{
 		if(sparse) {
 			//early abort
 			if( (sparseBlock==null || sparseBlock.isEmpty(r)) && v==0 )
 				return;

 			//allocation on demand
 			allocateSparseRowsBlock(false);
 			sparseBlock.allocate(r, estimatedNNzsPerRow, clen);

 			//set value and maintain nnz
 			if( sparseBlock.set(r, c, v) )
 				nonZeros += (v!=0) ? 1 : -1;
 		}
 		else {
 			//early abort
 			if( denseBlock==null && v==0 )
 				return;

 			//allocate and init dense block (w/o overwriting nnz)
 			allocateDenseBlock(false);

 			//set value and maintain nnz
 			if( denseBlock.get(r, c)==0 )
 				nonZeros++;
 			denseBlock.set(r, c, v);
 			if( v==0 )
 				nonZeros--;
 		}
 	}

 	public double getValueDenseUnsafe(int r, int c) {
 		if(denseBlock==null)
 			return 0;
 		return denseBlock.get(r, c);
 	}

 	public boolean containsValue(double pattern) {
 		//fast paths: infer from meta data only
 		if(isEmptyBlock(true))
 			return pattern==0;
 		if( nonZeros < getLength() && pattern == 0 )
 			return true;

 		//make a pass over the data to determine if it includes the
 		//pattern, with early abort as soon as the pattern is found
 		boolean NaNpattern = Double.isNaN(pattern);
 		if( isInSparseFormat() ) {
 			SparseBlock sb = getSparseBlock();
 			for(int i=0; i<rlen; i++) {
 				if( sb.isEmpty(i) ) continue;
 				int apos = sb.pos(i);
 				int alen = sb.size(i);
 				double[] avals = sb.values(i);
 				for( int j=apos; j<apos+alen; j++ )
 					if(avals[j]==pattern || (NaNpattern && Double.isNaN(avals[j])))
 						return true;
 			}
 		}
 		else {
 			DenseBlock db = getDenseBlock();
 			for(int i=0; i<rlen; i++) {
 				double[] vals = db.values(i);
 				int pos = db.pos(i);
 				for(int j=pos; j<pos+clen; j++)
 					if(vals[j]==pattern || (NaNpattern && Double.isNaN(vals[j])))
 						return true;
 			}
 		}
 		return false;
 	}

 	/**
 	 * Append value is only used when values are appended at the end of each row for the sparse representation
 	 * This can only be called, when the caller knows the access pattern of the block
 	 *
 	 * @param r row
 	 * @param c column
 	 * @param v value
 	 */
 	public void appendValue(int r, int c, double v)
 	{
 		//early abort (append guarantees no overwrite)
 		if( v == 0 )
 			return;

 		if( !sparse ) //DENSE
 		{
 			//allocate on demand (w/o overwriting nnz)
 			allocateDenseBlock(false);

 			//set value and maintain nnz
 			denseBlock.set(r, c, v);
 			nonZeros++;
 		}
 		else //SPARSE
 		{
 			//allocation on demand (w/o overwriting nnz)
 			allocateSparseRowsBlock(false);
 			sparseBlock.allocate(r, estimatedNNzsPerRow, clen);

 			//set value and maintain nnz
 			sparseBlock.append(r, c, v);
 			nonZeros++;
 		}
 	}

 	public void appendRow(int r, SparseRow row) {
 		appendRow(r, row, true);
 	}

 	public void appendRow(int r, SparseRow row, boolean deep)
 	{
 		if(row == null)
 			return;

 		if(sparse) {
 			//allocation on demand
 			allocateSparseRowsBlock(false);
 			sparseBlock.set(r, row, deep);
 			nonZeros += row.size();
 		}
 		else {
 			int[] cols = row.indexes();
 			double[] vals = row.values();
 			for(int i=0; i<row.size(); i++)
 				quickSetValue(r, cols[i], vals[i]);
 		}
 	}

 	public void appendToSparse( MatrixBlock that, int rowoffset, int coloffset ) {
 		appendToSparse(that, rowoffset, coloffset, true);
 	}

 	public void appendToSparse( MatrixBlock that, int rowoffset, int coloffset, boolean deep )
 	{
 		if( that==null || that.isEmptyBlock(false) )
 			return; //nothing to append

 		//init sparse rows if necessary
 		allocateSparseRowsBlock(false);

 		//append individual rows
 		int m2 = that.rlen;
 		for(int i=0; i<m2; i++)
 			appendRowToSparse(sparseBlock, that, i, rowoffset, coloffset, deep);
 	}

 	public void appendRowToSparse( SparseBlock dest, MatrixBlock src, int i, int rowoffset, int coloffset, boolean deep ) {
 		if( src.sparse ) //SPARSE <- SPARSE
 		{
 			SparseBlock a = src.sparseBlock;
 			if( a.isEmpty(i) ) return;
 			int aix = rowoffset+i;

 			//single block append (avoid re-allocations)
 			if( !dest.isAllocated(aix) && coloffset==0 ) {
 				//note: the deep copy flag is only relevant for MCSR due to
 				//shallow references of b.get(i); other block formats do not
 				//require a redundant copy because b.get(i) created a new row.
 				boolean ldeep = (deep && a instanceof SparseBlockMCSR);
 				dest.set(aix, a.get(i), ldeep);
 			}
 			else { //general case
 				int pos = a.pos(i);
 				int len = a.size(i);
 				int[] ix = a.indexes(i);
 				double[] val = a.values(i);
 				if( estimatedNNzsPerRow > 0 )
 					dest.allocate(aix, Math.max(estimatedNNzsPerRow, dest.size(aix)+len), clen);
 				else
 					dest.allocate(aix, dest.size(aix)+len);
 				for( int j=pos; j<pos+len; j++ )
 					dest.append(aix, coloffset+ix[j], val[j]);
 			}
 		}
 		else //SPARSE <- DENSE
 		{
 			DenseBlock a = src.getDenseBlock();
 			final int n2 = src.clen;
 			double[] avals = a.values(i);
 			int aix = a.pos(i);
 			int cix = rowoffset + i;
 			for( int j=0; j<n2; j++ ) {
 				double bval = avals[aix+j];
 				if( bval != 0 ) {
 					dest.allocate(cix, estimatedNNzsPerRow, clen);
 					dest.append(cix, coloffset+j, bval);
 				}
 			}
 		}
 	}


 	/**
 	 * Sorts all existing sparse rows by column indexes.
 	 */
 	public void sortSparseRows() {
 		if( !sparse || sparseBlock==null )
 			return;
 		sparseBlock.sort();
 	}

 	/**
 	 * Sorts all existing sparse rows in range [rl,ru) by
 	 * column indexes.
 	 *
 	 * @param rl row lower bound, inclusive
 	 * @param ru row upper bound, exclusive
 	 */
 	public void sortSparseRows(int rl, int ru) {
 		if( !sparse || sparseBlock==null )
 			return;
 		for( int i=rl; i<ru; i++ )
 			if( !sparseBlock.isEmpty(i) )
 				sparseBlock.sort(i);
 	}

 	/**
 	 * Utility function for computing the min non-zero value.
 	 *
 	 * @return minimum non-zero value
 	 */
 	public double minNonZero() {
 		//check for empty block and return immediately
 		if( isEmptyBlock() )
 			return -1;

 		//NOTE: usually this method is only applied on dense vectors and hence not really tuned yet.
 		double min = Double.POSITIVE_INFINITY;
 		for( int i=0; i<rlen; i++ )
 			for( int j=0; j<clen; j++ ){
 				double val = quickGetValue(i, j);
 				if( val != 0 )
 					min = Math.min(min, val);
 			}

 		return min;
 	}

 	/**
 	 * Wrapper method for reduceall-product of a matrix.
 	 *
 	 * @return the product sum of the matrix content
 	 */
 	public double prod() {
 		MatrixBlock out = new MatrixBlock(1, 1, false);
 		LibMatrixAgg.aggregateUnaryMatrix(this, out,
 			InstructionUtils.parseBasicAggregateUnaryOperator("ua*", 1));
 		return out.quickGetValue(0, 0);
 	}

 	/**
 	 * Wrapper method for reduceall-mean of a matrix.
 	 *
 	 * @return the mean value of all values in the matrix
 	 */
 	public double mean() {
 		MatrixBlock out = new MatrixBlock(1, 3, false);
 		LibMatrixAgg.aggregateUnaryMatrix(this, out,
 			InstructionUtils.parseBasicAggregateUnaryOperator("uamean", 1));
 		return out.quickGetValue(0, 0);
 	}

 	/**
 	 * Wrapper method for reduceall-min of a matrix.
 	 *
 	 * @return the minimum value of all values in the matrix
 	 */
 	public double min() {
 		MatrixBlock out = new MatrixBlock(1, 1, false);
 		LibMatrixAgg.aggregateUnaryMatrix(this, out,
 			InstructionUtils.parseBasicAggregateUnaryOperator("uamin", 1));
 		return out.quickGetValue(0, 0);
 	}

 	/**
 	 * Wrapper method for reduceall-max of a matrix.
 	 *
 	 * @return the maximum value of all values in the matrix
 	 */
 	public double max() {
 		MatrixBlock out = new MatrixBlock(1, 1, false);
 		LibMatrixAgg.aggregateUnaryMatrix(this, out,
 			InstructionUtils.parseBasicAggregateUnaryOperator("uamax", 1));
 		return out.quickGetValue(0, 0);
 	}

 	/**
 	 * Wrapper method for reduceall-sum of a matrix.
 	 *
 	 * @return Sum of the values in the matrix.
 	 */
 	public double sum() {
 		KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
 		return sumWithFn(kplus);
 	}

 	/**
 	 * Wrapper method for reduceall-sumSq of a matrix.
 	 *
 	 * @return Sum of the squared values in the matrix.
 	 */
 	public double sumSq() {
 		KahanPlusSq kplusSq = KahanPlusSq.getKahanPlusSqFnObject();
 		return sumWithFn(kplusSq);
 	}

 	/**
 	 * Wrapper method for reduceall-sum of a matrix using the given
 	 * Kahan function for summation.
 	 *
 	 * @param kfunc A Kahan function object to use for summation.
 	 * @return Sum of the values in the matrix with the given
 	 *         function applied.
 	 */
 	private double sumWithFn(KahanFunction kfunc) {
 		//construct operator
 		CorrectionLocationType corrLoc = CorrectionLocationType.LASTCOLUMN;
 		ReduceAll reduceAllObj = ReduceAll.getReduceAllFnObject();
 		AggregateOperator aop = new AggregateOperator(0, kfunc, corrLoc);
 		AggregateUnaryOperator auop = new AggregateUnaryOperator(aop, reduceAllObj);
 		//execute operation
 		MatrixBlock out = new MatrixBlock(1, 2, false);
 		LibMatrixAgg.aggregateUnaryMatrix(this, out, auop);
 		return out.quickGetValue(0, 0);
 	}

 	////////
 	// sparsity handling functions

 	/**
 	 * Returns the current representation (true for sparse).
 	 *
 	 * @return true if sparse
 	 */
 	@Override
 	public boolean isInSparseFormat() {
 		return sparse;
 	}

 	public boolean isUltraSparse() {
 		return isUltraSparse(true);
 	}

 	public boolean isUltraSparse(boolean checkNnz) {
 		double sp = ((double)nonZeros/rlen)/clen;
 		//check for sparse representation in order to account for vectors in dense
 		return sparse && sp<ULTRA_SPARSITY_TURN_POINT
 			&& (!checkNnz || nonZeros<ULTRA_SPARSE_BLOCK_NNZ);
 	}

 	public boolean isSparsePermutationMatrix() {
 		if( !isInSparseFormat() || nonZeros > rlen )
 			return false;
 		SparseBlock sblock = getSparseBlock();
 		boolean isPM = (sblock != null);
 		for( int i=0; i<rlen & isPM; i++ )
 			isPM &= sblock.isEmpty(i) || sblock.size(i) == 1;
 		return isPM;
 	}

 	private boolean isUltraSparseSerialize(boolean sparseDst) {
 		return nonZeros<rlen && sparseDst;
 	}

 	/**
 	 * Evaluates if this matrix block should be in sparse format in
 	 * memory. Note that this call does not change the representation -
 	 * for this please call examSparsity.
 	 *
 	 * @return true if matrix block should be in sparse format in memory
 	 */
 	public boolean evalSparseFormatInMemory() {
 		//ensure exact size estimates for write
 		if( nonZeros<=0 )
 			recomputeNonZeros();

 		//decide on in-memory representation
 		return evalSparseFormatInMemory(rlen, clen, nonZeros);
 	}

 	@SuppressWarnings("unused")
 	private boolean evalSparseFormatInMemory(boolean transpose)
 	{
 		int lrlen = (transpose) ? clen : rlen;
 		int lclen = (transpose) ? rlen : clen;
 		long lnonZeros = nonZeros;

 		//ensure exact size estimates for write
 		if( lnonZeros<=0 ) {
 			recomputeNonZeros();
 			lnonZeros = nonZeros;
 		}

 		//decide on in-memory representation
 		return evalSparseFormatInMemory(lrlen, lclen, lnonZeros);
 	}

 	/**
 	 * Evaluates if this matrix block should be in sparse format on
 	 * disk. This applies to any serialized matrix representation, i.e.,
 	 * when writing to in-memory buffer pool pages or writing to local fs
 	 * or hdfs.
 	 *
 	 * @return true if matrix block should be in sparse format on disk
 	 */
 	public boolean evalSparseFormatOnDisk() {
 		//ensure exact size estimates for write
 		if( nonZeros <= 0 )
 			recomputeNonZeros();

 		//decide on in-memory representation
 		return evalSparseFormatOnDisk(rlen, clen, nonZeros);
 	}

 	public void examSparsity() {
 		examSparsity(true);
 	}

 	/**
 	 * Evaluates if this matrix block should be in sparse format in
 	 * memory. Depending on the current representation, the state of the
 	 * matrix block is changed to the right representation if necessary.
 	 * Note that this consumes for the time of execution memory for both
 	 * representations.
 	 *
 	 * @param allowCSR allow CSR format on dense to sparse conversion
 	 */
 	public void examSparsity(boolean allowCSR) {
 		//determine target representation
 		boolean sparseDst = evalSparseFormatInMemory();

 		//check for empty blocks (e.g., sparse-sparse)
 		if( isEmptyBlock(false) ) {
 			cleanupBlock(true, true);
 		}

 		//change representation if required (also done for
 		//empty blocks in order to set representation flags)
 		if( sparse && !sparseDst)
 			sparseToDense();
 		else if( !sparse && sparseDst )
 			denseToSparse(allowCSR);
 	}

 	public static boolean evalSparseFormatInMemory(DataCharacteristics dc) {
 		return evalSparseFormatInMemory(dc.getRows(), dc.getCols(), dc.getNonZeros());
 	}


 	/**
 	 * Evaluates if a matrix block with the given characteristics should be in sparse format
 	 * in memory.
 	 *
 	 * @param nrows number of rows
 	 * @param ncols number of columns
 	 * @param nnz number of non-zeros
 	 * @return true if matrix block shold be in sparse format in memory
 	 */
 	public static boolean evalSparseFormatInMemory( final long nrows, final long ncols, final long nnz )
 	{
 		//evaluate sparsity threshold
 		double lsparsity = (double)nnz/nrows/ncols;
 		boolean lsparse = (lsparsity < SPARSITY_TURN_POINT);

 		//compare size of sparse and dense representation in order to prevent
 		//that the sparse size exceed the dense size since we use the dense size
 		//as worst-case estimate if unknown (and it requires less io from
 		//main memory).
 		double sizeSparse = estimateSizeSparseInMemory(nrows, ncols, lsparsity);
 		double sizeDense = estimateSizeDenseInMemory(nrows, ncols);

 		return lsparse && (sizeSparse<sizeDense);
 	}

 	/**
 	 * Evaluates if a matrix block with the given characteristics should be in sparse format
 	 * on disk (or in any other serialized representation).
 	 *
 	 * @param nrows number of rows
 	 * @param ncols number of columns
 	 * @param nnz number of non-zeros
 	 * @return true if matrix block shold be in sparse format on disk
 	 */
 	public static boolean evalSparseFormatOnDisk( final long nrows, final long ncols, final long nnz )
 	{
 		//evaluate sparsity threshold
 		double lsparsity = ((double)nnz/nrows)/ncols;
 		boolean lsparse = (lsparsity < SPARSITY_TURN_POINT);

 		double sizeUltraSparse = estimateSizeUltraSparseOnDisk( nrows, ncols, nnz );
 		double sizeSparse = estimateSizeSparseOnDisk(nrows, ncols, nnz);
 		double sizeDense = estimateSizeDenseOnDisk(nrows, ncols);

 		return lsparse && (sizeSparse<sizeDense || sizeUltraSparse<sizeDense);
 	}


 	////////
 	// basic block handling functions

 	private void denseToSparse() {
 		denseToSparse(true);
 	}

 	private void denseToSparse(boolean allowCSR)
 	{
 		DenseBlock a = getDenseBlock();

 		//set target representation, early abort on empty blocks
 		sparse = true;
 		if( a == null )
 			return;

 		final int m = rlen;
 		final int n = clen;

 		if( allowCSR && nonZeros <= Integer.MAX_VALUE ) {
 			//allocate target in memory-efficient CSR format
 			int lnnz = (int) nonZeros;
 			int[] rptr = new int[m+1];
 			int[] indexes = new int[lnnz];
 			double[] values = new double[lnnz];
 			for( int i=0, pos=0; i<m; i++ ) {
 				double[] avals = a.values(i);
 				int aix = a.pos(i);
 				for(int j=0; j<n; j++) {
 					double aval = avals[aix+j];
 					if( aval != 0 ) {
 						indexes[pos] = j;
 						values[pos] = aval;
 						pos++;
 					}
 				}
 				rptr[i+1]=pos;
 			}
 			sparseBlock = new SparseBlockCSR(
 				rptr, indexes, values, lnnz);
 		}
 		else {
 			//fallback to less-memory efficient MCSR format,
 			//which however allows much larger sparse matrices
 			if( !allocateSparseRowsBlock() )
 				reset(); //reset if not allocated
 			SparseBlock sblock = sparseBlock;
 			for( int i=0; i<m; i++ ) {
 				double[] avals = a.values(i);
 				int aix = a.pos(i);
 				//compute nnz per row (not via recomputeNonZeros as sparse allocated)
 				int lnnz = UtilFunctions.computeNnz(avals, aix, clen);
 				if( lnnz <= 0 ) continue;
 				//allocate sparse row and append non-zero values
 				sblock.allocate(i, lnnz);
 				for( int j=0; j<n; j++ )
 					sblock.append(i, j, avals[aix+j]);
 			}
 		}

 		//update nnz and cleanup dense block
 		denseBlock = null;
 	}

 	public void sparseToDense() {
 		//set target representation, early abort on empty blocks
 		sparse = false;
 		if(sparseBlock==null)
 			return;

 		int limit=rlen*clen;
 		if ( limit < 0 ) {
 			throw new DMLRuntimeException("Unexpected error in sparseToDense().. limit < 0: " + rlen + ", " + clen + ", " + limit);
 		}

 		//allocate dense target block, but keep nnz (no need to maintain)
 		if( !allocateDenseBlock(false) )
 			denseBlock.reset();

 		//copy sparse to dense
 		SparseBlock a = sparseBlock;
 		DenseBlock c = getDenseBlock();

 		for( int i=0; i<rlen; i++ ) {
 			if( a.isEmpty(i) ) continue;
 			int apos = a.pos(i);
 			int alen = a.size(i);
 			int[] aix = a.indexes(i);
 			double[] avals = a.values(i);
 			double[] cvals = c.values(i);
 			int cix = c.pos(i);
 			for( int j=apos; j<apos+alen; j++ )
 				if( avals[j] != 0 )
 					cvals[cix+aix[j]] = avals[j];
 		}

 		//cleanup sparse rows
 		sparseBlock = null;
 	}

 	/**
 	 * Recomputes and materializes the number of non-zero values
 	 * of the entire matrix block.
 	 *
 	 * @return number of non-zeros
 	 */
 	public long recomputeNonZeros() {
 		if( sparse && sparseBlock!=null ) { //SPARSE
 			//note: rlen might be <= sparseBlock.numRows()
 			nonZeros = sparseBlock.size(0, sparseBlock.numRows());
 		}
 		else if( !sparse && denseBlock!=null ) { //DENSE
 			nonZeros = denseBlock.countNonZeros();
 		}
 		return nonZeros;
 	}

 	public long recomputeNonZeros(int rl, int ru) {
 		return recomputeNonZeros(rl, ru, 0, clen-1);
 	}

 	/**
 	 * Recomputes the number of non-zero values of a specified
 	 * range of the matrix block. NOTE: This call does not materialize
 	 * the compute result in any form.
 	 *
 	 * @param rl 	row lower index, 0-based, inclusive
 	 * @param ru 	row upper index, 0-based, inclusive
 	 * @param cl 	column lower index, 0-based, inclusive
 	 * @param cu 	column upper index, 0-based, inclusive
 	 * @return the number of non-zero values
 	 */
 	public long recomputeNonZeros(int rl, int ru, int cl, int cu)
 	{
 		if( sparse && sparseBlock!=null ) //SPARSE
 		{
 			long nnz = 0;
 			if( cl==0 && cu==clen-1 ) { //specific case: all cols
 				nnz = sparseBlock.size(rl, ru+1);
 			}
 			else if( cl==cu ) { //specific case: one column
 				final int rlimit = Math.min( ru+1, rlen);
 				for(int i=rl; i<rlimit; i++)
 					if(!sparseBlock.isEmpty(i))
 						nnz += (sparseBlock.get(i, cl)!=0) ? 1 : 0;
 			}
 			else { //general case
 				nnz = sparseBlock.size(rl, ru+1, cl, cu+1);
 			}
 			return nnz;
 		}
 		else if( !sparse && denseBlock!=null ) { //DENSE
 			return denseBlock.countNonZeros(rl, ru+1, cl, cu+1);
 		}

 		return 0; //empty block
 	}

 	/**
 	 * Basic debugging primitive to check correctness of nnz.
 	 * This method is not intended for production use.
 	 */
 	public void checkNonZeros() {
 		//take non-zeros before and after recompute nnz
 		long nnzBefore = getNonZeros();
 		recomputeNonZeros();
 		long nnzAfter = getNonZeros();

 		//raise exception if non-zeros don't match up
 		if( nnzBefore != nnzAfter )
 			throw new RuntimeException("Number of non zeros incorrect: "+nnzBefore+" vs "+nnzAfter);
 	}

 	public void checkSparseRows() {
 		checkSparseRows(0, rlen);
 	}

 	/**
 	 * Basic debugging primitive to check sparse block column ordering.
 	 * This method is not intended for production use.
 	 *
 	 * @param rl row lower bound (inclusive)
 	 * @param ru row upper bound (exclusive)
 	 */
 	public void checkSparseRows(int rl, int ru) {
 		if( !sparse || sparseBlock == null )
 			return;

 		//check ordering of column indexes per sparse row
 		for( int i=rl; i<ru; i++ )
 			if( !sparseBlock.isEmpty(i) ) {
 				int apos = sparseBlock.pos(i);
 				int alen = sparseBlock.size(i);
 				int[] aix = sparseBlock.indexes(i);
 				double[] avals = sparseBlock.values(i);
 				for( int k=apos+1; k<apos+alen; k++ )
 					if( aix[k-1] >= aix[k] )
 						throw new RuntimeException("Wrong sparse row ordering: "+k+" "+aix[k-1]+" "+aix[k]);
 				for( int k=apos; k<apos+alen; k++ )
 					if( avals[k] == 0 )
 						throw new RuntimeException("Wrong sparse row: zero at "+k);
 			}
 	}

 	@Override
 	public void copy(MatrixValue thatValue) {
 		MatrixBlock that = checkType(thatValue);
 		//copy into automatically determined representation
 		copy( that, that.evalSparseFormatInMemory() );
 	}

 	@Override
 	public void copy(MatrixValue thatValue, boolean sp)
 	{
 		MatrixBlock that = checkType(thatValue);

 		if( this == that ) //prevent data loss (e.g., on sparse-dense conversion)
 			throw new RuntimeException( "Copy must not overwrite itself!" );

 		this.rlen=that.rlen;
 		this.clen=that.clen;
 		this.sparse=sp;
 		estimatedNNzsPerRow=(int)Math.ceil((double)thatValue.getNonZeros()/(double)rlen);
 		if(this.sparse && that.sparse)
 			copySparseToSparse(that);
 		else if(this.sparse && !that.sparse)
 			copyDenseToSparse(that);
 		else if(!this.sparse && that.sparse)
 			copySparseToDense(that);
 		else
 			copyDenseToDense(that);
 	}

 	public MatrixBlock copyShallow(MatrixBlock that) {
 		rlen = that.rlen;
 		clen = that.clen;
 		nonZeros = that.nonZeros;
 		sparse = that.sparse;
 		if( !sparse )
 			denseBlock = that.denseBlock;
 		else
 			sparseBlock = that.sparseBlock;
 		return this;
 	}

 	private void copySparseToSparse(MatrixBlock that) {
 		this.nonZeros=that.nonZeros;
 		if( that.isEmptyBlock(false) ) {
 			resetSparse();
 			return;
 		}

 		allocateSparseRowsBlock(false);
 		for(int i=0; i<Math.min(that.sparseBlock.numRows(), rlen); i++) {
 			if(!that.sparseBlock.isEmpty(i)) {
 				sparseBlock.set(i, that.sparseBlock.get(i), true);
 			}
 			else if(!this.sparseBlock.isEmpty(i)) {
 				this.sparseBlock.reset(i,estimatedNNzsPerRow, clen);
 			}
 		}
 	}

 	private void copyDenseToDense(MatrixBlock that) {
 		nonZeros = that.nonZeros;

 		//plain reset to 0 for empty input
 		if( that.isEmptyBlock(false) ) {
 			if(denseBlock!=null)
 				denseBlock.reset(rlen, clen);
 			return;
 		}

 		//allocate and copy dense block
 		allocateDenseBlock(false);
 		denseBlock.set(that.denseBlock);
 	}

 	private void copySparseToDense(MatrixBlock that) {
 		this.nonZeros=that.nonZeros;
 		if( that.isEmptyBlock(false) ) {
 			if(denseBlock!=null)
 				denseBlock.reset(rlen, clen);
 			return;
 		}

 		//allocate and init dense block (w/o overwriting nnz)
 		allocateDenseBlock(false);
 		SparseBlock a = that.getSparseBlock();
 		DenseBlock c = getDenseBlock();
 		for( int i=0; i<Math.min(a.numRows(), rlen); i++ ) {
 			if( a.isEmpty(i) ) continue;
 			int pos = a.pos(i);
 			int len = a.size(i);
 			int[] aix = a.indexes(i);
 			double[] avals = a.values(i);
 			double[] cvals = c.values(i);
 			int cix = c.pos(i);
 			for( int j=pos; j<pos+len; j++ )
 				cvals[cix+aix[j]] = avals[j];
 		}
 	}

 	private void copyDenseToSparse(MatrixBlock that) {
 		nonZeros = that.nonZeros;
 		if( that.isEmptyBlock(false) ) {
 			resetSparse();
 			return;
 		}

 		if( !allocateSparseRowsBlock(false) )
 			resetSparse();
 		DenseBlock a = that.getDenseBlock();
 		SparseBlock c = getSparseBlock();
 		for(int i=0; i<rlen; i++) {
 			double[] avals = a.values(i);
 			int aix = a.pos(i);
 			for( int j=0; j<clen; j++ ) {
 				double val = avals[aix+j];
 				if( val == 0 ) continue;
 				//create sparse row only if required
 				c.allocate(i, estimatedNNzsPerRow, clen);
 				c.append(i, j, val);
 			}
 		}
 	}


 	/**
 	 * In-place copy of matrix src into the index range of the existing current matrix.
 	 * Note that removal of existing nnz in the index range and nnz maintenance is
 	 * only done if 'awareDestNZ=true',
 	 *
 	 * @param rl row lower index, 0-based
 	 * @param ru row upper index, 0-based
 	 * @param cl column lower index, 0-based
 	 * @param cu column upper index, 0-based
 	 * @param src matrix block
 	 * @param awareDestNZ
 	 *           true, forces (1) to remove existing non-zeros in the index range of the
 	 *                 destination if not present in src and (2) to internally maintain nnz
 	 *           false, assume empty index range in destination and do not maintain nnz
 	 *                  (the invoker is responsible to recompute nnz after all copies are done)
 	 */
 	public void copy(int rl, int ru, int cl, int cu, MatrixBlock src, boolean awareDestNZ ) {
 		if(sparse && src.sparse)
 			copySparseToSparse(rl, ru, cl, cu, src, awareDestNZ);
 		else if(sparse && !src.sparse)
 			copyDenseToSparse(rl, ru, cl, cu, src, awareDestNZ);
 		else if(!sparse && src.sparse)
 			copySparseToDense(rl, ru, cl, cu, src, awareDestNZ);
 		else
 			copyDenseToDense(rl, ru, cl, cu, src, awareDestNZ);
 	}

 	private void copySparseToSparse(int rl, int ru, int cl, int cu, MatrixBlock src, boolean awareDestNZ) {
 		//handle empty src and dest
 		if( src.isEmptyBlock(false) ) {
 			if( awareDestNZ && sparseBlock != null )
 				copyEmptyToSparse(rl, ru, cl, cu, true);
 			return;
 		}

 		allocateSparseRowsBlock(false);

 		//copy values
 		SparseBlock a = src.sparseBlock;
 		SparseBlock b = sparseBlock;
 		for( int i=0; i<src.rlen; i++ ) {
 			if( a.isEmpty(i) ) {
 				copyEmptyToSparse(rl+i, rl+i, cl, cu, true);
 				continue;
 			}
 			int apos = a.pos(i);
 			int alen = a.size(i);
 			int[] aix = a.indexes(i);
 			double[] avals = a.values(i);
 			//copy row into empty target row
 			if( b.isEmpty(rl+i) ) {
 				if( cl == 0 ) { //no index offset needed
 					appendRow(rl+i, a.get(i), false);
 					nonZeros -= alen; //avoid nnz corruption
 				}
 				else {
 					b.allocate(rl+i, alen);
 					b.setIndexRange(rl+i, cl, cu+1, avals, aix, apos, alen);
 				}
 				nonZeros += awareDestNZ ? alen : 0;
 			}
 			//insert row into non-empty target row
 			else { //general case
 				int lnnz = b.size(rl+i);
 				b.setIndexRange(rl+i, cl, cu+1, avals, aix, apos, alen);
 				nonZeros += awareDestNZ ? (b.size(rl+i) - lnnz) : 0;
 			}
 		}
 	}

 	private void copySparseToDense(int rl, int ru, int cl, int cu, MatrixBlock src, boolean awareDestNZ) {
 		//handle empty src and dest
 		if( src.isEmptyBlock(false) ) {
 			if( awareDestNZ && denseBlock != null ) {
 				nonZeros -= recomputeNonZeros(rl, ru, cl, cu);
 				denseBlock.set(rl, ru+1, cl, cu+1, 0);
 			}
 			return;
 		}
 		if(denseBlock==null)
 			allocateDenseBlock();
 		else if( awareDestNZ ) {
 			nonZeros -= recomputeNonZeros(rl, ru, cl, cu);
 			denseBlock.set(rl, ru+1, cl, cu+1, 0);
 		}

 		//copy values
 		SparseBlock a = src.sparseBlock;
 		DenseBlock c = getDenseBlock();
 		for( int i=0; i<src.rlen; i++ ) {
 			if( a.isEmpty(i) ) continue;
 			int apos = a.pos(i);
 			int alen = a.size(i);
 			int[] aix = a.indexes(i);
 			double[] avals = a.values(i);
 			double[] cvals = c.values(rl+i);
 			int cix = c.pos(rl+i, cl);
 			for( int j=apos; j<apos+alen; j++ )
 				cvals[cix+aix[j]] = avals[j];
 			nonZeros += awareDestNZ ? alen : 0;
 		}
 	}

 	private void copyDenseToSparse(int rl, int ru, int cl, int cu, MatrixBlock src, boolean awareDestNZ)
 	{
 		//handle empty src and dest
 		if( src.isEmptyBlock(false) ) {
 			if( awareDestNZ && sparseBlock != null )
 				copyEmptyToSparse(rl, ru, cl, cu, true);
 			return;
 		}

 		//allocate output block
 		//no need to clear for awareDestNZ since overwritten
 		allocateSparseRowsBlock(false);

 		//copy values
 		DenseBlock a = src.getDenseBlock();
 		SparseBlock c = getSparseBlock();
 		for( int i=0; i<src.rlen; i++ )
 		{
 			int rix = rl + i;
 			double[] avals = a.values(i);
 			int aix = a.pos(i);
 			if( c instanceof SparseBlockMCSR
 				&& c.isEmpty(rix) ) //special case MCSR append
 			{
 				//count nnz per row (fits likely in L1 cache)
 				int lnnz = UtilFunctions.computeNnz(avals, aix, src.clen);

 				//allocate row once and copy values
 				if( lnnz > 0 ) {
 					c.allocate(rix, lnnz);
 					for( int j=0; j<src.clen; j++ ) {
 						double val = avals[aix+j];
 						if( val != 0 )
 							c.append(rix, cl+j, val);
 					}
 					if( awareDestNZ )
 						nonZeros += lnnz;
 				}
 			}
 			else if( awareDestNZ ) { //general case (w/ awareness NNZ)
 				int lnnz = c.size(rix);
 				if( cl==cu ) {
 					double val = avals[aix];
 					c.set(rix, cl, val);
 				}
 				else {
 					c.setIndexRange(rix, cl, cu+1, avals, aix, src.clen);
 				}
 				nonZeros += (c.size(rix) - lnnz);
 			}
 			else { //general case (w/o awareness NNZ)
 				for( int j=0; j<src.clen; j++ ) {
 					double val = avals[aix+j];
 					if( val != 0 )
 						c.set(rix, cl+j, val);
 				}
 			}
 		}
 	}

 	private void copyDenseToDense(int rl, int ru, int cl, int cu, MatrixBlock src, boolean awareDestNZ) {
 		//handle empty src and dest
 		if( src.isEmptyBlock(false) ) {
 			if( awareDestNZ && denseBlock != null ) {
 				nonZeros -= recomputeNonZeros(rl, ru, cl, cu);
 				denseBlock.set(rl, ru+1, cl, cu+1, 0);
 			}
 			return;
 		}

 		//allocate output block
 		//no need to clear for awareDestNZ since overwritten
 		allocateDenseBlock(false);

 		if( awareDestNZ && (nonZeros!=getLength() || src.nonZeros!=src.getLength()) )
 			nonZeros = nonZeros - recomputeNonZeros(rl, ru, cl, cu) + src.nonZeros;

 		//copy values
 		DenseBlock a = src.getDenseBlock();
 		DenseBlock c = getDenseBlock();
 		c.set(rl, ru+1, cl, cu+1, a);
 	}

 	private void copyEmptyToSparse(int rl, int ru, int cl, int cu, boolean updateNNZ ) {
 		SparseBlock a = sparseBlock;
 		if( cl==cu ) { //specific case: column vector
 			for( int i=rl; i<=ru; i++ )
 				if( !a.isEmpty(i) ) {
 					boolean update = a.set(i, cl, 0);
 					if( updateNNZ )
 						nonZeros -= update ? 1 : 0;
 				}
 		}
 		else {
 			for( int i=rl; i<=ru; i++ )
 				if( !a.isEmpty(i) ) {
 					int lnnz = a.size(i);
 					a.deleteIndexRange(i, cl, cu+1);
 					if( updateNNZ )
 						nonZeros += (a.size(i)-lnnz);
 				}
 		}
 	}

 	@Override
 	public void merge(CacheBlock that, boolean appendOnly) {
 		merge((MatrixBlock)that, appendOnly);
 	}

 	/**
 	 * Merge disjoint: merges all non-zero values of the given input into the current
 	 * matrix block. Note that this method does NOT check for overlapping entries;
 	 * it's the callers reponsibility of ensuring disjoint matrix blocks.
 	 *
 	 * The appendOnly parameter is only relevant for sparse target blocks; if true,
 	 * we only append values and do not sort sparse rows for each call; this is useful
 	 * whenever we merge iterators of matrix blocks into one target block.
 	 *
 	 * @param that matrix block
 	 * @param appendOnly ?
 	 */
 	public void merge(MatrixBlock that, boolean appendOnly) {
 		merge(that, appendOnly, false, true);
 	}

 	public void merge(MatrixBlock that, boolean appendOnly, boolean par) {
 		merge(that, appendOnly, par, true);
 	}

 	public void merge(MatrixBlock that, boolean appendOnly, boolean par, boolean deep) {
 		//check for empty input source (nothing to merge)
 		if( that == null || that.isEmptyBlock(false) )
 			return;

 		//check dimensions (before potentially copy to prevent implicit dimension change)
 		//this also does a best effort check for disjoint input blocks via the number of non-zeros
 		if( rlen != that.rlen || clen != that.clen )
 			throw new DMLRuntimeException("Dimension mismatch on merge disjoint (target="+rlen+"x"+clen+", source="+that.rlen+"x"+that.clen+")");
 		if( nonZeros+that.nonZeros > (long)rlen*clen )
 			throw new DMLRuntimeException("Number of non-zeros mismatch on merge disjoint (target="+rlen+"x"+clen+", nnz target="+nonZeros+", nnz source="+that.nonZeros+")");

 		//check for empty target (copy in full)
 		if( isEmptyBlock(false) && !(!sparse && isAllocated()) ) {
 			copy(that);
 			return;
 		}

 		//core matrix block merge (guaranteed non-empty source/target, nnz maintenance not required)
 		long nnz = nonZeros + that.nonZeros;
 		if( sparse )
 			mergeIntoSparse(that, appendOnly, deep);
 		else if( par )
 			mergeIntoDensePar(that);
 		else
 			mergeIntoDense(that);

 		//maintain number of nonzeros
 		nonZeros = nnz;
 	}

 	private void mergeIntoDense(MatrixBlock that) {
 		DenseBlock a = getDenseBlock();
 		if( that.sparse ) { //DENSE <- SPARSE
 			SparseBlock b = that.sparseBlock;
 			int m = rlen;
 			for( int i=0; i<m; i++ ) {
 				if( b.isEmpty(i) ) continue;
 				int bpos = b.pos(i);
 				int blen = b.size(i);
 				int[] bix = b.indexes(i);
 				double[] avals = a.values(i);
 				double[] bvals = b.values(i);
 				int aix = a.pos(i);
 				for( int j=bpos; j<bpos+blen; j++ ) {
 					double bval = bvals[j];
 					if( bval != 0 )
 						avals[aix+bix[j]] = bval;
 				}
 			}
 		}
 		else { //DENSE <- DENSE
 			DenseBlock b = that.getDenseBlock();
 			for(int bi=0; bi<a.numBlocks(); bi++) {
 				double[] avals = a.valuesAt(bi);
 				double[] bvals = b.valuesAt(bi);
 				int blen = a.size(bi);
 				for( int j=0; j<blen; j++ )
 					avals[j] = bvals[j]!=0 ? bvals[j] : avals[j];
 			}
 		}
 	}

 	private void mergeIntoDensePar(MatrixBlock that) {
 		DenseBlock a = getDenseBlock();
 		if( that.sparse ) { //DENSE <- SPARSE
 			SparseBlock b = that.sparseBlock;
 			int roff = 0; //row offset
 			for( int bi=0; bi<a.numBlocks(); bi++ ) {
 				double[] avals = a.valuesAt(bi);
 				int alen = a.blockSize(bi);
 				final int lroff = roff; //final for lambda
 				IntStream.range(lroff, lroff+alen).parallel().forEach(i -> {
 					if( b.isEmpty(i) ) return;
 					int aix = (i-lroff)*clen;
 					int bpos = b.pos(i);
 					int blen = b.size(i);
 					int[] bix = b.indexes(i);
 					double[] bval = b.values(i);
 					for( int j=bpos; j<bpos+blen; j++ )
 						if( bval[j] != 0 )
 							avals[aix+bix[j]] = bval[j];
 				});
 				roff += alen;
 			}
 		}
 		else { //DENSE <- DENSE
 			DenseBlock b = that.getDenseBlock();
 			for(int bi=0; bi<a.numBlocks(); bi++) {
 				double[] avals = a.valuesAt(bi);
 				double[] bvals = b.valuesAt(bi);
 				Arrays.parallelSetAll(avals,
 					i -> (bvals[i]!=0) ? bvals[i] : avals[i]);
 			}
 		}
 	}

 	private void mergeIntoSparse(MatrixBlock that, boolean appendOnly, boolean deep) {
 		SparseBlock a = sparseBlock;
 		final boolean COO = (a instanceof SparseBlockCOO);
 		final int m = rlen;
 		final int n = clen;
 		if( that.sparse ) { //SPARSE <- SPARSE
 			SparseBlock b = that.sparseBlock;
 			for( int i=0; i<m; i++ ) {
 				if( b.isEmpty(i) ) continue;
 				if( !COO && a.isEmpty(i) ) {
 					//copy entire sparse row (no sort required)
 					a.set(i, b.get(i), deep);
 				}
 				else {
 					boolean appended = false;
 					int bpos = b.pos(i);
 					int blen = b.size(i);
 					int[] bix = b.indexes(i);
 					double[] bval = b.values(i);
 					for( int j=bpos; j<bpos+blen; j++ ) {
 						if( bval[j] != 0 ) {
 							a.append(i, bix[j], bval[j]);
 							appended = true;
 						}
 					}
 					//only sort if value appended
 					if( !COO && !appendOnly && appended )
 						a.sort(i);
 				}
 			}
 		}
 		else { //SPARSE <- DENSE
 			DenseBlock b = that.getDenseBlock();
 			for( int i=0; i<m; i++ ) {
 				double[] bvals = b.values(i);
 				int bix = b.pos(i);
 				boolean appended = false;
 				for( int j=0; j<n; j++ ) {
 					double bval = bvals[bix+j];
 					if( bval != 0 ) {
 						appendValue(i, j, bval); //incl alloc
 						appended = true;
 					}
 				}
 				//only sort if value appended
 				if( !COO && !appendOnly && appended )
 					a.sort(i);
 			}
 		}
 		//full sort of coordinate blocks
 		if( COO && !appendOnly )
 			a.sort();
 	}

 	////////
 	// Input/Output functions

 	@Override
 	public void readFields(DataInput in)
 		throws IOException
 	{
 		//read basic header (int rlen, int clen, byte type)
 		rlen = in.readInt();
 		clen = in.readInt();
 		byte bformat = in.readByte();

 		//check type information
 		if( bformat<0 || bformat>=BlockType.values().length )
 			throw new IOException("invalid format: '"+bformat+"' (need to be 0-"+BlockType.values().length+").");

 		BlockType format=BlockType.values()[bformat];
 		try
 		{
 			switch(format)
 			{
 				case ULTRA_SPARSE_BLOCK:
 					nonZeros = readNnzInfo( in, true );
 					sparse = evalSparseFormatInMemory(rlen, clen, nonZeros);
 					cleanupBlock(true, !(sparse && sparseBlock instanceof SparseBlockCSR));
 					if( sparse )
 						readUltraSparseBlock(in);
 					else
 						readUltraSparseToDense(in);
 					break;
 				case SPARSE_BLOCK:
 					nonZeros = readNnzInfo( in, false );
 					sparse = evalSparseFormatInMemory(rlen, clen, nonZeros);
 					cleanupBlock(sparse, !sparse);
 					if( sparse )
 						readSparseBlock(in);
 					else
 						readSparseToDense(in);
 					break;
 				case DENSE_BLOCK:
 					sparse = false;
 					cleanupBlock(false, true); //reuse dense
 					readDenseBlock(in); //always dense in-mem if dense on disk
 					break;
 				case EMPTY_BLOCK:
 					sparse = true;
 					cleanupBlock(true, !(sparseBlock instanceof SparseBlockCSR));
 					if( sparseBlock != null )
 						sparseBlock.reset();
 					nonZeros = 0;
 					break;
 			}
 		}
 		catch(DMLRuntimeException ex)
 		{
 			throw new IOException("Error reading block of type '"+format.toString()+"'.", ex);
 		}
 	}

 	private void readDenseBlock(DataInput in)
 		throws IOException, DMLRuntimeException
 	{
 		if( !allocateDenseBlock(false) ) //allocate block
 			denseBlock.reset(rlen, clen);

 		DenseBlock a = getDenseBlock();
 		long nnz = 0;
 		if( in instanceof MatrixBlockDataInput ) { //fast deserialize
 			MatrixBlockDataInput mbin = (MatrixBlockDataInput)in;
 			for( int i=0; i<a.numBlocks(); i++ )
 				nnz += mbin.readDoubleArray(a.size(i), a.valuesAt(i));
 		}
 		else if( in instanceof DataInputBuffer && HDFSTool.USE_BINARYBLOCK_SERIALIZATION ) {
 			//workaround because sequencefile.reader.next(key, value) does not yet support serialization framework
 			DataInputBuffer din = (DataInputBuffer)in;
 			try(FastBufferedDataInputStream mbin = new FastBufferedDataInputStream(din)) {
 				for( int i=0; i<a.numBlocks(); i++ )
 					nnz += mbin.readDoubleArray(a.size(i), a.valuesAt(i));
 			}
 		}
 		else { //default deserialize
 			for( int i=0; i<rlen; i++ ) {
 				double[] avals = a.values(i);
 				int aix = a.pos(i);
 				for( int j=0; j<clen; j++ )
 					nnz += ((avals[aix+j] = in.readDouble()) != 0) ? 1 : 0;
 			}
 		}
 		nonZeros = nnz;
 	}

 	private void readSparseBlock(DataInput in)
 		throws IOException
 	{
 		if( !allocateSparseRowsBlock(false) )
 			resetSparse(); //reset if not allocated

 		if( in instanceof MatrixBlockDataInput ) { //fast deserialize
 			MatrixBlockDataInput mbin = (MatrixBlockDataInput)in;
 			nonZeros = mbin.readSparseRows(rlen, nonZeros, sparseBlock);
 		}
 		else if( in instanceof DataInputBuffer && HDFSTool.USE_BINARYBLOCK_SERIALIZATION ) {
 			//workaround because sequencefile.reader.next(key, value) does not yet support serialization framework
 			DataInputBuffer din = (DataInputBuffer)in;
 			FastBufferedDataInputStream mbin = null;
 			try {
 				mbin = new FastBufferedDataInputStream(din);
 				nonZeros = mbin.readSparseRows(rlen, nonZeros, sparseBlock);
 			}
 			finally {
 				IOUtilFunctions.closeSilently(mbin);
 			}
 		}
 		else { //default deserialize
 			for(int r=0; r<rlen; r++) {
 				int rnnz = in.readInt(); //row nnz
 				if( rnnz > 0 ) {
 					sparseBlock.reset(r, rnnz, clen);
 					for(int j=0; j<rnnz; j++) //col index/value pairs
 						sparseBlock.append(r, in.readInt(), in.readDouble());
 				}
 			}
 		}
 	}

 	private void readSparseToDense(DataInput in)
 		throws IOException, DMLRuntimeException
 	{
 		if( !allocateDenseBlock(false) ) //allocate block
 			denseBlock.reset(rlen, clen);

 		DenseBlock a = getDenseBlock();
 		for(int r=0; r<rlen; r++) {
 			int nr = in.readInt();
 			double[] avals = a.values(r);
 			int cix = a.pos(r);
 			for( int j=0; j<nr; j++ ) {
 				int c = in.readInt();
 				avals[cix+c] = in.readDouble();
 			}
 		}
 	}

 	private void readUltraSparseBlock(DataInput in)
 		throws IOException
 	{
 		//allocate ultra-sparse block in CSR to avoid unnecessary size overhead
 		//and to allow efficient reset without repeated sparse row allocation

 		//adjust size and ensure reuse block is in CSR format
 		allocateAndResetSparseBlock(false, SparseBlock.Type.CSR);

 		if( clen > 1 ) { //ULTRA-SPARSE BLOCK
 			//block: read ijv-triples (ordered by row and column) via custom
 			//init to avoid repeated updates of row pointers per append
 			SparseBlockCSR sblockCSR = (SparseBlockCSR) sparseBlock;
 			sblockCSR.initUltraSparse((int)nonZeros, in);
 		}
 		else { //ULTRA-SPARSE COL
 			//col: read iv-pairs (should never happen since always dense)
 			for(long i=0; i<nonZeros; i++) {
 				int r = in.readInt();
 				double val = in.readDouble();
 				sparseBlock.allocate(r, 1, 1);
 				sparseBlock.append(r, 0, val);
 			}
 		}
 	}

 	private void readUltraSparseToDense(DataInput in)
 		throws IOException, DMLRuntimeException
 	{
 		if( !allocateDenseBlock(false) ) //allocate block
 			denseBlock.reset(rlen, clen);

 		if( clen > 1 ) { //ULTRA-SPARSE BLOCK
 			//block: read ijv-triples
 			DenseBlock a = getDenseBlock();
 			for(long i=0; i<nonZeros; i++) {
 				int r = in.readInt();
 				int c = in.readInt();
 				a.set(r, c, in.readDouble());
 			}
 		}
 		else { //ULTRA-SPARSE COL
 			//col: read iv-pairs
 			double[] a = getDenseBlockValues();
 			for(long i=0; i<nonZeros; i++)
 				a[in.readInt()] = in.readDouble();
 		}
 	}

 	@Override
 	public void write(DataOutput out)
 		throws IOException
 	{
 		//determine format
 		boolean sparseSrc = sparse;
 		boolean sparseDst = evalSparseFormatOnDisk();

 		//write first part of header
 		out.writeInt(rlen);
 		out.writeInt(clen);

 		if( sparseSrc )
 		{
 			//write sparse to *
 			if( sparseBlock==null || nonZeros==0 )
 				writeEmptyBlock(out);
 			else if( isUltraSparseSerialize(sparseDst) )
 				writeSparseToUltraSparse(out);
 			else if( sparseDst )
 				writeSparseBlock(out);
 			else
 				writeSparseToDense(out);
 		}
 		else
 		{
 			//write dense to *
 			if( denseBlock==null || nonZeros==0 )
 				writeEmptyBlock(out);
 			else if( isUltraSparseSerialize(sparseDst) )
 				writeDenseToUltraSparse(out);
 			else if( sparseDst )
 				writeDenseToSparse(out);
 			else
 				writeDenseBlock(out);
 		}
 	}

 	private static void writeEmptyBlock(DataOutput out)
 		throws IOException
 	{
 		//empty blocks do not need to materialize row information
 		out.writeByte( BlockType.EMPTY_BLOCK.ordinal() );
 	}

 	private void writeDenseBlock(DataOutput out)
 		throws IOException
 	{
 		out.writeByte( BlockType.DENSE_BLOCK.ordinal() );

 		DenseBlock a = getDenseBlock();
 		if( out instanceof MatrixBlockDataOutput ) { //fast serialize
 			MatrixBlockDataOutput mout = (MatrixBlockDataOutput)out;
 			for(int i=0; i<a.numBlocks(); i++)
 				mout.writeDoubleArray(a.size(i), a.valuesAt(i));
 		}
 		else { //general case (if fast serialize not supported)
 			for(int i=0; i<a.numBlocks(); i++) {
 				double[] avals = a.values(i);
 				int limit = a.size(i);
 				for(int j=0; j<limit; j++)
 					out.writeDouble(avals[j]);
 			}
 		}
 	}

 	private void writeSparseBlock(DataOutput out)
 		throws IOException
 	{
 		out.writeByte( BlockType.SPARSE_BLOCK.ordinal() );
 		writeNnzInfo( out, false );

 		if( out instanceof MatrixBlockDataOutput ) //fast serialize
 			((MatrixBlockDataOutput)out).writeSparseRows(rlen, sparseBlock);
 		else //general case (if fast serialize not supported)
 		{
 			int r=0;
 			for(;r<Math.min(rlen, sparseBlock.numRows()); r++)
 			{
 				if( sparseBlock.isEmpty(r) )
 					out.writeInt(0);
 				else
 				{
 					int pos = sparseBlock.pos(r);
 					int nr = sparseBlock.size(r);
 					int[] cols = sparseBlock.indexes(r);
 					double[] values=sparseBlock.values(r);

 					out.writeInt(nr);
 					for(int j=pos; j<pos+nr; j++) {
 						out.writeInt(cols[j]);
 						out.writeDouble(values[j]);
 					}
 				}
 			}
 			for(;r<rlen; r++)
 				out.writeInt(0);
 		}
 	}

 	private void writeSparseToUltraSparse(DataOutput out)
 		throws IOException
 	{
 		out.writeByte( BlockType.ULTRA_SPARSE_BLOCK.ordinal() );
 		writeNnzInfo( out, true );

 		long wnnz = 0;
 		if( clen > 1 ) //ULTRA-SPARSE BLOCK
 		{
 			//block: write ijv-triples
 			if( sparseBlock instanceof SparseBlockCOO ) {
 				SparseBlockCOO sblock = (SparseBlockCOO)sparseBlock;
 				int[] rix = sblock.rowIndexes();
 				int[] cix = sblock.indexes();
 				double[] vals = sblock.values();
 				for(int i=0; i<sblock.size(); i++) {
 					//ultra-sparse block: write ijv-triples
 					out.writeInt(rix[i]);
 					out.writeInt(cix[i]);
 					out.writeDouble(vals[i]);
 					wnnz++;
 				}
 			}
 			else {
 				for(int r=0;r<Math.min(rlen, sparseBlock.numRows()); r++) {
 					if( sparseBlock.isEmpty(r) ) continue;
 					int apos = sparseBlock.pos(r);
 					int alen = sparseBlock.size(r);
 					int[] aix = sparseBlock.indexes(r);
 					double[] avals = sparseBlock.values(r);
 					for(int j=apos; j<apos+alen; j++) {
 						//ultra-sparse block: write ijv-triples
 						out.writeInt(r);
 						out.writeInt(aix[j]);
 						out.writeDouble(avals[j]);
 						wnnz++;
 					}
 				}
 			}
 		}
 		else //ULTRA-SPARSE COL
 		{
 			//block: write iv-pairs (should never happen since always dense)
 			for(int r=0;r<Math.min(rlen, sparseBlock.numRows()); r++)
 				if(!sparseBlock.isEmpty(r) ) {
 					int pos = sparseBlock.pos(r);
 					out.writeInt(r);
 					out.writeDouble(sparseBlock.values(r)[pos]);
 					wnnz++;
 				}
 		}

 		//validity check (nnz must exactly match written nnz)
 		if( nonZeros != wnnz ) {
 			throw new IOException("Invalid number of serialized non-zeros: "+wnnz+" (expected: "+nonZeros+")");
 		}
 	}

 	private void writeSparseToDense(DataOutput out)
 		throws IOException
 	{
 		//write block type 'dense'
 		out.writeByte( BlockType.DENSE_BLOCK.ordinal() );

 		//write data (from sparse to dense)
 		if( sparseBlock==null ) //empty block
 			for( int i=0; i<rlen*clen; i++ )
 				out.writeDouble(0);
 		else //existing sparse block
 		{
 			SparseBlock a = sparseBlock;
 			for( int i=0; i<rlen; i++ )
 			{
 				if( i<a.numRows() && !a.isEmpty(i) )
 				{
 					int apos = a.pos(i);
 					int alen = a.size(i);
 					int[] aix = a.indexes(i);
 					double[] avals = a.values(i);
 					//foreach non-zero value, fill with 0s if required
 					for( int j=0, j2=0; j2<alen; j++, j2++ ) {
 						for( ; j<aix[apos+j2]; j++ )
 							out.writeDouble( 0 );
 						out.writeDouble( avals[apos+j2] );
 					}
 					//remaining 0 values in row
 					for( int j=aix[apos+alen-1]+1; j<clen; j++)
 						out.writeDouble( 0 );
 				}
 				else //empty row
 					for( int j=0; j<clen; j++ )
 						out.writeDouble( 0 );
 			}
 		}
 	}

 	private void writeDenseToUltraSparse(DataOutput out) throws IOException
 	{
 		out.writeByte( BlockType.ULTRA_SPARSE_BLOCK.ordinal() );
 		writeNnzInfo( out, true );

 		long wnnz = 0;
 		if( clen > 1 ) { //ULTRA-SPARSE BLOCK
 			//block: write ijv-triples
 			DenseBlock a = getDenseBlock();
 			for( int r=0; r<rlen; r++ ) {
 				double[] avals = a.values(r);
 				int aix = a.pos(r);
 				for( int c=0; c<clen; c++ ) {
 					double aval = avals[aix+c];
 					if( aval != 0 ) {
 						out.writeInt(r);
 						out.writeInt(c);
 						out.writeDouble(aval);
 						wnnz++;
 					}
 				}
 			}
 		}
 		else { //ULTRA-SPARSE COL
 			//col: write iv-pairs
 			double[] a = getDenseBlockValues();
 			for(int r=0; r<rlen; r++) {
 				double aval = a[r];
 				if( aval != 0 ) {
 					out.writeInt(r);
 					out.writeDouble(aval);
 					wnnz++;
 				}
 			}
 		}

 		//validity check (nnz must exactly match written nnz)
 		if( nonZeros != wnnz ) {
 			throw new IOException("Invalid number of serialized non-zeros: "+wnnz+" (expected: "+nonZeros+")");
 		}
 	}

 	private void writeDenseToSparse(DataOutput out)
 		throws IOException
 	{
 		out.writeByte( BlockType.SPARSE_BLOCK.ordinal() ); //block type
 		writeNnzInfo( out, false );

 		DenseBlock a = getDenseBlock();
 		for( int r=0; r<rlen; r++ ) {
 			double[] avals = a.values(r);
 			int aix = a.pos(r);
 			out.writeInt(a.countNonZeros(r));
 			for( int c=0; c<clen; c++ ) {
 				double aval = avals[aix+c];
 				if( aval != 0 ) {
 					out.writeInt(c);
 					out.writeDouble(aval);
 				}
 			}
 		}
 	}

 	private long readNnzInfo( DataInput in, boolean ultrasparse ) throws IOException {
 		//note: if ultrasparse, int always sufficient because nnz<rlen
 		// where rlen is limited to integer
 		long lrlen = rlen;
 		long lclen = clen;

 		//read long if required, otherwise int (see writeNnzInfo, consistency required)
 		if( lrlen*lclen > Integer.MAX_VALUE  && !ultrasparse)
 			nonZeros = in.readLong();
 		else
 			nonZeros = in.readInt();
 		return nonZeros;
 	}

 	private void writeNnzInfo( DataOutput out, boolean ultrasparse ) throws IOException {
 		//note: if ultrasparse, int always sufficient because nnz<rlen
 		// where rlen is limited to integer
 		long lrlen = rlen;
 		long lclen = clen;

 		//write long if required, otherwise int
 		if( lrlen*lclen > Integer.MAX_VALUE && !ultrasparse)
 			out.writeLong( nonZeros );
 		else
 			out.writeInt( (int)nonZeros );
 	}

 	/**
 	 * Redirects the default java serialization via externalizable to our default
 	 * hadoop writable serialization for efficient broadcast/rdd deserialization.
 	 *
 	 * @param is object input
 	 * @throws IOException if IOException occurs
 	 */
 	@Override
 	public void readExternal(ObjectInput is)
 		throws IOException
 	{
 		if( is instanceof ObjectInputStream
 			&& !(is instanceof MatrixBlockDataInput) )
 		{
 			//fast deserialize of dense/sparse blocks
 			ObjectInputStream ois = (ObjectInputStream)is;
 			FastBufferedDataInputStream fis = new FastBufferedDataInputStream(ois);
 			readFields(fis); //note: cannot close fos as this would close oos
 		}
 		else {
 			//default deserialize (general case)
 			readFields(is);
 		}
 	}

 	/**
 	 * Redirects the default java serialization via externalizable to our default
 	 * hadoop writable serialization for efficient broadcast/rdd serialization.
 	 *
 	 * @param os object output
 	 * @throws IOException if IOException occurs
 	 */
 	@Override
 	public void writeExternal(ObjectOutput os)
 		throws IOException
 	{
 		//note: in case of a CorrMatrixBlock being wrapped around a matrix
 		//block, the object output is already a FastBufferedDataOutputStream;
 		//so in general we try to avoid unnecessary buffer allocations here.

 		if( os instanceof ObjectOutputStream && !isEmptyBlock(false)
 			&& !(os instanceof MatrixBlockDataOutput) ) {
 			//fast serialize of dense/sparse blocks
 			ObjectOutputStream oos = (ObjectOutputStream)os;
 			FastBufferedDataOutputStream fos = new FastBufferedDataOutputStream(oos);
 			write(fos); //note: cannot close fos as this would close oos
 			fos.flush();
 		}
 		else {
 			//default serialize (general case)
 			write(os);
 		}
 	}

 	/**
 	 * NOTE: The used estimates must be kept consistent with the respective write functions.
 	 *
 	 * @return exact size on disk
 	 */
 	public long getExactSizeOnDisk()
 	{
 		//determine format
 		boolean sparseSrc = sparse;
 		boolean sparseDst = evalSparseFormatOnDisk();

 		long lrlen = rlen;
 		long lclen = clen;

 		//ensure exact size estimates for write
 		if( nonZeros <= 0 ) {
 			recomputeNonZeros();
 		}

 		//get exact size estimate (see write for the corresponding meaning)
 		if( sparseSrc )
 		{
 			//write sparse to *
 			if(sparseBlock==null || nonZeros==0)
 				return HEADER_SIZE; //empty block
 			else if( nonZeros<lrlen && sparseDst )
 				return estimateSizeUltraSparseOnDisk(lrlen, lclen, nonZeros); //ultra sparse block
 			else if( sparseDst )
 				return estimateSizeSparseOnDisk(lrlen, lclen, nonZeros); //sparse block
 			else
 				return estimateSizeDenseOnDisk(lrlen, lclen); //dense block
 		}
 		else
 		{
 			//write dense to *
 			if(denseBlock==null || nonZeros==0)
 				return HEADER_SIZE; //empty block
 			else if( nonZeros<lrlen && sparseDst )
 				return estimateSizeUltraSparseOnDisk(lrlen, lclen, nonZeros); //ultra sparse block
 			else if( sparseDst )
 				return estimateSizeSparseOnDisk(lrlen, lclen, nonZeros); //sparse block
 			else
 				return estimateSizeDenseOnDisk(lrlen, lclen); //dense block
 		}
 	}

 	////////
 	// Estimates size and sparsity

 	public long estimateSizeInMemory() {
 		return estimateSizeInMemory(rlen, clen, getSparsity());
 	}

 	public static long estimateSizeInMemory(long nrows, long ncols, double sparsity)
 	{
 		//determine sparse/dense representation
 		boolean sparse = evalSparseFormatInMemory(nrows, ncols, (long)(sparsity*nrows*ncols));

 		//estimate memory consumption for sparse/dense
 		if( sparse )
 			return estimateSizeSparseInMemory(nrows, ncols, sparsity);
 		else
 			return estimateSizeDenseInMemory(nrows, ncols);
 	}

 	public static long estimateSizeDenseInMemory(long nrows, long ncols)
 	{
 		// basic variables and references sizes
 		double size = 44;

 		// core dense matrix block (double array)
 		size += 8d * nrows * ncols;

 		// robustness for long overflows
 		return (long) Math.min(size, Long.MAX_VALUE);
 	}

 	public static long estimateSizeSparseInMemory(long nrows, long ncols, double sparsity) {
 		return estimateSizeSparseInMemory(nrows, ncols, sparsity, DEFAULT_SPARSEBLOCK);
 	}

 	public static long estimateSizeSparseInMemory(long nrows, long ncols, double sparsity, SparseBlock.Type stype)
 	{
 		// basic variables and references sizes
 		double size = 44;

 		// delegate memory estimate to individual sparse blocks
 		size += SparseBlockFactory.estimateSizeSparseInMemory(
 			stype, nrows, ncols, sparsity);

 		// robustness for long overflows
 		return (long) Math.min(size, Long.MAX_VALUE);
 	}

 	public long estimateSizeOnDisk()
 	{
 		return estimateSizeOnDisk(rlen, clen, nonZeros);
 	}

 	public static long estimateSizeOnDisk( long nrows, long ncols, long nnz )
 	{
 		//determine sparse/dense representation
 		boolean sparse = evalSparseFormatOnDisk(nrows, ncols, nnz);

 		//estimate memory consumption for sparse/dense
 		if( sparse && nnz<nrows )
 			return estimateSizeUltraSparseOnDisk(nrows, ncols, nnz);
 		else if( sparse )
 			return estimateSizeSparseOnDisk(nrows, ncols, nnz);
 		else
 			return estimateSizeDenseOnDisk(nrows, ncols);
 	}

 	private static long estimateSizeDenseOnDisk( long nrows, long ncols)
 	{
 		//basic header (int rlen, int clen, byte type)
 		long size = HEADER_SIZE;
 		//data (all cells double)
 		size += nrows * ncols * 8;

 		return size;
 	}

 	private static long estimateSizeSparseOnDisk( long nrows, long ncols, long nnz )
 	{
 		//basic header: (int rlen, int clen, byte type)
 		long size = HEADER_SIZE;
 		//extended header (long nnz)
 		size += (nrows*ncols > Integer.MAX_VALUE) ? 8 : 4;
 		//data: (int num per row, int-double pair per non-zero value)
 		size += nrows * 4 + nnz * 12;

 		return size;
 	}

 	private static long estimateSizeUltraSparseOnDisk( long nrows, long ncols, long nnz )
 	{
 		//basic header (int rlen, int clen, byte type)
 		long size = HEADER_SIZE;
 		//extended header (int nnz, guaranteed by rlen<nnz)
 		size += 4;
 		//data (int-int-double triples per non-zero value)
 		if( ncols > 1 ) //block: ijv-triples
 			size += nnz * 16;
 		else //column: iv-pairs
 			size += nnz * 12;

 		return size;
 	}

 	public static SparsityEstimate estimateSparsityOnAggBinary(MatrixBlock m1, MatrixBlock m2, AggregateBinaryOperator op)
 	{
 		//Since MatrixMultLib always uses a dense output (except for ultra-sparse mm)
 		//with subsequent check for sparsity, we should always return a dense estimate.
 		//Once, we support more aggregate binary operations, we need to change this.

 		//WARNING: KEEP CONSISTENT WITH LIBMATRIXMULT
 		//Note that it is crucial to report the right output representation because
 		//in case of block reuse (e.g., mmcj) the output 'reset' refers to either
 		//dense or sparse representation and hence would produce incorrect results
 		//if we report the wrong representation (i.e., missing reset on ultrasparse mm).

 		boolean ultrasparse = (m1.isUltraSparse() || m2.isUltraSparse());
 		return new SparsityEstimate(ultrasparse, m1.getNumRows()*m2.getNumRows());
 	}

 	private static SparsityEstimate estimateSparsityOnBinary(MatrixBlock m1, MatrixBlock m2, BinaryOperator op)
 	{
 		SparsityEstimate est = new SparsityEstimate();

 		//estimate dense output for all sparse-unsafe operations, except DIV (because it commonly behaves like
 		//sparse-safe but is not due to 0/0->NaN, this is consistent with the current hop sparsity estimate)
 		//see also, special sparse-safe case for DIV in LibMatrixBincell
 		if( !op.sparseSafe && !(op.fn instanceof Divide && m2.getSparsity()==1.0) ) {
 			est.sparse = false;
 			return est;
 		}

 		BinaryAccessType atype = LibMatrixBincell.getBinaryAccessType(m1, m2);
 		boolean outer = (atype == BinaryAccessType.OUTER_VECTOR_VECTOR);
 		long m = m1.getNumRows();
 		long n = outer ? m2.getNumColumns() : m1.getNumColumns();
 		long nz1 = m1.getNonZeros();
 		long nz2 = m2.getNonZeros();

 		//account for matrix vector and vector/vector
 		long estnnz = 0;
 		if( atype == BinaryAccessType.OUTER_VECTOR_VECTOR )
 		{
 			estnnz = OptimizerUtils.getOuterNonZeros(
 				m, n, nz1, nz2, op.getBinaryOperatorOpOp2());
 		}
 		else //DEFAULT CASE
 		{
 			if( atype == BinaryAccessType.MATRIX_COL_VECTOR )
 				nz2 = nz2 * n;
 			else if( atype == BinaryAccessType.MATRIX_ROW_VECTOR )
 				nz2 = nz2 * m;

 			//compute output sparsity consistent w/ the hop compiler
 			double sp1 = OptimizerUtils.getSparsity(m, n, nz1);
 			double sp2 = OptimizerUtils.getSparsity(m, n, nz2);
 			double spout = OptimizerUtils.getBinaryOpSparsity(
 				sp1, sp2, op.getBinaryOperatorOpOp2(), true);
 			estnnz = UtilFunctions.toLong(spout * m * n);
 		}

 		est.sparse = evalSparseFormatInMemory(m, n, estnnz);
 		est.estimatedNonZeros = estnnz;

 		return est;
 	}

 	private boolean estimateSparsityOnSlice(int selectRlen, int selectClen, int finalRlen, int finalClen) {
 		long ennz = (long)((double)nonZeros/rlen/clen*selectRlen*selectClen);
 		return evalSparseFormatInMemory(finalRlen, finalClen, ennz);
 	}

 	private static boolean estimateSparsityOnLeftIndexing(
 		long rlenm1, long clenm1, long nnzm1, long rlenm2, long clenm2, long nnzm2) {
 		//min bound: nnzm1 - rlenm2*clenm2 + nnzm2
 		//max bound: min(rlenm1*rlenm2, nnzm1+nnzm2)
 		long ennz = Math.min(rlenm1*clenm1, nnzm1+nnzm2);
 		return evalSparseFormatInMemory(rlenm1, clenm1, ennz);
 	}

 	private boolean requiresInplaceSparseBlockOnLeftIndexing(boolean sparse, UpdateType update, long nnz) {
 		return sparse && update != UpdateType.INPLACE_PINNED
 			&& !isShallowSerialize() && (nnz <= Integer.MAX_VALUE
 				|| DEFAULT_INPLACE_SPARSEBLOCK==SparseBlock.Type.MCSR);
 	}

 	private static boolean estimateSparsityOnGroupedAgg( long rlen, long groups ) {
 		long ennz = Math.min(groups, rlen);
 		return evalSparseFormatInMemory(groups, 1, ennz);
 	}

 	////////
 	// CacheBlock implementation

 	@Override
 	public long getInMemorySize() {
 		//in-memory size given by header if not allocated
 		if( !isAllocated() )
 			return 44;
 		//in-memory size of dense/sparse representation
 		return !sparse ? estimateSizeDenseInMemory(rlen, clen) :
 			estimateSizeSparseInMemory(rlen, clen, getSparsity(),
 			SparseBlockFactory.getSparseBlockType(sparseBlock));
 	}

 	@Override
 	public long getExactSerializedSize() {
 		return getExactSizeOnDisk();
 	}

 	@Override
 	public boolean isShallowSerialize() {
 		return isShallowSerialize(false);
 	}

 	@Override
 	public boolean isShallowSerialize(boolean inclConvert) {
 		//shallow serialize if dense, dense in serialized form or already in CSR
 		boolean sparseDst = evalSparseFormatOnDisk();
 		return !sparse || !sparseDst
 			|| (sparse && sparseBlock instanceof SparseBlockCSR)
 			|| (sparse && sparseBlock instanceof SparseBlockMCSR
 				&& getInMemorySize() / MAX_SHALLOW_SERIALIZE_OVERHEAD
 				<= getExactSerializedSize())
 			|| (sparse && sparseBlock instanceof SparseBlockMCSR
 				&& nonZeros < Integer.MAX_VALUE //CSR constraint
 				&& inclConvert && CONVERT_MCSR_TO_CSR_ON_DEEP_SERIALIZE
 				&& !isUltraSparseSerialize(sparseDst));
 	}

 	@Override
 	public void toShallowSerializeBlock() {
 		if( isShallowSerialize() || !isShallowSerialize(true) )
 			return;
 		sparseBlock = SparseBlockFactory.copySparseBlock(
 			SparseBlock.Type.CSR, sparseBlock, false);
 	}

 	@Override
 	public void compactEmptyBlock() {
 		if( isEmptyBlock(false) && isAllocated() )
 			cleanupBlock(true, true);
 	}

 	////////
 	// Core block operations (called from instructions)

 	@Override
 	public MatrixBlock scalarOperations(ScalarOperator op, MatrixValue result) {
 		MatrixBlock ret = checkType(result);

 		// estimate the sparsity structure of result matrix
 		boolean sp = this.sparse; // by default, we guess result.sparsity=input.sparsity
 		if (!op.sparseSafe)
 			sp = false; // if the operation is not sparse safe, then result will be in dense format

 		//allocate the output matrix block
 		if( ret==null )
 			ret = new MatrixBlock(rlen, clen, sp, this.nonZeros);
 		else
 			ret.reset(rlen, clen, sp, this.nonZeros);

 		//core scalar operations
 		LibMatrixBincell.bincellOp(this, ret, op);

 		return ret;
 	}

 	@Override
 	public MatrixBlock unaryOperations(UnaryOperator op, MatrixValue result) {
 		MatrixBlock ret = checkType(result);

 		// estimate the sparsity structure of result matrix
 		// by default, we guess result.sparsity=input.sparsity, unless not sparse safe
 		boolean sp = this.sparse && op.sparseSafe;

 		//allocate output
 		int n = Builtin.isBuiltinCode(op.fn, BuiltinCode.CUMSUMPROD) ? 1 : clen;
 		if( ret == null )
 			ret = new MatrixBlock(rlen, n, sp, sp ? nonZeros : rlen*n);
 		else
 			ret.reset(rlen, n, sp);

 		//core execute
 		if( LibMatrixAgg.isSupportedUnaryOperator(op) ) {
 			//e.g., cumsum/cumprod/cummin/cumax/cumsumprod
 			if( op.getNumThreads() > 1 )
 				ret = LibMatrixAgg.cumaggregateUnaryMatrix(this, ret, op, op.getNumThreads());
 			else
 				ret = LibMatrixAgg.cumaggregateUnaryMatrix(this, ret, op);
 		}
 		else if(!sparse && !isEmptyBlock(false)
 			&& OptimizerUtils.isMaxLocalParallelism(op.getNumThreads())) {
 			//note: we apply multi-threading in a best-effort manner here
 			//only for expensive operators such as exp, log, sigmoid, because
 			//otherwise allocation, read and write anyway dominates
 			ret.allocateDenseBlock(false);
 			DenseBlock a = getDenseBlock();
 			DenseBlock c = ret.getDenseBlock();
 			for(int bi=0; bi<a.numBlocks(); bi++) {
 				double[] avals = a.valuesAt(bi), cvals = c.valuesAt(bi);
 				Arrays.parallelSetAll(cvals, i -> op.fn.execute(avals[i]));
 			}
 			ret.recomputeNonZeros();
 		}
 		else {
 			//default execute unary operations
 			if(op.sparseSafe)
 				sparseUnaryOperations(op, ret);
 			else
 				denseUnaryOperations(op, ret);
 		}

 		//ensure empty results sparse representation
 		//(no additional memory requirements)
 		if( ret.isEmptyBlock(false) )
 			ret.examSparsity();

 		return ret;
 	}

 	private void sparseUnaryOperations(UnaryOperator op, MatrixBlock ret) {
 		//early abort possible since sparse-safe
 		if( isEmptyBlock(false) )
 			return;

 		final int m = rlen;
 		final int n = clen;

 		if( sparse && ret.sparse ) //SPARSE <- SPARSE
 		{
 			ret.allocateSparseRowsBlock();
 			SparseBlock a = sparseBlock;
 			SparseBlock c = ret.sparseBlock;

 			long nnz = 0;
 			for(int i=0; i<m; i++) {
 				if( a.isEmpty(i) ) continue;

 				int apos = a.pos(i);
 				int alen = a.size(i);
 				int[] aix = a.indexes(i);
 				double[] avals = a.values(i);

 				c.allocate(i, alen); //avoid repeated alloc
 				for( int j=apos; j<apos+alen; j++ ) {
 					double val = op.fn.execute(avals[j]);
 					c.append(i, aix[j], val);
 					nnz += (val != 0) ? 1 : 0;
 				}
 			}
 			ret.nonZeros = nnz;
 		}
 		else if( sparse ) //DENSE <- SPARSE
 		{
 			ret.allocateDenseBlock(false);
 			SparseBlock a = sparseBlock;
 			DenseBlock c = ret.denseBlock;
 			long nnz = (ret.nonZeros > 0) ?
 				(long) m*n-a.size() : 0;
 			for(int i=0; i<m; i++) {
 				if( a.isEmpty(i) ) continue;
 				int apos = a.pos(i);
 				int alen = a.size(i);
 				int[] aix = a.indexes(i);
 				double[] avals = a.values(i);
 				double[] cvals = c.values(i);
 				int cix = c.pos(i);
 				for( int j=apos; j<apos+alen; j++ ) {
 					double val = op.fn.execute(avals[j]);
 					cvals[cix + aix[j]] = val;
 					nnz += (val != 0) ? 1 : 0;
 				}
 			}
 			ret.nonZeros = nnz;
 		}
 		else //DENSE <- DENSE
 		{
 			//allocate dense output block
 			ret.allocateDenseBlock(false);
 			DenseBlock da = getDenseBlock();
 			DenseBlock dc = ret.getDenseBlock();

 			//unary op, incl nnz maintenance
 			long nnz = 0;
 			for( int bi=0; bi<da.numBlocks(); bi++ ) {
 				double[] a = da.valuesAt(bi);
 				double[] c = dc.valuesAt(bi);
 				int len = da.size(bi);
 				for( int i=0; i<len; i++ ) {
 					c[i] = op.fn.execute(a[i]);
 					nnz += (c[i] != 0) ? 1 : 0;
 				}
 			}
 			ret.nonZeros = nnz;
 		}
 	}

 	private void denseUnaryOperations(UnaryOperator op, MatrixBlock ret) {
 		//prepare 0-value init (determine if unnecessarily sparse-unsafe)
 		double val0 = op.fn.execute(0d);

 		final int m = rlen;
 		final int n = clen;

 		//early abort possible if unnecessarily sparse unsafe
 		//(otherwise full init with val0, no need for computation)
 		if( isEmptyBlock(false) ) {
 			if( val0 != 0 )
 				ret.reset(m, n, val0);
 			return;
 		}

 		//redirection to sparse safe operation w/ init by val0
 		if( sparse && val0 != 0 ) {
 			ret.reset(m, n, val0);
 			ret.nonZeros = (long)m * n;
 		}
 		sparseUnaryOperations(op, ret);
 	}

 	@Override
 	public MatrixBlock binaryOperations(BinaryOperator op, MatrixValue thatValue, MatrixValue result) {
 		MatrixBlock that = checkType(thatValue);
 		MatrixBlock ret = checkType(result);
 		if( !LibMatrixBincell.isValidDimensionsBinary(this, that) ) {
 			throw new RuntimeException("Block sizes are not matched for binary " +
 					"cell operations: "+this.rlen+"x"+this.clen+" vs "+ that.rlen+"x"+that.clen);
 		}

 		//compute output dimensions
 		boolean outer = (LibMatrixBincell.getBinaryAccessType(this, that)
 				== BinaryAccessType.OUTER_VECTOR_VECTOR);
 		int rows = rlen;
 		int cols = outer ? that.clen : clen;

 		//estimate output sparsity
 		SparsityEstimate resultSparse = estimateSparsityOnBinary(this, that, op);
 		if( ret == null )
 			ret = new MatrixBlock(rows, cols, resultSparse.sparse, resultSparse.estimatedNonZeros);
 		else
 			ret.reset(rows, cols, resultSparse.sparse, resultSparse.estimatedNonZeros);

 		//core binary cell operation
 		LibMatrixBincell.bincellOp( this, that, ret, op );

 		return ret;
 	}

 	@Override
 	public MatrixBlock binaryOperationsInPlace(BinaryOperator op, MatrixValue thatValue) {
 		MatrixBlock that=checkType(thatValue);
 		if( !LibMatrixBincell.isValidDimensionsBinary(this, that) ) {
 			throw new RuntimeException("block sizes are not matched for binary " +
 					"cell operations: "+this.rlen+"*"+this.clen+" vs "+ that.rlen+"*"+that.clen);
 		}

 		//estimate output sparsity
 		SparsityEstimate resultSparse = estimateSparsityOnBinary(this, that, op);
 		if(resultSparse.sparse && !this.sparse)
 			denseToSparse();
 		else if(!resultSparse.sparse && this.sparse)
 			sparseToDense();

 		//core binary cell operation
 		LibMatrixBincell.bincellOpInPlace(this, that, op);
 		return this;
 	}

 	public MatrixBlock ternaryOperations(TernaryOperator op, MatrixBlock m2, MatrixBlock m3, MatrixBlock ret) {
 		//prepare inputs
 		final boolean s1 = (rlen==1 && clen==1);
 		final boolean s2 = (m2.rlen==1 && m2.clen==1);
 		final boolean s3 = (m3.rlen==1 && m3.clen==1);
 		final double d1 = s1 ? quickGetValue(0, 0) : Double.NaN;
 		final double d2 = s2 ? m2.quickGetValue(0, 0) : Double.NaN;
 		final double d3 = s3 ? m3.quickGetValue(0, 0) : Double.NaN;
 		final int m = Math.max(Math.max(rlen, m2.rlen), m3.rlen);
 		final int n = Math.max(Math.max(clen, m2.clen), m3.clen);
 		final long nnz = nonZeros;

 		//error handling
 		if( (!s1 && (rlen != m || clen != n))
 			|| (!s2 && (m2.rlen != m || m2.clen != n))
 			|| (!s3 && (m3.rlen != m || m3.clen != n)) ) {
 			throw new DMLRuntimeException("Block sizes are not matched for ternary cell operations: "
 				+ rlen + "x" + clen + " vs " + m2.rlen + "x" + m2.clen + " vs " + m3.rlen + "x" + m3.clen);
 		}

 		//prepare result
 		ret.reset(m, n, false);

 		if( op.fn instanceof IfElse && (s1 || nnz==0 || nnz==(long)m*n) ) {
 			//SPECIAL CASE for shallow-copy if-else
 			boolean expr = s1 ? (d1 != 0) : (nnz==(long)m*n);
 			MatrixBlock tmp = expr ? m2 : m3;
 			if( tmp.rlen==m && tmp.clen==n ) {
 				//shallow copy incl meta data
 				ret.copyShallow(tmp);
 			}
 			else {
 				//fill output with given scalar value
 				double tmpVal = tmp.quickGetValue(0, 0);
 				if( tmpVal != 0 ) {
 					ret.allocateDenseBlock();
 					ret.denseBlock.set(tmpVal);
 					ret.nonZeros = (long)m * n;
 				}
 			}
 		}
 		else if (s2 != s3 && (op.fn instanceof PlusMultiply || op.fn instanceof MinusMultiply) ) {
 			//SPECIAL CASE for sparse-dense combinations of common +* and -*
 			BinaryOperator bop = ((ValueFunctionWithConstant)op.fn)
 				.setOp2Constant(s2 ? d2 : d3);
 			LibMatrixBincell.bincellOp(this, s2 ? m3 : m2, ret, bop);
 		}
 		else {
 			ret.allocateDenseBlock();

 			//basic ternary operations
 			for( int i=0; i<m; i++ )
 				for( int j=0; j<n; j++ ) {
 					double in1 = s1 ? d1 : quickGetValue(i, j);
 					double in2 = s2 ? d2 : m2.quickGetValue(i, j);
 					double in3 = s3 ? d3 : m3.quickGetValue(i, j);
 					ret.appendValue(i, j, op.fn.execute(in1, in2, in3));
 				}

 			//ensure correct output representation
 			ret.examSparsity();
 		}

 		return ret;
 	}

 	@Override
 	public void incrementalAggregate(AggregateOperator aggOp, MatrixValue correction, MatrixValue newWithCorrection, boolean deep) {
 		//assert(aggOp.correctionExists);
 		MatrixBlock cor=checkType(correction);
 		MatrixBlock newWithCor=checkType(newWithCorrection);
 		KahanObject buffer=new KahanObject(0, 0);

 		if(aggOp.correction==CorrectionLocationType.LASTROW) {
 			for(int r=0; r<rlen; r++)
 				for(int c=0; c<clen; c++)
 				{
 					buffer._sum=this.quickGetValue(r, c);
 					buffer._correction=cor.quickGetValue(0, c);
 					buffer=(KahanObject) aggOp.increOp.fn.execute(buffer, newWithCor.quickGetValue(r, c),
 							newWithCor.quickGetValue(r+1, c));
 					quickSetValue(r, c, buffer._sum);
 					cor.quickSetValue(0, c, buffer._correction);
 				}
 		}
 		else if(aggOp.correction==CorrectionLocationType.LASTCOLUMN) {
 			if(aggOp.increOp.fn instanceof Builtin
 				&& ( ((Builtin)(aggOp.increOp.fn)).bFunc == Builtin.BuiltinCode.MAXINDEX
 					|| ((Builtin)(aggOp.increOp.fn)).bFunc == Builtin.BuiltinCode.MININDEX ) ) {
 					// *** HACK ALERT *** HACK ALERT *** HACK ALERT ***
 					// rowIndexMax() and its siblings don't fit very well into the standard
 					// aggregate framework. We (ab)use the "correction factor" argument to
 					// hold the maximum value in each row/column.

 					// The execute() method for this aggregate takes as its argument
 					// two candidates for the highest value. Bookkeeping about
 					// indexes (return column/row index with highest value, breaking
 					// ties in favor of higher indexes) is handled in this function.
 					// Note that both versions of incrementalAggregate() contain
 					// very similar blocks of special-case code. If one block is
 					// modified, the other needs to be changed to match.
 					for(int r=0; r<rlen; r++){
 						double currMaxValue = cor.quickGetValue(r, 0);
 						long newMaxIndex = (long)newWithCor.quickGetValue(r, 0);
 						double newMaxValue = newWithCor.quickGetValue(r, 1);
 						double update = aggOp.increOp.fn.execute(newMaxValue, currMaxValue);

 						if (2.0 == update) {
 							// Return value of 2 ==> both values the same, break ties
 							// in favor of higher index.
 							long curMaxIndex = (long) quickGetValue(r,0);
 							quickSetValue(r, 0, Math.max(curMaxIndex, newMaxIndex));
 						} else if(1.0 == update){
 							// Return value of 1 ==> new value is better; use its index
 							quickSetValue(r, 0, newMaxIndex);
 							cor.quickSetValue(r, 0, newMaxValue);
 						} else {
 							// Other return value ==> current answer is best
 						}
 					}
 					// *** END HACK ***
 				}else{
 					for(int r=0; r<rlen; r++)
 						for(int c=0; c<clen; c++)
 						{
 							buffer._sum=this.quickGetValue(r, c);
 							buffer._correction=cor.quickGetValue(r, 0);
 							buffer=(KahanObject) aggOp.increOp.fn.execute(buffer, newWithCor.quickGetValue(r, c), newWithCor.quickGetValue(r, c+1));
 							quickSetValue(r, c, buffer._sum);
 							cor.quickSetValue(r, 0, buffer._correction);
 						}
 				}
 		}
 		else if(aggOp.correction==CorrectionLocationType.NONE) {
 			//e.g., ak+ kahan plus as used in sum, mapmult, mmcj and tsmm
 			if(aggOp.increOp.fn instanceof KahanPlus) {
 				LibMatrixAgg.aggregateBinaryMatrix(newWithCor, this, cor, deep);
 			}
 			else
 			{
 				if( newWithCor.isInSparseFormat() && aggOp.sparseSafe ) //SPARSE
 				{
 					SparseBlock b = newWithCor.getSparseBlock();
 					if( b==null ) //early abort on empty block
 						return;
 					for( int r=0; r<Math.min(rlen, b.numRows()); r++ )
 					{
 						if( !b.isEmpty(r) )
 						{
 							int bpos = b.pos(r);
 							int blen = b.size(r);
 							int[] bix = b.indexes(r);
 							double[] bvals = b.values(r);
 							for( int j=bpos; j<bpos+blen; j++)
 							{
 								int c = bix[j];
 								buffer._sum = this.quickGetValue(r, c);
 								buffer._correction = cor.quickGetValue(r, c);
 								buffer = (KahanObject) aggOp.increOp.fn.execute(buffer, bvals[j]);
 								quickSetValue(r, c, buffer._sum);
 								cor.quickSetValue(r, c, buffer._correction);
 							}
 						}
 					}

 				}
 				else //DENSE or SPARSE (!sparsesafe)
 				{
 					for(int r=0; r<rlen; r++)
 						for(int c=0; c<clen; c++) {
 							buffer._sum=this.quickGetValue(r, c);
 							buffer._correction=cor.quickGetValue(r, c);
 							buffer=(KahanObject) aggOp.increOp.fn.execute(buffer, newWithCor.quickGetValue(r, c));
 							quickSetValue(r, c, buffer._sum);
 							cor.quickSetValue(r, c, buffer._correction);
 						}
 				}

 				//change representation if required
 				//(note since ak+ on blocks is currently only applied in MR, hence no need to account for this in mem estimates)
 				examSparsity();
 			}
 		}
 		else if(aggOp.correction==CorrectionLocationType.LASTTWOROWS) {
 			double n, n2, mu2;
 			for(int r=0; r<rlen; r++)
 				for(int c=0; c<clen; c++)
 				{
 					buffer._sum=this.quickGetValue(r, c);
 					n=cor.quickGetValue(0, c);
 					buffer._correction=cor.quickGetValue(1, c);
 					mu2=newWithCor.quickGetValue(r, c);
 					n2=newWithCor.quickGetValue(r+1, c);
 					n=n+n2;
 					double toadd=(mu2-buffer._sum)*n2/n;
 					buffer=(KahanObject) aggOp.increOp.fn.execute(buffer, toadd);
 					quickSetValue(r, c, buffer._sum);
 					cor.quickSetValue(0, c, n);
 					cor.quickSetValue(1, c, buffer._correction);
 				}
 		}
 		else if(aggOp.correction==CorrectionLocationType.LASTTWOCOLUMNS) {
 			double n, n2, mu2;
 			for(int r=0; r<rlen; r++)
 				for(int c=0; c<clen; c++)
 				{
 					buffer._sum=this.quickGetValue(r, c);
 					n=cor.quickGetValue(r, 0);
 					buffer._correction=cor.quickGetValue(r, 1);
 					mu2=newWithCor.quickGetValue(r, c);
 					n2=newWithCor.quickGetValue(r, c+1);
 					n=n+n2;
 					double toadd=(mu2-buffer._sum)*n2/n;
 					buffer=(KahanObject) aggOp.increOp.fn.execute(buffer, toadd);
 					quickSetValue(r, c, buffer._sum);
 					cor.quickSetValue(r, 0, n);
 					cor.quickSetValue(r, 1, buffer._correction);
 				}
 		}
 		else if (aggOp.correction == CorrectionLocationType.LASTFOURROWS
 				&& aggOp.increOp.fn instanceof CM
 				&& ((CM) aggOp.increOp.fn).getAggOpType() == AggregateOperationTypes.VARIANCE) {
 			// create buffers to store results
 			CM_COV_Object cbuff_curr = new CM_COV_Object();
 			CM_COV_Object cbuff_part = new CM_COV_Object();

 			// perform incremental aggregation
 			for (int r=0; r<rlen; r++)
 				for (int c=0; c<clen; c++) {
 					// extract current values: { var | mean, count, m2 correction, mean correction }
 					// note: m2 = var * (n - 1)
 					cbuff_curr.w = cor.quickGetValue(1, c); // count
 					cbuff_curr.m2._sum = quickGetValue(r, c) * (cbuff_curr.w - 1); // m2
 					cbuff_curr.mean._sum = cor.quickGetValue(0, c); // mean
 					cbuff_curr.m2._correction = cor.quickGetValue(2, c);
 					cbuff_curr.mean._correction = cor.quickGetValue(3, c);

 					// extract partial values: { var | mean, count, m2 correction, mean correction }
 					// note: m2 = var * (n - 1)
 					cbuff_part.w = newWithCor.quickGetValue(r+2, c); // count
 					cbuff_part.m2._sum = newWithCor.quickGetValue(r, c) * (cbuff_part.w - 1); // m2
 					cbuff_part.mean._sum = newWithCor.quickGetValue(r+1, c); // mean
 					cbuff_part.m2._correction = newWithCor.quickGetValue(r+3, c);
 					cbuff_part.mean._correction = newWithCor.quickGetValue(r+4, c);

 					// calculate incremental aggregated variance
 					cbuff_curr = (CM_COV_Object) aggOp.increOp.fn.execute(cbuff_curr, cbuff_part);

 					// store updated values: { var | mean, count, m2 correction, mean correction }
 					double var = cbuff_curr.getRequiredResult(AggregateOperationTypes.VARIANCE);
 					quickSetValue(r, c, var);
 					cor.quickSetValue(0, c, cbuff_curr.mean._sum); // mean
 					cor.quickSetValue(1, c, cbuff_curr.w); // count
 					cor.quickSetValue(2, c, cbuff_curr.m2._correction);
 					cor.quickSetValue(3, c, cbuff_curr.mean._correction);
 				}
 		}
 		else if (aggOp.correction == CorrectionLocationType.LASTFOURCOLUMNS
 				&& aggOp.increOp.fn instanceof CM
 				&& ((CM) aggOp.increOp.fn).getAggOpType() == AggregateOperationTypes.VARIANCE) {
 			// create buffers to store results
 			CM_COV_Object cbuff_curr = new CM_COV_Object();
 			CM_COV_Object cbuff_part = new CM_COV_Object();

 			// perform incremental aggregation
 			for (int r=0; r<rlen; r++)
 				for (int c=0; c<clen; c++) {
 					// extract current values: { var | mean, count, m2 correction, mean correction }
 					// note: m2 = var * (n - 1)
 					cbuff_curr.w = cor.quickGetValue(r, 1); // count
 					cbuff_curr.m2._sum = quickGetValue(r, c) * (cbuff_curr.w - 1); // m2
 					cbuff_curr.mean._sum = cor.quickGetValue(r, 0); // mean
 					cbuff_curr.m2._correction = cor.quickGetValue(r, 2);
 					cbuff_curr.mean._correction = cor.quickGetValue(r, 3);

 					// extract partial values: { var | mean, count, m2 correction, mean correction }
 					// note: m2 = var * (n - 1)
 					cbuff_part.w = newWithCor.quickGetValue(r, c+2); // count
 					cbuff_part.m2._sum = newWithCor.quickGetValue(r, c) * (cbuff_part.w - 1); // m2
 					cbuff_part.mean._sum = newWithCor.quickGetValue(r, c+1); // mean
 					cbuff_part.m2._correction = newWithCor.quickGetValue(r, c+3);
 					cbuff_part.mean._correction = newWithCor.quickGetValue(r, c+4);

 					// calculate incremental aggregated variance
 					cbuff_curr = (CM_COV_Object) aggOp.increOp.fn.execute(cbuff_curr, cbuff_part);

 					// store updated values: { var | mean, count, m2 correction, mean correction }
 					double var = cbuff_curr.getRequiredResult(AggregateOperationTypes.VARIANCE);
 					quickSetValue(r, c, var);
 					cor.quickSetValue(r, 0, cbuff_curr.mean._sum); // mean
 					cor.quickSetValue(r, 1, cbuff_curr.w); // count
 					cor.quickSetValue(r, 2, cbuff_curr.m2._correction);
 					cor.quickSetValue(r, 3, cbuff_curr.mean._correction);
 				}
 		}
 		else
 			throw new DMLRuntimeException("unrecognized correctionLocation: "+aggOp.correction);
 	}

 	@Override
 	public void incrementalAggregate(AggregateOperator aggOp, MatrixValue newWithCorrection) {
 		//assert(aggOp.correctionExists);
 		MatrixBlock newWithCor=checkType(newWithCorrection);
 		KahanObject buffer=new KahanObject(0, 0);

 		if(aggOp.correction==CorrectionLocationType.LASTROW)
 		{
 			if( aggOp.increOp.fn instanceof KahanPlus )
 			{
 				LibMatrixAgg.aggregateBinaryMatrix(newWithCor, this, aggOp);
 			}
 			else
 			{
 				for(int r=0; r<rlen-1; r++)
 					for(int c=0; c<clen; c++)
 					{
 						buffer._sum=this.quickGetValue(r, c);
 						buffer._correction=this.quickGetValue(r+1, c);
 						buffer=(KahanObject) aggOp.increOp.fn.execute(buffer, newWithCor.quickGetValue(r, c),
 								newWithCor.quickGetValue(r+1, c));
 						quickSetValue(r, c, buffer._sum);
 						quickSetValue(r+1, c, buffer._correction);
 					}
 			}
 		}
 		else if(aggOp.correction==CorrectionLocationType.LASTCOLUMN)
 		{
 			if(aggOp.increOp.fn instanceof Builtin
 			   && ( ((Builtin)(aggOp.increOp.fn)).bFunc == Builtin.BuiltinCode.MAXINDEX
 			        || ((Builtin)(aggOp.increOp.fn)).bFunc == Builtin.BuiltinCode.MININDEX)
 			        ){
 				// *** HACK ALERT *** HACK ALERT *** HACK ALERT ***
 				// rowIndexMax() and its siblings don't fit very well into the standard
 				// aggregate framework. We (ab)use the "correction factor" argument to
 				// hold the maximum value in each row/column.

 				// The execute() method for this aggregate takes as its argument
 				// two candidates for the highest value. Bookkeeping about
 				// indexes (return column/row index with highest value, breaking
 				// ties in favor of higher indexes) is handled in this function.
 				// Note that both versions of incrementalAggregate() contain
 				// very similar blocks of special-case code. If one block is
 				// modified, the other needs to be changed to match.
 				for(int r = 0; r < rlen; r++){
 					double currMaxValue = quickGetValue(r, 1);
 					long newMaxIndex = (long)newWithCor.quickGetValue(r, 0);
 					double newMaxValue = newWithCor.quickGetValue(r, 1);
 					double update = aggOp.increOp.fn.execute(newMaxValue, currMaxValue);

 					if (2.0 == update) {
 						// Return value of 2 ==> both values the same, break ties
 						// in favor of higher index.
 						long curMaxIndex = (long) quickGetValue(r,0);
 						quickSetValue(r, 0, Math.max(curMaxIndex, newMaxIndex));
 					} else if(1.0 == update){
 						// Return value of 1 ==> new value is better; use its index
 						quickSetValue(r, 0, newMaxIndex);
 						quickSetValue(r, 1, newMaxValue);
 					} else {
 						// Other return value ==> current answer is best
 					}
 				}
 				// *** END HACK ***
 			}
 			else
 			{
 				if(aggOp.increOp.fn instanceof KahanPlus) {
 					LibMatrixAgg.aggregateBinaryMatrix(newWithCor, this, aggOp);
 				}
 				else {
 					for(int r=0; r<rlen; r++)
 						for(int c=0; c<clen-1; c++)
 						{
 							buffer._sum=this.quickGetValue(r, c);
 							buffer._correction=this.quickGetValue(r, c+1);
 							buffer=(KahanObject) aggOp.increOp.fn.execute(buffer, newWithCor.quickGetValue(r, c), newWithCor.quickGetValue(r, c+1));
 							quickSetValue(r, c, buffer._sum);
 							quickSetValue(r, c+1, buffer._correction);
 						}
 				}
 			}
 		}
 		else if(aggOp.correction==CorrectionLocationType.LASTTWOROWS)
 		{
 			double n, n2, mu2;
 			for(int r=0; r<rlen-2; r++)
 				for(int c=0; c<clen; c++)
 				{
 					buffer._sum=this.quickGetValue(r, c);
 					n=this.quickGetValue(r+1, c);
 					buffer._correction=this.quickGetValue(r+2, c);
 					mu2=newWithCor.quickGetValue(r, c);
 					n2=newWithCor.quickGetValue(r+1, c);
 					n=n+n2;
 					double toadd=(mu2-buffer._sum)*n2/n;
 					buffer=(KahanObject) aggOp.increOp.fn.execute(buffer, toadd);
 					quickSetValue(r, c, buffer._sum);
 					quickSetValue(r+1, c, n);
 					quickSetValue(r+2, c, buffer._correction);
 				}

 		}else if(aggOp.correction==CorrectionLocationType.LASTTWOCOLUMNS)
 		{
 			double n, n2, mu2;
 			for(int r=0; r<rlen; r++)
 				for(int c=0; c<clen-2; c++)
 				{
 					buffer._sum=this.quickGetValue(r, c);
 					n=this.quickGetValue(r, c+1);
 					buffer._correction=this.quickGetValue(r, c+2);
 					mu2=newWithCor.quickGetValue(r, c);
 					n2=newWithCor.quickGetValue(r, c+1);
 					n=n+n2;
 					double toadd=(mu2-buffer._sum)*n2/n;
 					buffer=(KahanObject) aggOp.increOp.fn.execute(buffer, toadd);
 					quickSetValue(r, c, buffer._sum);
 					quickSetValue(r, c+1, n);
 					quickSetValue(r, c+2, buffer._correction);
 				}
 		}
 		else if (aggOp.correction == CorrectionLocationType.LASTFOURROWS
 				&& aggOp.increOp.fn instanceof CM
 				&& ((CM) aggOp.increOp.fn).getAggOpType() == AggregateOperationTypes.VARIANCE) {
 			// create buffers to store results
 			CM_COV_Object cbuff_curr = new CM_COV_Object();
 			CM_COV_Object cbuff_part = new CM_COV_Object();

 			// perform incremental aggregation
 			for (int r=0; r<rlen-4; r++)
 				for (int c=0; c<clen; c++) {
 					// extract current values: { var | mean, count, m2 correction, mean correction }
 					// note: m2 = var * (n - 1)
 					cbuff_curr.w = quickGetValue(r+2, c); // count
 					cbuff_curr.m2._sum = quickGetValue(r, c) * (cbuff_curr.w - 1); // m2
 					cbuff_curr.mean._sum = quickGetValue(r+1, c); // mean
 					cbuff_curr.m2._correction = quickGetValue(r+3, c);
 					cbuff_curr.mean._correction = quickGetValue(r+4, c);

 					// extract partial values: { var | mean, count, m2 correction, mean correction }
 					// note: m2 = var * (n - 1)
 					cbuff_part.w = newWithCor.quickGetValue(r+2, c); // count
 					cbuff_part.m2._sum = newWithCor.quickGetValue(r, c) * (cbuff_part.w - 1); // m2
 					cbuff_part.mean._sum = newWithCor.quickGetValue(r+1, c); // mean
 					cbuff_part.m2._correction = newWithCor.quickGetValue(r+3, c);
 					cbuff_part.mean._correction = newWithCor.quickGetValue(r+4, c);

 					// calculate incremental aggregated variance
 					cbuff_curr = (CM_COV_Object) aggOp.increOp.fn.execute(cbuff_curr, cbuff_part);

 					// store updated values: { var | mean, count, m2 correction, mean correction }
 					double var = cbuff_curr.getRequiredResult(AggregateOperationTypes.VARIANCE);
 					quickSetValue(r, c, var);
 					quickSetValue(r+1, c, cbuff_curr.mean._sum); // mean
 					quickSetValue(r+2, c, cbuff_curr.w); // count
 					quickSetValue(r+3, c, cbuff_curr.m2._correction);
 					quickSetValue(r+4, c, cbuff_curr.mean._correction);
 				}
 		}
 		else if (aggOp.correction == CorrectionLocationType.LASTFOURCOLUMNS
 				&& aggOp.increOp.fn instanceof CM
 				&& ((CM) aggOp.increOp.fn).getAggOpType() == AggregateOperationTypes.VARIANCE) {
 			// create buffers to store results
 			CM_COV_Object cbuff_curr = new CM_COV_Object();
 			CM_COV_Object cbuff_part = new CM_COV_Object();

 			// perform incremental aggregation
 			for (int r=0; r<rlen; r++)
 				for (int c=0; c<clen-4; c++) {
 					// extract current values: { var | mean, count, m2 correction, mean correction }
 					// note: m2 = var * (n - 1)
 					cbuff_curr.w = quickGetValue(r, c+2); // count
 					cbuff_curr.m2._sum = quickGetValue(r, c) * (cbuff_curr.w - 1); // m2
 					cbuff_curr.mean._sum = quickGetValue(r, c+1); // mean
 					cbuff_curr.m2._correction = quickGetValue(r, c+3);
 					cbuff_curr.mean._correction = quickGetValue(r, c+4);

 					// extract partial values: { var | mean, count, m2 correction, mean correction }
 					// note: m2 = var * (n - 1)
 					cbuff_part.w = newWithCor.quickGetValue(r, c+2); // count
 					cbuff_part.m2._sum = newWithCor.quickGetValue(r, c) * (cbuff_part.w - 1); // m2
 					cbuff_part.mean._sum = newWithCor.quickGetValue(r, c+1); // mean
 					cbuff_part.m2._correction = newWithCor.quickGetValue(r, c+3);
 					cbuff_part.mean._correction = newWithCor.quickGetValue(r, c+4);

 					// calculate incremental aggregated variance
 					cbuff_curr = (CM_COV_Object) aggOp.increOp.fn.execute(cbuff_curr, cbuff_part);

 					// store updated values: { var | mean, count, m2 correction, mean correction }
 					double var = cbuff_curr.getRequiredResult(AggregateOperationTypes.VARIANCE);
 					quickSetValue(r, c, var);
 					quickSetValue(r, c+1, cbuff_curr.mean._sum); // mean
 					quickSetValue(r, c+2, cbuff_curr.w); // count
 					quickSetValue(r, c+3, cbuff_curr.m2._correction);
 					quickSetValue(r, c+4, cbuff_curr.mean._correction);
 				}
 		}
 		else
 			throw new DMLRuntimeException("unrecognized correctionLocation: "+aggOp.correction);
 	}

 	@Override
 	public MatrixBlock reorgOperations(ReorgOperator op, MatrixValue ret, int startRow, int startColumn, int length)
 	{
 		if ( !( op.fn instanceof SwapIndex || op.fn instanceof DiagIndex
 			|| op.fn instanceof SortIndex || op.fn instanceof RevIndex ) )
 			throw new DMLRuntimeException("the current reorgOperations cannot support: "+op.fn.getClass()+".");

 		MatrixBlock result = checkType(ret);

 		//compute output dimensions and sparsity, note that for diagM2V,
 		//the input nnz might be much larger than the nnz of the output
 		CellIndex tempCellIndex = new CellIndex(-1,-1);
 		op.fn.computeDimension( rlen, clen, tempCellIndex );
 		long ennz = Math.min(nonZeros, (long)tempCellIndex.row*tempCellIndex.column);
 		boolean sps = evalSparseFormatInMemory(tempCellIndex.row, tempCellIndex.column, ennz);

 		//prepare output matrix block w/ right meta data
 		if( result == null )
 			result = new MatrixBlock(tempCellIndex.row, tempCellIndex.column, sps, nonZeros);
 		else
 			result.reset(tempCellIndex.row, tempCellIndex.column, sps, nonZeros);

 		if( LibMatrixReorg.isSupportedReorgOperator(op) )
 		{
 			//SPECIAL case (operators with special performance requirements,
 			//or size-dependent special behavior)
 			//currently supported opcodes: r', rdiag, rsort, rev
 			LibMatrixReorg.reorg(this, result, op);
 		}
 		else
 		{
 			//GENERIC case (any reorg operator)
 			CellIndex temp = new CellIndex(0, 0);
 			if(sparse && sparseBlock != null) {
 				for(int r=0; r<Math.min(rlen, sparseBlock.numRows()); r++) {
 					if(sparseBlock.isEmpty(r)) continue;
 					int apos = sparseBlock.pos(r);
 					int alen = sparseBlock.size(r);
 					int[] aix = sparseBlock.indexes(r);
 					double[] avals = sparseBlock.values(r);
 					for(int i=apos; i<apos+alen; i++) {
 						tempCellIndex.set(r, aix[i]);
 						op.fn.execute(tempCellIndex, temp);
 						result.appendValue(temp.row, temp.column, avals[i]);
 					}
 				}
 			}
 			else if( !sparse && denseBlock != null ) {
 				if( result.isInSparseFormat() ) { //SPARSE<-DENSE
 					DenseBlock a = getDenseBlock();
 					for( int i=0; i<rlen; i++ ) {
 						double[] avals = a.values(i);
 						int aix = a.pos(i);
 						for( int j=0; j<clen; j++ ) {
 							temp.set(i, j);
 							op.fn.execute(temp, temp);
 							result.appendValue(temp.row, temp.column, avals[aix+j]);
 						}
 					}
 				}
 				else { //DENSE<-DENSE
 					result.allocateDenseBlock();
 					DenseBlock a = getDenseBlock();
 					DenseBlock c = result.getDenseBlock();
 					for( int i=0; i<rlen; i++ ) {
 						double[] avals = a.values(i);
 						int aix = a.pos(i);
 						for( int j=0; j<clen; j++ ) {
 							temp.set(i, j);
 							op.fn.execute(temp, temp);
 							c.set(temp.row, temp.column, avals[aix+j]);
 						}
 					}
 					result.nonZeros = nonZeros;
 				}
 			}
 		}

 		return result;
 	}

 	public MatrixBlock append( MatrixBlock that, MatrixBlock ret ) {
 		return append(that, ret, true); //default cbind
 	}

 	public MatrixBlock append( MatrixBlock that, MatrixBlock ret, boolean cbind ) {
 		return append(new MatrixBlock[]{that}, ret, cbind);
 	}

 	public MatrixBlock append( MatrixBlock[] that, MatrixBlock ret, boolean cbind ) {
 		MatrixBlock result = checkType( ret );
 		final int m = cbind ? rlen : rlen+Arrays.stream(that).mapToInt(mb -> mb.rlen).sum();
 		final int n = cbind ? clen+Arrays.stream(that).mapToInt(mb -> mb.clen).sum() : clen;
 		final long nnz = nonZeros+Arrays.stream(that).mapToLong(mb -> mb.nonZeros).sum();
 		boolean shallowCopy = (nonZeros == nnz);
 		boolean sp = evalSparseFormatInMemory(m, n, nnz);

 		//init result matrix
 		if( result == null )
 			result = new MatrixBlock(m, n, sp, nnz);
 		else
 			result.reset(m, n, sp, nnz);

 		//core append operation
 		//copy left and right input into output
 		if( !result.sparse && nnz!=0 ) //DENSE
 		{
 			if( cbind ) {
 				DenseBlock resd = result.allocateBlock().getDenseBlock();
 				MatrixBlock[] in = ArrayUtils.addAll(new MatrixBlock[]{this}, that);

 				for( int i=0; i<m; i++ ) {
 					for( int k=0, off=0; k<in.length; off+=in[k].clen, k++ ) {
 						if( in[k].isEmptyBlock(false) )
 							continue;
 						if( in[k].sparse ) {
 							SparseBlock src = in[k].sparseBlock;
 							if( src.isEmpty(i) )
 								continue;
 							int srcpos = src.pos(i);
 							int srclen = src.size(i);
 							int[] srcix = src.indexes(i);
 							double[] srcval = src.values(i);
 							double[] resval = resd.values(i);
 							int resix = resd.pos(i, off);
 							for (int j=srcpos; j<srcpos+srclen; j++)
 								resval[resix+srcix[j]] = srcval[j];
 						}
 						else {
 							DenseBlock src = in[k].getDenseBlock();
 							double[] srcval = src.values(i);
 							double[] resval = resd.values(i);
 							System.arraycopy(srcval, src.pos(i),
 								resval, resd.pos(i, off), in[k].clen);
 						}
 					}
 				}
 			}
 			else { //rbind
 				result.copy(0, rlen-1, 0, n-1, this, false);
 				for(int i=0, off=rlen; i<that.length; i++) {
 					result.copy(off, off+that[i].rlen-1, 0, n-1, that[i], false);
 					off += that[i].rlen;
 				}
 			}
 		}
 		else if(nnz != 0) //SPARSE
 		{
 			//adjust sparse rows if required
 			result.allocateSparseRowsBlock();
 			//allocate sparse rows once for cbind
 			if( cbind && nnz > rlen && !shallowCopy && result.getSparseBlock() instanceof SparseBlockMCSR ) {
 				SparseBlock sblock = result.getSparseBlock();
 				for( int i=0; i<result.rlen; i++ ) {
 					final int row = i; //workaround for lambda compile issue
 					int lnnz = (int) (this.recomputeNonZeros(i, i, 0, this.clen-1) + Arrays.stream(that)
 						.mapToLong(mb -> mb.recomputeNonZeros(row, row, 0, mb.clen-1)).sum());
 					sblock.allocate(i, lnnz);
 				}
 			}

 			//core append operation
 			result.appendToSparse(this, 0, 0, !shallowCopy);
 			if( cbind ) {
 				for(int i=0, off=clen; i<that.length; i++) {
 					result.appendToSparse(that[i], 0, off);
 					off += that[i].clen;
 				}
 			}
 			else { //rbind
 				for(int i=0, off=rlen; i<that.length; i++) {
 					result.appendToSparse(that[i], off, 0);
 					off += that[i].rlen;
 				}
 			}
 		}

 		//update meta data
 		result.nonZeros = nnz;
 		return result;
 	}

 	public static MatrixBlock naryOperations(Operator op, MatrixBlock[] matrices, ScalarObject[] scalars, MatrixBlock ret) {
 		//note: currently only min max, plus supported and hence specialized implementation
 		//prepare operator
 		FunctionObject fn = ((SimpleOperator)op).fn;
 		boolean plus = fn instanceof Plus;
 		Builtin bfn = !plus ? (Builtin)((SimpleOperator)op).fn : null;

 		//process all scalars
 		double init = plus ? 0 :(bfn.getBuiltinCode() == BuiltinCode.MIN) ?
 			Double.POSITIVE_INFINITY : Double.NEGATIVE_INFINITY;
 		for( ScalarObject so : scalars )
 			init = fn.execute(init, so.getDoubleValue());

 		//compute output dimensions and estimate sparsity
 		final int m = matrices.length > 0 ? matrices[0].rlen : 1;
 		final int n = matrices.length > 0 ? matrices[0].clen : 1;
 		final long mn = (long) m * n;
 		final long nnz = (!plus && bfn.getBuiltinCode()==BuiltinCode.MIN && init < 0)
 			|| (!plus && bfn.getBuiltinCode()==BuiltinCode.MAX && init > 0) ? mn :
 			Math.min(Arrays.stream(matrices).mapToLong(mb -> mb.nonZeros).sum(), mn);
 		boolean sp = evalSparseFormatInMemory(m, n, nnz);

 		//init result matrix
 		if( ret == null )
 			ret = new MatrixBlock(m, n, sp, nnz);
 		else
 			ret.reset(m, n, sp, nnz);

 		//main processing
 		if( matrices.length > 0 ) {
 			ret.allocateBlock();
 			int[] cnt = !plus && Arrays.stream(matrices).allMatch(
 				mb -> mb.sparse || mb.isEmpty()) ? new int[n] : null;
 			if( ret.isInSparseFormat() ) {
 				double[] tmp = new double[n];
 				for(int i = 0; i < m; i++) {
 					//reset tmp and compute row output
 					Arrays.fill(tmp, init);
 					if( plus )
 						processAddRow(matrices, tmp, 0, n, i);
 					else
 						processMinMaxRow(bfn, matrices, tmp, 0, n, i, cnt);
 					//copy to sparse output
 					for(int j = 0; j < n; j++)
 						if( tmp[j] != 0 )
 							ret.appendValue(i, j, tmp[j]);
 				}
 			}
 			else {
 				DenseBlock c = ret.getDenseBlock();
 				long lnnz = 0;
 				for(int i = 0; i < m; i++) {
 					if( init != 0 )
 						Arrays.fill(c.values(i), c.pos(i), c.pos(i)+n, init);
 					if( plus )
 						processAddRow(matrices, c.values(i), c.pos(i), n, i);
 					else
 						processMinMaxRow(bfn, matrices, c.values(i), c.pos(i), n, i, cnt);
 					lnnz += UtilFunctions.countNonZeros(c.values(i), c.pos(i), n);
 				}
 				ret.setNonZeros(lnnz);
 			}
 		}
 		else {
 			ret.quickSetValue(0, 0, init);
 		}

 		return ret;
 	}

 	private static void processAddRow(MatrixBlock[] inputs, double[] c, int cix, int n, int i) {
 		//sparse-safe update over all input matrices
 		for( MatrixBlock in : inputs ) {
 			if( in.isEmptyBlock(false) )
 				continue;
 			if( in.isInSparseFormat() ) {
 				SparseBlock a = in.getSparseBlock();
 				if( a.isEmpty(i) ) continue;
 				LibMatrixMult.vectAdd(a.values(i), c, a.indexes(i), a.pos(i), cix, a.size(i));
 			}
 			else {
 				DenseBlock a = in.getDenseBlock();
 				LibMatrixMult.vectAdd(in.getDenseBlock().values(i), c, a.pos(i), cix, n);
 			}
 		}
 	}

 	private static void processMinMaxRow(Builtin fn, MatrixBlock[] inputs, double[] c, int cix, int n, int i, int[] cnt) {
 		if( cnt != null )
 			Arrays.fill(cnt, 0);

 		//sparse-safe update over all input matrices
 		for( MatrixBlock in : inputs ) {
 			if( in.isEmptyBlock(false) )
 				continue;
 			if( in.isInSparseFormat() ) {
 				SparseBlock a = in.sparseBlock;
 				if( a.isEmpty(i) ) continue;
 				int alen = a.size(i);
 				int apos = a.pos(i);
 				int[] aix = a.indexes(i);
 				double[] avals = a.values(i);
 				for( int k=apos; k<apos+alen; k++ ) {
 					c[aix[k]] = fn.execute(c[aix[k]], avals[k]);
 					if( cnt != null ) //maintain seen values
 						cnt[aix[k]]++;
 				}
 			}
 			else {
 				double[] avals = in.getDenseBlock().values(i);
 				int aix = in.getDenseBlock().pos(i);
 				for( int j=0; j<n; j++ )
 					c[cix+j] = fn.execute(c[cix+j], avals[aix+j]);
 			}
 		}

 		//corrections for all sparse inputs
 		if( Arrays.stream(inputs).allMatch(m -> m.sparse || m.isEmpty()) ) {
 			for( int j=0; j<n; j++ ) {
 				if( cnt[j]!=inputs.length )
 					c[cix+j] = fn.execute(c[cix+j], 0);
 			}
 		}
 	}

 	public MatrixBlock transposeSelfMatrixMultOperations( MatrixBlock out, MMTSJType tstype ) {
 		return transposeSelfMatrixMultOperations(out, tstype, 1);
 	}

 	public MatrixBlock transposeSelfMatrixMultOperations( MatrixBlock out, MMTSJType tstype, int k ) {
 		//check for transpose type
 		if( !(tstype == MMTSJType.LEFT || tstype == MMTSJType.RIGHT) )
 			throw new DMLRuntimeException("Invalid MMTSJ type '"+tstype.toString()+"'.");

 		//setup meta data
 		boolean leftTranspose = ( tstype == MMTSJType.LEFT );
 		int dim = leftTranspose ? clen : rlen;

 		//create output matrix block
 		if( out == null )
 			out = new MatrixBlock(dim, dim, false);
 		else
 			out.reset(dim, dim, false);

 		//pre=processing (outside LibMatrixMult for seamless integration
 		//with native BLAS library, e.g., for sparse-dense conversion)
 		MatrixBlock m1 = LibMatrixMult
 			.prepMatrixMultTransposeSelfInput(this, leftTranspose, k > 1);

 		//compute matrix mult
 		if( NativeHelper.isNativeLibraryLoaded() )
 			LibMatrixNative.tsmm(m1, out, leftTranspose, k);
 		else if( k > 1 )
 			LibMatrixMult.matrixMultTransposeSelf(m1, out, leftTranspose, k);
 		else
 			LibMatrixMult.matrixMultTransposeSelf(m1, out, leftTranspose);

 		return out;
 	}

 	public MatrixBlock chainMatrixMultOperations( MatrixBlock v, MatrixBlock w, MatrixBlock out, ChainType ctype ) {
 		return chainMatrixMultOperations(v, w, out, ctype, 1);
 	}

 	public MatrixBlock chainMatrixMultOperations( MatrixBlock v, MatrixBlock w, MatrixBlock out, ChainType ctype, int k ) {
 		//check for transpose type
 		if( !(ctype == ChainType.XtXv || ctype == ChainType.XtwXv || ctype == ChainType.XtXvy) )
 			throw new DMLRuntimeException("Invalid mmchain type '"+ctype.toString()+"'.");

 		//check for matching dimensions
 		if( this.getNumColumns() != v.getNumRows() )
 			throw new DMLRuntimeException("Dimensions mismatch on mmchain operation ("+this.getNumColumns()+" != "+v.getNumRows()+")");
 		if( v.getNumColumns() != 1 )
 			throw new DMLRuntimeException("Invalid input vector (column vector expected, but ncol="+v.getNumColumns()+")");
 		if( w!=null && w.getNumColumns() != 1 )
 			throw new DMLRuntimeException("Invalid weight vector (column vector expected, but ncol="+w.getNumColumns()+")");

 		//prepare result
 		if( out != null )
 			out.reset(clen, 1, false);
 		else
 			out = new MatrixBlock(clen, 1, false);

 		//compute matrix mult
 		if( k > 1 )
 			LibMatrixMult.matrixMultChain(this, v, w, out, ctype, k);
 		else
 			LibMatrixMult.matrixMultChain(this, v, w, out, ctype);

 		return out;
 	}

 	public void permutationMatrixMultOperations( MatrixValue m2Val, MatrixValue out1Val, MatrixValue out2Val ) {
 		permutationMatrixMultOperations(m2Val, out1Val, out2Val, 1);
 	}

 	public void permutationMatrixMultOperations( MatrixValue m2Val, MatrixValue out1Val, MatrixValue out2Val, int k ) {
 		//check input types and dimensions
 		MatrixBlock m2 = checkType(m2Val);
 		MatrixBlock ret1 = checkType(out1Val);
 		MatrixBlock ret2 = checkType(out2Val);

 		if(this.rlen!=m2.rlen)
 			throw new RuntimeException("Dimensions do not match for permutation matrix multiplication ("+this.rlen+"!="+m2.rlen+").");

 		//compute permutation matrix multiplication
 		if (k > 1)
 			LibMatrixMult.matrixMultPermute(this, m2, ret1, ret2, k);
 		else
 			LibMatrixMult.matrixMultPermute(this, m2, ret1, ret2);
 	}

 	public final MatrixBlock leftIndexingOperations(MatrixBlock rhsMatrix,
 			IndexRange ixrange, MatrixBlock ret, UpdateType update) {
 		return leftIndexingOperations(rhsMatrix, (int)ixrange.rowStart,
 			(int)ixrange.rowEnd, (int)ixrange.colStart, (int)ixrange.colEnd, ret, update);
 	}

 	public MatrixBlock leftIndexingOperations(MatrixBlock rhsMatrix,
 			int rl, int ru, int cl, int cu, MatrixBlock ret, UpdateType update) {
 		// Check the validity of bounds
 		if( rl < 0 || rl >= getNumRows() || ru < rl || ru >= getNumRows()
 			|| cl < 0 || cl >= getNumColumns() || cu < cl || cu >= getNumColumns() ) {
 			throw new DMLRuntimeException("Invalid values for matrix indexing: ["+(rl+1)+":"+(ru+1)+","
 				+ (cl+1)+":"+(cu+1)+"] " + "must be within matrix dimensions ["+getNumRows()+","+getNumColumns()+"].");
 		}
 		if( (ru-rl+1) != rhsMatrix.getNumRows() || (cu-cl+1) != rhsMatrix.getNumColumns() ) {
 			throw new DMLRuntimeException("Invalid values for matrix indexing: " +
 				"dimensions of the source matrix ["+rhsMatrix.getNumRows()+"x" + rhsMatrix.getNumColumns() + "] " +
 				"do not match the shape of the matrix specified by indices [" +
 				(rl+1) +":" + (ru+1) + ", " + (cl+1) + ":" + (cu+1) + "] (i.e., ["+(ru-rl+1)+"x"+(cu-cl+1)+"]).");
 		}

 		MatrixBlock result = ret;
 		boolean sp = estimateSparsityOnLeftIndexing(rlen, clen, nonZeros,
 			rhsMatrix.getNumRows(), rhsMatrix.getNumColumns(), rhsMatrix.getNonZeros());

 		if( !update.isInPlace() ) { //general case
 			if(result==null)
 				result=new MatrixBlock(rlen, clen, sp);
 			else
 				result.reset(rlen, clen, sp);
 			result.copy(this, sp);
 		}
 		else { //update in-place
 			//use current block as in-place result
 			result = this;
 			//ensure that the current block adheres to the sparsity estimate
 			//and thus implicitly the memory budget used by the compiler
 			if( result.sparse && !sp )
 				result.sparseToDense();
 			else if( !result.sparse && sp )
 				result.denseToSparse();

 			//ensure right sparse block representation to prevent serialization
 			if( requiresInplaceSparseBlockOnLeftIndexing(result.sparse, update, result.nonZeros+rhsMatrix.nonZeros) )
 				result.sparseBlock = SparseBlockFactory.copySparseBlock(
 					DEFAULT_INPLACE_SPARSEBLOCK, result.sparseBlock, false);
 		}

 		//NOTE conceptually we could directly use a zeroout and copy(..., false) but
 		//     since this was factors slower, we still use a full copy and subsequently
 		//     copy(..., true) - however, this can be changed in the future once we
 		//     improved the performance of zeroout.

 		MatrixBlock src = rhsMatrix;

 		if(rl==ru && cl==cu) { //specific case: cell update
 			//copy single value and update nnz
 			result.quickSetValue(rl, cl, src.quickGetValue(0, 0));
 		}
 		else { //general case
 			//handle csr sparse blocks separately to avoid repeated shifting on column-wise access
 			//(note that for sparse inputs this only applies to aligned column indexes)
 			if( !result.isEmptyBlock(false) && result.sparse && (!src.sparse || rl==0)
 				&& result.sparseBlock instanceof SparseBlockCSR ) {
 				SparseBlockCSR sblock = (SparseBlockCSR) result.sparseBlock;
 				if( src.sparse || src.isEmptyBlock(false) ) {
 					sblock.setIndexRange(rl, ru+1, cl, cu+1, src.getSparseBlock());
 				}
 				else { //dense
 					for(int bi=0; bi<src.denseBlock.numBlocks(); bi++) {
 						int rpos = bi * src.denseBlock.blockSize();
 						int blen = src.denseBlock.blockSize(bi);
 						sblock.setIndexRange(rl+rpos, rl+rpos+blen, cl, cu+1,
 							src.denseBlock.valuesAt(bi), 0, src.rlen*src.clen);
 					}
 				}
 				result.nonZeros = sblock.size();
 			}
 			//copy submatrix into result
 			else {
 				result.copy(rl, ru, cl, cu, src, true);
 			}
 		}

 		return result;
 	}

 	/**
 	 * Explicitly allow left indexing for scalars. Note: This operation is now 0-based.
 	 *
 	 * * Operations to be performed:
 	 *   1) result=this;
 	 *   2) result[row,column] = scalar.getDoubleValue();
 	 *
 	 * @param scalar scalar object
 	 * @param rl row lower
 	 * @param cl column lower
 	 * @param ret ?
 	 * @param update ?
 	 * @return matrix block
 	 */
 	public MatrixBlock leftIndexingOperations(ScalarObject scalar, int rl, int cl, MatrixBlock ret, UpdateType update) {
 		double inVal = scalar.getDoubleValue();
 		boolean sp = estimateSparsityOnLeftIndexing(rlen, clen, nonZeros, 1, 1, (inVal!=0)?1:0);

 		if( !update.isInPlace() ) { //general case
 			if(ret==null)
 				ret=new MatrixBlock(rlen, clen, sp);
 			else
 				ret.reset(rlen, clen, sp);
 			ret.copy(this, sp);
 		}
 		else { //update in-place
 			//use current block as in-place result
 			ret = this;
 			//ensure right sparse block representation to prevent serialization
 			if( requiresInplaceSparseBlockOnLeftIndexing(ret.sparse, update, ret.nonZeros+1) )
 				ret.sparseBlock = SparseBlockFactory.copySparseBlock(
 					DEFAULT_INPLACE_SPARSEBLOCK, ret.sparseBlock, false);
 		}

 		ret.quickSetValue(rl, cl, inVal);
 		return ret;
 	}


 	public MatrixBlock slice(IndexRange ixrange, MatrixBlock ret) {
 		return slice(
 			(int)ixrange.rowStart, (int)ixrange.rowEnd,
 			(int)ixrange.colStart, (int)ixrange.colEnd, true, ret);
 	}

 	public MatrixBlock slice(int rl, int ru) {
 		return slice(rl, ru, 0, clen-1, true, new MatrixBlock());
 	}

 	@Override
 	public MatrixBlock slice(int rl, int ru, int cl, int cu, CacheBlock ret) {
 		return slice(rl, ru, cl, cu, true, ret);
 	}

 	/**
 	 * Method to perform rightIndex operation for a given lower and upper bounds in row and column dimensions.
 	 * Extracted submatrix is returned as "result". Note: This operation is now 0-based.
 	 *
 	 * @param rl row lower
 	 * @param ru row upper
 	 * @param cl column lower
 	 * @param cu column upper
 	 * @param deep should perform deep copy
 	 * @param ret output matrix block
 	 * @return matrix block output matrix block
 	 */
 	public MatrixBlock slice(int rl, int ru, int cl, int cu, boolean deep, CacheBlock ret) {
 		// check the validity of bounds
 		if ( rl < 0 || rl >= getNumRows() || ru < rl || ru >= getNumRows()
 				|| cl < 0 || cl >= getNumColumns() || cu < cl || cu >= getNumColumns() ) {
 			throw new DMLRuntimeException("Invalid values for matrix indexing: ["+(rl+1)+":"+(ru+1)+"," + (cl+1)+":"+(cu+1)+"] "
 				+ "must be within matrix dimensions ["+getNumRows()+","+getNumColumns()+"]");
 		}

 		// Output matrix will have the same sparsity as that of the input matrix.
 		// (assuming a uniform distribution of non-zeros in the input)
 		MatrixBlock result=checkType((MatrixBlock)ret);
 		long estnnz= (long) ((double)this.nonZeros/rlen/clen*(ru-rl+1)*(cu-cl+1));
 		boolean result_sparsity = this.sparse && MatrixBlock.evalSparseFormatInMemory(ru-rl+1, cu-cl+1, estnnz);
 		if(result==null)
 			result=new MatrixBlock(ru-rl+1, cu-cl+1, result_sparsity, estnnz);
 		else
 			result.reset(ru-rl+1, cu-cl+1, result_sparsity, estnnz);

 		// actual slice operation
 		if( rl==0 && ru==rlen-1 && cl==0 && cu==clen-1 ) {
 			// copy if entire matrix required
 			if( deep )
 				result.copy( this );
 			else
 				result = this;
 		}
 		else //general case
 		{
 			//core slicing operation (nnz maintained internally)
 			if (sparse)
 				sliceSparse(rl, ru, cl, cu, deep, result);
 			else
 				sliceDense(rl, ru, cl, cu, result);
 		}

 		return result;
 	}

 	private void sliceSparse(int rl, int ru, int cl, int cu, boolean deep, MatrixBlock dest) {
 		//check for early abort
 		if( isEmptyBlock(false) )
 			return;

 		if( cl==cu ) //COLUMN VECTOR
 		{
 			//note: always dense single-block dest
 			dest.allocateDenseBlock();
 			double[] c = dest.getDenseBlockValues();
 			for( int i=rl; i<=ru; i++ ) {
 				if( !sparseBlock.isEmpty(i) ) {
 					double val = sparseBlock.get(i, cl);
 					if( val != 0 ) {
 						c[i-rl] = val;
 						dest.nonZeros++;
 					}
 				}
 			}
 		}
 		else if( cl==0 && cu==clen-1 ) //ROW batch
 		{
 			//note: always sparse dest, but also works for dense
 			boolean ldeep = (deep && sparseBlock instanceof SparseBlockMCSR);
 			for(int i = rl; i <= ru; i++)
 				dest.appendRow(i-rl, sparseBlock.get(i), ldeep);
 		}
 		else //general case (sparse/dense dest)
 		{
 			SparseBlock sblock = sparseBlock;
 			for(int i=rl; i <= ru; i++) {
 				if( sblock.isEmpty(i) ) continue;
 				int apos = sblock.pos(i);
 				int alen = sblock.size(i);
 				int[] aix = sblock.indexes(i);
 				double[] avals = sblock.values(i);
 				int astart = (cl>0)?sblock.posFIndexGTE(i, cl) : 0;
 				if( astart != -1 )
 					for( int j=apos+astart; j<apos+alen && aix[j] <= cu; j++ )
 						dest.appendValue(i-rl, aix[j]-cl, avals[j]);
 			}
 		}
 	}

 	private void sliceDense(int rl, int ru, int cl, int cu, MatrixBlock dest) {
 		//ensure allocated input/output blocks
 		if( denseBlock == null )
 			return;
 		dest.allocateDenseBlock();

 		//indexing operation
 		if( cl==cu ) { //COLUMN INDEXING
 			//note: output always single block
 			if( clen==1 ) { //vector -> vector
 				System.arraycopy(getDenseBlockValues(), rl,
 					dest.getDenseBlockValues(), 0, ru-rl+1);
 			}
 			else { //matrix -> vector
 				DenseBlock a = getDenseBlock();
 				double[] c = dest.getDenseBlockValues();
 				for( int i=rl; i<=ru; i++ )
 					c[i-rl] = a.get(i, cl);
 			}
 		}
 		else { // GENERAL RANGE INDEXING
 			DenseBlock a = getDenseBlock();
 			DenseBlock c = dest.getDenseBlock();
 			int len = dest.clen;
 			for(int i = rl; i <= ru; i++)
 				System.arraycopy(a.values(i), a.pos(i)+cl, c.values(i-rl), c.pos(i-rl), len);
 		}

 		//compute nnz of output (not maintained due to native calls)
 		dest.setNonZeros((getNonZeros() == getLength()) ?
 			(ru-rl+1) * (cu-cl+1) : dest.recomputeNonZeros());
 	}

 	@Override
 	public void slice(ArrayList<IndexedMatrixValue> outlist, IndexRange range,
 		int rowCut, int colCut, int blen, int boundaryRlen, int boundaryClen)
 	{
 		MatrixBlock topleft=null, topright=null, bottomleft=null, bottomright=null;
 		Iterator<IndexedMatrixValue> p=outlist.iterator();
 		int blockRowFactor=blen, blockColFactor=blen;
 		if(rowCut>range.rowEnd)
 			blockRowFactor=boundaryRlen;
 		if(colCut>range.colEnd)
 			blockColFactor=boundaryClen;

 		int minrowcut=(int)Math.min(rowCut,range.rowEnd);
 		int mincolcut=(int)Math.min(colCut, range.colEnd);
 		int maxrowcut=(int)Math.max(rowCut, range.rowStart);
 		int maxcolcut=(int)Math.max(colCut, range.colStart);

 		if(range.rowStart<rowCut && range.colStart<colCut)
 		{
 			topleft=(MatrixBlock) p.next().getValue();
 			//topleft.reset(blockRowFactor, blockColFactor,
 			//		checkSparcityOnSlide(rowCut-(int)range.rowStart, colCut-(int)range.colStart, blockRowFactor, blockColFactor));

 			topleft.reset(blockRowFactor, blockColFactor,
 					estimateSparsityOnSlice(minrowcut-(int)range.rowStart, mincolcut-(int)range.colStart, blockRowFactor, blockColFactor));
 		}
 		if(range.rowStart<rowCut && range.colEnd>=colCut)
 		{
 			topright=(MatrixBlock) p.next().getValue();
 			topright.reset(blockRowFactor, boundaryClen,
 					estimateSparsityOnSlice(minrowcut-(int)range.rowStart, (int)range.colEnd-maxcolcut+1, blockRowFactor, boundaryClen));
 		}
 		if(range.rowEnd>=rowCut && range.colStart<colCut)
 		{
 			bottomleft=(MatrixBlock) p.next().getValue();
 			bottomleft.reset(boundaryRlen, blockColFactor,
 					estimateSparsityOnSlice((int)range.rowEnd-maxrowcut+1, mincolcut-(int)range.colStart, boundaryRlen, blockColFactor));
 		}
 		if(range.rowEnd>=rowCut && range.colEnd>=colCut)
 		{
 			bottomright=(MatrixBlock) p.next().getValue();
 			bottomright.reset(boundaryRlen, boundaryClen,
 					estimateSparsityOnSlice((int)range.rowEnd-maxrowcut+1, (int)range.colEnd-maxcolcut+1, boundaryRlen, boundaryClen));
 		}

 		if(sparse)
 		{
 			if(sparseBlock!=null)
 			{
 				int r=(int)range.rowStart;
 				for(; r<Math.min(Math.min(rowCut, sparseBlock.numRows()), range.rowEnd+1); r++)
 					sliceHelp(r, range, colCut, topleft, topright, blen-rowCut, blen, blen);

 				for(; r<=Math.min(range.rowEnd, sparseBlock.numRows()-1); r++)
 					sliceHelp(r, range, colCut, bottomleft, bottomright, -rowCut, blen, blen);
 			}
 		}
 		else {
 			if(denseBlock!=null)
 			{
 				double[] a = getDenseBlockValues();
 				int i=((int)range.rowStart)*clen;
 				int r=(int) range.rowStart;
 				for(; r<Math.min(rowCut, range.rowEnd+1); r++)
 				{
 					int c=(int) range.colStart;
 					for(; c<Math.min(colCut, range.colEnd+1); c++)
 						topleft.appendValue(r+blen-rowCut, c+blen-colCut, a[i+c]);
 					for(; c<=range.colEnd; c++)
 						topright.appendValue(r+blen-rowCut, c-colCut, a[i+c]);
 					i+=clen;
 				}

 				for(; r<=range.rowEnd; r++)
 				{
 					int c=(int) range.colStart;
 					for(; c<Math.min(colCut, range.colEnd+1); c++)
 						bottomleft.appendValue(r-rowCut, c+blen-colCut, a[i+c]);
 					for(; c<=range.colEnd; c++)
 						bottomright.appendValue(r-rowCut, c-colCut, a[i+c]);
 					i+=clen;
 				}
 			}
 		}
 	}

 	private void sliceHelp(int r, IndexRange range, int colCut, MatrixBlock left, MatrixBlock right, int rowOffset, int normalBlockRowFactor, int normalBlockColFactor)
 	{
 		if(sparseBlock.isEmpty(r))
 			return;

 		int[] cols=sparseBlock.indexes(r);
 		double[] values=sparseBlock.values(r);
 		int start=sparseBlock.posFIndexGTE(r, (int)range.colStart);
 		if(start<0)
 			return;
 		int end=sparseBlock.posFIndexLTE(r, (int)range.colEnd);
 		if(end<0 || start>end)
 			return;

 		//actual slice operation
 		int pos = sparseBlock.pos(r);
 		for(int i=start; i<=end; i++) {
 			if(cols[pos+i]<colCut)
 				left.appendValue(r+rowOffset, cols[pos+i]+normalBlockColFactor-colCut, values[pos+i]);
 			else
 				right.appendValue(r+rowOffset, cols[pos+i]-colCut, values[pos+i]);
 		}
 	}

 	@Override
 	//This the append operations for MR side
 	//nextNCol is the number columns for the block right of block v2
 	public void append(MatrixValue v2, ArrayList<IndexedMatrixValue> outlist, int blen, boolean cbind, boolean m2IsLast, int nextNCol)
 	{
 		MatrixBlock m2 = (MatrixBlock)v2;

 		//case 1: copy lhs and rhs to output
 		if( cbind && clen==blen || !cbind && rlen==blen ) {
 			((MatrixBlock) outlist.get(0).getValue()).copy(this);
 			((MatrixBlock) outlist.get(1).getValue()).copy(m2);
 		}
 		//case 2: append part of rhs to lhs, append to 2nd output if necessary
 		else {
 			//single output block (via plain append operation)
 			if( cbind && clen + m2.clen < blen
 				|| !cbind && rlen + m2.rlen < blen )
 			{
 				append(m2, (MatrixBlock) outlist.get(0).getValue(), cbind);
 			}
 			//two output blocks (via slice and append)
 			else
 			{
 				//prepare output block 1
 				MatrixBlock ret1 = (MatrixBlock) outlist.get(0).getValue();
 				int lrlen1 = cbind ? rlen-1 : blen-rlen-1;
 				int lclen1 = cbind ? blen-clen-1 : clen-1;
 				MatrixBlock tmp1 = m2.slice(0, lrlen1, 0, lclen1, new MatrixBlock());
 				append(tmp1, ret1, cbind);

 				//prepare output block 2
 				MatrixBlock ret2 = (MatrixBlock) outlist.get(1).getValue();
 				if( cbind )
 					m2.slice(0, rlen-1, lclen1+1, m2.clen-1, ret2);
 				else
 					m2.slice(lrlen1+1, m2.rlen-1, 0, clen-1, ret2);
 			}
 		}
 	}

 	@Override
 	public MatrixBlock zeroOutOperations(MatrixValue result, IndexRange range, boolean complementary) {
 		checkType(result);
 		double currentSparsity=(double)nonZeros/rlen/clen;
 		double estimatedSps=currentSparsity*(range.rowEnd-range.rowStart+1)
 			*(range.colEnd-range.colStart+1)/rlen/clen;
 		if(!complementary)
 			estimatedSps=currentSparsity-estimatedSps;

 		boolean lsparse = evalSparseFormatInMemory(rlen, clen, (long)(estimatedSps*rlen*clen));

 		if(result==null)
 			result=new MatrixBlock(rlen, clen, lsparse, (int)(estimatedSps*rlen*clen));
 		else
 			result.reset(rlen, clen, lsparse, (int)(estimatedSps*rlen*clen));


 		if(sparse)
 		{
 			if(sparseBlock!=null)
 			{
 				if(!complementary)//if zero out
 				{
 					for(int r=0; r<Math.min((int)range.rowStart, sparseBlock.numRows()); r++)
 						((MatrixBlock) result).appendRow(r, sparseBlock.get(r));
 					for(int r=Math.min((int)range.rowEnd+1, sparseBlock.numRows()); r<Math.min(rlen, sparseBlock.numRows()); r++)
 						((MatrixBlock) result).appendRow(r, sparseBlock.get(r));
 				}
 				for(int r=(int)range.rowStart; r<=Math.min(range.rowEnd, sparseBlock.numRows()-1); r++)
 				{
 					if(sparseBlock.isEmpty(r))
 						continue;
 					int[] cols=sparseBlock.indexes(r);
 					double[] values=sparseBlock.values(r);

 					if(complementary)//if selection
 					{
 						int start=sparseBlock.posFIndexGTE(r,(int)range.colStart);
 						if(start<0) continue;
 						int end=sparseBlock.posFIndexGT(r,(int)range.colEnd);
 						if(end<0 || start>end)
 							continue;

 						for(int i=start; i<end; i++)
 						{
 							((MatrixBlock) result).appendValue(r, cols[i], values[i]);
 						}
 					}else
 					{
 						int pos = sparseBlock.pos(r);
 						int len = sparseBlock.size(r);
 						int start=sparseBlock.posFIndexGTE(r,(int)range.colStart);
 						if(start<0) start=pos+len;
 						int end=sparseBlock.posFIndexGT(r,(int)range.colEnd);
 						if(end<0) end=pos+len;

 						for(int i=pos; i<start; i++)
 						{
 							((MatrixBlock) result).appendValue(r, cols[i], values[i]);
 						}
 						for(int i=end; i<pos+len; i++)
 						{
 							((MatrixBlock) result).appendValue(r, cols[i], values[i]);
 						}
 					}
 				}
 			}
 		}else
 		{
 			if(denseBlock!=null)
 			{
 				double[] a = getDenseBlockValues();
 				if(complementary)//if selection
 				{
 					int offset=((int)range.rowStart)*clen;
 					for(int r=(int) range.rowStart; r<=range.rowEnd; r++)
 					{
 						for(int c=(int) range.colStart; c<=range.colEnd; c++)
 							((MatrixBlock) result).appendValue(r, c, a[offset+c]);
 						offset+=clen;
 					}
 				}else
 				{
 					int offset=0;
 					int r=0;
 					for(; r<(int)range.rowStart; r++)
 						for(int c=0; c<clen; c++, offset++)
 							((MatrixBlock) result).appendValue(r, c, a[offset]);

 					for(; r<=(int)range.rowEnd; r++)
 					{
 						for(int c=0; c<(int)range.colStart; c++)
 							((MatrixBlock) result).appendValue(r, c, a[offset+c]);
 						for(int c=(int)range.colEnd+1; c<clen; c++)
 							((MatrixBlock) result).appendValue(r, c, a[offset+c]);
 						offset+=clen;
 					}

 					for(; r<rlen; r++)
 						for(int c=0; c<clen; c++, offset++)
 							((MatrixBlock) result).appendValue(r, c, a[offset]);
 				}

 			}
 		}
 		return (MatrixBlock)result;
 	}

 	@Override
 	public MatrixBlock aggregateUnaryOperations(AggregateUnaryOperator op,
 			MatrixValue result, int blen, MatrixIndexes indexesIn) {
 		return aggregateUnaryOperations(op, result, blen, indexesIn, false);
 	}

 	@Override
 	public MatrixBlock aggregateUnaryOperations(AggregateUnaryOperator op, MatrixValue result,
 			int blen, MatrixIndexes indexesIn, boolean inCP)  {
 		CellIndex tempCellIndex = new CellIndex(-1,-1);
 		op.indexFn.computeDimension(rlen, clen, tempCellIndex);
 		if(op.aggOp.existsCorrection())
 		{
 			switch(op.aggOp.correction)
 			{
 				case LASTROW:
 					tempCellIndex.row++;
 					break;
 				case LASTCOLUMN:
 					tempCellIndex.column++;
 					break;
 				case LASTTWOROWS:
 					tempCellIndex.row+=2;
 					break;
 				case LASTTWOCOLUMNS:
 					tempCellIndex.column+=2;
 					break;
 				case LASTFOURROWS:
 					tempCellIndex.row+=4;
 					break;
 				case LASTFOURCOLUMNS:
 					tempCellIndex.column+=4;
 					break;
 				default:
 					throw new DMLRuntimeException("unrecognized correctionLocation: "+op.aggOp.correction);
 			}
 		}

 		//prepare result matrix block
 		if(result==null)
 			result=new MatrixBlock(tempCellIndex.row, tempCellIndex.column, false);
 		else
 			result.reset(tempCellIndex.row, tempCellIndex.column, false);
 		MatrixBlock ret = (MatrixBlock) result;

 		if( LibMatrixAgg.isSupportedUnaryAggregateOperator(op) ) {
 			if( op.getNumThreads() > 1 )
 				LibMatrixAgg.aggregateUnaryMatrix(this, ret, op, op.getNumThreads());
 			else
 				LibMatrixAgg.aggregateUnaryMatrix(this, ret, op);
 			LibMatrixAgg.recomputeIndexes(ret, op, blen, indexesIn);
 		}
 		else if(op.sparseSafe)
 			sparseAggregateUnaryHelp(op, ret, blen, indexesIn);
 		else
 			denseAggregateUnaryHelp(op, ret, blen, indexesIn);

 		if(op.aggOp.existsCorrection() && inCP)
 			((MatrixBlock)result).dropLastRowsOrColumns(op.aggOp.correction);

 		return ret;
 	}

 	private void sparseAggregateUnaryHelp(AggregateUnaryOperator op, MatrixBlock result,
 			int blen, MatrixIndexes indexesIn)
 	{
 		//initialize result
 		if(op.aggOp.initialValue!=0)
 			result.reset(result.rlen, result.clen, op.aggOp.initialValue);
 		CellIndex tempCellIndex = new CellIndex(-1,-1);
 		KahanObject buffer = new KahanObject(0,0);

 		if( sparse && sparseBlock!=null ) {
 			SparseBlock a = sparseBlock;
 			for(int r=0; r<Math.min(rlen, a.numRows()); r++) {
 				if(a.isEmpty(r)) continue;
 				int apos = a.pos(r);
 				int alen = a.size(r);
 				int[] aix = a.indexes(r);
 				double[] aval = a.values(r);
 				for(int i=apos; i<apos+alen; i++) {
 					tempCellIndex.set(r, aix[i]);
 					op.indexFn.execute(tempCellIndex, tempCellIndex);
 					incrementalAggregateUnaryHelp(op.aggOp, result,
 							tempCellIndex.row, tempCellIndex.column, aval[i], buffer);
 				}
 			}
 		}
 		else if( !sparse && denseBlock!=null ) {
 			DenseBlock a = getDenseBlock();
 			for(int i=0; i<rlen; i++)
 				for(int j=0; j<clen; j++) {
 					tempCellIndex.set(i, j);
 					op.indexFn.execute(tempCellIndex, tempCellIndex);
 					incrementalAggregateUnaryHelp(op.aggOp, result,
 						tempCellIndex.row, tempCellIndex.column, a.get(i, j), buffer);
 				}
 		}
 	}

 	private void denseAggregateUnaryHelp(AggregateUnaryOperator op, MatrixBlock result,
 			int blen, MatrixIndexes indexesIn)
 	{
 		if(op.aggOp.initialValue!=0)
 			result.reset(result.rlen, result.clen, op.aggOp.initialValue);
 		CellIndex tempCellIndex = new CellIndex(-1,-1);
 		KahanObject buffer=new KahanObject(0,0);
 		for(int i=0; i<rlen; i++)
 			for(int j=0; j<clen; j++) {
 				tempCellIndex.set(i, j);
 				op.indexFn.execute(tempCellIndex, tempCellIndex);
 				incrementalAggregateUnaryHelp(op.aggOp, result, tempCellIndex.row, tempCellIndex.column, quickGetValue(i,j), buffer);
 			}
 	}

 	private static void incrementalAggregateUnaryHelp(AggregateOperator aggOp, MatrixBlock result, int row, int column,
 			double newvalue, KahanObject buffer)
 	{
 		if(aggOp.existsCorrection())
 		{
 			if(aggOp.correction==CorrectionLocationType.LASTROW || aggOp.correction==CorrectionLocationType.LASTCOLUMN)
 			{
 				int corRow=row, corCol=column;
 				if(aggOp.correction==CorrectionLocationType.LASTROW)//extra row
 					corRow++;
 				else if(aggOp.correction==CorrectionLocationType.LASTCOLUMN)
 					corCol++;
 				else
 					throw new DMLRuntimeException("unrecognized correctionLocation: "+aggOp.correction);

 				buffer._sum=result.quickGetValue(row, column);
 				buffer._correction=result.quickGetValue(corRow, corCol);
 				buffer=(KahanObject) aggOp.increOp.fn.execute(buffer, newvalue);
 				result.quickSetValue(row, column, buffer._sum);
 				result.quickSetValue(corRow, corCol, buffer._correction);
 			}else if(aggOp.correction==CorrectionLocationType.NONE)
 			{
 				throw new DMLRuntimeException("unrecognized correctionLocation: "+aggOp.correction);
 			}else// for mean
 			{
 				int corRow=row, corCol=column;
 				int countRow=row, countCol=column;
 				if(aggOp.correction==CorrectionLocationType.LASTTWOROWS)
 				{
 					countRow++;
 					corRow+=2;
 				}
 				else if(aggOp.correction==CorrectionLocationType.LASTTWOCOLUMNS)
 				{
 					countCol++;
 					corCol+=2;
 				}
 				else
 					throw new DMLRuntimeException("unrecognized correctionLocation: "+aggOp.correction);
 				buffer._sum=result.quickGetValue(row, column);
 				buffer._correction=result.quickGetValue(corRow, corCol);
 				double count=result.quickGetValue(countRow, countCol)+1.0;
 				buffer=(KahanObject) aggOp.increOp.fn.execute(buffer, newvalue, count);
 				result.quickSetValue(row, column, buffer._sum);
 				result.quickSetValue(corRow, corCol, buffer._correction);
 				result.quickSetValue(countRow, countCol, count);
 			}

 		}else
 		{
 			newvalue=aggOp.increOp.fn.execute(result.quickGetValue(row, column), newvalue);
 			result.quickSetValue(row, column, newvalue);
 		}
 	}

 	public void dropLastRowsOrColumns(CorrectionLocationType correctionLocation)
 	{
 		//determine number of rows/cols to be removed
 		int step = correctionLocation.getNumRemovedRowsColumns();
 		if( step <= 0 )
 			return;

 		//e.g., colSums, colMeans, colMaxs, colMeans, colVars
 		if(   correctionLocation==CorrectionLocationType.LASTROW
 		   || correctionLocation==CorrectionLocationType.LASTTWOROWS
 		   || correctionLocation==CorrectionLocationType.LASTFOURROWS )
 		{
 			if( sparse && sparseBlock!=null ) { //SPARSE
 				nonZeros -= recomputeNonZeros(1, rlen-1, 0, clen-1);
 				sparseBlock = SparseBlockFactory
 					.createSparseBlock(DEFAULT_SPARSEBLOCK, sparseBlock.get(0));
 			}
 			else if( !sparse && denseBlock!=null ) { //DENSE
 				nonZeros -= recomputeNonZeros(1, rlen-1, 0, clen-1);
 				DenseBlock tmp = DenseBlockFactory.createDenseBlock(1, clen);
 				tmp.set(0, getDenseBlockValues());
 				denseBlock = tmp;
 			}
 			rlen -= step;
 		}

 		//e.g., rowSums, rowsMeans, rowsMaxs, rowsMeans, rowVars
 		else if(   correctionLocation==CorrectionLocationType.LASTCOLUMN
 		        || correctionLocation==CorrectionLocationType.LASTTWOCOLUMNS
 		        || correctionLocation==CorrectionLocationType.LASTFOURCOLUMNS )
 		{
 			if( sparse && sparseBlock!=null ) { //SPARSE
 				//sparse blocks are converted to a dense representation
 				//because column vectors are always smaller in dense
 				double[] tmp = new double[rlen];
 				int lnnz = 0;
 				for( int i=0; i<rlen; i++ )
 					lnnz += ((tmp[i] = sparseBlock.get(i, 0))!=0)? 1 : 0;
 				cleanupBlock(true, true);
 				sparse = false;
 				denseBlock = DenseBlockFactory.createDenseBlock(tmp, rlen, 1);
 				nonZeros = lnnz;
 			}
 			else if( !sparse && denseBlock!=null ) { //DENSE
 				double[] tmp = new double[rlen];
 				double[] a = getDenseBlockValues();
 				int lnnz = 0;
 				for( int i=0, aix=0; i<rlen; i++, aix+=clen )
 					lnnz += ((tmp[i] = a[aix])!=0)? 1 : 0;
 				denseBlock = DenseBlockFactory.createDenseBlock(tmp, rlen, 1);
 				nonZeros = lnnz;
 			}
 			clen -= step;
 		}
 	}

 	public CM_COV_Object cmOperations(CMOperator op) {
 		// dimension check for input column vectors
 		if ( this.getNumColumns() != 1) {
 			throw new DMLRuntimeException("Central Moment can not be computed on ["
 					+ this.getNumRows() + "," + this.getNumColumns() + "] matrix.");
 		}

 		CM_COV_Object cmobj = new CM_COV_Object();

 		// empty block handling (important for result corretness, otherwise
 		// we get a NaN due to 0/0 on reading out the required result)
 		if( isEmptyBlock(false) ) {
 			op.fn.execute(cmobj, 0.0, getNumRows());
 			return cmobj;
 		}

 		int nzcount = 0;
 		if(sparse && sparseBlock!=null) //SPARSE
 		{
 			for(int r=0; r<Math.min(rlen, sparseBlock.numRows()); r++)
 			{
 				if(sparseBlock.isEmpty(r))
 					continue;
 				int apos = sparseBlock.pos(r);
 				int alen = sparseBlock.size(r);
 				double[] avals = sparseBlock.values(r);
 				for(int i=apos; i<apos+alen; i++) {
 					op.fn.execute(cmobj, avals[i]);
 					nzcount++;
 				}
 			}
 			// account for zeros in the vector
 			op.fn.execute(cmobj, 0.0, this.getNumRows()-nzcount);
 		}
 		else if(denseBlock!=null)  //DENSE
 		{
 			//always vector (see check above)
 			double[] a = getDenseBlockValues();
 			for(int i=0; i<rlen; i++)
 				op.fn.execute(cmobj, a[i]);
 		}

 		return cmobj;
 	}

 	public CM_COV_Object cmOperations(CMOperator op, MatrixBlock weights) {
 		/* this._data must be a 1 dimensional vector */
 		if ( this.getNumColumns() != 1 || weights.getNumColumns() != 1) {
 			throw new DMLRuntimeException("Central Moment can be computed only on 1-dimensional column matrices.");
 		}
 		if ( this.getNumRows() != weights.getNumRows() || this.getNumColumns() != weights.getNumColumns()) {
 			throw new DMLRuntimeException("Covariance: Mismatching dimensions between input and weight matrices - " +
 					"["+this.getNumRows()+","+this.getNumColumns() +"] != ["
 					+ weights.getNumRows() + "," + weights.getNumColumns() +"]");
 		}

 		CM_COV_Object cmobj = new CM_COV_Object();
 		if (sparse && sparseBlock!=null) //SPARSE
 		{
 			for(int i=0; i < rlen; i++)
 				op.fn.execute(cmobj, this.quickGetValue(i,0), weights.quickGetValue(i,0));
 		}
 		else if(denseBlock!=null) //DENSE
 		{
 			//always vectors (see check above)
 			double[] a = getDenseBlockValues();
 			if( !weights.sparse )
 			{
 				double[] w = weights.getDenseBlockValues();
 				if(weights.denseBlock!=null)
 					for( int i=0; i<rlen; i++ )
 						op.fn.execute(cmobj, a[i], w[i]);
 			}
 			else
 			{
 				for(int i=0; i<rlen; i++)
 					op.fn.execute(cmobj, a[i], weights.quickGetValue(i,0) );
 			}
 		}

 		return cmobj;
 	}

 	public CM_COV_Object covOperations(COVOperator op, MatrixBlock that) {
 		/* this._data must be a 1 dimensional vector */
 		if ( this.getNumColumns() != 1 || that.getNumColumns() != 1 ) {
 			throw new DMLRuntimeException("Covariance can be computed only on 1-dimensional column matrices.");
 		}
 		if ( this.getNumRows() != that.getNumRows() || this.getNumColumns() != that.getNumColumns()) {
 			throw new DMLRuntimeException("Covariance: Mismatching input matrix dimensions - " +
 					"["+this.getNumRows()+","+this.getNumColumns() +"] != ["
 					+ that.getNumRows() + "," + that.getNumColumns() +"]");
 		}

 		CM_COV_Object covobj = new CM_COV_Object();
 		if(sparse && sparseBlock!=null) //SPARSE
 		{
 			for(int i=0; i < rlen; i++ )
 				op.fn.execute(covobj, this.quickGetValue(i,0), that.quickGetValue(i,0));
 		}
 		else if(denseBlock!=null) //DENSE
 		{
 			//always vectors (see check above)
 			double[] a = getDenseBlockValues();
 			if( !that.sparse ) {
 				if(that.denseBlock!=null) {
 					double[] b = that.getDenseBlockValues();
 					for( int i=0; i<rlen; i++ )
 						op.fn.execute(covobj, a[i], b[i]);
 				}
 			}
 			else {
 				for(int i=0; i<rlen; i++)
 					op.fn.execute(covobj, a[i], that.quickGetValue(i,0));
 			}
 		}

 		return covobj;
 	}

 	public CM_COV_Object covOperations(COVOperator op, MatrixBlock that, MatrixBlock weights) {
 		/* this._data must be a 1 dimensional vector */
 		if ( this.getNumColumns() != 1 || that.getNumColumns() != 1 || weights.getNumColumns() != 1) {
 			throw new DMLRuntimeException("Covariance can be computed only on 1-dimensional column matrices.");
 		}
 		if ( this.getNumRows() != that.getNumRows() || this.getNumColumns() != that.getNumColumns()) {
 			throw new DMLRuntimeException("Covariance: Mismatching input matrix dimensions - " +
 					"["+this.getNumRows()+","+this.getNumColumns() +"] != ["
 					+ that.getNumRows() + "," + that.getNumColumns() +"]");
 		}
 		if ( this.getNumRows() != weights.getNumRows() || this.getNumColumns() != weights.getNumColumns()) {
 			throw new DMLRuntimeException("Covariance: Mismatching dimensions between input and weight matrices - " +
 					"["+this.getNumRows()+","+this.getNumColumns() +"] != ["
 					+ weights.getNumRows() + "," + weights.getNumColumns() +"]");
 		}

 		CM_COV_Object covobj = new CM_COV_Object();
 		if(sparse && sparseBlock!=null) //SPARSE
 		{
 			for(int i=0; i < rlen; i++ )
 				op.fn.execute(covobj, this.quickGetValue(i,0), that.quickGetValue(i,0), weights.quickGetValue(i,0));
 		}
 		else if(denseBlock!=null) //DENSE
 		{
 			//always vectors (see check above)
 			double[] a = getDenseBlockValues();

 			if( !that.sparse && !weights.sparse )
 			{
 				double[] w = weights.getDenseBlockValues();
 				if(that.denseBlock!=null) {
 					double[] b = that.getDenseBlockValues();
 					for( int i=0; i<rlen; i++ )
 						op.fn.execute(covobj, a[i], b[i], w[i]);
 				}
 			}
 			else
 			{
 				for(int i=0; i<rlen; i++)
 					op.fn.execute(covobj, a[i], that.quickGetValue(i,0), weights.quickGetValue(i,0));
 			}
 		}

 		return covobj;
 	}

 	public MatrixBlock sortOperations(MatrixValue weights, MatrixBlock result) {
 		boolean wtflag = (weights!=null);

 		MatrixBlock wts= (weights == null ? null : checkType(weights));
 		checkType(result);

 		if ( getNumColumns() != 1 ) {
 			throw new DMLRuntimeException("Invalid input dimensions (" + getNumRows() + "x" + getNumColumns() + ") to sort operation.");
 		}
 		if ( wts != null && wts.getNumColumns() != 1 ) {
 			throw new DMLRuntimeException("Invalid weight dimensions (" + wts.getNumRows() + "x" + wts.getNumColumns() + ") to sort operation.");
 		}

 		// prepare result, currently always dense
 		// #rows in temp matrix = 1 + #nnz in the input ( 1 is for the "zero" value)
 		int dim1 = (int) (1+this.getNonZeros());
 		if(result==null)
 			result=new MatrixBlock(dim1, 2, false);
 		else
 			result.reset(dim1, 2, false);

 		// Copy the input elements into a temporary array for sorting
 		// First column is data and second column is weights
 		// (since the inputs are vectors, they are likely dense - hence quickget is sufficient)
 		MatrixBlock tdw = new MatrixBlock(dim1, 2, false);
 		double d, w, zero_wt=0;
 		int ind = 1;
 		if( wtflag ) // w/ weights
 		{
 			for ( int i=0; i<rlen; i++ ) {
 				d = quickGetValue(i,0);
 				w = wts.quickGetValue(i,0);
 				if ( d != 0 ) {
 					tdw.quickSetValue(ind, 0, d);
 					tdw.quickSetValue(ind, 1, w);
 					ind++;
 				}
 				else
 					zero_wt += w;
 			}
 		}
 		else //w/o weights
 		{
 			zero_wt = getNumRows() - getNonZeros();
 			for( int i=0; i<rlen; i++ ) {
 				d = quickGetValue(i,0);
 				if( d != 0 ){
 					tdw.quickSetValue(ind, 0, d);
 					tdw.quickSetValue(ind, 1, 1);
 					ind++;
 				}
 			}
 		}
 		tdw.quickSetValue(0, 0, 0.0);
 		tdw.quickSetValue(0, 1, zero_wt); //num zeros in input

 		// Sort td and tw based on values inside td (ascending sort), incl copy into result
 		SortIndex sfn = new SortIndex(1, false, false);
 		ReorgOperator rop = new ReorgOperator(sfn);
 		LibMatrixReorg.reorg(tdw, result, rop);

 		return result;
 	}

 	public double interQuartileMean() {
 		//input state: rlen x 2, values and weights, sorted by weight
 		//approach: determine q25 and q75 keys by cumsum of weights
 		double sum_wt = sumWeightForQuantile();
 		double q25d = 0.25*sum_wt;
 		double q75d = 0.75*sum_wt;
 		int q25i = (int) Math.ceil(q25d);
 		int q75i = (int) Math.ceil(q75d);

 		// find q25 as sum of weights (but excluding from mean)
 		double psum = 0; int i = -1;
 		while(psum < q25i && i < getNumRows())
 			psum += quickGetValue(++i, 1);
 		double q25Part = psum-q25d;
 		double q25Val = quickGetValue(i, 0);

 		// compute mean and find q75 as sum of weights (including in mean)
 		double sum = 0;
 		while(psum < q75i && i < getNumRows()) {
 			double v1 = quickGetValue(++i, 0);
 			double v2 = quickGetValue(i, 1);
 			psum += v2;
 			sum += v1 * v2;
 		}
 		double q75Part = psum-q75d;
 		double q75Val = quickGetValue(i, 0);

 		//compute final IQM, incl. correction for q25 and q75 portions
 		return computeIQMCorrection(sum, sum_wt, q25Part, q25Val, q75Part, q75Val);
 	}

 	public static double computeIQMCorrection(double sum, double sum_wt,
 		double q25Part, double q25Val, double q75Part, double q75Val) {
 		return (sum + q25Part*q25Val - q75Part*q75Val) / (sum_wt*0.5);
 	}

 	public MatrixBlock pickValues(MatrixValue quantiles, MatrixValue ret) {

 		MatrixBlock qs=checkType(quantiles);

 		if ( qs.clen != 1 ) {
 			throw new DMLRuntimeException("Multiple quantiles can only be computed on a 1D matrix");
 		}

 		MatrixBlock output = checkType(ret);

 		if(output==null)
 			output=new MatrixBlock(qs.rlen, qs.clen, false); // resulting matrix is mostly likely be dense
 		else
 			output.reset(qs.rlen, qs.clen, false);

 		for ( int i=0; i < qs.rlen; i++ ) {
 			output.quickSetValue(i, 0, this.pickValue(qs.quickGetValue(i,0)) );
 		}

 		return output;
 	}

 	public double median() {
 		double sum_wt = sumWeightForQuantile();
 		return pickValue(0.5, sum_wt%2==0);
 	}

 	public double pickValue(double quantile){
 		return pickValue(quantile, false);
 	}

 	public double pickValue(double quantile, boolean average) {
 		double sum_wt = sumWeightForQuantile();

 		// do averaging only if it is asked for; and sum_wt is even
 		average = average && (sum_wt%2 == 0);

 		int pos = (int) Math.ceil(quantile*sum_wt);

 		int t = 0, i=-1;
 		do {
 			i++;
 			t += quickGetValue(i,1);
 		} while(t<pos && i < getNumRows());

 		if ( quickGetValue(i,1) != 0 ) {
 			// i^th value is present in the data set, simply return it
 			if ( average ) {
 				if(pos < t) {
 					return quickGetValue(i,0);
 				}
 				if(quickGetValue(i+1,1) != 0)
 					return (quickGetValue(i,0)+quickGetValue(i+1,0))/2;
 				else
 					// (i+1)^th value is 0. So, fetch (i+2)^th value
 					return (quickGetValue(i,0)+quickGetValue(i+2,0))/2;
 			}
 			else
 				return quickGetValue(i, 0);
 		}
 		else {
 			// i^th value is not present in the data set.
 			// It can only happen in the case where i^th value is 0.0; and 0.0 is not present in the data set (but introduced by sort).
 			if ( i+1 < getNumRows() )
 				// when 0.0 is not the last element in the sorted order
 				return quickGetValue(i+1,0);
 			else
 				// when 0.0 is the last element in the sorted order (input data is all negative)
 				return quickGetValue(i-1,0);
 		}
 	}

 	/**
 	 * In a given two column matrix, the second column denotes weights.
 	 * This function computes the total weight
 	 *
 	 * @return sum weight for quantile
 	 */
 	public double sumWeightForQuantile() {
 		double sum_wt = 0;
 		for (int i=0; i < getNumRows(); i++ ) {
 			double tmp = quickGetValue(i, 1);
 			sum_wt += tmp;

 			// test all values not just final sum_wt to ensure that non-integer weights
 			// don't cancel each other out; integer weights are required by all quantiles, etc
 			if( Math.floor(tmp) < tmp ) {
 				throw new DMLRuntimeException("Wrong input data, quantile weights "
 						+ "are expected to be integers but found '"+tmp+"'.");
 			}
 		}
 		return sum_wt;
 	}

 	public MatrixBlock aggregateBinaryOperations(MatrixIndexes m1Index, MatrixBlock m1, MatrixIndexes m2Index, MatrixBlock m2,
 		MatrixBlock ret, AggregateBinaryOperator op ) {
 		return aggregateBinaryOperations(m1, m2, ret, op);
 	}

 	public MatrixBlock aggregateBinaryOperations(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, AggregateBinaryOperator op) {
 		//check input types, dimensions, configuration
 		if( m1.clen != m2.rlen ) {
 			throw new RuntimeException("Dimensions do not match for matrix multiplication ("+m1.clen+"!="+m2.rlen+").");
 		}
 		if( !(op.binaryFn instanceof Multiply && op.aggOp.increOp.fn instanceof Plus) ) {
 			throw new DMLRuntimeException("Unsupported binary aggregate operation: ("+op.binaryFn+", "+op.aggOp+").");
 		}

 		//setup meta data (dimensions, sparsity)
 		int rl = m1.rlen;
 		int cl = m2.clen;
 		SparsityEstimate sp = estimateSparsityOnAggBinary(m1, m2, op);

 		//create output matrix block
 		if( ret==null )
 			ret = new MatrixBlock(rl, cl, sp.sparse, sp.estimatedNonZeros);
 		else
 			ret.reset(rl, cl, sp.sparse, sp.estimatedNonZeros);

 		//compute matrix multiplication (only supported binary aggregate operation)
 		if( NativeHelper.isNativeLibraryLoaded() )
 			LibMatrixNative.matrixMult(m1, m2, ret, op.getNumThreads());
 		else if( op.getNumThreads() > 1 )
 			LibMatrixMult.matrixMult(m1, m2, ret, op.getNumThreads());
 		else
 			LibMatrixMult.matrixMult(m1, m2, ret);

 		return ret;
 	}

 	public MatrixBlock aggregateTernaryOperations(MatrixBlock m1, MatrixBlock m2, MatrixBlock m3, MatrixBlock ret, AggregateTernaryOperator op, boolean inCP) {
 		//check input dimensions and operators
 		if( m1.rlen!=m2.rlen || m1.clen!=m2.clen || (m3!=null && (m2.rlen!=m3.rlen || m2.clen!=m3.clen)) )
 			throw new DMLRuntimeException("Invalid dimensions for aggregate ternary ("+m1.rlen+"x"+m1.clen+", "+m2.rlen+"x"+m2.clen+", "+m3.rlen+"x"+m3.clen+").");
 		if( !( op.aggOp.increOp.fn instanceof KahanPlus && op.binaryFn instanceof Multiply) )
 			throw new DMLRuntimeException("Unsupported operator for aggregate ternary operations.");

 		//create output matrix block w/ corrections
 		int rl = (op.indexFn instanceof ReduceRow) ? 2 : 1;
 		int cl = (op.indexFn instanceof ReduceRow) ? m1.clen : 2;
 		if( ret == null )
 			ret = new MatrixBlock(rl, cl, false);
 		else
 			ret.reset(rl, cl, false);

 		//execute ternary aggregate function
 		if( op.getNumThreads() > 1 )
 			ret = LibMatrixAgg.aggregateTernary(m1, m2, m3, ret, op, op.getNumThreads());
 		else
 			ret = LibMatrixAgg.aggregateTernary(m1, m2, m3, ret, op);

 		if(op.aggOp.existsCorrection() && inCP)
 			ret.dropLastRowsOrColumns(op.aggOp.correction);
 		return ret;
 	}

 	public MatrixBlock  uaggouterchainOperations(MatrixBlock mbLeft, MatrixBlock mbRight, MatrixBlock mbOut, BinaryOperator bOp, AggregateUnaryOperator uaggOp) {
 		double bv[] = DataConverter.convertToDoubleVector(mbRight);
 		int bvi[] = null;

 		//process instruction
 		if (LibMatrixOuterAgg.isSupportedUaggOp(uaggOp, bOp))
 		{
 			if((LibMatrixOuterAgg.isRowIndexMax(uaggOp)) || (LibMatrixOuterAgg.isRowIndexMin(uaggOp)))
 			{
 				bvi = LibMatrixOuterAgg.prepareRowIndices(bv.length, bv, bOp, uaggOp);
 			} else {
 				Arrays.sort(bv);
 			}

 			int iRows = (uaggOp.indexFn instanceof ReduceCol ? mbLeft.getNumRows(): 2);
 			int iCols = (uaggOp.indexFn instanceof ReduceRow ? mbLeft.getNumColumns(): 2);
 			if(mbOut==null)
 				mbOut=new MatrixBlock(iRows, iCols, false);  // Output matrix will be dense matrix most of the time.
 			else
 				mbOut.reset(iRows, iCols, false);

 			LibMatrixOuterAgg.aggregateMatrix(mbLeft, mbOut, bv, bvi, bOp, uaggOp);
 		} else
 			throw new DMLRuntimeException("Unsupported operator for unary aggregate operations.");

 		return mbOut;
 	}


 	/**
 	 * Invocation from CP instructions. The aggregate is computed on the groups object
 	 * against target and weights.
 	 *
 	 * Notes:
 	 * * The computed number of groups is reused for multiple invocations with different target.
 	 * * This implementation supports that the target is passed as column or row vector,
 	 *   in case of row vectors we also use sparse-safe implementations for sparse safe
 	 *   aggregation operators.
 	 *
 	 * @param tgt ?
 	 * @param wghts ?
 	 * @param ret ?
 	 * @param ngroups ?
 	 * @param op operator
 	 * @return matrix block
 	 */
 	public MatrixBlock groupedAggOperations(MatrixValue tgt, MatrixValue wghts, MatrixValue ret, int ngroups, Operator op) {
 		//single-threaded grouped aggregate
 		return groupedAggOperations(tgt, wghts, ret, ngroups, op, 1);
 	}

 	public MatrixBlock groupedAggOperations(MatrixValue tgt, MatrixValue wghts, MatrixValue ret, int ngroups, Operator op, int k) {
 		//setup input matrices
 		MatrixBlock target = checkType(tgt);
 		MatrixBlock weights = checkType(wghts);

 		//check valid dimensions
 		boolean validMatrixOp = (weights == null && ngroups>=1);
 		if( this.getNumColumns() != 1 || (weights!=null && weights.getNumColumns()!=1) )
 			throw new DMLRuntimeException("groupedAggregate can only operate on 1-dimensional column matrices for groups and weights.");
 		if( target.getNumColumns() != 1 && op instanceof CMOperator && !validMatrixOp )
 			throw new DMLRuntimeException("groupedAggregate can only operate on 1-dimensional column matrices for target (for this aggregation function).");
 		if( target.getNumColumns() != 1 && target.getNumRows()!=1 && !validMatrixOp )
 			throw new DMLRuntimeException("groupedAggregate can only operate on 1-dimensional column or row matrix for target.");
 		if( this.getNumRows() != target.getNumRows() && this.getNumRows() !=Math.max(target.getNumRows(),target.getNumColumns()) || (weights != null && this.getNumRows() != weights.getNumRows()) )
 			throw new DMLRuntimeException("groupedAggregate can only operate on matrices with equal dimensions.");

 		// Determine the number of groups
 		if( ngroups <= 0 ) { //reuse if available
 			double min = this.min();
 			double max = this.max();
 			if ( min <= 0 )
 				throw new DMLRuntimeException("Invalid value (" + min + ") encountered in 'groups' while computing groupedAggregate");
 			if ( max <= 0 )
 				throw new DMLRuntimeException("Invalid value (" + max + ") encountered in 'groups' while computing groupedAggregate.");
 			ngroups = (int) max;
 		}

 		// Allocate result matrix
 		boolean rowVector = (target.getNumRows()==1 && target.getNumColumns()>1);
 		MatrixBlock result = checkType(ret);
 		boolean result_sparsity = estimateSparsityOnGroupedAgg(rlen, ngroups);
 		if(result==null)
 			result=new MatrixBlock(ngroups, rowVector?1:target.getNumColumns(), result_sparsity);
 		else
 			result.reset(ngroups, rowVector?1:target.getNumColumns(), result_sparsity);

 		//execute grouped aggregate operation
 		if( k > 1 )
 			LibMatrixAgg.groupedAggregate(this, target, weights, result, ngroups, op, k);
 		else
 			LibMatrixAgg.groupedAggregate(this, target, weights, result, ngroups, op);

 		return result;
 	}

 	public MatrixBlock removeEmptyOperations( MatrixBlock ret, boolean rows, boolean emptyReturn, MatrixBlock select ) {
 		return LibMatrixReorg.rmempty(this, ret, rows, emptyReturn, select);
 	}

 	public MatrixBlock removeEmptyOperations( MatrixBlock ret, boolean rows, boolean emptyReturn) {
 		return removeEmptyOperations(ret, rows, emptyReturn, null);
 	}

 	public MatrixBlock rexpandOperations( MatrixBlock ret, double max, boolean rows, boolean cast, boolean ignore, int k ) {
 		MatrixBlock result = checkType(ret);
 		return LibMatrixReorg.rexpand(this, result, max, rows, cast, ignore, k);
 	}


 	@Override
 	public MatrixBlock replaceOperations(MatrixValue result, double pattern, double replacement) {
 		MatrixBlock ret = checkType(result);
 		examSparsity(); //ensure its in the right format
 		ret.reset(rlen, clen, sparse);
 		//probe early abort conditions
 		if( nonZeros == 0 && pattern != 0  )
 			return ret;
 		if( !containsValue(pattern) )
 			return this; //avoid allocation + copy

 		boolean NaNpattern = Double.isNaN(pattern);
 		if( sparse ) //SPARSE
 		{
 			if( pattern != 0d ) //SPARSE <- SPARSE (sparse-safe)
 			{
 				ret.allocateSparseRowsBlock();
 				SparseBlock a = sparseBlock;
 				SparseBlock c = ret.sparseBlock;

 				for( int i=0; i<rlen; i++ ) {
 					if( !a.isEmpty(i) ) {
 						c.allocate(i);
 						int apos = a.pos(i);
 						int alen = a.size(i);
 						int[] aix = a.indexes(i);
 						double[] avals = a.values(i);
 						for( int j=apos; j<apos+alen; j++ ) {
 							double val = avals[j];
 							if( val== pattern || (NaNpattern && Double.isNaN(val)) )
 								c.append(i, aix[j], replacement);
 							else
 								c.append(i, aix[j], val);
 						}
 					}
 				}
 			}
 			else //DENSE <- SPARSE
 			{
 				ret.sparse = false;
 				ret.allocateDenseBlock();
 				SparseBlock a = sparseBlock;
 				double[] c = ret.getDenseBlockValues();

 				//initialize with replacement (since all 0 values, see SPARSITY_TURN_POINT)
 				Arrays.fill(c, replacement);

 				//overwrite with existing values (via scatter)
 				if( a != null  ) //check for empty matrix
 					for( int i=0, cix=0; i<rlen; i++, cix+=clen ) {
 						if( !a.isEmpty(i) ) {
 							int apos = a.pos(i);
 							int alen = a.size(i);
 							int[] aix = a.indexes(i);
 							double[] avals = a.values(i);
 							for( int j=apos; j<apos+alen; j++ )
 								if( avals[ j ] != 0 )
 									c[ cix+aix[j] ] = avals[ j ];
 						}
 					}
 			}
 		}
 		else { //DENSE <- DENSE
 			int mn = ret.rlen * ret.clen;
 			ret.allocateDenseBlock();
 			double[] a = getDenseBlockValues();
 			double[] c = ret.getDenseBlockValues();
 			for( int i=0; i<mn; i++ ) {
 				c[i] = ( a[i]== pattern || (NaNpattern && Double.isNaN(a[i])) ) ?
 					replacement : a[i];
 			}
 		}

 		ret.recomputeNonZeros();
 		ret.examSparsity();

 		return ret;
 	}

 	public MatrixBlock extractTriangular(MatrixBlock ret, boolean lower, boolean diag, boolean values) {
 		ret.reset(rlen, clen, sparse);
 		if( isEmptyBlock(false) )
 			return ret; //sparse-safe
 		ret.allocateBlock();

 		long nnz = 0;
 		if( sparse ) { //SPARSE
 			SparseBlock a = sparseBlock;
 			SparseBlock c = ret.sparseBlock;
 			for( int i=0; i<rlen; i++ ) {
 				if( a.isEmpty(i) ) continue;
 				int jbeg = Math.min(lower ? 0 : (diag ? i : i+1), clen);
 				int jend = Math.min(lower ? (diag ? i+1 : i) : clen, clen);
 				if( values ) {
 					int k1 = a.posFIndexGTE(i, jbeg);
 					int k2 = a.posFIndexGTE(i, jend);
 					k1 = (k1 >= 0) ? k1 : a.size(i);
 					k2 = (k2 >= 0) ? k2 : a.size(i);
 					int apos = a.pos(i);
 					int[] aix = a.indexes(i);
 					double[] avals = a.values(i);
 					c.allocate(i, k2-k1);
 					for( int k=apos+k1; k<apos+k2; k++ )
 						ret.appendValue(i, aix[k], avals[k]);
 				}
 				else {
 					c.allocate(i, jend-jbeg);
 					for( int j=jbeg; j<jend; j++ )
 						ret.appendValue(i, j, 1);
 				}
 			}
 			//nnz maintained internally
 		}
 		else { //DENSE <- DENSE
 			DenseBlock a = denseBlock;
 			DenseBlock c = ret.getDenseBlock();
 			for(int i = 0; i < rlen; i++) {
 				int jbeg = Math.min(lower ? 0 : (diag ? i : i+1), clen);
 				int jend = Math.min(lower ? (diag ? i+1 : i) : clen, clen);
 				double[] avals = a.values(i), cvals = c.values(i);
 				int aix = a.pos(i,jbeg), cix = c.pos(i,jbeg);
 				if( values ) {
 					System.arraycopy(avals, aix, cvals, cix, jend-jbeg);
 					nnz += UtilFunctions.countNonZeros(avals, aix, jend-jbeg);
 				}
 				else { //R semantics full reset, not just nnz
 					Arrays.fill(cvals, cix, cix+(jend-jbeg), 1);
 					nnz += (jend-jbeg);
 				}
 			}
 		}
 		ret.setNonZeros(nnz);
 		ret.examSparsity();
 		return ret;
 	}

 	/**
 	 *  D = ctable(A,v2,W)
 	 *  this &lt;- A; scalarThat &lt;- v2; that2 &lt;- W; result &lt;- D
 	 *
 	 * (i1,j1,v1) from input1 (this)
 	 * (v2) from sclar_input2 (scalarThat)
 	 * (i3,j3,w)  from input3 (that2)
 	 */
 	@Override
 	public void ctableOperations(Operator op, double scalarThat,
 			MatrixValue that2Val, CTableMap resultMap, MatrixBlock resultBlock) {
 		MatrixBlock that2 = checkType(that2Val);
 		CTable ctable = CTable.getCTableFnObject();
 		double v2 = scalarThat;

 		//sparse-unsafe ctable execution
 		//(because input values of 0 are invalid and have to result in errors)
 		for( int i=0; i<rlen; i++ )
 			for( int j=0; j<clen; j++ ) {
 				double v1 = this.quickGetValue(i, j);
 				double w = that2.quickGetValue(i, j);
 				ctable.execute(v1, v2, w, false, resultMap, resultBlock);
 			}

 		//maintain nnz (if necessary)
 		if( resultBlock!=null )
 			resultBlock.recomputeNonZeros();
 	}

 	/**
 	 *  D = ctable(A,v2,w)
 	 *  this &lt;- A; scalar_that &lt;- v2; scalar_that2 &lt;- w; result &lt;- D
 	 *
 	 * (i1,j1,v1) from input1 (this)
      * (v2) from sclar_input2 (scalarThat)
 	 * (w)  from scalar_input3 (scalarThat2)
 	 */
 	@Override
 	public void ctableOperations(Operator op, double scalarThat,
 			double scalarThat2, CTableMap resultMap, MatrixBlock resultBlock)
 	{
 		CTable ctable = CTable.getCTableFnObject();
 		double v2 = scalarThat;
 		double w = scalarThat2;

 		//sparse-unsafe ctable execution
 		//(because input values of 0 are invalid and have to result in errors)
 		for( int i=0; i<rlen; i++ )
 			for( int j=0; j<clen; j++ ) {
 				double v1 = this.quickGetValue(i, j);
 				ctable.execute(v1, v2, w, false, resultMap, resultBlock);
 			}

 		//maintain nnz (if necessary)
 		if( resultBlock!=null )
 			resultBlock.recomputeNonZeros();
 	}

 	/**
 	 * Specific ctable case of ctable(seq(...),X), where X is the only
 	 * matrix input. The 'left' input parameter specifies if the seq appeared
 	 * on the left, otherwise it appeared on the right.
 	 *
 	 */
 	@Override
 	public void ctableOperations(Operator op, MatrixIndexes ix1, double scalarThat,
 			boolean left, int blen, CTableMap resultMap, MatrixBlock resultBlock)
 	{
 		CTable ctable = CTable.getCTableFnObject();
 		double w = scalarThat;
 		int offset = (int) ((ix1.getRowIndex()-1)*blen);

 		//sparse-unsafe ctable execution
 		//(because input values of 0 are invalid and have to result in errors)
 		for( int i=0; i<rlen; i++ )
 			for( int j=0; j<clen; j++ ) {
 				double v1 = this.quickGetValue(i, j);
 				if( left )
 					ctable.execute(offset+i+1, v1, w, false, resultMap, resultBlock);
 				else
 					ctable.execute(v1, offset+i+1, w, false, resultMap, resultBlock);
 			}

 		//maintain nnz (if necessary)
 		if( resultBlock!=null )
 			resultBlock.recomputeNonZeros();
 	}

 	/**
 	 *  D = ctable(A,B,w)
 	 *  this &lt;- A; that &lt;- B; scalar_that2 &lt;- w; result &lt;- D
 	 *
 	 * (i1,j1,v1) from input1 (this)
 	 * (i1,j1,v2) from input2 (that)
 	 * (w)  from scalar_input3 (scalarThat2)
 	 *
 	 * NOTE: This method supports both vectors and matrices. In case of matrices and ignoreZeros=true
 	 * we can also use a sparse-safe implementation
 	 */
 	@Override
 	public void ctableOperations(Operator op, MatrixValue thatVal, double scalarThat2, boolean ignoreZeros,
 			     CTableMap resultMap, MatrixBlock resultBlock)
 	{
 		//setup ctable computation
 		MatrixBlock that = checkType(thatVal);
 		CTable ctable = CTable.getCTableFnObject();
 		double w = scalarThat2;

 		if( ignoreZeros //SPARSE-SAFE & SPARSE INPUTS
 			&& this.sparse && that.sparse )
 		{
 			//note: only used if both inputs have aligned zeros, which
 			//allows us to infer that the nnz both inputs are equivalent

 			//early abort on empty blocks possible
 			if( this.isEmptyBlock(false) && that.isEmptyBlock(false) )
 				return;

 			SparseBlock a = this.sparseBlock;
 			SparseBlock b = that.sparseBlock;
 			for( int i=0; i<rlen; i++ ) {
 				if( a.isEmpty(i) ) continue;
 				int alen = a.size(i);
 				int apos = a.pos(i);
 				double[] avals = a.values(i);
 				int bpos = b.pos(i);
 				double[] bvals = b.values(i);
 				for( int j=0; j<alen; j++ )
 					ctable.execute(avals[apos+j], bvals[bpos+j],
 						w, ignoreZeros, resultMap, resultBlock);
 			}
 		}
 		else //SPARSE-UNSAFE | GENERIC INPUTS
 		{
 			//sparse-unsafe ctable execution
 			//(because input values of 0 are invalid and have to result in errors)
 			for( int i=0; i<rlen; i++ )
 				for( int j=0; j<clen; j++ ) {
 					double v1 = this.quickGetValue(i, j);
 					double v2 = that.quickGetValue(i, j);
 					ctable.execute(v1, v2, w, ignoreZeros, resultMap, resultBlock);
 				}
 		}

 		//maintain nnz (if necessary)
 		if( resultBlock!=null )
 			resultBlock.recomputeNonZeros();
 	}

 	/**
 	 * @param thatMatrix matrix value
 	 * @param thatScalar scalar double
 	 * @param resultBlock result matrix block
 	 * @param updateClen when this matrix already has the desired number of columns updateClen can be set to false
 	 * @return resultBlock
 	 */
 	public MatrixBlock ctableSeqOperations(MatrixValue thatMatrix, double thatScalar, MatrixBlock resultBlock, boolean updateClen) {
 		MatrixBlock that = checkType(thatMatrix);
 		CTable ctable = CTable.getCTableFnObject();
 		double w = thatScalar;

 		//sparse-unsafe ctable execution
 		//(because input values of 0 are invalid and have to result in errors)
 		//resultBlock guaranteed to be allocated for ctableexpand
 		//each row in resultBlock will be allocated and will contain exactly one value
 		int maxCol = 0;
 		for( int i=0; i<rlen; i++ ) {
 			double v2 = that.quickGetValue(i, 0);
 			maxCol = ctable.execute(i+1, v2, w, maxCol, resultBlock);
 		}

 		//update meta data (initially unknown number of columns)
 		//note: nnz maintained in ctable (via quickset)
 		if(updateClen) {
 			resultBlock.clen = maxCol;
 		}
 		return resultBlock;
 	}

 	/**
 	 *  D = ctable(seq,A,w)
 	 *  this &lt;- seq; thatMatrix &lt;- A; thatScalar &lt;- w; result &lt;- D
 	 *
 	 * (i1,j1,v1) from input1 (this)
 	 * (i1,j1,v2) from input2 (that)
 	 * (w)  from scalar_input3 (scalarThat2)
 	 *
 	 * @param thatMatrix matrix value
 	 * @param thatScalar scalar double
 	 * @param resultBlock result matrix block
 	 * @return resultBlock
 	 */
 	public MatrixBlock ctableSeqOperations(MatrixValue thatMatrix, double thatScalar, MatrixBlock resultBlock) {
 		return ctableSeqOperations(thatMatrix, thatScalar, resultBlock, true);
 	}

 	/**
 	 *  D = ctable(A,B,W)
 	 *  this &lt;- A; that &lt;- B; that2 &lt;- W; result &lt;- D
 	 *
 	 * (i1,j1,v1) from input1 (this)
 	 * (i1,j1,v2) from input2 (that)
 	 * (i1,j1,w)  from input3 (that2)
 	 *
 	 * @param op operator
 	 * @param thatVal matrix value 1
 	 * @param that2Val matrix value 2
 	 * @param resultMap table map
 	 */
 	public void ctableOperations(Operator op, MatrixValue thatVal, MatrixValue that2Val, CTableMap resultMap) {
 		ctableOperations(op, thatVal, that2Val, resultMap, null);
 	}

 	@Override
 	public void ctableOperations(Operator op, MatrixValue thatVal, MatrixValue that2Val, CTableMap resultMap, MatrixBlock resultBlock) {
 		MatrixBlock that = checkType(thatVal);
 		MatrixBlock that2 = checkType(that2Val);
 		CTable ctable = CTable.getCTableFnObject();

 		//sparse-unsafe ctable execution
 		//(because input values of 0 are invalid and have to result in errors)
 		if(resultBlock == null)
 		{
 			for( int i=0; i<rlen; i++ )
 				for( int j=0; j<clen; j++ )
 				{
 					double v1 = this.quickGetValue(i, j);
 					double v2 = that.quickGetValue(i, j);
 					double w = that2.quickGetValue(i, j);
 					ctable.execute(v1, v2, w, false, resultMap);
 				}
 		}
 		else
 		{
 			for( int i=0; i<rlen; i++ )
 				for( int j=0; j<clen; j++ )
 				{
 					double v1 = this.quickGetValue(i, j);
 					double v2 = that.quickGetValue(i, j);
 					double w = that2.quickGetValue(i, j);
 					ctable.execute(v1, v2, w, false, resultBlock);
 				}
 			resultBlock.recomputeNonZeros();
 		}
 	}

 	public MatrixBlock quaternaryOperations(QuaternaryOperator qop, MatrixBlock um, MatrixBlock vm, MatrixBlock wm, MatrixBlock out) {
 		return quaternaryOperations(qop, um, vm, wm, out, 1);
 	}

 	public MatrixBlock quaternaryOperations(QuaternaryOperator qop, MatrixBlock U, MatrixBlock V, MatrixBlock wm, MatrixBlock out, int k) {
 		//check input dimensions
 		if( getNumRows() != U.getNumRows() )
 			throw new DMLRuntimeException("Dimension mismatch rows on quaternary operation: "+getNumRows()+"!="+U.getNumRows());
 		if( getNumColumns() != V.getNumRows() )
 			throw new DMLRuntimeException("Dimension mismatch columns quaternary operation: "+getNumColumns()+"!="+V.getNumRows());

 		//check input data types
 		MatrixBlock X = this;
 		MatrixBlock R = checkType(out);

 		//prepare intermediates and output
 		if( qop.wtype1 != null || qop.wtype4 != null )
 			R.reset(1, 1, false);
 		else if( qop.wtype2 != null || qop.wtype5 != null )
 			R.reset(rlen, clen, sparse);
 		else if( qop.wtype3 != null ) {
 			DataCharacteristics mc = qop.wtype3.computeOutputCharacteristics(X.rlen, X.clen, U.clen);
 			R.reset( (int)mc.getRows(), (int)mc.getCols(), qop.wtype3.isBasic()?X.isInSparseFormat():false);
 		}

 		//core block operation
 		if( qop.wtype1 != null ){ //wsloss
 			MatrixBlock W = qop.wtype1.hasFourInputs() ? checkType(wm) : null;
 			if( k > 1 )
 				LibMatrixMult.matrixMultWSLoss(X, U, V, W, R, qop.wtype1, k);
 			else
 				LibMatrixMult.matrixMultWSLoss(X, U, V, W, R, qop.wtype1);
 		}
 		else if( qop.wtype2 != null ){ //wsigmoid
 			if( k > 1 )
 				LibMatrixMult.matrixMultWSigmoid(X, U, V, R, qop.wtype2, k);
 			else
 				LibMatrixMult.matrixMultWSigmoid(X, U, V, R, qop.wtype2);
 		}
 		else if( qop.wtype3 != null ){ //wdivmm
 			//note: for wdivmm-minus X and W interchanged because W always present
 			MatrixBlock W = qop.wtype3.hasFourInputs() ? checkType(wm) : null;
 			if( qop.getScalar() != 0 )
 				W = new MatrixBlock(qop.getScalar());
 			if( k > 1 )
 				LibMatrixMult.matrixMultWDivMM(X, U, V, W, R, qop.wtype3, k);
 			else
 				LibMatrixMult.matrixMultWDivMM(X, U, V, W, R, qop.wtype3);
 		}
 		else if( qop.wtype4 != null ){ //wcemm
 			MatrixBlock W = qop.wtype4.hasFourInputs() ? checkType(wm) : null;
 			double eps = (W != null && W.getNumRows() == 1 && W.getNumColumns() == 1) ? W.quickGetValue(0, 0) : qop.getScalar();

 			if( k > 1 )
 				LibMatrixMult.matrixMultWCeMM(X, U, V, eps, R, qop.wtype4, k);
 			else
 				LibMatrixMult.matrixMultWCeMM(X, U, V, eps, R, qop.wtype4);
 		}
 		else if( qop.wtype5 != null ){ //wumm
 			if( k > 1 )
 				LibMatrixMult.matrixMultWuMM(X, U, V, R, qop.wtype5, qop.fn, k);
 			else
 				LibMatrixMult.matrixMultWuMM(X, U, V, R, qop.wtype5, qop.fn);
 		}

 		return R;
 	}

 	////////
 	// Data Generation Methods
 	// (rand, sequence)

 	/**
 	 * Function to generate the random matrix with specified dimensions (block sizes are not specified).
 	 *
 	 *
 	 * @param rows number of rows
 	 * @param cols number of columns
 	 * @param sparsity sparsity as a percentage
 	 * @param min minimum value
 	 * @param max maximum value
 	 * @param pdf pdf
 	 * @param seed random seed
 	 * @return matrix block
 	 */
 	public static MatrixBlock randOperations(int rows, int cols, double sparsity, double min, double max, String pdf, long seed) {
 		return randOperations(rows, cols, sparsity, min, max, pdf, seed, 1);
 	}

 	/**
 	 * Function to generate the random matrix with specified dimensions (block sizes are not specified).
 	 *
 	 * @param rows number of rows
 	 * @param cols number of columns
 	 * @param sparsity sparsity as a percentage
 	 * @param min minimum value
 	 * @param max maximum value
 	 * @param pdf pdf
 	 * @param seed random seed
 	 * @param k ?
 	 * @return matrix block
 	 */
 	public static MatrixBlock randOperations(int rows, int cols, double sparsity, double min, double max, String pdf, long seed, int k) {
 		RandomMatrixGenerator rgen = new RandomMatrixGenerator(pdf, rows, cols,
 				ConfigurationManager.getBlocksize(), sparsity, min, max);

 		if (k > 1)
 			return randOperations(rgen, seed, k);
 		else
 			return randOperations(rgen, seed);
 	}

 	/**
 	 * Function to generate the random matrix with specified dimensions and block dimensions.
 	 *
 	 * @param rgen random matrix generator
 	 * @param seed seed value
 	 * @return matrix block
 	 */
 	public static MatrixBlock randOperations(RandomMatrixGenerator rgen, long seed) {
 		return randOperations(rgen, seed, 1);
 	}

 	/**
 	 * Function to generate the random matrix with specified dimensions and block dimensions.
 	 *
 	 * @param rgen random matrix generator
 	 * @param seed seed value
 	 * @param k ?
 	 * @return matrix block
 	 */
 	public static MatrixBlock randOperations(RandomMatrixGenerator rgen, long seed, int k) {
 		MatrixBlock out = new MatrixBlock();
 		Well1024a bigrand = null;

 		//setup seeds and nnz per block
 		if( !LibMatrixDatagen.isShortcutRandOperation(rgen._min, rgen._max, rgen._sparsity, rgen._pdf) )
 			bigrand = LibMatrixDatagen.setupSeedsForRand(seed);

 		//generate rand data
 		if (k > 1)
 			out.randOperationsInPlace(rgen, bigrand, -1, k);
 		else
 			out.randOperationsInPlace(rgen, bigrand, -1);

 		return out;
 	}

 	/**
 	 * Function to generate a matrix of random numbers. This is invoked both
 	 * from CP as well as from MR. In case of CP, it generates an entire matrix
 	 * block-by-block. A <code>bigrand</code> is passed so that block-level
 	 * seeds are generated internally. In case of MR, it generates a single
 	 * block for given block-level seed <code>bSeed</code>.
 	 *
 	 * When pdf="uniform", cell values are drawn from uniform distribution in
 	 * range <code>[min,max]</code>.
 	 *
 	 * When pdf="normal", cell values are drawn from standard normal
 	 * distribution N(0,1). The range of generated values will always be
 	 * (-Inf,+Inf).
 	 *
 	 * @param rgen random matrix generator
 	 * @param bigrand ?
 	 * @param bSeed seed value
 	 * @return matrix block
 	 */
 	public MatrixBlock randOperationsInPlace(RandomMatrixGenerator rgen, Well1024a bigrand, long bSeed ) {
 		LibMatrixDatagen.generateRandomMatrix(this, rgen, bigrand, bSeed);
 		return this;
 	}

 	/**
 	 * Function to generate a matrix of random numbers. This is invoked both
 	 * from CP as well as from MR. In case of CP, it generates an entire matrix
 	 * block-by-block. A <code>bigrand</code> is passed so that block-level
 	 * seeds are generated internally. In case of MR, it generates a single
 	 * block for given block-level seed <code>bSeed</code>.
 	 *
 	 * When pdf="uniform", cell values are drawn from uniform distribution in
 	 * range <code>[min,max]</code>.
 	 *
 	 * When pdf="normal", cell values are drawn from standard normal
 	 * distribution N(0,1). The range of generated values will always be
 	 * (-Inf,+Inf).
 	 *
 	 * @param rgen random matrix generator
 	 * @param bigrand ?
 	 * @param bSeed seed value
 	 * @param k ?
 	 * @return matrix block
 	 */
 	public MatrixBlock randOperationsInPlace(RandomMatrixGenerator rgen,
 			Well1024a bigrand, long bSeed, int k) {
 		LibMatrixDatagen.generateRandomMatrix(this, rgen, bigrand, bSeed, k);
 		return this;
 	}

 	/**
 	 * Method to generate a sequence according to the given parameters. The
 	 * generated sequence is always in dense format.
 	 *
 	 * Both end points specified <code>from</code> and <code>to</code> must be
 	 * included in the generated sequence i.e., [from,to] both inclusive. Note
 	 * that, <code>to</code> is included only if (to-from) is perfectly
 	 * divisible by <code>incr</code>.
 	 *
 	 * For example, seq(0,1,0.5) generates (0.0 0.5 1.0)
 	 *      whereas seq(0,1,0.6) generates (0.0 0.6) but not (0.0 0.6 1.0)
 	 *
 	 * @param from ?
 	 * @param to ?
 	 * @param incr ?
 	 * @return matrix block
 	 */
 	public static MatrixBlock seqOperations(double from, double to, double incr) {
 		MatrixBlock out = new MatrixBlock();
 		LibMatrixDatagen.generateSequence( out, from, to, incr );
 		return out;
 	}

 	public MatrixBlock seqOperationsInPlace(double from, double to, double incr) {
 		LibMatrixDatagen.generateSequence( this, from, to, incr );
 		return this;
 	}

 	public static MatrixBlock sampleOperations(long range, int size, boolean replace, long seed) {
 		MatrixBlock out = new MatrixBlock();
 		LibMatrixDatagen.generateSample( out, range, size, replace, seed );
 		return out;
 	}

 	////////
 	// Misc methods

 	private static MatrixBlock checkType(MatrixValue block) {
 		if( block!=null && !(block instanceof MatrixBlock))
 			throw new RuntimeException("Unsupported matrix value: "
 				+ block.getClass().getSimpleName());
 		return (MatrixBlock) block;
 	}

 	/**
 	 * Indicates if concurrent modifications of disjoint rows are thread-safe.
 	 *
 	 * @return true if thread-safe
 	 */
 	public boolean isThreadSafe() {
 		return !sparse || ((sparseBlock != null) ? sparseBlock.isThreadSafe() :
 			DEFAULT_SPARSEBLOCK == SparseBlock.Type.MCSR); //only MCSR thread-safe
 	}

 	/**
 	 * Indicates if concurrent modifications of disjoint rows are thread-safe.
 	 *
 	 * @param sparse true if sparse
 	 * @return true if ?
 	 */
 	public static boolean isThreadSafe(boolean sparse) {
 		return !sparse || DEFAULT_SPARSEBLOCK == SparseBlock.Type.MCSR; //only MCSR thread-safe
 	}

 	/**
 	 * Checks for existing NaN values in the matrix block.
 	 * @throws DMLRuntimeException if the blocks contains at least one NaN.
 	 */
 	public void checkNaN() {
 		if( isEmptyBlock(false) )
 			return;
 		if( sparse ) {
 			SparseBlock sblock = sparseBlock;
 			for(int i=0; i<rlen; i++) {
 				if( sblock.isEmpty(i) ) continue;
 				int alen = sblock.size(i);
 				int apos = sblock.pos(i);
 				int[] aix = sblock.indexes(i);
 				double[] avals = sblock.values(i);
 				for(int k=apos; k<apos+alen; k++) {
 					if( Double.isNaN(avals[k]) )
 						throw new DMLRuntimeException("NaN encountered at position ["+i+","+aix[k]+"].");
 				}
 			}
 		}
 		else {
 			DenseBlock dblock = denseBlock;
 			for(int i=0; i<rlen; i++) {
 				int aix = dblock.pos(i);
 				double[] avals = dblock.values(i);
 				for(int j=0; j<clen; j++)
 					if( Double.isNaN(avals[aix+j]) )
 						throw new DMLRuntimeException("NaN encountered at position ["+i+","+j+"].");
 			}
 		}
 	}

 	@Override
 	public int compareTo(Object arg0) {
 		throw new RuntimeException("CompareTo should never be called for matrix blocks.");
 	}

 	@Override
 	public boolean equals(Object arg0) {
 		throw new RuntimeException("equals should never be called for matrix blocks.");
 	}

 	@Override
 	public int hashCode() {
 		throw new RuntimeException("HashCode should never be called for matrix blocks.");
 	}

 	@Override
 	public String toString()
 	{
 		StringBuilder sb = new StringBuilder();

 		sb.append("sparse? = ");
 		sb.append(sparse);
 		sb.append("\n");

 		sb.append("nonzeros = ");
 		sb.append(nonZeros);
 		sb.append("\n");

 		sb.append("size: ");
 		sb.append(rlen);
 		sb.append(" X ");
 		sb.append(clen);
 		sb.append("\n");

 		if( sparse && sparseBlock != null ) {
 			//overloaded implementation in sparse blocks
 			sb.append(sparseBlock.toString());
 		}
 		else if( !sparse && denseBlock!=null ) {
 			//overloaded implementation in dense blocks
 			sb.append(denseBlock.toString());
 		}

 		return sb.toString();
 	}


 	///////////////////////////
 	// Helper classes

 	public static class SparsityEstimate
 	{
 		public long estimatedNonZeros=0;
 		public boolean sparse=false;
 		public SparsityEstimate(boolean sps, long nnzs)
 		{
 			sparse=sps;
 			estimatedNonZeros=nnzs;
 		}
 		public SparsityEstimate(){}
 	}
 }