src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java - systemds - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.sysds.runtime.matrix.data;

 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject.UpdateType;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.DenseBlockFactory;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.data.SparseBlockCSR;
 import org.apache.sysds.runtime.data.SparseRowVector;
 import org.apache.sysds.runtime.functionobjects.DiagIndex;
 import org.apache.sysds.runtime.functionobjects.RevIndex;
 import org.apache.sysds.runtime.functionobjects.SortIndex;
 import org.apache.sysds.runtime.functionobjects.SwapIndex;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysds.runtime.meta.DataCharacteristics;
 import org.apache.sysds.runtime.util.CommonThreadPool;
 import org.apache.sysds.runtime.util.DataConverter;
 import org.apache.sysds.runtime.util.SortUtils;
 import org.apache.sysds.runtime.util.UtilFunctions;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
 import java.util.stream.Collectors;

 /**
  * MB:
  * Library for selected matrix reorg operations including special cases
  * and all combinations of dense and sparse representations.
  *
  * Current list of supported operations:
  *  - reshape,
  *  - r' (transpose),
  *  - rdiag (diagV2M/diagM2V),
  *  - rsort (sorting data/indexes)
  *  - rmempty (remove empty)
  *  - rexpand (outer/table-seq expansion)
  */
 public class LibMatrixReorg
 {
 	//minimum number of elements for multi-threaded execution
 	public static final long PAR_NUMCELL_THRESHOLD = 1024*1024; //1M

 	// SORTING threshold
 	public static final int PAR_NUMCELL_THRESHOLD_SORT = 1024;

 	//allow shallow dense/sparse copy for unchanged data (which is
 	//safe due to copy-on-write and safe update-in-place handling)
 	public static final boolean SHALLOW_COPY_REORG = true;

 	//use csr instead of mcsr sparse block for rexpand columns / diag v2m
 	public static final boolean SPARSE_OUTPUTS_IN_CSR = true;

 	private enum ReorgType {
 		TRANSPOSE,
 		REV,
 		DIAG,
 		RESHAPE,
 		SORT,
 		INVALID,
 	}

 	private LibMatrixReorg() {
 		//prevent instantiation via private constructor
 	}

 	/////////////////////////
 	// public interface    //
 	/////////////////////////

 	public static boolean isSupportedReorgOperator( ReorgOperator op )
 	{
 		return (getReorgType(op) != ReorgType.INVALID);
 	}

 	public static MatrixBlock reorg( MatrixBlock in, MatrixBlock out, ReorgOperator op ) {
 		ReorgType type = getReorgType(op);

 		switch( type ) {
 			case TRANSPOSE:
 				if( op.getNumThreads() > 1 )
 					return transpose(in, out, op.getNumThreads());
 				else
 					return transpose(in, out);
 			case REV:
 				return rev(in, out);
 			case DIAG:
 				return diag(in, out);
 			case SORT:
 				SortIndex ix = (SortIndex) op.fn;
 				if (op.getNumThreads() > 1)
 					return sort(in, out, ix.getCols(), ix.getDecreasing(), ix.getIndexReturn(), op.getNumThreads());
 				else
 					return sort(in, out, ix.getCols(), ix.getDecreasing(), ix.getIndexReturn());
 			default:
 				throw new DMLRuntimeException("Unsupported reorg operator: "+op.fn);
 		}
 	}

 	public static MatrixBlock transpose( MatrixBlock in, MatrixBlock out ) {
 		//sparse-safe operation
 		if( in.isEmptyBlock(false) )
 			return out;

 		//set basic meta data
 		out.nonZeros = in.nonZeros;

 		//shallow dense vector transpose (w/o result allocation)
 		//since the physical representation of dense vectors is always the same,
 		//we don't need to create a copy, given our copy on write semantics.
 		//however, note that with update in-place this would be an invalid optimization
 		if( SHALLOW_COPY_REORG && !in.sparse && !out.sparse && (in.rlen==1 || in.clen==1)  ) {
 			out.denseBlock = DenseBlockFactory.createDenseBlock(in.getDenseBlockValues(), in.clen, in.rlen);
 			return out;
 		}

 		//Timing time = new Timing(true);

 		//allocate output arrays (if required)
 		if( out.sparse )
 			out.allocateSparseRowsBlock(false);
 		else
 			out.allocateDenseBlock(false);

 		//execute transpose operation
 		if( !in.sparse && !out.sparse )
 			transposeDenseToDense( in, out, 0, in.rlen, 0, in.clen );
 		else if( in.sparse && out.sparse )
 			transposeSparseToSparse( in, out, 0, in.rlen, 0, in.clen,
 				countNnzPerColumn(in, 0, in.rlen));
 		else if( in.sparse )
 			transposeSparseToDense( in, out, 0, in.rlen, 0, in.clen );
 		else
 			transposeDenseToSparse( in, out );

 		//System.out.println("r' ("+in.rlen+", "+in.clen+", "+in.sparse+", "+out.sparse+") in "+time.stop()+" ms.");

 		return out;
 	}

 	public static MatrixBlock transpose( MatrixBlock in, MatrixBlock out, int k ) {
 		//redirect small or special cases to sequential execution
 		if( in.isEmptyBlock(false) || (in.rlen * in.clen < PAR_NUMCELL_THRESHOLD) || k == 1
 			|| (SHALLOW_COPY_REORG && !in.sparse && !out.sparse && (in.rlen==1 || in.clen==1) )
 			|| (in.sparse && !out.sparse && in.rlen==1) || (!in.sparse && out.sparse && in.rlen==1)
 			|| (!in.sparse && out.sparse) || !out.isThreadSafe())
 		{
 			return transpose(in, out);
 		}

 		//Timing time = new Timing(true);

 		//set meta data and allocate output arrays (if required)
 		out.nonZeros = in.nonZeros;
 		if( out.sparse )
 			out.allocateSparseRowsBlock(false);
 		else
 			out.allocateDenseBlock(false);

 		//core multi-threaded transpose
 		try {
 			ExecutorService pool = CommonThreadPool.get(k);
 			//pre-processing (compute nnz per column once for sparse)
 			int[] cnt = null;
 			if( in.sparse && out.sparse ) {
 				ArrayList<CountNnzTask> tasks = new ArrayList<>();
 				int blklen = (int)(Math.ceil((double)in.rlen/k));
 				for( int i=0; i<k & i*blklen<in.rlen; i++ )
 					tasks.add(new CountNnzTask(in, i*blklen, Math.min((i+1)*blklen, in.rlen)));
 				List<Future<int[]>> rtasks = pool.invokeAll(tasks);
 				for( Future<int[]> rtask : rtasks )
 					cnt = mergeNnzCounts(cnt, rtask.get());
 			}
 			//compute actual transpose and check for errors
 			ArrayList<TransposeTask> tasks = new ArrayList<>();
 			boolean row = (in.sparse || in.rlen >= in.clen) && !out.sparse;
 			int len = row ? in.rlen : in.clen;
 			int blklen = (int)(Math.ceil((double)len/k));
 			blklen += (!out.sparse && (blklen%8)!=0) ? 8-blklen%8 : 0;
 			for( int i=0; i<k & i*blklen<len; i++ )
 				tasks.add(new TransposeTask(in, out, row, i*blklen, Math.min((i+1)*blklen, len), cnt));
 			List<Future<Object>> taskret = pool.invokeAll(tasks);
 			pool.shutdown();
 			for( Future<Object> task : taskret )
 				task.get();
 		}
 		catch(Exception ex) {
 			throw new DMLRuntimeException(ex);
 		}

 		//System.out.println("r' k="+k+" ("+in.rlen+", "+in.clen+", "+in.sparse+", "+out.sparse+") in "+time.stop()+" ms.");

 		return out;
 	}

 	public static MatrixBlock rev( MatrixBlock in, MatrixBlock out ) {
 		//Timing time = new Timing(true);

 		//sparse-safe operation
 		if( in.isEmptyBlock(false) )
 			return out;

 		//special case: row vector
 		if( in.rlen == 1 ) {
 			out.copy(in);
 			return out;
 		}

 		if( in.sparse )
 			reverseSparse( in, out );
 		else
 			reverseDense( in, out );

 		//System.out.println("rev ("+in.rlen+", "+in.clen+", "+in.sparse+") in "+time.stop()+" ms.");

 		return out;
 	}

 	public static void rev( IndexedMatrixValue in, long rlen, int blen, ArrayList<IndexedMatrixValue> out ) {
 		//input block reverse
 		MatrixIndexes inix = in.getIndexes();
 		MatrixBlock inblk = (MatrixBlock) in.getValue();
 		MatrixBlock tmpblk = rev(inblk, new MatrixBlock(inblk.getNumRows(), inblk.getNumColumns(), inblk.isInSparseFormat()));

 		//split and expand block if necessary (at most 2 blocks)
 		if( rlen % blen == 0 ) //special case: aligned blocks
 		{
 			int nrblks = (int)Math.ceil((double)rlen/blen);
 			out.add(new IndexedMatrixValue(
 					new MatrixIndexes(nrblks-inix.getRowIndex()+1, inix.getColumnIndex()), tmpblk));
 		}
 		else //general case: unaligned blocks
 		{
 			//compute target positions and sizes
 			long pos1 = rlen - UtilFunctions.computeCellIndex(inix.getRowIndex(), blen, tmpblk.getNumRows()-1) + 1;
 			long pos2 = pos1 + tmpblk.getNumRows() - 1;
 			int ipos1 = UtilFunctions.computeCellInBlock(pos1, blen);
 			int iposCut = tmpblk.getNumRows() - ipos1 - 1;
 			int blkix1 = (int)UtilFunctions.computeBlockIndex(pos1, blen);
 			int blkix2 = (int)UtilFunctions.computeBlockIndex(pos2, blen);
 			int blklen1 = UtilFunctions.computeBlockSize(rlen, blkix1, blen);
 			int blklen2 = UtilFunctions.computeBlockSize(rlen, blkix2, blen);

 			//slice first block
 			MatrixIndexes outix1 = new MatrixIndexes(blkix1, inix.getColumnIndex());
 			MatrixBlock outblk1 = new MatrixBlock(blklen1, inblk.getNumColumns(), inblk.isInSparseFormat());
 			MatrixBlock tmp1 = tmpblk.slice(0, iposCut);
 			outblk1.leftIndexingOperations(tmp1, ipos1, ipos1+tmp1.getNumRows()-1,
 				0, tmpblk.getNumColumns()-1, outblk1, UpdateType.INPLACE_PINNED);
 			out.add(new IndexedMatrixValue(outix1, outblk1));

 			//slice second block (if necessary)
 			if( blkix1 != blkix2 ) {
 				MatrixIndexes outix2 = new MatrixIndexes(blkix2, inix.getColumnIndex());
 				MatrixBlock outblk2 = new MatrixBlock(blklen2, inblk.getNumColumns(), inblk.isInSparseFormat());
 				MatrixBlock tmp2 = tmpblk.slice(iposCut+1, tmpblk.getNumRows()-1);
 				outblk2.leftIndexingOperations(tmp2, 0, tmp2.getNumRows()-1, 0, tmpblk.getNumColumns()-1, outblk2, UpdateType.INPLACE_PINNED);
 				out.add(new IndexedMatrixValue(outix2, outblk2));
 			}
 		}
 	}

 	public static MatrixBlock diag( MatrixBlock in, MatrixBlock out ) {
 		//Timing time = new Timing(true);

 		//sparse-safe operation
 		if( in.isEmptyBlock(false) )
 			return out;

 		int rlen = in.rlen;
 		int clen = in.clen;

 		if( clen == 1 ) //diagV2M
 			diagV2M( in, out );
 		else if ( rlen == clen ) //diagM2V
 			diagM2V( in, out );
 		else
 			throw new DMLRuntimeException("Reorg diagM2V requires squared block input. ("+rlen+", "+clen+")");

 		//System.out.println("rdiag ("+in.rlen+", "+in.clen+", "+in.sparse+", "+out.sparse+") in "+time.stop()+" ms.");

 		return out;
 	}

 	public static MatrixBlock sort(MatrixBlock in, MatrixBlock out, int[] by, boolean desc, boolean ixret) {
 		return sort(in,out,by,desc,ixret,1);
 	}

 	/**
 	 * @param in Input matrix to sort
 	 * @param out Output matrix where the sorted input is inserted to
 	 * @param by The Ordering parameter
 	 * @param desc A boolean, specifying if it should be descending order.
 	 * @param ixret A boolean, specifying if the return should be the sorted indexes.
 	 * @param k Number of parallel threads
 	 * @return The sorted out matrix.
 	 */
 	public static MatrixBlock sort(MatrixBlock in, MatrixBlock out, int[] by, boolean desc, boolean ixret, int k) {
 		//Timing time = new Timing(true);

 		//meta data gathering and preparation
 		boolean sparse = in.isInSparseFormat();
 		int rlen = in.rlen;
 		int clen = in.clen;
 		out.sparse = (in.sparse && !ixret);
 		out.nonZeros = ixret ? rlen : in.nonZeros;

 		//step 1: error handling
 		if( !isValidSortByList(by, clen) )
 			throw new DMLRuntimeException("Sort configuration issue: invalid orderby columns: "
 				+ Arrays.toString(by)+" ("+rlen+"x"+clen+" input).");

 		//step 2: empty block / special case handling
 		if( !ixret ) //SORT DATA
 		{
 			if( in.isEmptyBlock(false) ) //EMPTY INPUT BLOCK
 				return out;

 			if( !sparse && clen == 1 ) { //DENSE COLUMN VECTOR
 				//in-place quicksort, unstable (no indexes needed)
 				out.copy( in ); //dense (always single block)
 				if (k > 1)
 					Arrays.parallelSort(out.getDenseBlockValues());
 				else
 					Arrays.sort(out.getDenseBlockValues());
 				if( desc )
 					sortReverseDense(out);
 				return out;
 			}
 		}
 		else //SORT INDEX
 		{
 			if( in.isEmptyBlock(false) ) { //EMPTY INPUT BLOCK
 				out.allocateDenseBlock(false); //single block
 				double[] c = out.getDenseBlockValues();
 				// create a list containing the indexes of each element in in.
 				for( int i=0; i<rlen; i++ )
 					c[i] = i+1; //seq(1,n)
 				return out;
 			}
 		}

 		//step 3: index vector sorting
 		//create index vector and extract values
 		//TODO perf: reconsider partition sort to avoid unnecessary barriers
 		int[] vix = new int[rlen];
 		double[] values = new double[rlen];
 		for( int i=0; i<rlen; i++ ) {
 			vix[i] = i;
 			values[i] = in.quickGetValue(i, by[0]-1);
 		}

 		// step 4: split the data into number of blocks of PAR_NUMCELL_THRESHOLD_SORT (1024) elements.
 		if (k == 1 || rlen < PAR_NUMCELL_THRESHOLD_SORT){ // There is no parallel
 			//sort index vector on extracted data (unstable)
 			SortUtils.sortByValue(0, rlen, values, vix);
 		} else {
 			try {
 				ExecutorService pool = CommonThreadPool.get(k);
 				ArrayList<SortTask> tasks = new ArrayList<>();

 				// sort smaller blocks.
 				int blklen = (int)(Math.ceil((double)rlen/k));
 				for( int i=0; i*blklen<rlen; i++ ){
 					int start = i*blklen;
 					int stop = Math.min(rlen , i*blklen + blklen);
 					tasks.add(new SortTask(start, stop, vix, values));
 				}
 				CommonThreadPool.invokeAndShutdown(pool, tasks);
 				mergeSortedBlocks(blklen, vix, values, k);
 			}
 			catch(Exception ex) {
 				throw new DMLRuntimeException(ex);
 			}
 		}


 		//sort by secondary columns if required (in-place)
 		if( by.length > 1 )
 			sortBySecondary(0, rlen, values, vix, in, by, 1);

 		//flip order if descending requested (note that this needs to happen
 		//before we ensure stable outputs, hence we also flip values)
 		if(desc) {
 			sortReverseDense(vix);
 			sortReverseDense(values);
 		}

 		//final pass to ensure stable output
 		sortIndexesStable(0, rlen, values, vix, in, by, 1);

 		//step 4: create output matrix (guaranteed non-empty, see step 2)
 		if( !ixret ) {
 			out.allocateBlock();
 			//copy input data in sorted order into result
 			ExecutorService pool = CommonThreadPool.get(k);
 			ArrayList<CopyTask> tasks = new ArrayList<>();
 			ArrayList<Integer> blklen = UtilFunctions
 				.getBalancedBlockSizesDefault(rlen, k, false);
 			for( int i=0, lb=0; i<blklen.size(); lb+=blklen.get(i), i++ )
 				tasks.add( new CopyTask(in, out, vix, lb, lb+blklen.get(i)));
 			CommonThreadPool.invokeAndShutdown(pool, tasks);
 		}
 		else {
 			//copy sorted index vector into result
 			out.allocateDenseBlock(false);
 			DenseBlock c = out.getDenseBlock();
 			for( int i=0; i<rlen; i++ )
 				c.set(i, 0, vix[i]+1);
 		}

 		return out;
 	}

 	/**
 	 * CP reshape operation (single input, single output matrix)
 	 *
 	 * NOTE: In contrast to R, the rowwise parameter specifies both
 	 * the read and write order, with row-wise being the default, while
 	 * R uses always a column-wise read, rowwise specifying the write
 	 * order and column-wise being the default.
 	 *
 	 * @param in input matrix
 	 * @param out output matrix
 	 * @param rows number of rows
 	 * @param cols number of columns
 	 * @param rowwise if true, reshape by row
 	 * @return output matrix
 	 */
 	public static MatrixBlock reshape( MatrixBlock in, MatrixBlock out, int rows, int cols, boolean rowwise ) {
 		int rlen = in.rlen;
 		int clen = in.clen;

 		//check validity
 		if( ((long)rlen)*clen != ((long)rows)*cols )
 			throw new DMLRuntimeException("Reshape matrix requires consistent numbers of input/output cells ("+rlen+":"+clen+", "+rows+":"+cols+").");

 		//check for same dimensions
 		if( rlen==rows && clen == cols ) {
 			//copy incl dims, nnz
 			if( SHALLOW_COPY_REORG )
 				out.copyShallow(in);
 			else
 				out.copy(in);
 			return out;
 		}

 		//determine output representation
 		out.sparse = MatrixBlock.evalSparseFormatInMemory(rows, cols, in.nonZeros);

 		//set output dimensions
 		out.rlen = rows;
 		out.clen = cols;
 		out.nonZeros = in.nonZeros;

 		//core reshape (sparse or dense)
 		if(!in.sparse && !out.sparse)
 			reshapeDense(in, out, rows, cols, rowwise);
 		else if(in.sparse && out.sparse)
 			reshapeSparse(in, out, rows, cols, rowwise);
 		else if(in.sparse)
 			reshapeSparseToDense(in, out, rows, cols, rowwise);
 		else
 			reshapeDenseToSparse(in, out, rows, cols, rowwise);

 		return out;
 	}


 	/**
 	 * MR/SPARK reshape interface - for reshape we cannot view blocks independently, and hence,
 	 * there are different CP and MR interfaces.
 	 *
 	 * @param in indexed matrix value
 	 * @param mcIn input matrix characteristics
 	 * @param mcOut output matrix characteristics
 	 * @param rowwise if true, reshape by row
 	 * @param outputEmptyBlocks output blocks with nnz=0
 	 * @return list of indexed matrix values
 	 */
 	public static List<IndexedMatrixValue> reshape(IndexedMatrixValue in, DataCharacteristics mcIn,
 	                                               DataCharacteristics mcOut, boolean rowwise, boolean outputEmptyBlocks ) {
 		//prepare inputs
 		MatrixIndexes ixIn = in.getIndexes();
 		MatrixBlock mbIn = (MatrixBlock) in.getValue();

 		//prepare result blocks (no reuse in order to guarantee mem constraints)
 		Collection<MatrixIndexes> rix = computeAllResultBlockIndexes(ixIn, mcIn, mcOut, mbIn, rowwise, outputEmptyBlocks);
 		Map<MatrixIndexes, MatrixBlock> rblk = createAllResultBlocks(rix, mbIn.nonZeros, mcOut);

 		//basic algorithm
 		long row_offset = (ixIn.getRowIndex()-1)*mcIn.getBlocksize();
 		long col_offset = (ixIn.getColumnIndex()-1)*mcIn.getBlocksize();
 		if( mbIn.sparse )
 			reshapeSparse(mbIn, row_offset, col_offset, rblk, mcIn, mcOut, rowwise);
 		else //dense
 			reshapeDense(mbIn, row_offset, col_offset, rblk, mcIn, mcOut, rowwise);

 		//prepare output (sparsity switch, wrapper)
 		return rblk.entrySet().stream()
 			.filter( e -> outputEmptyBlocks || !e.getValue().isEmptyBlock(false))
 			.map(e -> {e.getValue().examSparsity(); return new IndexedMatrixValue(e.getKey(),e.getValue());})
 			.collect(Collectors.toList());
 	}

 	/**
 	 * CP rmempty operation (single input, single output matrix)
 	 *
 	 * @param in input matrix
 	 * @param ret output matrix
 	 * @param rows ?
 	 * @param emptyReturn return row/column of zeros for empty input
 	 * @param select ?
 	 * @return matrix block
 	 */
 	public static MatrixBlock rmempty(MatrixBlock in, MatrixBlock ret, boolean rows, boolean emptyReturn, MatrixBlock select) {
 		//check for empty inputs
 		//(the semantics of removeEmpty are that for an empty m-by-n matrix, the output
 		//is an empty 1-by-n or m-by-1 matrix because we don't allow matrices with dims 0)
 		if( in.isEmptyBlock(false) && select == null  ) {
 			int n = emptyReturn ? 1 : 0;
 			if( rows )
 				ret.reset(n, in.clen, in.sparse);
 			else //cols
 				ret.reset(in.rlen, n, in.sparse);
 			return ret;
 		}

 		if( rows )
 			return removeEmptyRows(in, ret, select, emptyReturn);
 		else //cols
 			return removeEmptyColumns(in, ret, select, emptyReturn);
 	}

 	/**
 	 * MR rmempty interface - for rmempty we cannot view blocks independently, and hence,
 	 * there are different CP and MR interfaces.
 	 *
 	 * @param data ?
 	 * @param offset ?
 	 * @param rmRows ?
 	 * @param len ?
 	 * @param blen block length
 	 * @param outList list of indexed matrix values
 	 */
 	public static void rmempty(IndexedMatrixValue data, IndexedMatrixValue offset, boolean rmRows, long len, long blen, ArrayList<IndexedMatrixValue> outList) {
 		//sanity check inputs
 		if( !(data.getValue() instanceof MatrixBlock && offset.getValue() instanceof MatrixBlock) )
 			throw new DMLRuntimeException("Unsupported input data: expected "+MatrixBlock.class.getName()+" but got "+data.getValue().getClass().getName()+" and "+offset.getValue().getClass().getName());
 		if(     rmRows && data.getValue().getNumRows()!=offset.getValue().getNumRows()
 			|| !rmRows && data.getValue().getNumColumns()!=offset.getValue().getNumColumns()  ){
 			throw new DMLRuntimeException("Dimension mismatch between input data and offsets: ["
 					+data.getValue().getNumRows()+"x"+data.getValue().getNumColumns()+" vs "+offset.getValue().getNumRows()+"x"+offset.getValue().getNumColumns());
 		}

 		//compute outputs (at most two output blocks)
 		HashMap<MatrixIndexes,IndexedMatrixValue> out = new HashMap<>();
 		MatrixBlock linData = (MatrixBlock) data.getValue();
 		MatrixBlock linOffset = (MatrixBlock) offset.getValue();
 		MatrixIndexes tmpIx = new MatrixIndexes(-1,-1);
 		if( rmRows ) //margin = "rows"
 		{
 			long rlen = len;
 			long clen = linData.getNumColumns();

 			for( int i=0; i<linOffset.getNumRows(); i++ ) {
 				long rix = (long)linOffset.quickGetValue(i, 0);
 				if( rix > 0 ) //otherwise empty row
 				{
 					//get single row from source block
 					MatrixBlock src = linData.slice(i, i, 0, (int)(clen-1), new MatrixBlock());
 					long brix = (rix-1)/blen+1;
 					long lbrix = (rix-1)%blen;
 					tmpIx.setIndexes(brix, data.getIndexes().getColumnIndex());
 					 //create target block if necessary
 					if( !out.containsKey(tmpIx) ) {
 						IndexedMatrixValue tmpIMV = new IndexedMatrixValue(new MatrixIndexes(),new MatrixBlock());
 						tmpIMV.getIndexes().setIndexes(tmpIx);
 						((MatrixBlock)tmpIMV.getValue()).reset((int)Math.min(blen, rlen-((brix-1)*blen)), (int)clen);
 						out.put(tmpIMV.getIndexes(), tmpIMV);
 					}
 					//put single row into target block
 					((MatrixBlock)out.get(tmpIx).getValue()).copy(
 							  (int)lbrix, (int)lbrix, 0, (int)clen-1, src, false);
 				}
 			}
 		}
 		else //margin = "cols"
 		{
 			long rlen = linData.getNumRows();
 			long clen = len;

 			for( int i=0; i<linOffset.getNumColumns(); i++ ) {
 				long cix = (long)linOffset.quickGetValue(0, i);
 				if( cix > 0 ) //otherwise empty row
 				{
 					//get single row from source block
 					MatrixBlock src = linData.slice(0, (int)(rlen-1), i, i, new MatrixBlock());
 					long bcix = (cix-1)/blen+1;
 					long lbcix = (cix-1)%blen;
 					tmpIx.setIndexes(data.getIndexes().getRowIndex(), bcix);
 					 //create target block if necessary
 					if( !out.containsKey(tmpIx) ) {
 						IndexedMatrixValue tmpIMV = new IndexedMatrixValue(new MatrixIndexes(),new MatrixBlock());
 						tmpIMV.getIndexes().setIndexes(tmpIx);
 						((MatrixBlock)tmpIMV.getValue()).reset((int)rlen,(int)Math.min(blen, clen-((bcix-1)*blen)));
 						out.put(tmpIMV.getIndexes(), tmpIMV);
 					}
 					//put single row into target block
 					((MatrixBlock)out.get(tmpIx).getValue()).copy(
 							  0, (int)rlen-1, (int)lbcix, (int)lbcix, src, false);
 				}
 			}
 		}

 		//prepare and return outputs (already in cached values)
 		for( IndexedMatrixValue imv : out.values() ){
 			((MatrixBlock)imv.getValue()).recomputeNonZeros();
 			outList.add(imv);
 		}
 	}

 	/**
 	 * CP rexpand operation (single input, single output)
 	 *
 	 * @param in input matrix
 	 * @param ret output matrix
 	 * @param max ?
 	 * @param rows ?
 	 * @param cast ?
 	 * @param ignore ?
 	 * @param k degree of parallelism
 	 * @return output matrix
 	 */
 	public static MatrixBlock rexpand(MatrixBlock in, MatrixBlock ret, double max, boolean rows, boolean cast, boolean ignore, int k) {
 		//prepare parameters
 		int lmax = (int)UtilFunctions.toLong(max);

 		//sanity check for input nnz (incl implicit handling of empty blocks)
 		if( !ignore && in.getNonZeros()<in.getNumRows() )
 			throw new DMLRuntimeException("Invalid input w/ zeros for rexpand ignore=false "
 					+ "(rlen="+in.getNumRows()+", nnz="+in.getNonZeros()+").");

 		//check for empty inputs (for ignore=true)
 		if( in.isEmptyBlock(false) ) {
 			if( rows )
 				ret.reset(lmax, in.rlen, true);
 			else //cols
 				ret.reset(in.rlen, lmax, true);
 			return ret;
 		}

 		//execute rexpand operations
 		if( rows )
 			return rexpandRows(in, ret, lmax, cast, ignore);
 		else //cols
 			return rexpandColumns(in, ret, lmax, cast, ignore, k);
 	}

 	/**
 	 * MR/Spark rexpand operation (single input, multiple outputs incl empty blocks)
 	 *
 	 * @param data indexed matrix value
 	 * @param max ?
 	 * @param rows ?
 	 * @param cast ?
 	 * @param ignore ?
 	 * @param blen block length
 	 * @param outList list of indexed matrix values
 	 */
 	public static void rexpand(IndexedMatrixValue data, double max, boolean rows, boolean cast, boolean ignore, long blen, ArrayList<IndexedMatrixValue> outList) {
 		//prepare parameters
 		MatrixIndexes ix = data.getIndexes();
 		MatrixBlock in = (MatrixBlock)data.getValue();

 		//execute rexpand operations incl sanity checks
 		//TODO more robust (memory efficient) implementation w/o tmp block
 		MatrixBlock tmp = rexpand(in, new MatrixBlock(), max, rows, cast, ignore, 1);

 		//prepare outputs blocks (slice tmp block into output blocks )
 		if( rows ) //expanded vertically
 		{
 			for( int rl=0; rl<tmp.getNumRows(); rl+=blen ) {
 				MatrixBlock mb = tmp.slice(
 						rl, (int)(Math.min(rl+blen, tmp.getNumRows())-1));
 				outList.add(new IndexedMatrixValue(
 						new MatrixIndexes(rl/blen+1, ix.getRowIndex()), mb));
 			}
 		}
 		else //expanded horizontally
 		{
 			for( int cl=0; cl<tmp.getNumColumns(); cl+=blen ) {
 				MatrixBlock mb = tmp.slice(
 						0, tmp.getNumRows()-1,
 						cl, (int)(Math.min(cl+blen, tmp.getNumColumns())-1), new MatrixBlock());
 				outList.add(new IndexedMatrixValue(
 						new MatrixIndexes(ix.getRowIndex(), cl/blen+1), mb));
 			}
 		}
 	}

 	///////////////////////////////
 	// private CP implementation //
 	///////////////////////////////

 	private static ReorgType getReorgType( ReorgOperator op )
 	{
 		if( op.fn instanceof SwapIndex )  //transpose
 			return ReorgType.TRANSPOSE;

 		else if( op.fn instanceof RevIndex ) //rev
 			return ReorgType.REV;

 		else if( op.fn instanceof DiagIndex ) //diag
 			return ReorgType.DIAG;

 		else if( op.fn instanceof SortIndex ) //sort
 			return ReorgType.SORT;

 		return ReorgType.INVALID;
 	}

 	private static void transposeDenseToDense(MatrixBlock in, MatrixBlock out, int rl, int ru, int cl, int cu) {
 		final int m = in.rlen;
 		final int n = in.clen;
 		final int n2 = out.clen;

 		DenseBlock a = in.getDenseBlock();
 		DenseBlock c = out.getDenseBlock();

 		if( m==1 || n==1 ) //VECTOR TRANSPOSE
 		{
 			//plain memcopy, in case shallow dense copy no applied
 			//input/output guaranteed single block
 			int ix = rl+cl; int len = ru+cu-ix-1;
 			System.arraycopy(a.valuesAt(0), ix, c.valuesAt(0), ix, len);
 		}
 		else //MATRIX TRANSPOSE
 		{
 			//blocking according to typical L2 cache sizes
 			final int blocksizeI = 128;
 			final int blocksizeJ = 128;

 			//blocked execution
 			if( a.numBlocks()==1 && c.numBlocks()==1 ) { //<16GB
 				double[] avals = a.valuesAt(0);
 				double[] cvals = c.valuesAt(0);
 				for( int bi = rl; bi<ru; bi+=blocksizeI ) {
 					int bimin = Math.min(bi+blocksizeI, ru);
 					for( int bj = cl; bj<cu; bj+=blocksizeJ ) {
 						int bjmin = Math.min(bj+blocksizeJ, cu);
 						//core transpose operation
 						for( int i=bi; i<bimin; i++ ) {
 							int aix = i * n + bj;
 							int cix = bj * n2 + i;
 							transposeRow(avals, cvals, aix, cix, n2, bjmin-bj);
 						}
 					}
 				}
 			}
 			else { //general case > 16GB (multiple blocks)
 				for( int bi = rl; bi<ru; bi+=blocksizeI ) {
 					int bimin = Math.min(bi+blocksizeI, ru);
 					for( int bj = cl; bj<cu; bj+=blocksizeJ ) {
 						int bjmin = Math.min(bj+blocksizeJ, cu);
 						//core transpose operation
 						for( int i=bi; i<bimin; i++ ) {
 							double[] avals = a.values(i);
 							int aix = a.pos(i);
 							for( int j=bj; j<bjmin; j++ )
 								c.set(j, i, avals[ aix+j ]);
 						}
 					}
 				}
 			}
 		}
 	}

 	private static void transposeDenseToSparse(MatrixBlock in, MatrixBlock out)
 	{
 		//NOTE: called only in sequential execution

 		final int m = in.rlen;
 		final int n = in.clen;
 		final int m2 = out.rlen;
 		final int n2 = out.clen;
 		final int ennz2 = (int) (in.nonZeros/m2);

 		DenseBlock a = in.getDenseBlock();
 		SparseBlock c = out.getSparseBlock();

 		if( out.rlen == 1 ) //VECTOR-VECTOR
 		{
 			//allocate row once by nnz, copy non-zeros
 			c.set(0, new SparseRowVector((int)in.nonZeros, a.valuesAt(0), m), false);
 		}
 		else //general case: MATRIX-MATRIX
 		{
 			//blocking according to typical L2 cache sizes
 			final int blocksizeI = 128;
 			final int blocksizeJ = 128;

 			//blocked execution
 			for( int bi = 0; bi<m; bi+=blocksizeI ) {
 				int bimin = Math.min(bi+blocksizeI, m);
 				for( int bj = 0; bj<n; bj+=blocksizeJ ) {
 					int bjmin = Math.min(bj+blocksizeJ, n);
 					//core transpose operation
 					for( int i=bi; i<bimin; i++ ) {
 						double[] avals = a.values(i);
 						int aix = a.pos(i);
 						for( int j=bj; j<bjmin; j++ ) {
 							c.allocate(j, ennz2, n2);
 							c.append(j, i, avals[aix+j]);
 						}
 					}
 				}
 			}
 		}
 	}

 	private static void transposeSparseToSparse(MatrixBlock in, MatrixBlock out, int rl, int ru, int cl, int cu, int[] cnt)
 	{
 		//NOTE: called only in sequential or column-wise parallel execution
 		if( rl > 0 || ru < in.rlen )
 			throw new RuntimeException("Unsupported row-parallel transposeSparseToSparse: "+rl+", "+ru);

 		final int m2 = out.rlen;
 		final int n2 = out.clen;
 		final int ennz2 = (int) (in.nonZeros/m2);

 		SparseBlock a = in.getSparseBlock();
 		SparseBlock c = out.getSparseBlock();

 		if( cu-cl == 1 ) { // SINGLE TARGET ROW
 			//number of columns <= num cores, use sequential scan over input
 			//and avoid cache blocking and temporary position maintenance
 			if( cnt[cl] > 0 )
 				c.allocate(cl, cnt[cl]);

 			for( int i=0; i<ru; i++ ) {
 				if( a.isEmpty(i) ) continue;
 				int apos = a.pos(i);
 				int alen = a.size(i);
 				int[] aix = a.indexes(i);
 				double[] avals = a.values(i);
 				for( int j=apos; j<apos+alen && aix[j]<=cl; j++ )
 					if( aix[j] == cl )
 						c.append(cl, i, avals[j]);
 			}
 		}
 		else { //GENERAL CASE
 			//allocate output sparse rows
 			if( cnt != null ) {
 				for( int i=cl; i<cu; i++ )
 					if( cnt[i] > 0 )
 						c.allocate(i, cnt[i]);
 			}

 			//blocking according to typical L2 cache sizes w/ awareness of sparsity
 			final long xsp = (long)in.rlen*in.clen/in.nonZeros;
 			final int blocksizeI = Math.max(128, (int) (8*xsp));
 			final int blocksizeJ = Math.max(128, (int) (8*xsp));

 			//temporary array for block boundaries (for preventing binary search)
 			int[] ix = new int[Math.min(blocksizeI, ru-rl)];

 			//blocked execution
 			for( int bi=rl; bi<ru; bi+=blocksizeI )
 			{
 				Arrays.fill(ix, 0);
 				//find column starting positions
 				int bimin = Math.min(bi+blocksizeI, ru);
 				if( cl > 0 ) {
 					for( int i=bi; i<bimin; i++ ) {
 						if( a.isEmpty(i) ) continue;
 						int j = a.posFIndexGTE(i, cl);
 						ix[i-bi] = (j>=0) ? j : a.size(i);
 					}
 				}

 				for( int bj=cl; bj<cu; bj+=blocksizeJ ) {
 					int bjmin = Math.min(bj+blocksizeJ, cu);
 					//core block transpose operation
 					for( int i=bi; i<bimin; i++ ) {
 						if( a.isEmpty(i) ) continue;
 						int apos = a.pos(i);
 						int alen = a.size(i);
 						int[] aix = a.indexes(i);
 						double[] avals = a.values(i);
 						int j = ix[i-bi] + apos; //last block boundary
 						for( ; j<apos+alen && aix[j]<bjmin; j++ ) {
 							c.allocate(aix[j], ennz2, n2);
 							c.append(aix[j], i, avals[j]);
 						}
 						ix[i-bi] = j - apos; //keep block boundary
 					}
 				}
 			}
 		}
 	}

 	private static void transposeSparseToDense(MatrixBlock in, MatrixBlock out, int rl, int ru, int cl, int cu) {
 		final int m = in.rlen;
 		final int n = in.clen;

 		SparseBlock a = in.getSparseBlock();
 		DenseBlock c = out.getDenseBlock();

 		if( m==1 ) //ROW VECTOR TRANSPOSE
 		{
 			//NOTE: called only in sequential execution
 			int alen = a.size(0); //always pos 0
 			int[] aix = a.indexes(0);
 			double[] avals = a.values(0);
 			double[] cvals = c.valuesAt(0);
 			for( int j=0; j<alen; j++ )
 				cvals[aix[j]] = avals[j];
 		}
 		else //MATRIX TRANSPOSE
 		{
 			//blocking according to typical L2 cache sizes
 			final int blocksizeI = 128;
 			final int blocksizeJ = 128;

 			//temporary array for block boundaries (for preventing binary search)
 			int[] ix = new int[blocksizeI];

 			//blocked execution
 			for( int bi = rl; bi<ru; bi+=blocksizeI ) {
 				Arrays.fill(ix, 0);
 				int bimin = Math.min(bi+blocksizeI, ru);
 				for( int bj = 0; bj<n; bj+=blocksizeJ ) {
 					int bjmin = Math.min(bj+blocksizeJ, n);
 					//core transpose operation
 					for( int i=bi, iix=0; i<bimin; i++, iix++ ) {
 						if( a.isEmpty(i) ) continue;
 						int apos = a.pos(i);
 						int alen = a.size(i);
 						int[] aix = a.indexes(i);
 						double[] avals = a.values(i);
 						int j = ix[iix]; //last block boundary
 						for( ; j<alen && aix[apos+j]<bjmin; j++ )
 							c.set(aix[apos+j], i, avals[apos+j]);
 						ix[iix] = j; //keep block boundary
 					}
 				}
 			}
 		}
 	}

 	static void transposeRow( double[] a, double[] c, int aix, int cix, int n2, int len ) {
 		final int bn = len%8;
 		//compute rest (not aligned to 8-blocks)
 		for( int j=0; j<bn; j++, aix++, cix+=n2 )
 			c[ cix ] = a[ aix+0 ];
 		//unrolled 8-blocks
 		for( int j=bn; j<len; j+=8, aix+=8, cix+=8*n2 ) {
 			c[ cix + 0*n2 ] = a[ aix+0 ];
 			c[ cix + 1*n2 ] = a[ aix+1 ];
 			c[ cix + 2*n2 ] = a[ aix+2 ];
 			c[ cix + 3*n2 ] = a[ aix+3 ];
 			c[ cix + 4*n2 ] = a[ aix+4 ];
 			c[ cix + 5*n2 ] = a[ aix+5 ];
 			c[ cix + 6*n2 ] = a[ aix+6 ];
 			c[ cix + 7*n2 ] = a[ aix+7 ];
 		}
 	}

 	private static int[] countNnzPerColumn(MatrixBlock in, int rl, int ru) {
 		//initial pass to determine capacity (this helps to prevent
 		//sparse row reallocations and mem inefficiency w/ skew
 		int[] cnt = null;
 		if( in.sparse && in.clen <= 4096 ) { //16KB
 			SparseBlock a = in.sparseBlock;
 			cnt = new int[in.clen];
 			for( int i=rl; i<ru; i++ ) {
 				if( !a.isEmpty(i) )
 					countAgg(cnt, a.indexes(i), a.pos(i), a.size(i));
 			}
 		}
 		return cnt;
 	}

 	private static int[] mergeNnzCounts(int[] cnt, int[] cnt2) {
 		if( cnt == null )
 			return cnt2;
 		for( int i=0; i<cnt.length; i++ )
 			cnt[i] += cnt2[i];
 		return cnt;
 	}

 	private static void reverseDense(MatrixBlock in, MatrixBlock out) {
 		final int m = in.rlen;
 		final int n = in.clen;

 		//set basic meta data and allocate output
 		out.sparse = false;
 		out.nonZeros = in.nonZeros;
 		out.allocateDenseBlock(false);

 		//copy all rows into target positions
 		if( n == 1 ) { //column vector
 			double[] a = in.getDenseBlockValues();
 			double[] c = out.getDenseBlockValues();
 			for( int i=0; i<m; i++ )
 				c[m-1-i] = a[i];
 		}
 		else { //general matrix case
 			DenseBlock a = in.getDenseBlock();
 			DenseBlock c = out.getDenseBlock();
 			for( int i=0; i<m; i++ ) {
 				final int ri = m - 1 - i;
 				System.arraycopy(a.values(i), a.pos(i), c.values(ri), c.pos(ri), n);
 			}
 		}
 	}

 	private static void reverseSparse(MatrixBlock in, MatrixBlock out) {
 		final int m = in.rlen;

 		//set basic meta data and allocate output
 		out.sparse = true;
 		out.nonZeros = in.nonZeros;

 		out.allocateSparseRowsBlock(false);

 		//copy all rows into target positions
 		SparseBlock a = in.getSparseBlock();
 		SparseBlock c = out.getSparseBlock();
 		for( int i=0; i<m; i++ )
 			if( !a.isEmpty(i) )
 				c.set(m-1-i, a.get(i), true);
 	}

 	/**
 	 * Generic implementation diagV2M
 	 * (in most-likely DENSE, out most likely SPARSE)
 	 *
 	 * @param in input matrix
 	 * @param out output matrix
 	 */
 	private static void diagV2M( MatrixBlock in, MatrixBlock out )
 	{
 		final int rlen = in.rlen;

 		//CASE column vector
 		if( out.sparse  ) { //SPARSE
 			if( SPARSE_OUTPUTS_IN_CSR ) {
 				int[] rptr = new int[in.rlen+1];
 				int[] cix = null;
 				double[] vals = null;
 				//case a: fully dense vector
 				if( rlen == in.nonZeros && !in.sparse ) {
 					//reuse single seq for rptr and cix (cix truncated by 1)
 					rptr = cix = UtilFunctions.getSeqArray(0, rlen, 1);
 					vals = in.getDenseBlockValues(); //shallow copy
 				}
 				//case b: more general input
 				else {
 					cix = new int[(int)in.nonZeros];
 					vals = new double[(int)in.nonZeros];
 					for( int i=0, pos=0; i<rlen; i++ ) {
 						double val = in.quickGetValue(i, 0);
 						if( val != 0 ) {
 							cix[pos] = i;
 							vals[pos] = val;
 							pos++;
 						}
 						rptr[i+1]=pos;
 					}
 				}
 				out.sparseBlock = new SparseBlockCSR(
 					rptr, cix, vals, (int)in.nonZeros);
 			}
 			else {
 				out.allocateBlock();
 				SparseBlock sblock = out.sparseBlock;
 				for(int i=0; i<rlen; i++) {
 					double val = in.quickGetValue(i, 0);
 					if( val != 0 ) {
 						sblock.allocate(i, 1);
 						sblock.append(i, i, val);
 					}
 				}
 			}
 		}
 		else { //DENSE
 			for( int i=0; i<rlen; i++ ) {
 				double val = in.quickGetValue(i, 0);
 				if( val != 0 )
 					out.appendValue(i, i, val);
 			}
 		}

 		out.setNonZeros(in.nonZeros);
 	}

 	/**
 	 * Generic implementation diagM2V (non-performance critical)
 	 * (in most-likely SPARSE, out most likely DENSE)
 	 *
 	 * NOTE: squared block assumption (checked on entry diag)
 	 *
 	 * @param in input matrix
 	 * @param out output matrix
 	 */
 	private static void diagM2V( MatrixBlock in, MatrixBlock out )
 	{
 		DenseBlock c = out.allocateBlock().getDenseBlock();
 		int rlen = in.rlen;
 		int nnz = 0;
 		for( int i=0; i<rlen; i++ ) {
 			double val = in.quickGetValue(i, i);
 			if( val != 0 ) {
 				c.set(i, 0, val);
 				nnz++;
 			}
 		}
 		out.setNonZeros(nnz);
 	}

 	private static void reshapeDense( MatrixBlock in, MatrixBlock out, int rows, int cols, boolean rowwise ) {
 		int rlen = in.rlen;
 		int clen = in.clen;

 		//reshape empty block
 		if( in.denseBlock == null )
 			return;

 		//shallow dense by-row reshape (w/o result allocation)
 		if( SHALLOW_COPY_REORG && rowwise && in.denseBlock.numBlocks()==1 ) {
 			//since the physical representation of dense matrices is always the same,
 			//we don't need to create a copy, given our copy on write semantics.
 			//however, note that with update in-place this would be an invalid optimization
 			out.denseBlock = DenseBlockFactory.createDenseBlock(in.getDenseBlockValues(), rows, cols);
 			return;
 		}

 		//allocate block if necessary
 		out.allocateDenseBlock(false);

 		//dense reshape
 		DenseBlock a = in.getDenseBlock();
 		DenseBlock c = out.getDenseBlock();

 		if( rowwise ) {
 			//VECTOR-MATRIX, MATRIX-VECTOR, GENERAL CASE
 			//pure copy of rowwise internal representation
 			c.set(a);
 		}
 		else { //colwise
 			if( rlen==1 || clen==1 ) { //VECTOR->MATRIX
 				//note: cache-friendly on a but not on c
 				double[] avals = a.valuesAt(0);
 				double[] cvals = c.valuesAt(0);
 				for( int j=0, aix=0; j<cols; j++ )
 					for( int i=0, cix=0; i<rows; i++, cix+=cols )
 						cvals[ cix + j ] = avals[ aix++ ];
 			}
 			else if( rows==1 || cols==1 ) //MATRIX->VECTOR
 			{
 				//note: cache-friendly on c but not on a
 				double[] avals = a.valuesAt(0);
 				double[] cvals = c.valuesAt(0);
 				for( int j=0, cix=0; j<clen; j++ )
 					for( int i=0, aix=0; i<rlen; i++, aix+=clen )
 						cvals[ cix++ ] = avals[ aix + j ];
 			}
 			else //GENERAL CASE: MATRIX->MATRIX
 			{
 				//note: cache-friendly on c but not an a
 				for( int i=0; i<rows; i++ ) {
 					double[] cvals = c.values(i);
 					int cix = c.pos(i);
 					for( int j=0, aix2=i; j<cols; j++, aix2+=rows ) {
 						int ai = aix2%rlen;
 						int aj = aix2/rlen;
 						cvals[cix+j] = a.get(ai,aj);
 					}
 				}
 				//index conversion c[i,j]<- a[k,l]:
 				// k = (rows*j+i)%rlen
 				// l = (rows*j+i)/rlen
 			}
 		}
 	}

 	private static void reshapeSparse( MatrixBlock in, MatrixBlock out, int rows, int cols, boolean rowwise )
 	{
 		int rlen = in.rlen;
 		int clen = in.clen;

 		//reshape empty block
 		if( in.isEmptyBlock(false) )
 			return;

 		//allocate block if necessary
 		out.allocateSparseRowsBlock(false);
 		int estnnz = (int) (in.nonZeros/rows);

 		//sparse reshape
 		SparseBlock a = in.sparseBlock;
 		SparseBlock c = out.sparseBlock;

 		if( rowwise )
 		{
 			//NOTES on special cases
 			// * vector-matrix not really different from general
 			// * clen=1 and cols=1 will never be sparse.

 			if( rows==1 ) //MATRIX->VECTOR
 			{
 				//note: cache-friendly on a and c; append-only
 				c.allocate(0, estnnz, cols);
 				for( int i=0, cix=0; i<rlen; i++, cix+=clen ) {
 					if( a.isEmpty(i) ) continue;
 					int apos = a.pos(i);
 					int alen = a.size(i);
 					int[] aix = a.indexes(i);
 					double[] avals = a.values(i);
 					for( int j=apos; j<apos+alen; j++ )
 						c.append(0, cix+aix[j], avals[j]);
 				}
 			}
 			else if( cols%clen==0 //SPECIAL CSR N:1 MATRIX->MATRIX
 				&& SHALLOW_COPY_REORG && SPARSE_OUTPUTS_IN_CSR
 				&& a instanceof SparseBlockCSR ) { //int nnz
 				int[] aix = ((SparseBlockCSR)a).indexes();
 				int n = cols/clen, pos = 0;
 				int[] rptr = new int[rows+1];
 				int[] indexes = new int[(int)a.size()];
 				rptr[0] = 0;
 				for(int bi=0, ci=0; bi<rlen; bi+=n, ci++) {
 					for( int i=bi, cix=0; i<bi+n; i++, cix+=clen ) {
 						if(a.isEmpty(i)) continue;
 						int apos = a.pos(i);
 						int alen = a.size(i);
 						for( int j=apos; j<apos+alen; j++ )
 							indexes[pos++] = cix+aix[j];
 					}
 					rptr[ci+1] = pos;
 				}
 				//create CSR block with shallow copy of values
 				out.sparseBlock = new SparseBlockCSR(rptr, indexes,
 					((SparseBlockCSR)a).values(), pos);
 			}
 			else if( cols%clen==0 ) { //SPECIAL N:1 MATRIX->MATRIX
 				int n = cols/clen;
 				for(int bi=0, ci=0; bi<rlen; bi+=n, ci++) {
 					//allocate output row once (w/o re-allocations)
 					long lnnz = a.size(bi, bi+n);
 					c.allocate(ci, (int)lnnz);
 					//copy N input rows into output row
 					for( int i=bi, cix=0; i<bi+n; i++, cix+=clen ) {
 						if(a.isEmpty(i)) continue;
 						int apos = a.pos(i);
 						int alen = a.size(i);
 						int[] aix = a.indexes(i);
 						double[] avals = a.values(i);
 						for( int j=apos; j<apos+alen; j++ )
 							c.append(ci, cix+aix[j], avals[j]);
 					}
 				}
 			}
 			else //GENERAL CASE: MATRIX->MATRIX
 			{
 				//note: cache-friendly on a but not c; append-only
 				//long cix because total cells in sparse can be larger than int
 				long cix = 0;
 				for( int i=0; i<rlen; i++ ) {
 					if( !a.isEmpty(i) ){
 						int apos = a.pos(i);
 						int alen = a.size(i);
 						int[] aix = a.indexes(i);
 						double[] avals = a.values(i);
 						for( int j=apos; j<apos+alen; j++ ) {
 							int ci = (int)((cix+aix[j])/cols);
 							int cj = (int)((cix+aix[j])%cols);
 							c.allocate(ci, estnnz, cols);
 							c.append(ci, cj, avals[j]);
 						}
 					}

 					cix += clen;
 				}
 			}
 		}
 		else //colwise
 		{
 			//NOTES on special cases
 			// * matrix-vector not really different from general
 			// * clen=1 and cols=1 will never be sparse.

 			if( rlen==1 ) //VECTOR->MATRIX
 			{
 				//note: cache-friendly on a but not c; append-only
 				if( !a.isEmpty(0) ){
 					int alen = a.size(0); //always pos 0
 					int[] aix = a.indexes(0);
 					double[] avals = a.values(0);
 					for( int j=0; j<alen; j++ ) {
 						int ci = aix[j]%rows;
 						int cj = aix[j]/rows;
 						c.allocate(ci, estnnz, cols);
 						c.append(ci, cj, avals[j]);
 					}
 				}
 			}
 			else //GENERAL CASE: MATRIX->MATRIX
 			{
 				//note: cache-friendly on a but not c; append&sort, in-place w/o shifts
 				for( int i=0; i<rlen; i++ ) {
 					if( a.isEmpty(i) ) continue;
 					int apos = a.pos(i);
 					int alen = a.size(i);
 					int[] aix = a.indexes(i);
 					double[] avals = a.values(i);
 					for( int j=apos; j<apos+alen; j++ ) {
 						//long tmpix because total cells in sparse can be larger than int
 						long tmpix = (long)aix[j]*rlen+i;
 						int ci = (int)(tmpix%rows);
 						int cj = (int)(tmpix/rows);
 						c.allocate(ci, estnnz, cols);
 						c.append(ci, cj, avals[j]);
 					}
 				}
 				out.sortSparseRows();
 			}
 		}
 	}

 	private static void reshapeDenseToSparse( MatrixBlock in, MatrixBlock out, int rows, int cols, boolean rowwise )
 	{
 		int rlen = in.rlen;
 		int clen = in.clen;

 		//reshape empty block
 		if( in.denseBlock == null )
 			return;

 		//allocate block if necessary
 		out.allocateSparseRowsBlock(false);
 		int estnnz = (int) (in.nonZeros/rows);

 		//sparse reshape
 		DenseBlock a = in.getDenseBlock();
 		SparseBlock c = out.sparseBlock;

 		if( rowwise ) {
 			//NOTES on special cases
 			// * vector-matrix, matrix-vector not really different from general

 			//GENERAL CASE: MATRIX->MATRIX
 			//note: cache-friendly on a and c; append-only
 			for( int i=0; i<rlen; i++ ) {
 				double[] avals = a.values(i);
 				int aix = a.pos(i);
 				for( int j=0; j<clen; j++ ) {
 					double val = avals[aix+j];
 					if( val != 0 ) {
 						long cix = (long) i*clen+j;
 						int ci = (int)(cix / cols);
 						int cj = (int)(cix % cols);
 						c.allocate(ci, estnnz, cols);
 						c.append(ci, cj, val);
 					}
 				}
 			}
 		}
 		else //colwise
 		{
 			//NOTES on special cases
 			// * matrix-vector not really different from general

 			if( rlen==1 ) //VECTOR->MATRIX
 			{
 				//note: cache-friendly on a but not c; append-only
 				double[] avals = a.valuesAt(0);
 				for( int j=0, aix=0; j<cols; j++ )
 					for( int i=0; i<rows; i++ ) {
 						double val = avals[aix++];
 						if( val != 0 ) {
 							c.allocate(i, estnnz, cols);
 							c.append(i, j, val);
 						}
 					}
 			}
 			else //GENERAL CASE: MATRIX->MATRIX
 			{
 				//note: cache-friendly on c but not a; append-only
 				for( int i=0; i<rows; i++ )
 					for( int j=0, aix2=i; j<cols; j++, aix2+=rows ) {
 						int ai = aix2%rlen;
 						int aj = aix2/rlen;
 						double val = a.get(ai, aj);
 						if( val != 0 ) {
 							c.allocate(i, estnnz, cols);
 							c.append(i, j, val);
 						}
 					}
 			}
 		}
 	}

 	private static void reshapeSparseToDense( MatrixBlock in, MatrixBlock out, int rows, int cols, boolean rowwise ) {
 		int rlen = in.rlen;
 		int clen = in.clen;

 		//reshape empty block
 		if( in.sparseBlock == null )
 			return;

 		//allocate block if necessary
 		out.allocateDenseBlock(false);

 		//sparse/dense reshape
 		SparseBlock a = in.sparseBlock;
 		DenseBlock c = out.getDenseBlock();

 		if( rowwise )
 		{
 			//NOTES on special cases
 			// * vector-matrix, matrix-vector not really different from general

 			//GENERAL CASE: MATRIX->MATRIX
 			//note: cache-friendly on a and c
 			for( int i=0, cix=0; i<rlen; i++, cix+=clen ) {
 				if( !a.isEmpty(i) ) {
 					int apos = a.pos(i);
 					int alen = a.size(i);
 					int[] aix = a.indexes(i);
 					double[] avals = a.values(i);
 					for( int j=apos; j<apos+alen; j++ ) {
 						int ci = (cix+aix[j]) / cols;
 						int cj = (cix+aix[j]) % cols;
 						c.set(ci, cj, avals[j]);
 					}
 				}
 			}
 		}
 		else //colwise
 		{
 			//NOTES on special cases
 			// * matrix-vector not really different from general

 			if( rlen==1 ) //VECTOR->MATRIX
 			{
 				//note: cache-friendly on a but not c
 				double[] cvals = c.valuesAt(0);
 				if( !a.isEmpty(0) ) {
 					int apos = a.pos(0);
 					int alen = a.size(0);
 					int[] aix = a.indexes(0);
 					double[] avals = a.values(0);
 					for( int j=apos; j<apos+alen; j++ ) {
 						int ci = aix[j]%rows;
 						int cj = aix[j]/rows;
 						cvals[ci*cols+cj] = avals[j];
 					}
 				}
 			}
 			else //GENERAL CASE: MATRIX->MATRIX
 			{
 				//note: cache-friendly on a but not c
 				for( int i=0; i<rlen; i++ ) {
 					if( a.isEmpty(i) ) continue;
 					int apos = a.pos(i);
 					int alen = a.size(i);
 					int[] aix = a.indexes(i);
 					double[] avals = a.values(i);
 					for( int j=apos; j<apos+alen; j++ ) {
 						int tmpix = aix[j]*rlen+i;
 						int ci = tmpix%rows;
 						int cj = tmpix/rows;
 						c.set(ci, cj, avals[j]);
 					}
 				}
 			}
 		}
 	}

 	///////////////////////////////
 	// private MR implementation //
 	///////////////////////////////

 	private static Collection<MatrixIndexes> computeAllResultBlockIndexes(MatrixIndexes ixin,
 		DataCharacteristics mcIn, DataCharacteristics mcOut, MatrixBlock in, boolean rowwise, boolean outputEmpty )
 	{
 		HashSet<MatrixIndexes> ret = new HashSet<>();

 		long row_offset = (ixin.getRowIndex()-1)*mcOut.getBlocksize();
 		long col_offset = (ixin.getColumnIndex()-1)*mcOut.getBlocksize();
 		long max_row_offset = Math.min(mcIn.getRows(),row_offset+mcIn.getBlocksize())-1;
 		long max_col_offset = Math.min(mcIn.getCols(),col_offset+mcIn.getBlocksize())-1;

 		if( rowwise ) {
 			if( mcIn.getCols() == 1 ) {
 				MatrixIndexes first = computeResultBlockIndex(new MatrixIndexes(), row_offset, 0, mcIn, mcOut, rowwise);
 				MatrixIndexes last = computeResultBlockIndex(new MatrixIndexes(), max_row_offset, 0, mcIn, mcOut, rowwise);
 				createRowwiseIndexes(first, last, mcOut.getNumColBlocks(), ret);
 			}
 			else if( in.getNonZeros()<in.getNumRows() && !outputEmpty ) {
 				createNonZeroIndexes(mcIn, mcOut, in, row_offset, col_offset, rowwise, ret);
 			}
 			else {
 				for( long i=row_offset; i<max_row_offset+1; i++ ) {
 					MatrixIndexes first = computeResultBlockIndex(new MatrixIndexes(), i, col_offset, mcIn, mcOut, rowwise);
 					MatrixIndexes last = computeResultBlockIndex(new MatrixIndexes(), i, max_col_offset, mcIn, mcOut, rowwise);
 					createRowwiseIndexes(first, last, mcOut.getNumColBlocks(), ret);
 				}
 			}
 		}
 		else{ //colwise
 			if( mcIn.getRows() == 1 ) {
 				MatrixIndexes first = computeResultBlockIndex(new MatrixIndexes(), 0, col_offset, mcIn, mcOut, rowwise);
 				MatrixIndexes last = computeResultBlockIndex(new MatrixIndexes(), 0, max_col_offset, mcIn, mcOut, rowwise);
 				createColwiseIndexes(first, last, mcOut.getNumRowBlocks(), ret);
 			}
 			else if( in.getNonZeros()<in.getNumColumns() && !outputEmpty ) {
 				createNonZeroIndexes(mcIn, mcOut, in, row_offset, col_offset, rowwise, ret);
 			}
 			else {
 				for( long j=col_offset; j<max_col_offset+1; j++ ) {
 					MatrixIndexes first = computeResultBlockIndex(new MatrixIndexes(), row_offset, j, mcIn, mcOut, rowwise);
 					MatrixIndexes last = computeResultBlockIndex(new MatrixIndexes(), max_row_offset, j, mcIn, mcOut, rowwise);
 					createColwiseIndexes(first, last, mcOut.getNumRowBlocks(), ret);
 				}
 			}
 		}
 		return ret;
 	}

 	private static void createRowwiseIndexes(MatrixIndexes first, MatrixIndexes last, long ncblks, HashSet<MatrixIndexes> ret) {
 		if( first.getRowIndex()<=0 || first.getColumnIndex()<=0 )
 			throw new RuntimeException("Invalid computed first index: "+first.toString());
 		if( last.getRowIndex()<=0 || last.getColumnIndex()<=0 )
 			throw new RuntimeException("Invalid computed last index: "+last.toString());

 		//add first row block
 		ret.add(first);

 		//add blocks in between first and last
 		if( !first.equals(last) ) {
 			boolean fill = first.getRowIndex()==last.getRowIndex()
 				&& first.getColumnIndex() > last.getColumnIndex();
 			for( long k1=first.getRowIndex(); k1<=last.getRowIndex(); k1++ ) {
 				long k2_start = (k1==first.getRowIndex() ? first.getColumnIndex()+1 : 1);
 				long k2_end = (k1==last.getRowIndex() && !fill) ? last.getColumnIndex()-1 : ncblks;
 				for( long k2=k2_start; k2<=k2_end; k2++ )
 					ret.add(new MatrixIndexes(k1,k2));
 			}
 			ret.add(last);
 		}
 	}

 	private static void createColwiseIndexes(MatrixIndexes first, MatrixIndexes last, long nrblks, HashSet<MatrixIndexes> ret) {
 		if( first.getRowIndex()<=0 || first.getColumnIndex()<=0 )
 			throw new RuntimeException("Invalid computed first index: "+first.toString());
 		if( last.getRowIndex()<=0 || last.getColumnIndex()<=0 )
 			throw new RuntimeException("Invalid computed last index: "+last.toString());

 		//add first row block
 		ret.add(first);

 		//add blocks in between first and last
 		if( !first.equals(last) ) {
 			boolean fill = first.getColumnIndex()==last.getColumnIndex()
 					&& first.getRowIndex() > last.getRowIndex();
 			for( long k1=first.getColumnIndex(); k1<=last.getColumnIndex(); k1++ ) {
 				long k2_start = ((k1==first.getColumnIndex()) ? first.getRowIndex()+1 : 1);
 				long k2_end = ((k1==last.getColumnIndex() && !fill) ? last.getRowIndex()-1 : nrblks);
 				for( long k2=k2_start; k2<=k2_end; k2++ )
 					ret.add(new MatrixIndexes(k2,k1));
 			}
 			ret.add(last);
 		}
 	}

 	private static void createNonZeroIndexes(DataCharacteristics mcIn, DataCharacteristics mcOut,
 	                                         MatrixBlock in, long row_offset, long col_offset, boolean rowwise, HashSet<MatrixIndexes> ret) {
 		Iterator<IJV> iter = in.getSparseBlockIterator();
 		while( iter.hasNext() ) {
 			IJV cell = iter.next();
 			ret.add(computeResultBlockIndex(new MatrixIndexes(),
 				row_offset+cell.getI(), col_offset+cell.getJ(), mcIn, mcOut, rowwise));
 		}
 	}

 	private static Map<MatrixIndexes, MatrixBlock> createAllResultBlocks(Collection<MatrixIndexes> rix, long nnz, DataCharacteristics mcOut) {
 		return rix.stream().collect(Collectors.toMap(ix -> ix, ix -> createResultBlock(ix, nnz, rix.size(), mcOut)));
 	}

 	private static MatrixBlock createResultBlock(MatrixIndexes ix, long nnz, int nBlocks, DataCharacteristics mcOut) {
 		//compute indexes
 		long bi = ix.getRowIndex();
 		long bj = ix.getColumnIndex();
 		int lbrlen = UtilFunctions.computeBlockSize(mcOut.getRows(), bi, mcOut.getBlocksize());
 		int lbclen = UtilFunctions.computeBlockSize(mcOut.getCols(), bj, mcOut.getBlocksize());
 		if( lbrlen<1 || lbclen<1 )
 			throw new DMLRuntimeException("Computed block dimensions ("+bi+","+bj+" -> "+lbrlen+","+lbclen+") are invalid!");
 		//create result block
 		int estnnz = (int) (nnz/nBlocks); //force initial capacity per row to 1, for many blocks
 		boolean sparse = MatrixBlock.evalSparseFormatInMemory(lbrlen, lbclen, estnnz);
 		return new MatrixBlock(lbrlen, lbclen, sparse, estnnz);
 	}

 	private static void reshapeDense(MatrixBlock in, long row_offset, long col_offset, Map<MatrixIndexes,MatrixBlock> rix,
 	                                 DataCharacteristics mcIn, DataCharacteristics mcOut, boolean rowwise ) {
 		if( in.isEmptyBlock(false) )
 			return;

 		int rlen = in.rlen;
 		int clen = in.clen;
 		double[] a = in.getDenseBlockValues();

 		//append all values to right blocks
 		MatrixIndexes ixtmp = new MatrixIndexes();
 		for( int i=0, aix=0; i<rlen; i++, aix+=clen )
 		{
 			long ai = row_offset+i;
 			for( int j=0; j<clen; j++ )
 			{
 				double val = a[ aix+j ];
 				if( val !=0 ) {
 					long aj = col_offset+j;
 					ixtmp = computeResultBlockIndex(ixtmp, ai, aj, mcIn, mcOut, rowwise);
 					MatrixBlock out = rix.get(ixtmp);
 					if( out == null )
 						throw new DMLRuntimeException("Missing result block: "+ixtmp);
 					ixtmp = computeInBlockIndex(ixtmp, ai, aj, mcIn, mcOut, rowwise);
 					out.appendValue((int)ixtmp.getRowIndex(),(int)ixtmp.getColumnIndex(), val);
 				}
 			}
 		}

 		//cleanup for sparse blocks
 		if( !rowwise && mcIn.getRows() > 1 ) {
 			rix.values().stream().filter(b -> b.sparse)
 				.forEach(b -> b.sortSparseRows());
 		}
 	}

 	private static void reshapeSparse(MatrixBlock in, long row_offset, long col_offset, Map<MatrixIndexes,MatrixBlock> rix,
 	                                  DataCharacteristics mcIn, DataCharacteristics mcOut, boolean rowwise ) {
 		if( in.isEmptyBlock(false) )
 			return;

 		int rlen = in.rlen;
 		SparseBlock a = in.sparseBlock;

 		//append all values to right blocks
 		MatrixIndexes ixtmp = new MatrixIndexes();
 		for( int i=0; i<rlen; i++ ) {
 			if( a.isEmpty(i) ) continue;
 			long ai = row_offset+i;
 			int apos = a.pos(i);
 			int alen = a.size(i);
 			int[] aix = a.indexes(i);
 			double[] avals = a.values(i);
 			for( int j=apos; j<apos+alen; j++ )  {
 				long aj = col_offset+aix[j];
 				ixtmp = computeResultBlockIndex(ixtmp, ai, aj, mcIn, mcOut, rowwise);
 				MatrixBlock out = getAllocatedBlock(rix, ixtmp);
 				ixtmp = computeInBlockIndex(ixtmp, ai, aj, mcIn, mcOut, rowwise);
 				out.appendValue((int)ixtmp.getRowIndex(),(int)ixtmp.getColumnIndex(), avals[j]);
 			}
 		}

 		//cleanup for sparse blocks
 		if( !rowwise && mcIn.getRows() > 1 ) {
 			rix.values().stream().filter(b -> b.sparse)
 				.forEach(b -> b.sortSparseRows());
 		}
 	}

 	private static MatrixBlock getAllocatedBlock(Map<MatrixIndexes,MatrixBlock> rix, MatrixIndexes ix) {
 		MatrixBlock out = rix.get(ix);
 		if( out == null )
 			throw new DMLRuntimeException("Missing result block: "+ix);
 		return out;
 	}

 	/**
 	 * Assumes internal (0-begin) indices ai, aj as input; computes external block indexes (1-begin)
 	 *
 	 * @param ixout matrix indexes for reuse
 	 * @param ai row index
 	 * @param aj column index
 	 * @param mcIn input matrix characteristics
 	 * @param mcOut output matrix characteristics
 	 * @param rowwise row-wise extraction
 	 * @return matrix indexes
 	 */
 	private static MatrixIndexes computeResultBlockIndex(MatrixIndexes ixout, long ai, long aj,
 		DataCharacteristics mcIn, DataCharacteristics mcOut, boolean rowwise )
 	{
 		long tempc = computeGlobalCellIndex(mcIn, ai, aj, rowwise);
 		long ci = rowwise ? tempc/mcOut.getCols() : tempc%mcOut.getRows();
 		long cj = rowwise ? tempc%mcOut.getCols() : tempc/mcOut.getRows();
 		long bci = ci/mcOut.getBlocksize() + 1;
 		long bcj = cj/mcOut.getBlocksize() + 1;
 		return ixout.setIndexes(bci, bcj);
 	}

 	private static MatrixIndexes computeInBlockIndex(MatrixIndexes ixout, long ai, long aj,
 	                                                 DataCharacteristics mcIn, DataCharacteristics mcOut, boolean rowwise )
 	{
 		long tempc = computeGlobalCellIndex(mcIn, ai, aj, rowwise);
 		long ci = rowwise ? (tempc/mcOut.getCols())%mcOut.getBlocksize() :
 			(tempc%mcOut.getRows())%mcOut.getBlocksize();
 		long cj = rowwise ? (tempc%mcOut.getCols())%mcOut.getBlocksize() :
 			(tempc/mcOut.getRows())%mcOut.getBlocksize();
 		return ixout.setIndexes(ci, cj);
 	}

 	private static long computeGlobalCellIndex(DataCharacteristics mcIn, long ai, long aj, boolean rowwise) {
 		return rowwise ? ai*mcIn.getCols()+aj : ai+mcIn.getRows()*aj;
 	}

 	private static MatrixBlock removeEmptyRows(MatrixBlock in, MatrixBlock ret, MatrixBlock select, boolean emptyReturn) {
 		final int m = in.rlen;
 		final int n = in.clen;
 		boolean[] flags = null;
 		int rlen2 = 0;

 		//Step 0: special case handling
 		if( SHALLOW_COPY_REORG && SPARSE_OUTPUTS_IN_CSR
 			&& in.sparse && !in.isEmptyBlock(false)
 			&& select==null && in.sparseBlock instanceof SparseBlockCSR
 			&& in.nonZeros < Integer.MAX_VALUE )
 		{
 			//create the output in csr format with a shallow copy of arrays for column
 			//indexes and values (heuristic: shallow copy better than copy to dense)
 			SparseBlockCSR sblock = (SparseBlockCSR) in.sparseBlock;
 			int lrlen = 0;
 			for( int i=0; i<m; i++ )
 				lrlen += sblock.isEmpty(i) ? 0 : 1;
 			//check for sparse output representation, otherwise
 			//fall back to default pass to allocate in dense format
 			if( MatrixBlock.evalSparseFormatInMemory(lrlen, n, in.nonZeros) ) {
 				int[] rptr = new int[lrlen+1];
 				for( int i=0, j=0, pos=0; i<m; i++ )
 					if( !sblock.isEmpty(i) ) {
 						pos += sblock.size(i);
 						rptr[++j] = pos;
 					}
 				ret.reset(lrlen, in.clen, true);
 				ret.sparseBlock = new SparseBlockCSR(rptr,
 					sblock.indexes(), sblock.values(), (int)in.nonZeros);
 				ret.nonZeros = in.nonZeros;
 				return ret;
 			}
 		}

 		//Step 1: scan block and determine non-empty rows
 		if(select == null)
 		{
 			flags = new boolean[ m ]; //false
 			if( in.sparse ) { //SPARSE
 				SparseBlock a = in.sparseBlock;
 				for ( int i=0; i < m; i++ )
 					rlen2 += (flags[i] = !a.isEmpty(i)) ? 1 : 0;
 			}
 			else { //DENSE
 				DenseBlock a = in.getDenseBlock();
 				for( int i=0; i<m; i++ ) {
 					double[] avals = a.values(i);
 					int aix = a.pos(i);
 					for(int j=0; j<n; j++)
 						if( avals[aix+j] != 0 ) {
 							flags[i] = true;
 							rlen2++;
 							//early abort for current row
 							break;
 						}
 				}
 			}
 		}
 		else {
 			flags = DataConverter.convertToBooleanVector(select);
 			rlen2 = (int)select.getNonZeros();
 		}

 		//Step 2: reset result and copy rows
 		//dense stays dense if correct input representation (but robust for any input),
 		//sparse might be dense/sparse
 		rlen2 = Math.max(rlen2, emptyReturn ? 1 : 0); //ensure valid output
 		boolean sp = MatrixBlock.evalSparseFormatInMemory(rlen2, n, in.nonZeros);
 		ret.reset(rlen2, n, sp);
 		if( in.isEmptyBlock(false) )
 			return ret;

 		if( SHALLOW_COPY_REORG && m == rlen2 ) {
 			ret.sparse = in.sparse;
 			if( ret.sparse )
 				ret.sparseBlock = in.sparseBlock;
 			else
 				ret.denseBlock = in.denseBlock;
 		}
 		else if( in.sparse ) //* <- SPARSE
 		{
 			//note: output dense or sparse
 			for( int i=0, cix=0; i<m; i++ )
 				if( flags[i] ) {
 					ret.appendRow(cix++, in.sparseBlock.get(i),
 						!SHALLOW_COPY_REORG);
 				}
 		}
 		else if( !in.sparse && !ret.sparse )  //DENSE <- DENSE
 		{
 			ret.allocateDenseBlock();
 			DenseBlock a = in.getDenseBlock();
 			DenseBlock c = ret.getDenseBlock();
 			for( int i=0, ci=0; i<m; i++ )
 				if( flags[i] ) {
 					System.arraycopy(a.values(i),
 						a.pos(i), c.values(ci), c.pos(ci), n);
 					ci++; //target row index
 				}
 		}
 		else //SPARSE <- DENSE
 		{
 			ret.allocateSparseRowsBlock();
 			DenseBlock a = in.getDenseBlock();
 			for( int i=0, ci=0; i<m; i++ )
 				if( flags[i] ) {
 					double[] avals = a.values(i);
 					int aix = a.pos(i);
 					for( int j=0; j<n; j++ )
 						ret.appendValue(ci, j, avals[aix+j]);
 					ci++;
 				}
 		}

 		//check sparsity
 		ret.nonZeros = (select==null) ?
 			in.nonZeros : ret.recomputeNonZeros();
 		ret.examSparsity();

 		return ret;
 	}

 	private static MatrixBlock removeEmptyColumns(MatrixBlock in, MatrixBlock ret, MatrixBlock select, boolean emptyReturn) {
 		final int m = in.rlen;
 		final int n = in.clen;

 		//Step 1: scan block and determine non-empty columns
 		//(we optimized for cache-friendly behavior and hence don't do early abort)
 		boolean[] flags = null;

 		if (select == null)
 		{
 			flags = new boolean[ n ]; //false
 			if( in.sparse ) { //SPARSE
 				SparseBlock a = in.sparseBlock;
 				for( int i=0; i<m; i++ )
 					if ( !a.isEmpty(i) ) {
 						int apos = a.pos(i);
 						int alen = a.size(i);
 						int[] aix = a.indexes(i);
 						for( int j=apos; j<apos+alen; j++ )
 							flags[ aix[j] ] = true;
 					}
 			}
 			else { //DENSE
 				DenseBlock a = in.getDenseBlock();
 				for( int i=0; i<m; i++ ) {
 					double[] avals = a.values(i);
 					int aix = a.pos(i);
 					for( int j=0; j<n; j++ )
 						flags[j] |= (avals[aix+j] != 0);
 				}
 			}
 		}
 		else {
 			flags = DataConverter.convertToBooleanVector(select);
 		}

 		//Step 2: determine number of columns
 		int clen2 = 0;
 		for( int j=0; j<n; j++ ) {
 			clen2 += flags[j] ? 1 : 0;
 		}

 		//Step 3: reset result and copy columns
 		//dense stays dense if correct input representation (but robust for any input),
 		// sparse might be dense/sparse
 		clen2 = Math.max(clen2, emptyReturn ? 1 : 0); //ensure valid output
 		boolean sp = MatrixBlock.evalSparseFormatInMemory(m, clen2, in.nonZeros);
 		ret.reset(m, clen2, sp);
 		if( in.isEmptyBlock(false) )
 			return ret;

 		if( SHALLOW_COPY_REORG && n == clen2 ) {
 			//quick path: shallow copy if unmodified
 			ret.sparse = in.sparse;
 			if( ret.sparse )
 				ret.sparseBlock = in.sparseBlock;
 			else
 				ret.denseBlock = in.denseBlock;
 		}
 		else
 		{
 			//create mapping of flags to target indexes
 			int[] cix = new int[n];
 			for( int j=0, pos=0; j<n; j++ ) {
 				if( flags[j] )
 					cix[j] = pos++;
 			}

 			//deep copy of modified outputs
 			if( in.sparse ) //* <- SPARSE
 			{
 				//note: output dense or sparse
 				SparseBlock a = in.sparseBlock;
 				for( int i=0; i<m; i++ )
 					if ( !a.isEmpty(i) ) {
 						int apos = a.pos(i);
 						int alen = a.size(i);
 						int[] aix = a.indexes(i);
 						double[] avals = a.values(i);
 						for( int j=apos; j<apos+alen; j++ )
 							if( flags[aix[j]] )
 								ret.appendValue(i, cix[aix[j]], avals[j]);
 					}
 			}
 			else if( !in.sparse && !ret.sparse ) { //DENSE <- DENSE
 				ret.allocateDenseBlock();
 				DenseBlock a = in.getDenseBlock();
 				DenseBlock c = ret.getDenseBlock();
 				for( int i=0; i<m; i++ ) {
 					double[] avals = a.values(i);
 					double[] cvals = c.values(i);
 					int aix = a.pos(i);
 					int lcix = c.pos(i);
 					for( int j=0; j<n; j++ )
 						if( flags[j] )
 							 cvals[ lcix+cix[j] ] = avals[aix+j];
 				}
 			}
 			else { //SPARSE <- DENSE
 				ret.allocateSparseRowsBlock();
 				DenseBlock a = in.getDenseBlock();
 				for( int i=0; i<m; i++ ) {
 					double[] avals = a.values(i);
 					int aix = a.pos(i);
 					for( int j=0; j<n; j++ ) {
 						double aval = avals[aix+j];
 						if( flags[j] && aval!=0 )
 							 ret.appendValue(i, cix[j], aval);
 					}
 				}
 			}
 		}

 		//check sparsity
 		ret.nonZeros = (select==null) ?
 			in.nonZeros : ret.recomputeNonZeros();
 		ret.examSparsity();

 		return ret;
 	}

 	private static MatrixBlock rexpandRows(MatrixBlock in, MatrixBlock ret, int max, boolean cast, boolean ignore) {
 		//set meta data
 		final int rlen = max;
 		final int clen = in.rlen;
 		final long nnz = in.nonZeros;
 		boolean sp = MatrixBlock.evalSparseFormatInMemory(rlen, clen, nnz);
 		ret.reset(rlen, clen, sp);

 		//setup temporary array for 'buffered append w/ sorting' in order
 		//to mitigate performance issues due to random row access for large m
 		final int blksize = 1024*1024; //max 12MB
 		int[] tmpi = new int[Math.min(blksize,clen)];
 		double[] tmp = new double[Math.min(blksize,clen)];

 		//expand input vertically  (input vector likely dense
 		//but generic implementation for general case)
 		for( int i=0; i<clen; i+=blksize )
 		{
 			//create sorted block indexes (append buffer)
 			int len = Math.min(blksize, clen-i);
 			copyColVector(in, i, tmp, tmpi, len);
 			SortUtils.sortByValue(0, len, tmp, tmpi);

 			//process current append buffer
 			for( int j=0; j<len; j++ )
 			{
 				//get value and cast if necessary (table)
 				double val = tmp[j];
 				if( cast )
 					val = UtilFunctions.toLong(val);

 				//handle invalid values if not to be ignored
 				if( !ignore && val<=0 )
 					throw new DMLRuntimeException("Invalid input value <= 0 for ignore=false: "+val);

 				//set expanded value if matching
 				//note: tmpi populated with i+j indexes, then sorted
 				if( val == Math.floor(val) && val >= 1 && val <= max )
 					ret.appendValue((int)(val-1), tmpi[j], 1);
 			}
 		}

 		//ensure valid output sparse representation
 		//(necessary due to cache-conscious processing w/ unstable sort)
 		if( ret.isInSparseFormat() )
 			ret.sortSparseRows();

 		return ret;
 	}

 	private static MatrixBlock rexpandColumns(MatrixBlock in, MatrixBlock ret, int max, boolean cast, boolean ignore, int k) {
 		//set meta data
 		final int rlen = in.rlen;
 		final int clen = max;
 		final long nnz = in.nonZeros;
 		boolean sp = MatrixBlock.evalSparseFormatInMemory(rlen, clen, nnz);
 		ret.reset(rlen, clen, sp);
 		ret.allocateBlock();

 		//execute rexpand columns
 		long rnnz = 0; //real nnz (due to cutoff max)
 		if( k <= 1 || in.getNumRows() <= PAR_NUMCELL_THRESHOLD
 			|| (sp && SPARSE_OUTPUTS_IN_CSR) ) {
 			rnnz = rexpandColumns(in, ret, max, cast, ignore, 0, rlen);
 		}
 		else {
 			try {
 				ExecutorService pool = CommonThreadPool.get(k);
 				ArrayList<RExpandColsTask> tasks = new ArrayList<>();
 				int blklen = (int)(Math.ceil((double)rlen/k/8));
 				for( int i=0; i<8*k & i*blklen<rlen; i++ )
 					tasks.add(new RExpandColsTask(in, ret,
 						max, cast, ignore, i*blklen, Math.min((i+1)*blklen, rlen)));
 				List<Future<Long>> taskret = pool.invokeAll(tasks);
 				pool.shutdown();
 				for( Future<Long> task : taskret )
 					rnnz += task.get();
 			}
 			catch(Exception ex) {
 				throw new DMLRuntimeException(ex);
 			}
 		}

 		//post-processing
 		ret.setNonZeros(rnnz);

 		return ret;
 	}

 	private static long rexpandColumns(MatrixBlock in, MatrixBlock ret, int max, boolean cast, boolean ignore, int rl, int ru) {
 		//initialize auxiliary data structures
 		int lnnz = 0;
 		int[] cix = null;
 		if( ret.sparse && SPARSE_OUTPUTS_IN_CSR ) {
 			cix = new int[in.rlen];
 			Arrays.fill(cix, -1);
 		}

 		//expand input horizontally (input vector likely dense
 		//but generic implementation for general case)
 		DenseBlock cd = ret.getDenseBlock();
 		SparseBlock cs = ret.getSparseBlock();
 		for( int i=rl; i<ru; i++ )
 		{
 			//get value and cast if necessary (table)
 			double val = in.quickGetValue(i, 0);
 			if( cast )
 				val = UtilFunctions.toLong(val);

 			//handle invalid values if not to be ignored
 			if( !ignore && val<=0 )
 				throw new DMLRuntimeException("Invalid input value <= 0 for ignore=false: "+val);

 			//set expanded value if matching
 			if( val == Math.floor(val) && val >= 1 && val <= max ) {
 				//update target without global nnz maintenance
 				if( cix != null ) {
 					cix[i] = (int)(val-1);
 				}
 				else if( ret.sparse ) {
 					cs.allocate(i, 1);
 					cs.append(i, (int)(val-1), 1);
 				}
 				else
 					cd.set(i, (int)(val-1), 1);
 				lnnz ++;
 			}
 		}

 		//init CSR block once to avoid repeated updates of row pointers on append
 		if( cix != null )
 			ret.sparseBlock = new SparseBlockCSR(in.rlen, lnnz, cix);

 		//recompute nnz of partition
 		return ret.setNonZeros(lnnz);
 	}

 	private static void copyColVector( MatrixBlock in, int ixin, double[] tmp, int[] tmpi, int len)
 	{
 		//copy value array from input matrix
 		if( in.isEmptyBlock(false) ) {
 			Arrays.fill(tmp, 0, len, 0);
 		}
 		else if( in.sparse ){ //SPARSE
 			for( int i=0; i<len; i++ )
 				tmp[i] = in.quickGetValue(ixin+i, 0);
 		}
 		else { //DENSE
 			System.arraycopy(in.getDenseBlockValues(), ixin, tmp, 0, len);
 		}

 		//init index array
 		for( int i=0; i<len; i++ )
 			tmpi[i] = ixin + i;
 	}


 	/**
 	 * Utility method for in-place transformation of an ascending sorted
 	 * order into a descending sorted order. This method assumes dense
 	 * column vectors as input.
 	 *
 	 * @param m1 matrix
 	 */
 	private static void sortReverseDense( MatrixBlock m1 ) {
 		int rlen = m1.rlen;
 		double[] a = m1.getDenseBlockValues();
 		for( int i=0; i<rlen/2; i++ ) {
 			double tmp = a[i];
 			a[i] = a[rlen - i -1];
 			a[rlen - i - 1] = tmp;
 		}
 	}

 	private static void sortReverseDense( int[] a ) {
 		int rlen = a.length;
 		for( int i=0; i<rlen/2; i++ ) {
 			int tmp = a[i];
 			a[i] = a[rlen - i -1];
 			a[rlen - i - 1] = tmp;
 		}
 	}

 	private static void sortReverseDense( double[] a ) {
 		int rlen = a.length;
 		for( int i=0; i<rlen/2; i++ ) {
 			double tmp = a[i];
 			a[i] = a[rlen - i -1];
 			a[rlen - i - 1] = tmp;
 		}
 	}

 	// Method to merge all blocks of a specified length.
 	private static void mergeSortedBlocks(int blockLength, int[] valueIndexes, double[] values, int k){
 		// Check if the blocklength is bigger than the size of the values
 		// if it is smaller then merge the blocks, if not merging is done

 		int vlen = values.length;
 		int mergeBlockSize = blockLength * 2;
 		if (mergeBlockSize <= vlen + blockLength){
 			try {
 				ExecutorService pool = CommonThreadPool.get(k);
 				ArrayList<MergeTask> tasks = new ArrayList<>();
 				for( int i=0; i*mergeBlockSize<vlen; i++ ){
 					int start = i*mergeBlockSize;
 					if (start + blockLength < vlen){
 						int stop = Math.min(vlen , (i+1)*mergeBlockSize);
 						tasks.add(new MergeTask(start, stop, blockLength, valueIndexes, values));
 					}
 				}
 				CommonThreadPool.invokeAndShutdown(pool, tasks);
 				// recursive merge larger blocks.
 				mergeSortedBlocks(mergeBlockSize, valueIndexes, values, k);
 			}
 			catch(Exception ex) {
 				throw new DMLRuntimeException(ex);
 			}
 		}
 	}

 	private static void sortBySecondary(int rl, int ru, double[] values, int[] vix, MatrixBlock in, int[] by, int off) {
 		//find runs of equal values in current offset and index range
 		//replace value by next column, sort, and recurse until single value
 		for( int i=rl; i<ru-1; i++ ) {
 			double tmp = values[i];
 			//determine run of equal values
 			int len = 0;
 			while( i+len+1<ru && tmp==values[i+len+1] )
 				len++;
 			//temp value replacement and recursive sort
 			if( len > 0 ) {
 				double old = values[i];
 				//extract values of next column
 				for(int j=i; j<i+len+1; j++)
 					values[j] = in.quickGetValue(vix[j], by[off]-1);
 				//sort values, incl recursive decent
 				SortUtils.sortByValue(i, i+len+1, values, vix);
 				if( off+1 < by.length )
 					sortBySecondary(i, i+len+1, values, vix, in, by, off+1);
 				//reset values of previous level
 				Arrays.fill(values, i, i+len+1, old);
 				i += len; //skip processed run
 			}
 		}
 	}

 	private static void sortIndexesStable(int rl, int ru, double[] values, int[] vix, MatrixBlock in, int[] by, int off) {
 		for( int i=rl; i<ru-1; i++ ) {
 			double tmp = values[i];
 			//determine run of equal values
 			int len = 0;
 			while( i+len+1<ru && tmp==values[i+len+1] )
 				len++;
 			//temp value replacement and recursive decent
 			if( len > 0 ) {
 				if( off < by.length ) {
 					//extract values of next column
 					for(int j=i; j<i+len+1; j++)
 						values[j] = in.quickGetValue(vix[j], by[off]-1);
 					sortIndexesStable(i, i+len+1, values, vix, in, by, off+1);
 				}
 				else //unstable sort of run indexes (equal value guaranteed)
 					Arrays.sort(vix, i, i+len+1);
 				i += len; //skip processed run
 			}
 		}
 	}

 	private static boolean isValidSortByList(int[] by, int clen) {
 		if( by == null || by.length==0 || by.length>clen )
 			return false;
 		for(int i=0; i<by.length; i++)
 			if( by[i] <= 0 || clen < by[i])
 				return false;
 		return true;
 	}

 	@SuppressWarnings("unused")
 	private static void countAgg( int[] c, int[] ai, final int len )
 	{
 		final int bn = len%8;

 		//compute rest, not aligned to 8-block
 		for( int i=0; i<bn; i++ )
 			c[ ai[i] ]++;

 		//unrolled 8-block (for better instruction level parallelism)
 		for( int i=bn; i<len; i+=8 )
 		{
 			c[ ai[ i+0 ] ] ++;
 			c[ ai[ i+1 ] ] ++;
 			c[ ai[ i+2 ] ] ++;
 			c[ ai[ i+3 ] ] ++;
 			c[ ai[ i+4 ] ] ++;
 			c[ ai[ i+5 ] ] ++;
 			c[ ai[ i+6 ] ] ++;
 			c[ ai[ i+7 ] ] ++;
 		}
 	}

 	private static void countAgg( int[] c, int[] aix, int ai, final int len )
 	{
 		final int bn = len%8;

 		//compute rest, not aligned to 8-block
 		for( int i=ai; i<ai+bn; i++ )
 			c[ aix[i] ]++;

 		//unrolled 8-block (for better instruction level parallelism)
 		for( int i=ai+bn; i<ai+len; i+=8 )
 		{
 			c[ aix[ i+0 ] ] ++;
 			c[ aix[ i+1 ] ] ++;
 			c[ aix[ i+2 ] ] ++;
 			c[ aix[ i+3 ] ] ++;
 			c[ aix[ i+4 ] ] ++;
 			c[ aix[ i+5 ] ] ++;
 			c[ aix[ i+6 ] ] ++;
 			c[ aix[ i+7 ] ] ++;
 		}
 	}

 	@SuppressWarnings("unused")
 	private static class AscRowComparator implements Comparator<Integer>
 	{
 		private MatrixBlock _mb = null;
 		private int _col = -1;

 		public AscRowComparator( MatrixBlock mb, int col )
 		{
 			_mb = mb;
 			_col = col;
 		}

 		@Override
 		public int compare(Integer arg0, Integer arg1)
 		{
 			double val0 = _mb.quickGetValue(arg0, _col);
 			double val1 = _mb.quickGetValue(arg1, _col);
 			return (val0 < val1 ? -1 : (val0 == val1 ? 0 : 1));
 		}
 	}

 	@SuppressWarnings("unused")
 	private static class DescRowComparator implements Comparator<Integer>
 	{
 		private MatrixBlock _mb = null;
 		private int _col = -1;

 		public DescRowComparator( MatrixBlock mb, int col )
 		{
 			_mb = mb;
 			_col = col;
 		}

 		@Override
 		public int compare(Integer arg0, Integer arg1)
 		{
 			double val0 = _mb.quickGetValue(arg0, _col);
 			double val1 = _mb.quickGetValue(arg1, _col);
 			return (val0 > val1 ? -1 : (val0 == val1 ? 0 : 1));
 		}
 	}

 	private static class TransposeTask implements Callable<Object>
 	{
 		private MatrixBlock _in = null;
 		private MatrixBlock _out = null;
 		private boolean _row = false;
 		private int _rl = -1;
 		private int _ru = -1;
 		private int[] _cnt = null;

 		protected TransposeTask(MatrixBlock in, MatrixBlock out, boolean row, int rl, int ru, int[] cnt) {
 			_in = in;
 			_out = out;
 			_row = row;
 			_rl = rl;
 			_ru = ru;
 			_cnt = cnt;
 		}

 		@Override
 		public Object call() {
 			int rl = _row ? _rl : 0;
 			int ru = _row ? _ru : _in.rlen;
 			int cl = _row ? 0 : _rl;
 			int cu = _row ? _in.clen : _ru;

 			//execute transpose operation
 			if( !_in.sparse && !_out.sparse )
 				transposeDenseToDense( _in, _out, rl, ru, cl, cu );
 			else if( _in.sparse && _out.sparse )
 				transposeSparseToSparse( _in, _out, rl, ru, cl, cu, _cnt );
 			else if( _in.sparse )
 				transposeSparseToDense( _in, _out, rl, ru, cl, cu );
 			else
 				throw new DMLRuntimeException("Unsupported multi-threaded dense-sparse transpose.");

 			return null;
 		}
 	}

 	private static class CountNnzTask implements Callable<int[]>
 	{
 		private MatrixBlock _in = null;
 		private int _rl = -1;
 		private int _ru = -1;

 		protected CountNnzTask(MatrixBlock in, int rl, int ru) {
 			_in = in;
 			_rl = rl;
 			_ru = ru;
 		}

 		@Override
 		public int[] call() {
 			return countNnzPerColumn(_in, _rl, _ru);
 		}
 	}

 	private static class RExpandColsTask implements Callable<Long>
 	{
 		private final MatrixBlock _in;
 		private final MatrixBlock _out;
 		private final int _max;
 		private final boolean _cast;
 		private final boolean _ignore;
 		private final int _rl;
 		private final int _ru;

 		protected RExpandColsTask(MatrixBlock in, MatrixBlock out, int max, boolean cast, boolean ignore, int rl, int ru) {
 			_in = in;
 			_out = out;
 			_max = max;
 			_cast = cast;
 			_ignore = ignore;
 			_rl = rl;
 			_ru = ru;
 		}

 		@Override
 		public Long call() {
 			return rexpandColumns(_in, _out, _max, _cast, _ignore, _rl, _ru);
 		}
 	}

 	private static class SortTask implements Callable<Object>
 	{
 		private final int _start;
 		private final int _end;
 		private final int[] _indexes;
 		private final double[] _values;


 		protected SortTask(int start, int end, int[] indexes, double[] values){
 			_start = start;
 			_end = end;
 			_indexes = indexes;
 			_values = values;

 		}

 		@Override
 		public Long call(){
 			SortUtils.sortByValue(_start, _end, _values, _indexes);
 			return 1l;
 		}

 	}

 	private static class MergeTask implements Callable<Object>
 	{
 		private final int _start;
 		private final int _end;
 		private final int _blockSize;
 		private final int[] _indexes;
 		private final double[] _values;

 		protected MergeTask(int start, int end, int blockSize, int[] indexes, double[] values){
 			_start = start;
 			_end = end;
 			_blockSize = blockSize;
 			_indexes = indexes;
 			_values = values;
 		}

 		@Override
 		public Long call(){
 			// Find the middle index of the two merge blocks
 			int middle = _start + _blockSize;
 			// Early return if edge case
 			if (middle == _end) return 1l;

 			// Pointer left side merge.
 			int pointlIndex = middle -1;
 			// Pointer to merge index.
 			int positionToAssign = _end - 1;

 			// Make copy of entire right side so that we use left side directly as output.
 			// Results in worst case (and most cases) allocation of extra array of half input size.
 			int[] rhsCopy = Arrays.copyOfRange(_indexes, middle, _end);
 			double[] rhsCopyV = Arrays.copyOfRange(_values, middle, _end);

 			int pointrIndex = _end - middle - 1;
 			while (positionToAssign >= _start && pointrIndex >= 0) {
 				if( pointrIndex < 0
 					|| ( pointlIndex >= _start && _values[pointlIndex] > rhsCopyV[pointrIndex]) )
 				{
 					_values[positionToAssign] = _values[pointlIndex];
 					_indexes[positionToAssign] = _indexes[pointlIndex];
 					pointlIndex--;
 					positionToAssign--;
 				}
 				else {
 					_values[positionToAssign] = rhsCopyV[pointrIndex];
 					_indexes[positionToAssign] = rhsCopy[pointrIndex];
 					positionToAssign--;
 					pointrIndex--;
 				}
 			}

 			return 1l;
 		}
 	}

 	private static class CopyTask implements Callable<Object>
 	{
 		private final MatrixBlock _in;
 		private final MatrixBlock _out;
 		private final int[] _vix;
 		private final int _rl;
 		private final int _ru;

 		protected CopyTask(MatrixBlock in, MatrixBlock out, int[] vix, int rl, int ru) {
 			_in = in;
 			_out = out;
 			_vix = vix;
 			_rl = rl;
 			_ru = ru;
 		}

 		@Override
 		public Object call() {
 			int clen = _in.clen;
 			if( !_out.sparse ) { //DENSE
 				DenseBlock a = _in.getDenseBlock();
 				DenseBlock c = _out.getDenseBlock();
 				for( int i=_rl; i<_ru; i++ )
 					System.arraycopy(a.values(_vix[i]), a.pos(_vix[i]), c.values(i), c.pos(i), clen);
 			}
 			else { //SPARSE
 				for( int i=_rl; i<_ru; i++ )
 					if( !_in.sparseBlock.isEmpty(_vix[i]) )
 						_out.sparseBlock.set(i, _in.sparseBlock.get(_vix[i]),
 							!SHALLOW_COPY_REORG); //row remains unchanged
 			}
 			return null;
 		}
 	}
 }