src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java - systemds - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.sysml.runtime.matrix.data;

 import java.lang.ref.SoftReference;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.atomic.AtomicLong;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.util.ConvolutionUtils;

 public class LibMatrixDNN {

 	protected static final Log LOG =  LogFactory.getLog(LibMatrixDNN.class.getName());

 	public static final boolean ALLOW_MULTI_THREADED_OPS = true;
 	// Using hashmap to avoid any performance impacts of multimap
 	private static final ConcurrentHashMap<Integer, SoftReference<double[]>> non_zeroed_double_arr = new ConcurrentHashMap<Integer, SoftReference<double[]>>();
 	private static final int NON_ZEROED_DOUBLE_ARR_THRESHOLD = 100;
 	public static void cacheReuseableData(double[] arr) {
 		if(arr != null && arr.length >= NON_ZEROED_DOUBLE_ARR_THRESHOLD) {
 			// Put the last recently removed arrays into the NON_ZEROED_DOUBLE_ARR as
 			// it has lower probability of being garbage collected
 			// new Integer(arr.length) can be avoided here as autoboxing will do the trick
 			non_zeroed_double_arr.put(arr.length, new SoftReference<double[]>(arr));
 		}
 	}
 	private static boolean warnedSingleThread = false;
 	private static void warnSingleThreaded() {
 		if(!warnedSingleThread) {
 			throw new RuntimeException("WARN: Single thread execution in LibMatrixDNN");
 			// LOG.warn("WARN: Single thread execution in LibMatrixDNN");
 			// warnedSingleThread = true;
 		}
 	}
 	public static double[] getReuseableData(long length) {
 		if(length >= NON_ZEROED_DOUBLE_ARR_THRESHOLD) {
 			// Explicit "new Integer" required here for HashMap.remove
 			SoftReference<double[]> arr = non_zeroed_double_arr.remove(new Integer((int) length));
 			if(arr != null) {
 				return arr.get();
 			}
 		}
 		return null;
 	}

 	enum TaskType {
 		MaxPooling_Forward, MaxPooling_Backward,
 		LoopedIm2ColConv2d, LoopedIm2ColConv2dBwdFilter, LoopedIm2ColConv2dBwdData
 	}

 	public static class TemporaryConvolutionData {
 		public int [] minIndexArrR;
 		public int [] minIndexArrS;
 		public int [] maxIndexArrR;
 		public int [] maxIndexArrS;
 		int minCommonIndexS;
 		int maxCommonIndexS;
 	}

 	private static AtomicLong conv2dSparseCount = new AtomicLong(0);
 	private static AtomicLong conv2dDenseCount = new AtomicLong(0);
 	private static AtomicLong conv2dBwdFilterSparseCount = new AtomicLong(0);
 	private static AtomicLong conv2dBwdFilterDenseCount = new AtomicLong(0);
 	private static AtomicLong conv2dBwdDataSparseCount = new AtomicLong(0);
 	private static AtomicLong conv2dBwdDataDenseCount = new AtomicLong(0);
 	private static AtomicLong im2colSparseCount = new AtomicLong(0);
 	private static AtomicLong im2colDenseCount = new AtomicLong(0);
 	private static AtomicLong maxPoolBwdSparseCount = new AtomicLong(0);
 	private static AtomicLong maxPoolBwdDenseCount = new AtomicLong(0);
 	private static AtomicLong loopedConvMatMultTime = new AtomicLong(0);
 	private static AtomicLong loopedConvIm2ColTime = new AtomicLong(0);
 	private static AtomicLong loopedConvBwdFilterMatMultTime = new AtomicLong(0);
 	private static AtomicLong loopedConvBwdFilterIm2ColTime = new AtomicLong(0);
 	private static AtomicLong loopedConvBwdDataMatMultTime = new AtomicLong(0);
 	private static AtomicLong loopedConvBwdDataCol2ImTime = new AtomicLong(0);

 	public static void appendStatistics(StringBuilder sb) {
 		if(DMLScript.STATISTICS && (conv2dDenseCount.get() != 0 || conv2dSparseCount.get() != 0)) {
 			sb.append("LibMatrixDNN dense count (conv/bwdF/bwdD/im2col/maxBwd):\t"
 					+ conv2dDenseCount.get() + "/"
 					+ conv2dBwdFilterDenseCount.get() + "/"
 					+ conv2dBwdDataDenseCount.get() + "/"
 					+ im2colDenseCount.get() + "/"
 					+ maxPoolBwdDenseCount.get() + ".\n");
 			sb.append("LibMatrixDNN sparse count (conv/bwdF/bwdD/im2col/maxBwd):\t"
 					+ conv2dSparseCount.get() + "/"
 					+ conv2dBwdFilterSparseCount.get() + "/"
 					+ conv2dBwdDataSparseCount.get() + "/"
 					+ im2colSparseCount.get() + "/"
 					+ maxPoolBwdSparseCount.get() + ".\n");
 			if(loopedConvMatMultTime.get() != 0 || loopedConvIm2ColTime.get() != 0) {
 				sb.append("LibMatrixDNN conv(im2col/matmult), bwdF (im2col/matmult), bwdD (col2im/matmult) time:\t" +
 						String.format("%.3f", loopedConvIm2ColTime.get()*1e-9) + "/" +
 						String.format("%.3f", loopedConvMatMultTime.get()*1e-9) + "/" +
 						String.format("%.3f", loopedConvBwdFilterIm2ColTime.get()*1e-9) + "/" +
 						String.format("%.3f", loopedConvBwdFilterMatMultTime.get()*1e-9) + "/" +
 						String.format("%.3f", loopedConvBwdDataCol2ImTime.get()*1e-9) + "/" +
 						String.format("%.3f", loopedConvBwdDataMatMultTime.get()*1e-9) + " sec.\n");
 			}
 		}
 	}
 	public static void resetStatistics() {
 		conv2dDenseCount.set(0);
 		conv2dBwdFilterDenseCount.set(0);
 		conv2dBwdDataDenseCount.set(0);
 		im2colDenseCount.set(0);
 		maxPoolBwdDenseCount.set(0);

 		conv2dSparseCount.set(0);
 		conv2dBwdFilterSparseCount.set(0);
 		conv2dBwdDataSparseCount.set(0);
 		im2colSparseCount.set(0);
 		maxPoolBwdSparseCount.set(0);

 		loopedConvIm2ColTime.set(0);
 		loopedConvMatMultTime.set(0);
 		loopedConvBwdFilterMatMultTime.set(0);
 		loopedConvBwdFilterIm2ColTime.set(0);
 		loopedConvBwdDataMatMultTime.set(0);
 		loopedConvBwdDataCol2ImTime.set(0);
 	}

 	public static class ConvolutionParameters {
 		public int N; public int C; public int H; public int W;
 		public int K; public int R; public int S; public int stride_h; public int stride_w; public int pad_h; public int pad_w;
 		public int P; public int Q; public int numThreads;

 		public AtomicLong outputNNZ = new AtomicLong(-1);

 		MatrixBlock input1; MatrixBlock input2; MatrixBlock output;
 		boolean reuseNonZeroedOutput = false;

 		public TemporaryConvolutionData tmpData;

 		private int convertToInt(long val) throws DMLRuntimeException {
 			if( val > Integer.MAX_VALUE ) {
 				throw new DMLRuntimeException("The value for ConvolutionParameters is too large:" + val);
 			}
 			return (int) val;
 		}

 		public boolean compare(ConvolutionParameters that) {
 			if(this.N == that.N && this.C == that.C && this.H == that.H && this.W == that.W
 					&& this.K == that.K && this.R == that.R && this.S == that.S && this.stride_h == that.stride_h
 					 && this.stride_w == that.stride_w  && this.pad_h == that.pad_h
 					  && this.pad_w == that.pad_w   && this.numThreads == that.numThreads) {
 				return true;
 			}
 			return false;
 		}

 		public String toString() {
 			return "(" + N + " " + C + " " + H + " " + W + " " + K + " " + R + " " + S + ")";
 		}

 		public ConvolutionParameters(long N, long C, long H, long W,
 				long K, long R, long S, long stride_h, long stride_w, long pad_h, long pad_w, int numThreads) throws DMLRuntimeException {
 			this.N = convertToInt(N);
 			this.C = convertToInt(C);
 			this.H = convertToInt(H);
 			this.W = convertToInt(W);
 			this.K = convertToInt(K);
 			this.R = convertToInt(R);
 			this.S = convertToInt(S);
 			this.stride_h = convertToInt(stride_h);
 			this.stride_w = convertToInt(stride_w);
 			this.pad_h = convertToInt(pad_h);
 			this.pad_w = convertToInt(pad_w);
 			if(H >= 0 && pad_h >= 0 && R >= 0 && stride_h >= 0)
 				P = (int) ((H + 2 * pad_h - R) / stride_h + 1);
 			else
 				P = -1;
 			// P = convertToInt(ConvolutionUtils.getP(H, R, stride_h, pad_h));

 			if(W >= 0 && pad_w >= 0 && S >= 0 && stride_w >= 0)
 				Q = (int) ((W + 2 * pad_w - S) / stride_w + 1);
 			else
 				Q = -1;
 			// Q = convertToInt(ConvolutionUtils.getQ(W, S, stride_w, pad_w));

 			this.numThreads = numThreads;
 		}

 		public ConvolutionParameters(int N, int C, int H, int W,
 			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int numThreads) {
 			this.N = N;
 			this.C = C;
 			this.H = H;
 			this.W = W;
 			this.K = K;
 			this.R = R;
 			this.S = S;
 			this.stride_h = stride_h;
 			this.stride_w = stride_w;
 			this.pad_h = pad_h;
 			this.pad_w = pad_w;
 			P = (int) ConvolutionUtils.getP(H, R, stride_h, pad_h);
 			Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
 			this.numThreads = numThreads;
 		}

 		public void setReuseNonZeroedOutput(boolean reuseNonZeroedOutput) {
 			this.reuseNonZeroedOutput = reuseNonZeroedOutput;
 		}

 		public boolean isOutputThreadSafe() {
 			return output.isThreadSafe();
 		}
 	}

 	public static void conv2d_backward_data(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		params.input1 = filter;
 		params.input2 = dout;
 		params.output = outputBlock;
 		if(filter.getNumRows() != params.K || filter.getNumColumns() != params.C*params.R*params.S ||
 				dout.getNumRows() != params.N || dout.getNumColumns() != params.K*params.P*params.Q) {
 			throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter");
 		}
 		if(params.stride_h <= 0 || params.stride_w <= 0) {
 			throw new DMLRuntimeException("Only positive strides supported");
 		}

 		// Convert filter (which is relatively small matrix) to dense
 		if(params.input1.isInSparseFormat()) {
 			params.input1.sparseToDense();
 		}

 		if(DMLScript.STATISTICS) {
 			if(filter.isInSparseFormat() || dout.isInSparseFormat()) {
 				conv2dBwdDataSparseCount.addAndGet(1);
 			}
 			else {
 				conv2dBwdDataDenseCount.addAndGet(1);
 			}
 		}

 		params.reuseNonZeroedOutput = true;

 		int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
 		if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
 			warnSingleThreaded();
 			MatrixBlock dout_reshaped = new MatrixBlock(params.P*params.Q, params.K, false);
 			dout_reshaped.allocateDenseBlock(true);
 			for (int n = 0; n < params.N; n++) {
 				doLoopedIm2ColConv2dBwdData(n, dout_reshaped, params);
 			}
 		}
 		else {
 			runConvTask(constrainedNumThreads, 1, TaskType.LoopedIm2ColConv2dBwdData, params);
 		}
 	}

 	public static void conv2d_backward_filter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		params.input1 = input;
 		params.input2 = dout;
 		params.output = outputBlock;
 		if(input.getNumRows() != params.N || input.getNumColumns() != params.C*params.H*params.W ||
 				dout.getNumRows() != params.N || dout.getNumColumns() != params.K*params.P*params.Q) {
 			throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter");
 		}
 		if(params.stride_h <= 0 || params.stride_w <= 0) {
 			throw new DMLRuntimeException("Only positive strides supported");
 		}

 		if(DMLScript.STATISTICS) {
 			if(input.isInSparseFormat() || dout.isInSparseFormat()) {
 				conv2dBwdFilterSparseCount.addAndGet(1);
 			}
 			else {
 				conv2dBwdFilterDenseCount.addAndGet(1);
 			}
 		}

 		params.reuseNonZeroedOutput = true;

 		int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
 		if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
 			warnSingleThreaded();
 			MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
 			im2ColOutBlock.allocateDenseBlock(true);
 			MatrixBlock dout_reshaped = new MatrixBlock(params.P*params.Q, params.K, false);
 			dout_reshaped.allocateDenseBlock(true);
 			for (int n = 0; n < params.N; n++) {
 				params.output = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, dout_reshaped, params.output, params);
 			}
 		}
 		else {
 			runConvTask(constrainedNumThreads, 1, TaskType.LoopedIm2ColConv2dBwdFilter, params);
 		}

 	}

 	// ret += elem
 	private static void elementWiseInPlaceAddition(MatrixBlock ret, MatrixBlock elem) throws DMLRuntimeException {
 		if(ret.getNumRows() != elem.getNumRows() || ret.getNumColumns() != elem.getNumColumns()) {
 			throw new DMLRuntimeException("Incorrect dimensions");
 		}
 		if(!ret.isInSparseFormat() && !elem.isInSparseFormat()) {
 			for(int i = 0; i < ret.getNumRows()*ret.getNumColumns(); i++) {
 				ret.denseBlock[i] += elem.denseBlock[i];
 			}
 		}
 		else if(!ret.isInSparseFormat() && elem.isInSparseFormat()) {
 			if(!elem.isEmptyBlock()) {
 				Iterator<IJV> iter = elem.sparseBlock.getIterator();
 				int numCol = ret.getNumColumns();
 				while(iter.hasNext()) {
 					IJV ijv = iter.next();
 					int index = ijv.getI()*numCol + ijv.getJ();
 					ret.denseBlock[index] += ijv.getV();
 				}
 			}
 		}
 		else {
 			throw new DMLRuntimeException("Sparse return format not supported");
 		}
 	}

 	// ret += t(elem)
 	private static void elementWiseInPlaceTransposedAddition(MatrixBlock ret, MatrixBlock elem) throws DMLRuntimeException {
 		if(ret.getNumRows() != elem.getNumColumns() || ret.getNumColumns() != elem.getNumRows()) {
 			throw new DMLRuntimeException("Incorrect dimensions");
 		}
 		int numRow = ret.getNumColumns();
 		if(!ret.isInSparseFormat() && !elem.isInSparseFormat()) {
 			int iter = 0;
 			for(int i = 0; i < elem.getNumRows(); i++) {
 				for(int j = 0; j < elem.getNumColumns(); j++, iter++) {
 					int index = j*numRow+i;
 					ret.denseBlock[index] += elem.denseBlock[iter];
 				}
 			}
 		}
 		else if(!ret.isInSparseFormat() && elem.isInSparseFormat()) {
 			if(!elem.isEmptyBlock()) {
 				Iterator<IJV> iter = elem.sparseBlock.getIterator();
 				while(iter.hasNext()) {
 					IJV ijv = iter.next();
 					int index = ijv.getJ()*numRow + ijv.getI();
 					ret.denseBlock[index] += ijv.getV();
 				}
 			}
 		}
 		else {
 			throw new DMLRuntimeException("Sparse return format not supported");
 		}
 	}

 	private static void doLoopedIm2ColConv2dBwdData(int n, MatrixBlock dout_reshaped, ConvolutionParameters params) throws DMLRuntimeException {
 		MatrixBlock filter = params.input1;
 		MatrixBlock dout = params.input2;
 		doRotate180(n, 0, dout, dout_reshaped.denseBlock, params, true);
 		dout_reshaped.recomputeNonZeros();

 		MatrixBlock temp = new MatrixBlock(params.P*params.Q, params.C*params.R*params.S, false);
 		long t1 = DMLScript.STATISTICS ? System.nanoTime() : 0;
 		LibMatrixMult.matrixMult(dout_reshaped, filter, temp, false);
 		long t2 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
 		doCol2imOverSingleImage(n, temp, params);
 		long t3 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
 		if(DMLScript.STATISTICS) {
 			loopedConvBwdDataMatMultTime.addAndGet(t2-t1);
 			loopedConvBwdDataCol2ImTime.addAndGet(t3-t2);
 		}
 	}

 	private static MatrixBlock doLoopedIm2ColConv2dBwdFilter(int n,
 			MatrixBlock im2ColOutBlock, MatrixBlock dout_reshaped, MatrixBlock partialRetBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		long nnz = 0;
 		long t1 = DMLScript.STATISTICS ? System.nanoTime() : 0;
 		for (int c = 0; c < params.C; c++) {
 			nnz += doIm2colOverInputPath_NCHW(n, c, im2ColOutBlock, params);
 		}
 		long t2 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
 		im2ColOutBlock.setNonZeros(nnz);

 		doRotate180(n, 0, params.input2, dout_reshaped.denseBlock, params, true);
 		dout_reshaped.recomputeNonZeros();

 		MatrixBlock temp = new MatrixBlock(params.C*params.R*params.S, params.K, false);
 		long t3 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
 		LibMatrixMult.matrixMult(im2ColOutBlock, dout_reshaped, temp, false);
 		long t4 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
 		if(DMLScript.STATISTICS) {
 			loopedConvBwdFilterMatMultTime.addAndGet(t4-t3);
 			loopedConvBwdFilterIm2ColTime.addAndGet(t2-t1);
 		}
 		if(!temp.isEmptyBlock())
 			elementWiseInPlaceTransposedAddition(partialRetBlock, temp);
 		return partialRetBlock;
 	}

 	private static void computeTensorIndexes(int j, int [] ret, int H, int W) throws DMLRuntimeException {
 		ret[0] = j / (H*W);
 		ret[1] = (j - ret[0]*(H*W))/W;
 		ret[2] = j % W;
 	}

 	public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		params.input1 = input;
 		params.input2 = filter;
 		params.output = outputBlock;

 		if(input.getNumRows() != params.N || input.getNumColumns() != params.C*params.H*params.W ||
 				filter.getNumRows() != params.K || filter.getNumColumns() != params.C*params.R*params.S) {
 			throw new DMLRuntimeException("Incorrect input to conv2d");
 		}

 		if(DMLScript.STATISTICS) {
 			if(input.isInSparseFormat() || filter.isInSparseFormat()) {
 				conv2dSparseCount.addAndGet(1);
 			}
 			else {
 				conv2dDenseCount.addAndGet(1);
 			}
 		}

 		params.reuseNonZeroedOutput = true;
 		int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
 		if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
 			warnSingleThreaded();
 			MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
 			im2ColOutBlock.allocateDenseBlock(true);
 			for (int n = 0; n < params.N; n++) {
 				doLoopedIm2ColConv2d(n, im2ColOutBlock, params);
 			}
 		}
 		else {
 			runConvTask(constrainedNumThreads, 1, TaskType.LoopedIm2ColConv2d, params);
 		}
 	}

 	private static void doLoopedIm2ColConv2d(int n, MatrixBlock im2ColOutBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		long nnz = 0;
 		long t1 = DMLScript.STATISTICS ? System.nanoTime() : 0;
 		for (int c = 0; c < params.C; c++) {
 			nnz += doIm2colOverInputPath_NCHW(n, c, im2ColOutBlock, params);
 		}
 		long t2 = DMLScript.STATISTICS ? System.nanoTime() : 0;

 		im2ColOutBlock.setNonZeros(nnz);
 		MatrixBlock matMultOutBlock = new MatrixBlock(params.K, params.P*params.Q, false);
 		LibMatrixMult.matrixMult(params.input2, im2ColOutBlock, matMultOutBlock, false);
 		long t3 = DMLScript.STATISTICS ? System.nanoTime() : 0;

 		if(DMLScript.STATISTICS) {
 			loopedConvIm2ColTime.addAndGet(t2 - t1);
 			loopedConvMatMultTime.addAndGet(t3 - t2);
 		}

 		int destPos = n*params.K*params.P*params.Q;
 		int length = params.K*params.P*params.Q;
 		if(params.reuseNonZeroedOutput && matMultOutBlock.isEmptyBlock()) {
 			Arrays.fill(params.output.denseBlock, destPos, destPos + length, 0);
 		}
 		else if(!matMultOutBlock.isEmptyBlock()) {
 			if(matMultOutBlock.isInSparseFormat()) {
 				Iterator<IJV> iter = matMultOutBlock.sparseBlock.getIterator();
 				final int outOffset = n*params.K*params.P*params.Q;
 				while(iter.hasNext()) {
 					IJV ijv = iter.next();
 					int k = ijv.getI();
 					int p = ijv.getJ() / params.Q;
 					int q = ijv.getJ() % params.Q;
 					params.output.denseBlock[outOffset + k*params.P*params.Q + p*params.Q + q] = ijv.getV();
 				}
 			}
 			else
 				System.arraycopy(matMultOutBlock.denseBlock, 0, params.output.denseBlock, destPos, length);
 		}
 	}


 	public static void maxpooling_backward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		params.input1 = input;
 		params.input2 = dout;
 		params.output = outputBlock;
 		if(input.getNumColumns() != params.C*params.H*params.W || input.getNumRows() != params.N) {
 			throw new DMLRuntimeException("Incorrect input dimensions in maxpooling_backward:" + input.getNumRows() + " " + input.getNumColumns() + " " + params.N + " " + params.K*params.P*params.Q);
 		}

 		if(dout.getNumColumns() != params.C*params.P*params.Q || dout.getNumRows() != params.N) {
 			throw new DMLRuntimeException("Incorrect dout dimensions in maxpooling_backward:" + input.getNumRows() + " " + input.getNumColumns() + " " + params.N + " " + params.K*params.P*params.Q);
 		}

 		if(DMLScript.STATISTICS) {
 			if(input.isInSparseFormat() || dout.isInSparseFormat()) {
 				maxPoolBwdSparseCount.addAndGet(1);
 			}
 			else {
 				maxPoolBwdDenseCount.addAndGet(1);
 			}
 		}

 		if (params.output.isInSparseFormat())
 			throw new DMLRuntimeException("Sparse maxpooling_backward is not supported");

 		int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
 		if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
 			warnSingleThreaded();
 			for (int n = 0; n < params.N; n++) {
 				doPoolingBackward(n, params);
 			}
 		}
 		else {
 			runConvTask(constrainedNumThreads, 1, TaskType.MaxPooling_Backward, params);
 		}
 	}

 	private static void doPoolingBackward(int n, ConvolutionParameters params) throws DMLRuntimeException {
 		double [] inputArray = null;
 		if (!params.input1.isInSparseFormat())
 			inputArray = params.input1.getDenseBlock();
 		double [] doutArray = null;
 		if (!params.input2.isInSparseFormat())
 			doutArray = params.input2.getDenseBlock();
 		double [] outputArray = null;
 		if (!params.output.isInSparseFormat())
 			outputArray = params.output.getDenseBlock();
 		else
 			throw new DMLRuntimeException("Only dense output supported for pooling_backward");

 		if(inputArray != null) {
 			if(doutArray != null)
 				doPoolingBackwardDenseDense(n, inputArray, doutArray, outputArray, params);
 			else
 				doPoolingBackwardDenseSparse(n, inputArray, params.input2, outputArray, params);
 		}
 		else {
 			if(doutArray != null)
 				doPoolingBackwardSparseDense(n, doutArray, outputArray, params);
 			else
 				doPoolingBackwardSparseSparse(n, outputArray, params);
 		}
 	}

 	private static void doPoolingBackwardSparseDense(int n, double [] doutArray,  double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
 		if (!params.input1.isInSparseFormat())
 			throw new DMLRuntimeException("Incorrect usage: Call optimized versions");

 		for (int c = 0; c < params.C; c++) {
 			for (int p = 0; p < params.P; p++) {
 				for (int q = 0; q < params.Q; q++) {
 					double inVal = doutArray[n*params.C*params.P*params.Q + c*params.P*params.Q +  p * params.Q + q];
 					if(inVal != 0) {
 						final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
 						int start_index_h = p * params.stride_h - params.pad_h;
 						final int end_index_h = Math.min(start_index_h + params.R, params.H);
 						start_index_h = Math.max(start_index_h, 0);
 						int maxIndex = getMaxIndexSparse(start_index_h, end_index_h, q, inputOffset, n, c, params.input1, params);
 						outputArray[maxIndex] += inVal;
 					}
 				}
 			}
 		}
 	}

 	private static void doPoolingBackwardSparseSparse(int n, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
 		if (!params.input1.isInSparseFormat())
 			throw new DMLRuntimeException("Incorrect usage: Call optimized versions");

 		// params.input2.isEmptyBlock() check is done by the caller
 		Iterator<IJV> iter = params.input2.sparseBlock.getIterator(n, n+1);
 		int [] tensorIndexes = new int[3];

 		while(iter.hasNext()) {
 			IJV ijv = iter.next();
 			computeTensorIndexes(ijv.getJ(), tensorIndexes, params.P, params.Q);
 			int c = tensorIndexes[0];
 			int p = tensorIndexes[1];
 			int q = tensorIndexes[2];

 			final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
 			int start_index_h = p * params.stride_h - params.pad_h;
 			final int end_index_h = Math.min(start_index_h + params.R, params.H);
 			start_index_h = Math.max(start_index_h, 0);
 			int maxIndex = getMaxIndexSparse(start_index_h, end_index_h, q, inputOffset, n, c, params.input1, params);
 			outputArray[maxIndex] += ijv.getV();
 		}

 	}

 	private static void doPoolingBackwardDenseSparse(int n, double [] inputArray,
 			MatrixBlock dout, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
 		// dout.isEmptyBlock() check is done by the caller
 		Iterator<IJV> iter = dout.sparseBlock.getIterator(n, n+1);
 		int [] tensorIndexes = new int[3];

 		while(iter.hasNext()) {
 			IJV ijv = iter.next();
 			computeTensorIndexes(ijv.getJ(), tensorIndexes, params.P, params.Q);
 			int c = tensorIndexes[0];
 			int p = tensorIndexes[1];
 			int q = tensorIndexes[2];

 			final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
 			int start_index_h = p * params.stride_h - params.pad_h;
 			final int end_index_h = Math.min(start_index_h + params.R, params.H);
 			start_index_h = Math.max(start_index_h, 0);
 			int maxIndex = getMaxIndex(start_index_h, end_index_h, q, inputOffset, inputArray, params);
 			outputArray[maxIndex] += ijv.getV();
 		}
 	}

 	private static void doPoolingBackwardDenseDense(int n, double [] inputArray, double [] doutArray,
 			double [] outputArray, ConvolutionParameters params) {
 		for (int c = 0; c < params.C; c++) {
 			final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
 			final int outputOffset = n*params.C*params.P*params.Q + c*params.P*params.Q;

 			for (int p = 0; p < params.P; p++) {
 				int start_index_h = p * params.stride_h - params.pad_h;
 				final int end_index_h = Math.min(start_index_h + params.R, params.H);
 				start_index_h = Math.max(start_index_h, 0);

 				for (int q = 0; q < params.Q; q++) {
 					int maxIndex = getMaxIndex(start_index_h, end_index_h, q, inputOffset, inputArray, params);
 					outputArray[maxIndex] += doutArray[outputOffset +  p * params.Q + q];
 				}
 			}
 		}
 	}

 	private static int getMaxIndexSparse(int start_index_h, int end_index_h,
 			int q, int inputOffset, int n, int c, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException {
 		if(!input.isInSparseFormat())
 			throw new DMLRuntimeException("Incorrect usage: Only sparse format supported");

 		// input.isEmptyBlock() check is done by the caller
 		Iterator<IJV> iter = input.sparseBlock.getIterator(n, n+1);
 		int [] tensorIndexes = new int[3];

 		int start_index_w = Math.max(q * params.stride_w - params.pad_w, 0);
 		int end_index_w = Math.min(start_index_w + params.S, params.W);
 		start_index_w = Math.max(start_index_w, 0);

 		int maxIndex = inputOffset +  start_index_h*params.W + start_index_w;
 		double maxVal = -Double.MAX_VALUE;

 		// Find maxIndex
 		double currDoutVal = -1;
 		while(iter.hasNext()) {
 			IJV ijv = iter.next();
 			computeTensorIndexes(ijv.getJ(), tensorIndexes, params.H, params.W);
 			if(c != tensorIndexes[0])
 				continue;
 			int h = tensorIndexes[1];
 			int w = tensorIndexes[2];
 			if(h >= start_index_h && h < end_index_h && w >= start_index_w && w < end_index_w) {
 				currDoutVal = ijv.getV();
 				if(maxVal < currDoutVal) {
 					maxIndex = inputOffset +  h*params.W + w;
 					maxVal = currDoutVal;
 				}
 			}
 		}
 		return maxIndex;
 	}

 	private static int getMaxIndex(int start_index_h, int end_index_h,
 			int q, int inputOffset, double [] inputArray, ConvolutionParameters params) {
 		int start_index_w = q * params.stride_w - params.pad_w;
 		int end_index_w = Math.min(start_index_w + params.S, params.W);
 		start_index_w = Math.max(start_index_w, 0);

 		int maxIndex = inputOffset +  start_index_h*params.W + start_index_w;
 		double maxVal = -Double.MAX_VALUE;

 		// Find maxIndex
 		double currDoutVal = -1;
 		for (int h = start_index_h; h < end_index_h; h++) {
 			for (int w = start_index_w; w < end_index_w; w++) {
 				currDoutVal = inputArray[inputOffset +  h*params.W + w];
 				if(maxVal < currDoutVal) {
 					maxIndex = inputOffset +  h*params.W + w;
 					maxVal = currDoutVal;
 				}
 			}
 		}
 		return maxIndex;
 	}

 	public static void maxpooling(MatrixBlock input, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		params.input1 = input;
 		params.output = outputBlock;

 		if(input.getNumColumns() != params.C*params.H*params.W || input.getNumRows() != params.N) {
 			throw new DMLRuntimeException("Incorrect input dimensions in maxpooling:" + input.getNumRows() + " " + input.getNumColumns() + " " + params.N + " " + params.K*params.P*params.Q);
 		}

 		params.outputNNZ.set(0);

 		int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
 		if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
 			warnSingleThreaded();
 			for (int n = 0; n < params.N; n++) {
 				for (int c = 0; c < params.C; c++) {
 					doPooling(n, c, params);
 				}
 			}
 		}
 		else {
 			runConvTask(constrainedNumThreads, params.C, TaskType.MaxPooling_Forward, params);
 		}
 		outputBlock.setNonZeros(params.outputNNZ.get());
 	}

 	private static void doPooling(int n, int c, ConvolutionParameters params) {
 		double [] inputArray = null;
 		if (!params.input1.isInSparseFormat())
 			inputArray = params.input1.getDenseBlock();
 		double [] outputArray = null;
 		if (!params.output.isInSparseFormat())
 			outputArray = params.output.getDenseBlock();

 		long tmpNNZ = 0;
 		for (int p = 0; p < params.P; p++) {
 			for (int q = 0; q < params.Q; q++) {
 				int start_index_h = p * params.stride_h - params.pad_h;
 				int start_index_w = q * params.stride_w - params.pad_w;
 				int end_index_h = Math.min(start_index_h + params.R, params.H);
 				int end_index_w = Math.min(start_index_w + params.S, params.W);
 				start_index_h = Math.max(start_index_h, 0);
 				start_index_w = Math.max(start_index_w, 0);
 				int out_index = n*params.C*params.P*params.Q + c*params.P*params.Q +  p * params.Q + q;
 				outputArray[out_index] = -Double.MAX_VALUE;
 				for (int h = start_index_h; h < end_index_h; h++) {
 					for (int w = start_index_w; w < end_index_w; w++) {
 						double inVal = -1;
 						if(inputArray != null)
 							inVal = inputArray[n*params.C*params.H*params.W + c*params.H*params.W +  h*params.W + w];
 						else
 							inVal = params.input1.quickGetValue(n, c*params.H*params.W +  h*params.W + w);
 						outputArray[out_index] = Math.max(outputArray[out_index], inVal);
 						if(outputArray[out_index] != 0)
 							tmpNNZ++;
 					}
 				}
 			}
 		}
 		params.outputNNZ.addAndGet(tmpNNZ);
 	}

 	private static void doRotate180(int inputN, int outputN, MatrixBlock input,
 			double [] outputArray,  ConvolutionParameters params, boolean zeroOutSparseOutput) throws DMLRuntimeException {
 		double [] inputArray = null;
 		if (!input.isInSparseFormat())
 			inputArray = input.getDenseBlock();
 		if(outputArray == null)
 			throw new DMLRuntimeException("Sparse output is not supported for rotate180");

 		int outputOffset = outputN*params.K*params.P*params.Q;
 		if(inputArray != null) {
 			for (int k = 0; k < params.K; k++) {
 				for (int p = 0; p < params.P; p++) {
 					for (int q = 0; q < params.Q; q++) {
 						outputArray[outputOffset + p*params.Q*params.K + q*params.K + k] = inputArray[inputN*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q + q];
 					}
 				}
 			}
 		}
 		else {
 			if(zeroOutSparseOutput)
 				Arrays.fill(outputArray, 0);

 			if(!input.isEmptyBlock()) {
 				Iterator<IJV> iter = input.sparseBlock.getIterator(inputN, inputN+1);
 				int [] tensorIndexes = new int[3];
 				while(iter.hasNext()) {
 					IJV ijv = iter.next();
 					computeTensorIndexes(ijv.getJ(), tensorIndexes, params.P, params.Q);
 					int k = tensorIndexes[0];
 					int p = tensorIndexes[1];
 					int q = tensorIndexes[2];
 					outputArray[outputOffset + p*params.Q*params.K + q*params.K + k] = ijv.getV();
 				}
 			}
 		}
 	}

 	private static int [] getTaskSize(int constrainedNumThreads, int maxNumTaskSize1, int maxNumTaskSize2) {
 		int taskSize1 = 1; int taskSize2 = 1;
 		// Why this heuristics ? To reduce the impact of the thread-creation overhead in case of small tasks
 		int approxNumTasksToCreate = 3*constrainedNumThreads;
 		while((maxNumTaskSize1*maxNumTaskSize2)/(taskSize1*taskSize2) > approxNumTasksToCreate) {
 			// Possibility of creating too many tasks, increase taskSize2
 			taskSize2 *= 2;
 			if(taskSize2 >= maxNumTaskSize2) {
 				taskSize2 = maxNumTaskSize2;
 				break;
 			}
 		}
 		while((maxNumTaskSize1*maxNumTaskSize2)/(taskSize1*taskSize2) > approxNumTasksToCreate) {
 			// Possibility of creating too many tasks, increase taskSize1
 			taskSize1 *= 2;
 			if(taskSize1 >= maxNumTaskSize1) {
 				taskSize1 = maxNumTaskSize1;
 				break;
 			}
 		}
 		int [] ret = new int[2];
 		ret[0] = taskSize1;
 		ret[1] = taskSize2;
 		return ret;
 	}

 	private static void runSequentialConvTask(int NSize, int Z, TaskType type, ConvolutionParameters params) throws DMLRuntimeException {
 		ConvTask task = new ConvTask(0, NSize, 0, Z, type, params);
 		warnSingleThreaded();
 		try {
 			task.call();
 		} catch (Exception e) {
 			throw new DMLRuntimeException("Error while executing single-threaded " + type.name(), e);
 		}
 	}

 	private static void runConvTask(int constrainedNumThreads, int Z, TaskType type, ConvolutionParameters params) throws DMLRuntimeException {
 		if (params.isOutputThreadSafe() && constrainedNumThreads > 1)
 			runParallelConvTask(constrainedNumThreads, params.N, Z, type, params);
 		else
 			runSequentialConvTask(params.N, Z, type, params);
 	}

 	private static void runParallelConvTask(int constrainedNumThreads, int NSize, int Z, TaskType type, ConvolutionParameters params) throws DMLRuntimeException {
 		ArrayList<ConvTask> tasks = new ArrayList<ConvTask>();
 		if(NSize >= constrainedNumThreads || Z == 1) {
 			int numNTasks = (int) Math.ceil(((double) NSize) / constrainedNumThreads);
 			for (int n = 0; n < NSize; n += numNTasks) {
 				tasks.add(new ConvTask(n, Math.min(NSize, n+numNTasks), 0, Z, type, params));
 			}
 		}
 		else {
 			int [] taskSizes = getTaskSize(constrainedNumThreads, NSize, Z);
 			for (int n = 0; n < NSize; n += taskSizes[0]) {
 				for (int z = 0; z < Z; z += taskSizes[1]) {
 					tasks.add(new ConvTask(n, Math.min(NSize, n+taskSizes[0]), z, Math.min(Z, z+taskSizes[1]), type, params));
 				}
 			}
 			LOG.debug("Reduce number of tasks from " + (NSize*Z)  + "(" + NSize + "," + Z + ") to " + tasks.size());
 		}

 		ExecutorService pool = Executors.newFixedThreadPool( Math.min(constrainedNumThreads, tasks.size()) );
 		List<Future<Object>> taskret;
 		try {
 			taskret = pool.invokeAll(tasks);
 			pool.shutdown();
 			for( Future<Object> task : taskret ) {
 				switch(type) {
 					case LoopedIm2ColConv2dBwdFilter:
 						elementWiseInPlaceAddition(params.output, (MatrixBlock) task.get());
 						break;
 					default:
 						task.get();
 				}
 			}
 		} catch (InterruptedException e) {
 			throw new DMLRuntimeException("Error while executing multi-threaded " + type.name(), e);
 		} catch (ExecutionException e) {
 			throw new DMLRuntimeException("Error while executing multi-threaded " + type.name(), e);
 		}
 	}

 	private static class ConvTask implements Callable<Object> {
 		int n1; int n2; int z1; int z2;
 		ConvolutionParameters params;
 		TaskType type;
 		public ConvTask(int n1, int n2, int z1, int z2, TaskType type, ConvolutionParameters params) {
 			this.n1 = n1;
 			this.n2 = n2;
 			this.z1 = z1;
 			this.z2 = z2;
 			this.type = type;
 			this.params = params;
 		}

 		@Override
 		public Object call() throws DMLRuntimeException {
 			switch(type) {
 				case MaxPooling_Forward:
 					for (int n = n1; n < n2; n++) {
 						for (int z = z1; z < z2; z++) {
 							doPooling(n, z, params);
 						}
 					}
 					break;
 				case MaxPooling_Backward:
 					for (int n = n1; n < n2; n++) {
 						doPoolingBackward(n, params);
 					}
 					break;
 				case LoopedIm2ColConv2d:
 					MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
 					im2ColOutBlock.allocateDenseBlock(true);
 					for (int n = n1; n < n2; n++) {
 						doLoopedIm2ColConv2d(n, im2ColOutBlock, params);
 					}
 					break;
 				case LoopedIm2ColConv2dBwdFilter:
 				{
 					MatrixBlock im2ColOutBlock1 = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
 					im2ColOutBlock1.allocateDenseBlock(true);
 					MatrixBlock partialRetBlock = new MatrixBlock(params.K, params.C*params.R*params.S, false);
 					partialRetBlock.allocateDenseBlock(true);
 					MatrixBlock dout_reshaped = new MatrixBlock(params.P*params.Q, params.K, false);
 					dout_reshaped.allocateDenseBlock(true);
 					for (int n = n1; n < n2; n++) {
 						partialRetBlock = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock1, dout_reshaped, partialRetBlock, params);
 					}
 					return partialRetBlock;
 				}
 				case LoopedIm2ColConv2dBwdData:
 				{
 					MatrixBlock dout_reshaped = new MatrixBlock(params.P*params.Q, params.K, false);
 					dout_reshaped.allocateDenseBlock(true);
 					for (int n = n1; n < n2; n++) {
 						doLoopedIm2ColConv2dBwdData(n, dout_reshaped, params);
 					}
 					break;
 				}
 				default:
 					throw new DMLRuntimeException("Unsupported ConvTask:" + type.name());
 			}
 			return null;
 		}
 	}

 	// Converts input: PQ X CRS matrix and writes to 1 X CHW
 	private static void doCol2imOverSingleImage(int outputN, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException {
 		if(input.rlen != params.P*params.Q || input.clen != params.C*params.R*params.S) {
 			throw new DMLRuntimeException("Incorrect input dimensions");
 		}

 		double [] outputArray = null;
 		if (!params.output.isInSparseFormat())
 			outputArray = params.output.getDenseBlock();
 		else {
 			throw new DMLRuntimeException("Only dense output is implemented");
 		}

 		if(!input.isInSparseFormat()) {
 			double [] inputArray = input.getDenseBlock();
 			doCol2IMDenseInput(0, outputN, inputArray, outputArray, params);
 		}
 		else {
 			if(!input.isEmptyBlock())
 				doCol2IMSparseInput(0, outputN, input.getSparseBlockIterator(), outputArray, params);
 		}
 	}

 	private static void doCol2IMSparseInput(int inputN, int outputN, Iterator<IJV> inputIter, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
 		int [] tensorIndexes = new int[3];

 		while(inputIter.hasNext()) {
 			IJV ijv = inputIter.next();
 			computeTensorIndexes(ijv.getJ(), tensorIndexes, params.R, params.S);
 			int c = tensorIndexes[0];
 			int r = tensorIndexes[1];
 			int s = tensorIndexes[2];
 			computeTensorIndexes(ijv.getI(), tensorIndexes, params.P, params.Q);
 			int p = tensorIndexes[1];
 			int q = tensorIndexes[2];
 			if(inputN != tensorIndexes[0]) {
 				throw new DMLRuntimeException("Incorrect tensor indexes: " + inputN + " != " + tensorIndexes[0] + " <" + p + " " + q + " " + ijv.getI() + params.P + " " + params.Q + ">");
 			}
 			int h = p*params.stride_h + r - params.pad_h;
 			int w = q*params.stride_w + s - params.pad_w;
 			if(h >= 0 && h < params.H && w >= 0 && w < params.W) {
 				int outIndex = outputN*params.C*params.H*params.W + c*params.H*params.W + h*params.W + w;
 				outputArray[outIndex] += ijv.getV();
 			}
 		}
 	}

 	// Converts input: PQ X CRS matrix and writes to 1 X CHW if inputN == 0
 	// Or converts input: NPQ X CRS matrix and writes to N X CHW
 	private static void doCol2IMDenseInput(int inputN, int outputN, double [] inputArray, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
 		final int outputNOffset = outputN*params.C*params.H*params.W;
 		for (int p = 0; p < params.P; p++) {
 			// h = p*params.stride_h + r - params.pad_h
 			//   = r + hOffset
 			// Based on restrictions: h >= 0 and r >= 0 and h < params.H and r < params.R, we get
 			// max(0, - hOffset) <= r < min(params.R, params.H - hOffset)
 			final int hOffset = p*params.stride_h - params.pad_h;
 			final int rStart = Math.max(0, - hOffset);
 			final int rEnd = Math.min(params.R, params.H - hOffset);
 			for (int q = 0; q < params.Q; q++) {
 				// Using the same logic as above on following:
 				// w = q*params.stride_w + s - params.pad_w
 				final int wOffset = q*params.stride_w - params.pad_w;
 				final int sStart = Math.max(0, - wOffset);
 				final int sEnd = Math.min(params.S, params.W - wOffset);
 				final int tempOffset = (inputN*params.P*params.Q + p*params.Q + q)*params.C*params.R*params.S;
 				for (int c = 0; c < params.C; c++) {
 					final int outOffset = outputNOffset + c*params.H*params.W;
 					final int inputOffset = tempOffset + c*params.R*params.S;
 					for (int r = rStart; r < rEnd; r++) {
 						for (int s = sStart; s < sEnd; s++) {
 							int inputIndex = inputOffset + r*params.S + s;
 							int outIndex = outOffset + (hOffset + r)*params.W + wOffset + s;
 							outputArray[outIndex] += inputArray[inputIndex];
 						}
 					}
 				}
 			}
 		}
 	}

 	private static long doIm2colOverInputPath_NCHW(int n, int c, MatrixBlock output, ConvolutionParameters params) throws DMLRuntimeException {
 		double [] inputArray = null;
 		if (!params.input1.isInSparseFormat())
 			inputArray = params.input1.getDenseBlock();
 		double [] outputArray = null;
 		if(output == null && !params.output.isInSparseFormat())
 			outputArray = params.output.getDenseBlock();
 		else if(output != null && !output.isInSparseFormat())
 			outputArray = output.getDenseBlock();
 		else {
 			throw new DMLRuntimeException("Sparse output is not supported for im2col");
 		}

 		final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
 		int outputOffset;
 		if(output == null)
 			outputOffset = (c*params.R*params.S*params.N + n)*params.P*params.Q;
 		else
 			outputOffset = (c*params.R*params.S)*params.P*params.Q;

 		long tmpNNZ = 0;
 		for (int r = 0; r < params.R; r++) { // Get an input patch of size R X S
 			for (int s = 0; s < params.S; s++) {
 				int localIndex;
 				if(output == null)
 					localIndex = outputOffset + ((r*params.S*params.N + s*params.N)*params.P*params.Q);
 				else
 					localIndex = outputOffset + ((r*params.S + s)*params.P*params.Q);

 				int input_row = r - params.pad_h;
 				// And copy it to outputArray[i] (taking care of padding & striding)
 				for (int p = params.P; p > 0; p--) {
 					if (input_row >= 0 && input_row < params.H) {
 						int input_col = s - params.pad_w;
 						for (int q = params.Q; q > 0; q--, localIndex++) {
 							if (input_col >= 0 && input_col < params.W) {
 								// Copy from [channel c, height input_row, width input_col]
 								if(inputArray != null)
 									outputArray[localIndex] = inputArray[inputOffset + input_row*params.W + input_col];
 								else
 									outputArray[localIndex] = params.input1.quickGetValue(n, c*params.H*params.W + input_row*params.W + input_col);
 								if(outputArray[localIndex] != 0)
 									tmpNNZ++;
 							}
 							else if(params.reuseNonZeroedOutput) {
 								outputArray[localIndex] = 0;
 							}
 							input_col += params.stride_w;
 						}
 					} else {
 						if(params.reuseNonZeroedOutput) {
 							for(int i = localIndex; i < localIndex + params.Q; i++) {
 								outputArray[localIndex] = 0;
 							}
 						}
 						localIndex += params.Q;
 					}
 					input_row += params.stride_h;
 				}
 			}
 		}

 		return tmpNNZ;
 	}
 }