src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java - systemds - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.sysds.runtime.matrix.data;

 import jcuda.Pointer;
 import jcuda.jcudnn.cudnnConvolutionBwdFilterPreference;
 import jcuda.jcudnn.cudnnConvolutionDescriptor;
 import jcuda.jcudnn.cudnnConvolutionFwdPreference;
 import jcuda.jcudnn.cudnnFilterDescriptor;
 import jcuda.jcudnn.cudnnTensorDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyConvolutionDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
 import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;

 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;

 /**
  * This class is a wrapper that contain necessary data structures to invoke
  * a cudnn convolution* functions (such as cudnnConvolutionForward, etc)
  *
  * It implements autocloseable to simplify the LibMatrixCuDNN code and also avoids potential memory leaks.
  *
  * The caller has to use the factory methods: cudnnGetConvolutionForwardAlgorithm,
  * cudnnGetConvolutionBackwardFilterAlgorithm and cudnnGetConvolutionBackwardDataAlgorithm
  * to get the LibMatrixCuDNNConvolutionAlgorithm object.
  * The naming of this methods is consistent with that of CuDNN library.
  *
  */
 public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseable {
 	// Limit the workspace available to cudnn convolution operation to 1 GB
 	static long MAX_WORKSPACE_LIMIT_BYTES = (long) 1e+9;

 	public int algo = -1;
 	public Pointer workSpace = new Pointer();
 	public long sizeInBytes = 0;
 	cudnnTensorDescriptor nchwTensorDesc = null;
 	cudnnTensorDescriptor nkpqTensorDesc = null;
 	cudnnFilterDescriptor filterDesc = null;
 	cudnnConvolutionDescriptor convDesc = null;
 	GPUContext gCtx = null; String instName = null;

 	private LibMatrixCuDNNConvolutionAlgorithm(GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S,
 			int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) {
 		int padding[] = {pad_h, pad_w};
 		int strides[] = {stride_h, stride_w};
 		convDesc = allocateConvolutionDescriptor(padding, strides);
 		this.gCtx = gCtx;
 		this.instName = instName;
 		nchwTensorDesc = allocateTensorDescriptor(N, C, H, W);
 		nkpqTensorDesc = allocateTensorDescriptor(N, K, P, Q);
 		filterDesc = allocateFilterDescriptor(K, C, R, S);
 	}

 	/**
 	 * Deallocates the tensor and filter descriptors as well as allocated workspace
 	 */
 	@Override
 	public void close() {
 		if(nchwTensorDesc != null)
 			cudnnDestroyTensorDescriptor(nchwTensorDesc);
 		if(nkpqTensorDesc != null)
 			cudnnDestroyTensorDescriptor(nkpqTensorDesc);
 		if(filterDesc != null)
 			cudnnDestroyFilterDescriptor(filterDesc);
 		if(convDesc != null)
 			cudnnDestroyConvolutionDescriptor(convDesc);
 		if(sizeInBytes != 0) {
 			try {
 				gCtx.cudaFreeHelper(instName, workSpace, DMLScript.EAGER_CUDA_FREE);
 			} catch (DMLRuntimeException e) {
 				throw new RuntimeException(e);
 			}
 		}
 	}

 	/**
 	 * Factory method to get the algorithm wrapper for convolution forward
 	 *
 	 * @param gCtx     a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link org.apache.sysds.utils.Statistics}.
 	 * @param N        number of input images
 	 * @param C        number of channels
 	 * @param H        height of each image
 	 * @param W        width of each image
 	 * @param K        number of output "channels"
 	 * @param R        height of filter
 	 * @param S        width of filter
 	 * @param pad_h    padding height
 	 * @param pad_w    padding width
 	 * @param stride_h stride height
 	 * @param stride_w string width
 	 * @param P        output height
 	 * @param Q        output width
 	 * @param workspaceLimit maximum intermediate memory to use
 	 * @return algorithm wrapper
 	 */
 	public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionForwardAlgorithm(
 			GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S,
 			int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, long workspaceLimit) {
 		LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S,
 				pad_h, pad_w, stride_h, stride_w, P, Q);
 		int[] algos = {-1};
 		long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
 		jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(LibMatrixCuDNN.getCudnnHandle(gCtx),
 				ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc,
 				cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
 		jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
 				ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc, algos[0], sizeInBytesArray);
 		if (sizeInBytesArray[0] != 0)
 			ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]);
 		ret.sizeInBytes = sizeInBytesArray[0];
 		ret.algo = algos[0];
 		return ret;
 	}

 	/**
 	 * Factory method to get the algorithm wrapper for convolution backward filter
 	 *
 	 * @param gCtx     a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link org.apache.sysds.utils.Statistics}.
 	 * @param N        number of input images
 	 * @param C        number of channels
 	 * @param H        height of each image
 	 * @param W        width of each image
 	 * @param K        number of output "channels"
 	 * @param R        height of filter
 	 * @param S        width of filter
 	 * @param pad_h    padding height
 	 * @param pad_w    padding width
 	 * @param stride_h stride height
 	 * @param stride_w string width
 	 * @param P        output height
 	 * @param Q        output width
 	 * @param workspaceLimit maximum intermediate memory to use
 	 * @return algorithm wrapper
 	 */
 	public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardFilterAlgorithm(
 			GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S,
 			int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, long workspaceLimit) {
 		LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S,
 				pad_h, pad_w, stride_h, stride_w, P, Q);

 		int[] algos = {-1};
 		long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
 		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm(
 				LibMatrixCuDNN.getCudnnHandle(gCtx),
 				ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc,
 				cudnnConvolutionBwdFilterPreference.CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
 		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
 				ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, algos[0], sizeInBytesArray);
 		if (sizeInBytesArray[0] != 0)
 			ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]);
 		ret.sizeInBytes = sizeInBytesArray[0];
 		ret.algo = algos[0];

 		return ret;
 	}

 	/**
 	 * Factory method to get the algorithm wrapper for convolution backward data
 	 *
 	 * @param gCtx     a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link org.apache.sysds.utils.Statistics}.
 	 * @param N        number of input images
 	 * @param C        number of channels
 	 * @param H        height of each image
 	 * @param W        width of each image
 	 * @param K        number of output "channels"
 	 * @param R        height of filter
 	 * @param S        width of filter
 	 * @param pad_h    padding height
 	 * @param pad_w    padding width
 	 * @param stride_h stride height
 	 * @param stride_w string width
 	 * @param P        output height
 	 * @param Q        output width
 	 * @param workspaceLimit maximum intermediate memory to use
 	 * @return algorithm wrapper
 	 */
 	public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardDataAlgorithm(
 			GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S,
 			int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, long workspaceLimit) {
 		LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S,
 				pad_h, pad_w, stride_h, stride_w, P, Q);
 		if(H == R || W == S) {
 			// CuDNN's cudnnGetConvolutionBackwardDataAlgorithm returns CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 for atleast one scenario
 			// for sentence CNN (N=1, C=1, H=2060, W=300, F=500, Hf=5, Wf=300, sparsity=0.1).
 			// This causes more than 100x slowdown when compared with CUDNN_CONVOLUTION_BWD_DATA_ALGO_0.
 			// To keep things simple for now, we will always prefer to use memory-less operator for conv1d: CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
 			ret.algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
 		}
 		else {
 			int[] algos = {-1};
 			long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
 			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm(
 					LibMatrixCuDNN.getCudnnHandle(gCtx),
 					ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc,
 					jcuda.jcudnn.cudnnConvolutionBwdDataPreference.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
 			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
 					ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray);
 			if (sizeInBytesArray[0] != 0)
 				ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]);
 			ret.sizeInBytes = sizeInBytesArray[0];
 			ret.algo = algos[0];
 		}
 		return ret;
 	}

 	/**
 	 * Convenience method to get tensor descriptor
 	 * @param N number of images
 	 * @param C number of channels
 	 * @param H height
 	 * @param W width
 	 * @return cudnn tensor descriptor
 	 */
 	private static cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) {
 		cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
 		cudnnCreateTensorDescriptor(tensorDescriptor);
 		cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, LibMatrixCUDA.CUDNN_DATA_TYPE, N, C, H, W);
 		return tensorDescriptor;
 	}

 	private static cudnnFilterDescriptor allocateFilterDescriptor(int K, int C, int R, int S) {
 		cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
 		cudnnCreateFilterDescriptor(filterDesc);
 		cudnnSetFilter4dDescriptor(filterDesc, LibMatrixCUDA.CUDNN_DATA_TYPE, CUDNN_TENSOR_NCHW, K, C, R, S);
 		return filterDesc;
 	}

 	private static cudnnConvolutionDescriptor allocateConvolutionDescriptor(int padding [], int strides []) {
 		cudnnConvolutionDescriptor convDesc = new cudnnConvolutionDescriptor();
 		cudnnCreateConvolutionDescriptor(convDesc);
 		cudnnSetConvolution2dDescriptor(convDesc, padding[0], padding[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION, LibMatrixCUDA.CUDNN_DATA_TYPE);
 		return convDesc;
 	}
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.sysds.runtime.matrix.data;

	import jcuda.Pointer;
	import jcuda.jcudnn.cudnnConvolutionBwdFilterPreference;
	import jcuda.jcudnn.cudnnConvolutionDescriptor;
	import jcuda.jcudnn.cudnnConvolutionFwdPreference;
	import jcuda.jcudnn.cudnnFilterDescriptor;
	import jcuda.jcudnn.cudnnTensorDescriptor;
	import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
	import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
	import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
	import static jcuda.jcudnn.JCudnn.cudnnDestroyConvolutionDescriptor;
	import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor;
	import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
	import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor;
	import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor;
	import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
	import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
	import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;

	import org.apache.sysds.api.DMLScript;
	import org.apache.sysds.runtime.DMLRuntimeException;
	import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;

	/**
	* This class is a wrapper that contain necessary data structures to invoke
	* a cudnn convolution* functions (such as cudnnConvolutionForward, etc)
	*
	* It implements autocloseable to simplify the LibMatrixCuDNN code and also avoids potential memory leaks.
	*
	* The caller has to use the factory methods: cudnnGetConvolutionForwardAlgorithm,
	* cudnnGetConvolutionBackwardFilterAlgorithm and cudnnGetConvolutionBackwardDataAlgorithm
	* to get the LibMatrixCuDNNConvolutionAlgorithm object.
	* The naming of this methods is consistent with that of CuDNN library.
	*
	*/
	public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseable {
	// Limit the workspace available to cudnn convolution operation to 1 GB
	static long MAX_WORKSPACE_LIMIT_BYTES = (long) 1e+9;

	public int algo = -1;
	public Pointer workSpace = new Pointer();
	public long sizeInBytes = 0;
	cudnnTensorDescriptor nchwTensorDesc = null;
	cudnnTensorDescriptor nkpqTensorDesc = null;
	cudnnFilterDescriptor filterDesc = null;
	cudnnConvolutionDescriptor convDesc = null;
	GPUContext gCtx = null; String instName = null;

	private LibMatrixCuDNNConvolutionAlgorithm(GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S,
	int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) {
	int padding[] = {pad_h, pad_w};
	int strides[] = {stride_h, stride_w};
	convDesc = allocateConvolutionDescriptor(padding, strides);
	this.gCtx = gCtx;
	this.instName = instName;
	nchwTensorDesc = allocateTensorDescriptor(N, C, H, W);
	nkpqTensorDesc = allocateTensorDescriptor(N, K, P, Q);
	filterDesc = allocateFilterDescriptor(K, C, R, S);
	}

	/**
	* Deallocates the tensor and filter descriptors as well as allocated workspace
	*/
	@Override
	public void close() {
	if(nchwTensorDesc != null)
	cudnnDestroyTensorDescriptor(nchwTensorDesc);
	if(nkpqTensorDesc != null)
	cudnnDestroyTensorDescriptor(nkpqTensorDesc);
	if(filterDesc != null)
	cudnnDestroyFilterDescriptor(filterDesc);
	if(convDesc != null)
	cudnnDestroyConvolutionDescriptor(convDesc);
	if(sizeInBytes != 0) {
	try {
	gCtx.cudaFreeHelper(instName, workSpace, DMLScript.EAGER_CUDA_FREE);
	} catch (DMLRuntimeException e) {
	throw new RuntimeException(e);
	}
	}
	}

	/**
	* Factory method to get the algorithm wrapper for convolution forward
	*
	* @param gCtx a valid {@link GPUContext}
	* @param instName the invoking instruction's name for record {@link org.apache.sysds.utils.Statistics}.
	* @param N number of input images
	* @param C number of channels
	* @param H height of each image
	* @param W width of each image
	* @param K number of output "channels"
	* @param R height of filter
	* @param S width of filter
	* @param pad_h padding height
	* @param pad_w padding width
	* @param stride_h stride height
	* @param stride_w string width
	* @param P output height
	* @param Q output width
	* @param workspaceLimit maximum intermediate memory to use
	* @return algorithm wrapper
	*/
	public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionForwardAlgorithm(
	GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S,
	int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, long workspaceLimit) {
	LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S,
	pad_h, pad_w, stride_h, stride_w, P, Q);
	int[] algos = {-1};
	long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
	jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(LibMatrixCuDNN.getCudnnHandle(gCtx),
	ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc,
	cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
	jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
	ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc, algos[0], sizeInBytesArray);
	if (sizeInBytesArray[0] != 0)
	ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]);
	ret.sizeInBytes = sizeInBytesArray[0];
	ret.algo = algos[0];
	return ret;
	}

	/**
	* Factory method to get the algorithm wrapper for convolution backward filter
	*
	* @param gCtx a valid {@link GPUContext}
	* @param instName the invoking instruction's name for record {@link org.apache.sysds.utils.Statistics}.
	* @param N number of input images
	* @param C number of channels
	* @param H height of each image
	* @param W width of each image
	* @param K number of output "channels"
	* @param R height of filter
	* @param S width of filter
	* @param pad_h padding height
	* @param pad_w padding width
	* @param stride_h stride height
	* @param stride_w string width
	* @param P output height
	* @param Q output width
	* @param workspaceLimit maximum intermediate memory to use
	* @return algorithm wrapper
	*/
	public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardFilterAlgorithm(
	GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S,
	int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, long workspaceLimit) {
	LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S,
	pad_h, pad_w, stride_h, stride_w, P, Q);

	int[] algos = {-1};
	long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
	jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm(
	LibMatrixCuDNN.getCudnnHandle(gCtx),
	ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc,
	cudnnConvolutionBwdFilterPreference.CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
	jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
	ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, algos[0], sizeInBytesArray);
	if (sizeInBytesArray[0] != 0)
	ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]);
	ret.sizeInBytes = sizeInBytesArray[0];
	ret.algo = algos[0];

	return ret;
	}

	/**
	* Factory method to get the algorithm wrapper for convolution backward data
	*
	* @param gCtx a valid {@link GPUContext}
	* @param instName the invoking instruction's name for record {@link org.apache.sysds.utils.Statistics}.
	* @param N number of input images
	* @param C number of channels
	* @param H height of each image
	* @param W width of each image
	* @param K number of output "channels"
	* @param R height of filter
	* @param S width of filter
	* @param pad_h padding height
	* @param pad_w padding width
	* @param stride_h stride height
	* @param stride_w string width
	* @param P output height
	* @param Q output width
	* @param workspaceLimit maximum intermediate memory to use
	* @return algorithm wrapper
	*/
	public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardDataAlgorithm(
	GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S,
	int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, long workspaceLimit) {
	LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S,
	pad_h, pad_w, stride_h, stride_w, P, Q);
	if(H == R \|\| W == S) {
	// CuDNN's cudnnGetConvolutionBackwardDataAlgorithm returns CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 for atleast one scenario
	// for sentence CNN (N=1, C=1, H=2060, W=300, F=500, Hf=5, Wf=300, sparsity=0.1).
	// This causes more than 100x slowdown when compared with CUDNN_CONVOLUTION_BWD_DATA_ALGO_0.
	// To keep things simple for now, we will always prefer to use memory-less operator for conv1d: CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
	ret.algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
	}
	else {
	int[] algos = {-1};
	long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
	jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm(
	LibMatrixCuDNN.getCudnnHandle(gCtx),
	ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc,
	jcuda.jcudnn.cudnnConvolutionBwdDataPreference.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
	jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
	ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray);
	if (sizeInBytesArray[0] != 0)
	ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]);
	ret.sizeInBytes = sizeInBytesArray[0];
	ret.algo = algos[0];
	}
	return ret;
	}

	/**
	* Convenience method to get tensor descriptor
	* @param N number of images
	* @param C number of channels
	* @param H height
	* @param W width
	* @return cudnn tensor descriptor
	*/
	private static cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) {
	cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
	cudnnCreateTensorDescriptor(tensorDescriptor);
	cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, LibMatrixCUDA.CUDNN_DATA_TYPE, N, C, H, W);
	return tensorDescriptor;
	}

	private static cudnnFilterDescriptor allocateFilterDescriptor(int K, int C, int R, int S) {
	cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
	cudnnCreateFilterDescriptor(filterDesc);
	cudnnSetFilter4dDescriptor(filterDesc, LibMatrixCUDA.CUDNN_DATA_TYPE, CUDNN_TENSOR_NCHW, K, C, R, S);
	return filterDesc;
	}

	private static cudnnConvolutionDescriptor allocateConvolutionDescriptor(int padding [], int strides []) {
	cudnnConvolutionDescriptor convDesc = new cudnnConvolutionDescriptor();
	cudnnCreateConvolutionDescriptor(convDesc);
	cudnnSetConvolution2dDescriptor(convDesc, padding[0], padding[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION, LibMatrixCUDA.CUDNN_DATA_TYPE);
	return convDesc;
	}
	}