src/main/java/org/apache/sysml/hops/ConvolutionOp.java - systemds - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.sysml.hops;

 import java.util.ArrayList;

 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.hops.Hop.MultiThreadedHop;
 import org.apache.sysml.lops.ConvolutionTransform;
 import org.apache.sysml.lops.Lop;
 import org.apache.sysml.lops.LopsException;
 import org.apache.sysml.lops.LopProperties.ExecType;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.matrix.data.LibMatrixDNN.ConvolutionParameters;

 public class ConvolutionOp extends Hop  implements MultiThreadedHop
 {
 	private Hop.ConvOp op;

 	private int _maxNumThreads = -1; //-1 for unlimited

 	private ConvolutionOp() {
 		//default constructor for clone
 	}

 	public ConvolutionOp(String l, DataType dt, ValueType vt, ConvOp o, Hop inp)
 	{
 		super(l, dt, vt);
 		op = o;
 		getInput().add(0, inp);
 		inp.getParent().add(this);

 		//compute unknown dims and nnz
 		refreshSizeInformation();
 	}


 	public ConvolutionOp(String l, DataType dt, ValueType vt, ConvOp o, ArrayList<Hop> inp)
 	{
 		super(l, dt, vt);
 		op = o;

 		for( int i=0; i<inp.size(); i++ ) {
 			Hop in = inp.get(i);
 			getInput().add(i, in);
 			in.getParent().add(this);
 		}

 		//compute unknown dims and nnz
 		refreshSizeInformation();
 	}

 	public ConvOp getOp()
 	{
 		return op;
 	}

 	@Override
 	public String getOpString() {
 		return "" + HopsConv2Lops.get(op);
 	}

 	@Override
 	public Lop constructLops()
 		throws HopsException, LopsException
 	{
 		//return already created lops
 		if( getLops() != null )
 			return getLops();

 		ExecType et = optFindExecType();

 		ArrayList<Hop> inputs = getInput();
 		switch( op )
 		{
 			case MAX_POOLING:
 			case MAX_POOLING_BACKWARD:
 			case DIRECT_CONV2D:
 			case DIRECT_CONV2D_BACKWARD_DATA:
 			case DIRECT_CONV2D_BACKWARD_FILTER:
 			{
 				//TODO: Fix me. Currently forcing the instruction to GPU if gpu flag is set
 				if(DMLScript.USE_ACCELERATOR) {
 					et = ExecType.GPU;
 					setLops(constructConvolutionLops(et, inputs));
 					break;
 				}
 				else if(et == ExecType.CP) {
 					setLops(constructConvolutionLops(et, inputs));
 					break;
 				}
 				else {
 					// TODO: Add support for SPARK/MR backends once we are happy with the performance of
 					// single node Lenet script.
 					throw new HopsException("Unimplemented ConvolutionOp for execution type: " + et.name());
 				}
 				// break;
 			}
 			default:
 				throw new HopsException("Unsupported lops construction for operation type '"+op+"'.");
 		}

 		//add reblock/checkpoint lops if necessary
 		constructAndSetLopsDataFlowProperties();

 		return getLops();
 	}

 	public void setOp(ConvOp op) {
 		this.op = op;
 	}

 	public static Lop constructFusedConvolutionLops(ExecType et,
 			ArrayList<Hop> inputs,
 			ConvOp op, ConvolutionOp primaryOp,
 			long rlen, long clen) throws HopsException, LopsException {
 		int expectedNumInputs = 13;
 		if(op == ConvOp.MAX_POOLING_BACKWARD
 				|| op == ConvOp.DIRECT_CONV2D
 				|| op == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER
 				|| op == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
 			expectedNumInputs = 14;
 		}

 		if(inputs.size() != expectedNumInputs) {
 			throw new HopsException("Incorrect number of inputs for " + op.name());
 		}

 		Lop in = inputs.get(0).constructLops();
 		int numThreads = et == ExecType.CP ? OptimizerUtils.getConstrainedNumThreads(primaryOp.getMaxNumThreads()) : 1;
 		ConvolutionTransform transform1 = new ConvolutionTransform( in,
 				HopsConv2Lops.get(op), primaryOp.getDataType(), primaryOp.getValueType(), et, numThreads);

 		// setOutputDimensions(transform1);
 		transform1.getOutputParameters().setDimensions(
 				rlen, clen, primaryOp.getRowsInBlock(), primaryOp.getColsInBlock(), -1, primaryOp.getUpdateType());

 		// setLineNumbers(transform1);
 		transform1.setAllPositions(primaryOp.getBeginLine(), primaryOp.getBeginColumn(), primaryOp.getEndLine(), primaryOp.getEndColumn());

 		in.addOutput(transform1);

 		// stride1, stride2, padding1, padding2
 		// input_shape1, input_shape2, input_shape3, input_shape4,
 		// filter_shape1, filter_shape2, filter_shape3, filter_shape4
 		for( int i=1; i < inputs.size(); i++ )
 		{
 			Lop ltmp = inputs.get(i).constructLops();
 			transform1.addInput(ltmp);
 			//if(i == 1 && expectedNumInputs == 14)
 				ltmp.addOutput(transform1);
 		}
 		transform1.setLevel(); //force order of added lops
 		return transform1;
 	}

 	public Lop constructConvolutionLops(ExecType et, ArrayList<Hop> inputs) throws HopsException, LopsException {
 		int expectedNumInputs = 13;
 		if(op == ConvOp.MAX_POOLING_BACKWARD
 				|| op == ConvOp.DIRECT_CONV2D
 				|| op == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER
 				|| op == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
 			expectedNumInputs = 14;
 		}

 		if(inputs.size() != expectedNumInputs) {
 			throw new HopsException("Incorrect number of inputs for " + op.name());
 		}

 		Lop in = inputs.get(0).constructLops();
 		int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
 		ConvolutionTransform transform1 = new ConvolutionTransform( in,
 				HopsConv2Lops.get(op), getDataType(), getValueType(), et, k);
 		setOutputDimensions(transform1);
 		setLineNumbers(transform1);
 		in.addOutput(transform1);

 		// stride1, stride2, padding1, padding2
 		// input_shape1, input_shape2, input_shape3, input_shape4,
 		// filter_shape1, filter_shape2, filter_shape3, filter_shape4
 		for( int i=1; i < inputs.size(); i++ )
 		{
 			Lop ltmp = inputs.get(i).constructLops();
 			transform1.addInput(ltmp);
 			//if(i == 1 && expectedNumInputs == 14)
 				ltmp.addOutput(transform1);
 		}
 		transform1.setLevel(); //force order of added lops
 		return transform1;
 	}


 	@Override
 	protected double computeOutputMemEstimate( long dim1, long dim2, long nnz )
 	{
 		double sparsity = 1.0;
 		return OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, sparsity);
 	}

 	@Override
 	protected double computeIntermediateMemEstimate( long dim1, long dim2, long nnz )
 	{
 		//default: no intermediate memory requirements
 		return 0;
 	}

 	@Override
 	protected long[] inferOutputCharacteristics( MemoTable memo )
 	{
 		// [numRows, numCols, NNZ]
 		long[] ret = null;

 		ConvolutionParameters params;
 		try {
 			params = parseInput();
 		} catch (DMLRuntimeException e) {
 			throw new RuntimeException(e);
 		}

 		switch(op)
 		{
 			case MAX_POOLING:
 			{
 				ret = new long[3];
 				ret[0] = getInput().get(0)._dim1;
 				ret[1] = getExtractedVal(params.C, params.P, params.Q);
 				ret[2] = -1;
 				break;
 			}
 			case MAX_POOLING_BACKWARD:
 			{
 				ret = new long[3];
 				ret[0] = getInput().get(0)._dim1;
 				ret[1] = getInput().get(0)._dim2;
 				ret[2] = -1;
 				break;
 			}
 			case DIRECT_CONV2D:
 			{
 				ret = new long[3];
 				ret[0] = getInput().get(0)._dim1;
 				ret[1] = getExtractedVal(getInput().get(1)._dim1, params.P, params.Q);
 				ret[2] = -1;
 				break;
 			}
 			case DIRECT_CONV2D_BACKWARD_FILTER:
 			{
 				ret = new long[3];
 				ret[0] = getInput().get(1)._dim1;
 				ret[1] = getInput().get(1)._dim2;
 				ret[2] = -1;
 				break;
 			}
 			case DIRECT_CONV2D_BACKWARD_DATA:
 			{
 				ret = new long[3];
 				ret[0] = getInput().get(0)._dim1;
 				ret[1] = getInput().get(0)._dim2;
 				ret[2] = -1;
 				break;
 			}
 			default:
 				throw new RuntimeException("Unsupported op:" + op.name());
 		}

 		if(LOG.isDebugEnabled() && (ret[0] <= 0 || ret[1] <= 0)) {
 			LOG.debug("Unknown dimensions for ConvolutionOp in inferOutputCharacteristics:" + op.name() + " " + ret[0] + " " + ret[1] +
 					" img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" +
 					" filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" +
 					" output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
 					" pad=[" + params.pad_h + " " + params.pad_w + "]");
 		}

 		return ret;
 	}


 	@Override
 	public boolean allowsAllExecTypes()
 	{
 		return true;
 	}

 	@Override
 	protected ExecType optFindExecType() throws HopsException {

 		checkAndSetForcedPlatform();

 		//TODO: Remove this once memEstimate is fixed for these instructions
 		if((op == ConvOp.MAX_POOLING || op == ConvOp.MAX_POOLING_BACKWARD) && DMLScript.USE_ACCELERATOR) {
 			return ExecType.GPU;
 		}

 		ExecType REMOTE = OptimizerUtils.isSparkExecutionMode() ? ExecType.SPARK : ExecType.MR;

 		if( _etypeForced != null )
 		{
 			_etype = _etypeForced;
 		}
 		else
 		{
 			// TODO: After adding Spark backend, uncomment this
 			if ( OptimizerUtils.isMemoryBasedOptLevel() ) {
 				_etype = findExecTypeByMemEstimate();
 			}
 			else
 			{
 				_etype = REMOTE;
 			}

 			//check for valid CP dimensions and matrix size
 			checkAndSetInvalidCPDimsAndSize();
 		}

 		//mark for recompile (forever)
 		if( ConfigurationManager.isDynamicRecompilation() && !dimsKnown(true) && _etype==REMOTE )
 			setRequiresRecompile();

 		_etype = ExecType.CP;

 		return _etype;
 	}

 	// stride1, stride2, padding1, padding2
 	// input_shape1, input_shape2, input_shape3, input_shape4,
 	// filter_shape1, filter_shape2, filter_shape3, filter_shape4
 	ConvolutionParameters parseInput() throws DMLRuntimeException {
 		ConvolutionParameters params = null;
 		if(op == ConvOp.MAX_POOLING_BACKWARD
 				|| op == ConvOp.DIRECT_CONV2D
 				|| op == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER
 				|| op == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
 			params = new ConvolutionParameters(
 					computeSizeInformation(getInput().get(6)),
 					computeSizeInformation(getInput().get(7)),
 					computeSizeInformation(getInput().get(8)),
 					computeSizeInformation(getInput().get(9)),
 					computeSizeInformation(getInput().get(10)),
 					computeSizeInformation(getInput().get(12)),
 					computeSizeInformation(getInput().get(13)),
 					computeSizeInformation(getInput().get(2)),
 					computeSizeInformation(getInput().get(3)),
 					computeSizeInformation(getInput().get(4)),
 					computeSizeInformation(getInput().get(5)), _maxNumThreads);
 		}
 		else {
 			params = new ConvolutionParameters(
 					computeSizeInformation(getInput().get(5)),
 					computeSizeInformation(getInput().get(6)),
 					computeSizeInformation(getInput().get(7)),
 					computeSizeInformation(getInput().get(8)),
 					computeSizeInformation(getInput().get(9)),
 					computeSizeInformation(getInput().get(11)),
 					computeSizeInformation(getInput().get(12)),
 					computeSizeInformation(getInput().get(1)),
 					computeSizeInformation(getInput().get(2)),
 					computeSizeInformation(getInput().get(3)),
 					computeSizeInformation(getInput().get(4)), _maxNumThreads);
 		}
 		return params;
 	}

 	long getExtractedVal(long val1, long val2) {
 		if(val1 == -1 || val2 == -1) {
 			return -1;
 		}
 		return val1*val2;
 	}

 	public static long getExtractedVal(long val1, long val2, long val3) {
 		if(val1 == -1 || val2 == -1 || val3 == -1) {
 			return -1;
 		}
 		return val1*val2*val3;
 	}

 	@Override
 	public void refreshSizeInformation()
 	{
 		ConvolutionParameters params;
 		try {
 			params = parseInput();
 		} catch (DMLRuntimeException e) {
 			throw new RuntimeException(e);
 		}

 		switch(op)
 		{
 			case MAX_POOLING:
 			{
 				_dim1 = getInput().get(0)._dim1;
 				_dim2 = getExtractedVal(params.C, params.P, params.Q);
 				_nnz = -1; // cannot infer stats
 				break;
 			}
 			case MAX_POOLING_BACKWARD:
 			{
 				_dim1 = getInput().get(0)._dim1;
 				_dim2 = getInput().get(0)._dim2;
 				_nnz = -1;
 				break;
 			}
 			case DIRECT_CONV2D:
 			{
 				_dim1 = getInput().get(0)._dim1;
 				_dim2 = getExtractedVal(getInput().get(1)._dim1, params.P, params.Q);
 				_nnz = -1; // cannot infer stats
 				break;
 			}
 			case DIRECT_CONV2D_BACKWARD_DATA:
 			{
 				_dim1 = getInput().get(0)._dim1;
 				_dim2 = getInput().get(0)._dim2;
 				_nnz = -1; // cannot infer stats
 				break;
 			}
 			case DIRECT_CONV2D_BACKWARD_FILTER:
 			{
 				_dim1 = getInput().get(1)._dim1;
 				_dim2 = getInput().get(1)._dim2;
 				_nnz = -1; // cannot infer stats
 				break;
 			}
 			default:
 				throw new RuntimeException("The sizes are not refreshed for " + op.name());
 		}

 		if(LOG.isDebugEnabled() && (_dim1 <= 0 || _dim2 <= 0)) {
 			LOG.debug("Unknown dimensions for ConvolutionOp in refreshSizeInformation:" + op.name() + " " + _dim1 + " " + _dim2 +
 					" img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" +
 					" filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" +
 					" output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
 					" pad=[" + params.pad_h + " " + params.pad_w + "]");
 		}
 	}

 	@Override
 	public Object clone() throws CloneNotSupportedException
 	{
 		ConvolutionOp ret = new ConvolutionOp();

 		//copy generic attributes
 		ret.clone(this, false);

 		//copy specific attributes
 		ret.op = op;
 		ret._maxNumThreads = _maxNumThreads;
 		return ret;
 	}

 	@Override
 	public boolean compare( Hop that )
 	{
 		if( !(that instanceof ConvolutionOp) )
 			return false;

 		ConvolutionOp that2 = (ConvolutionOp)that;

 		boolean ret =  (op == that2.op)
 				    && (getInput().size()==that.getInput().size())
 				    && _maxNumThreads == that2._maxNumThreads;

 		//compare all childs
 		if( ret ) //sizes matched
 			for( int i=0; i<_input.size(); i++ )
 				ret &= getInput().get(i) == that2.getInput().get(i);

 		return ret;
 	}


 	@Override
 	public void printMe() throws HopsException
 	{
 		if (LOG.isDebugEnabled()){
 			if (getVisited() != VisitStatus.DONE) {
 				super.printMe();
 				LOG.debug("  Operation: " + op);
 				for (Hop h : getInput()) {
 					h.printMe();
 				}
 			}
 			setVisited(VisitStatus.DONE);
 		}
 	}

 	@Override
 	public void setMaxNumThreads( int k ) {
 		_maxNumThreads = k;
 	}

 	@Override
 	public int getMaxNumThreads() {
 		return _maxNumThreads;
 	}
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.sysml.hops;

	import java.util.ArrayList;

	import org.apache.sysml.api.DMLScript;
	import org.apache.sysml.conf.ConfigurationManager;
	import org.apache.sysml.hops.Hop.MultiThreadedHop;
	import org.apache.sysml.lops.ConvolutionTransform;
	import org.apache.sysml.lops.Lop;
	import org.apache.sysml.lops.LopsException;
	import org.apache.sysml.lops.LopProperties.ExecType;
	import org.apache.sysml.parser.Expression.DataType;
	import org.apache.sysml.parser.Expression.ValueType;
	import org.apache.sysml.runtime.DMLRuntimeException;
	import org.apache.sysml.runtime.matrix.data.LibMatrixDNN.ConvolutionParameters;

	public class ConvolutionOp extends Hop implements MultiThreadedHop
	{
	private Hop.ConvOp op;

	private int _maxNumThreads = -1; //-1 for unlimited

	private ConvolutionOp() {
	//default constructor for clone
	}

	public ConvolutionOp(String l, DataType dt, ValueType vt, ConvOp o, Hop inp)
	{
	super(l, dt, vt);
	op = o;
	getInput().add(0, inp);
	inp.getParent().add(this);

	//compute unknown dims and nnz
	refreshSizeInformation();
	}


	public ConvolutionOp(String l, DataType dt, ValueType vt, ConvOp o, ArrayList<Hop> inp)
	{
	super(l, dt, vt);
	op = o;

	for( int i=0; i<inp.size(); i++ ) {
	Hop in = inp.get(i);
	getInput().add(i, in);
	in.getParent().add(this);
	}

	//compute unknown dims and nnz
	refreshSizeInformation();
	}

	public ConvOp getOp()
	{
	return op;
	}

	@Override
	public String getOpString() {
	return "" + HopsConv2Lops.get(op);
	}

	@Override
	public Lop constructLops()
	throws HopsException, LopsException
	{
	//return already created lops
	if( getLops() != null )
	return getLops();

	ExecType et = optFindExecType();

	ArrayList<Hop> inputs = getInput();
	switch( op )
	{
	case MAX_POOLING:
	case MAX_POOLING_BACKWARD:
	case DIRECT_CONV2D:
	case DIRECT_CONV2D_BACKWARD_DATA:
	case DIRECT_CONV2D_BACKWARD_FILTER:
	{
	//TODO: Fix me. Currently forcing the instruction to GPU if gpu flag is set
	if(DMLScript.USE_ACCELERATOR) {
	et = ExecType.GPU;
	setLops(constructConvolutionLops(et, inputs));
	break;
	}
	else if(et == ExecType.CP) {
	setLops(constructConvolutionLops(et, inputs));
	break;
	}
	else {
	// TODO: Add support for SPARK/MR backends once we are happy with the performance of
	// single node Lenet script.
	throw new HopsException("Unimplemented ConvolutionOp for execution type: " + et.name());
	}
	// break;
	}
	default:
	throw new HopsException("Unsupported lops construction for operation type '"+op+"'.");
	}

	//add reblock/checkpoint lops if necessary
	constructAndSetLopsDataFlowProperties();

	return getLops();
	}

	public void setOp(ConvOp op) {
	this.op = op;
	}

	public static Lop constructFusedConvolutionLops(ExecType et,
	ArrayList<Hop> inputs,
	ConvOp op, ConvolutionOp primaryOp,
	long rlen, long clen) throws HopsException, LopsException {
	int expectedNumInputs = 13;
	if(op == ConvOp.MAX_POOLING_BACKWARD
	\|\| op == ConvOp.DIRECT_CONV2D
	\|\| op == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER
	\|\| op == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
	expectedNumInputs = 14;
	}

	if(inputs.size() != expectedNumInputs) {
	throw new HopsException("Incorrect number of inputs for " + op.name());
	}

	Lop in = inputs.get(0).constructLops();
	int numThreads = et == ExecType.CP ? OptimizerUtils.getConstrainedNumThreads(primaryOp.getMaxNumThreads()) : 1;
	ConvolutionTransform transform1 = new ConvolutionTransform( in,
	HopsConv2Lops.get(op), primaryOp.getDataType(), primaryOp.getValueType(), et, numThreads);

	// setOutputDimensions(transform1);
	transform1.getOutputParameters().setDimensions(
	rlen, clen, primaryOp.getRowsInBlock(), primaryOp.getColsInBlock(), -1, primaryOp.getUpdateType());

	// setLineNumbers(transform1);
	transform1.setAllPositions(primaryOp.getBeginLine(), primaryOp.getBeginColumn(), primaryOp.getEndLine(), primaryOp.getEndColumn());

	in.addOutput(transform1);

	// stride1, stride2, padding1, padding2
	// input_shape1, input_shape2, input_shape3, input_shape4,
	// filter_shape1, filter_shape2, filter_shape3, filter_shape4
	for( int i=1; i < inputs.size(); i++ )
	{
	Lop ltmp = inputs.get(i).constructLops();
	transform1.addInput(ltmp);
	//if(i == 1 && expectedNumInputs == 14)
	ltmp.addOutput(transform1);
	}
	transform1.setLevel(); //force order of added lops
	return transform1;
	}

	public Lop constructConvolutionLops(ExecType et, ArrayList<Hop> inputs) throws HopsException, LopsException {
	int expectedNumInputs = 13;
	if(op == ConvOp.MAX_POOLING_BACKWARD
	\|\| op == ConvOp.DIRECT_CONV2D
	\|\| op == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER
	\|\| op == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
	expectedNumInputs = 14;
	}

	if(inputs.size() != expectedNumInputs) {
	throw new HopsException("Incorrect number of inputs for " + op.name());
	}

	Lop in = inputs.get(0).constructLops();
	int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
	ConvolutionTransform transform1 = new ConvolutionTransform( in,
	HopsConv2Lops.get(op), getDataType(), getValueType(), et, k);
	setOutputDimensions(transform1);
	setLineNumbers(transform1);
	in.addOutput(transform1);

	// stride1, stride2, padding1, padding2
	// input_shape1, input_shape2, input_shape3, input_shape4,
	// filter_shape1, filter_shape2, filter_shape3, filter_shape4
	for( int i=1; i < inputs.size(); i++ )
	{
	Lop ltmp = inputs.get(i).constructLops();
	transform1.addInput(ltmp);
	//if(i == 1 && expectedNumInputs == 14)
	ltmp.addOutput(transform1);
	}
	transform1.setLevel(); //force order of added lops
	return transform1;
	}


	@Override
	protected double computeOutputMemEstimate( long dim1, long dim2, long nnz )
	{
	double sparsity = 1.0;
	return OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, sparsity);
	}

	@Override
	protected double computeIntermediateMemEstimate( long dim1, long dim2, long nnz )
	{
	//default: no intermediate memory requirements
	return 0;
	}

	@Override
	protected long[] inferOutputCharacteristics( MemoTable memo )
	{
	// [numRows, numCols, NNZ]
	long[] ret = null;

	ConvolutionParameters params;
	try {
	params = parseInput();
	} catch (DMLRuntimeException e) {
	throw new RuntimeException(e);
	}

	switch(op)
	{
	case MAX_POOLING:
	{
	ret = new long[3];
	ret[0] = getInput().get(0)._dim1;
	ret[1] = getExtractedVal(params.C, params.P, params.Q);
	ret[2] = -1;
	break;
	}
	case MAX_POOLING_BACKWARD:
	{
	ret = new long[3];
	ret[0] = getInput().get(0)._dim1;
	ret[1] = getInput().get(0)._dim2;
	ret[2] = -1;
	break;
	}
	case DIRECT_CONV2D:
	{
	ret = new long[3];
	ret[0] = getInput().get(0)._dim1;
	ret[1] = getExtractedVal(getInput().get(1)._dim1, params.P, params.Q);
	ret[2] = -1;
	break;
	}
	case DIRECT_CONV2D_BACKWARD_FILTER:
	{
	ret = new long[3];
	ret[0] = getInput().get(1)._dim1;
	ret[1] = getInput().get(1)._dim2;
	ret[2] = -1;
	break;
	}
	case DIRECT_CONV2D_BACKWARD_DATA:
	{
	ret = new long[3];
	ret[0] = getInput().get(0)._dim1;
	ret[1] = getInput().get(0)._dim2;
	ret[2] = -1;
	break;
	}
	default:
	throw new RuntimeException("Unsupported op:" + op.name());
	}

	if(LOG.isDebugEnabled() && (ret[0] <= 0 \|\| ret[1] <= 0)) {
	LOG.debug("Unknown dimensions for ConvolutionOp in inferOutputCharacteristics:" + op.name() + " " + ret[0] + " " + ret[1] +
	" img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" +
	" filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" +
	" output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
	" pad=[" + params.pad_h + " " + params.pad_w + "]");
	}

	return ret;
	}


	@Override
	public boolean allowsAllExecTypes()
	{
	return true;
	}

	@Override
	protected ExecType optFindExecType() throws HopsException {

	checkAndSetForcedPlatform();

	//TODO: Remove this once memEstimate is fixed for these instructions
	if((op == ConvOp.MAX_POOLING \|\| op == ConvOp.MAX_POOLING_BACKWARD) && DMLScript.USE_ACCELERATOR) {
	return ExecType.GPU;
	}

	ExecType REMOTE = OptimizerUtils.isSparkExecutionMode() ? ExecType.SPARK : ExecType.MR;

	if( _etypeForced != null )
	{
	_etype = _etypeForced;
	}
	else
	{
	// TODO: After adding Spark backend, uncomment this
	if ( OptimizerUtils.isMemoryBasedOptLevel() ) {
	_etype = findExecTypeByMemEstimate();
	}
	else
	{
	_etype = REMOTE;
	}

	//check for valid CP dimensions and matrix size
	checkAndSetInvalidCPDimsAndSize();
	}

	//mark for recompile (forever)
	if( ConfigurationManager.isDynamicRecompilation() && !dimsKnown(true) && _etype==REMOTE )
	setRequiresRecompile();

	_etype = ExecType.CP;

	return _etype;
	}

	// stride1, stride2, padding1, padding2
	// input_shape1, input_shape2, input_shape3, input_shape4,
	// filter_shape1, filter_shape2, filter_shape3, filter_shape4
	ConvolutionParameters parseInput() throws DMLRuntimeException {
	ConvolutionParameters params = null;
	if(op == ConvOp.MAX_POOLING_BACKWARD
	\|\| op == ConvOp.DIRECT_CONV2D
	\|\| op == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER
	\|\| op == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
	params = new ConvolutionParameters(
	computeSizeInformation(getInput().get(6)),
	computeSizeInformation(getInput().get(7)),
	computeSizeInformation(getInput().get(8)),
	computeSizeInformation(getInput().get(9)),
	computeSizeInformation(getInput().get(10)),
	computeSizeInformation(getInput().get(12)),
	computeSizeInformation(getInput().get(13)),
	computeSizeInformation(getInput().get(2)),
	computeSizeInformation(getInput().get(3)),
	computeSizeInformation(getInput().get(4)),
	computeSizeInformation(getInput().get(5)), _maxNumThreads);
	}
	else {
	params = new ConvolutionParameters(
	computeSizeInformation(getInput().get(5)),
	computeSizeInformation(getInput().get(6)),
	computeSizeInformation(getInput().get(7)),
	computeSizeInformation(getInput().get(8)),
	computeSizeInformation(getInput().get(9)),
	computeSizeInformation(getInput().get(11)),
	computeSizeInformation(getInput().get(12)),
	computeSizeInformation(getInput().get(1)),
	computeSizeInformation(getInput().get(2)),
	computeSizeInformation(getInput().get(3)),
	computeSizeInformation(getInput().get(4)), _maxNumThreads);
	}
	return params;
	}

	long getExtractedVal(long val1, long val2) {
	if(val1 == -1 \|\| val2 == -1) {
	return -1;
	}
	return val1*val2;
	}

	public static long getExtractedVal(long val1, long val2, long val3) {
	if(val1 == -1 \|\| val2 == -1 \|\| val3 == -1) {
	return -1;
	}
	return val1val2val3;
	}

	@Override
	public void refreshSizeInformation()
	{
	ConvolutionParameters params;
	try {
	params = parseInput();
	} catch (DMLRuntimeException e) {
	throw new RuntimeException(e);
	}

	switch(op)
	{
	case MAX_POOLING:
	{
	_dim1 = getInput().get(0)._dim1;
	_dim2 = getExtractedVal(params.C, params.P, params.Q);
	_nnz = -1; // cannot infer stats
	break;
	}
	case MAX_POOLING_BACKWARD:
	{
	_dim1 = getInput().get(0)._dim1;
	_dim2 = getInput().get(0)._dim2;
	_nnz = -1;
	break;
	}
	case DIRECT_CONV2D:
	{
	_dim1 = getInput().get(0)._dim1;
	_dim2 = getExtractedVal(getInput().get(1)._dim1, params.P, params.Q);
	_nnz = -1; // cannot infer stats
	break;
	}
	case DIRECT_CONV2D_BACKWARD_DATA:
	{
	_dim1 = getInput().get(0)._dim1;
	_dim2 = getInput().get(0)._dim2;
	_nnz = -1; // cannot infer stats
	break;
	}
	case DIRECT_CONV2D_BACKWARD_FILTER:
	{
	_dim1 = getInput().get(1)._dim1;
	_dim2 = getInput().get(1)._dim2;
	_nnz = -1; // cannot infer stats
	break;
	}
	default:
	throw new RuntimeException("The sizes are not refreshed for " + op.name());
	}

	if(LOG.isDebugEnabled() && (_dim1 <= 0 \|\| _dim2 <= 0)) {
	LOG.debug("Unknown dimensions for ConvolutionOp in refreshSizeInformation:" + op.name() + " " + _dim1 + " " + _dim2 +
	" img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" +
	" filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" +
	" output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
	" pad=[" + params.pad_h + " " + params.pad_w + "]");
	}
	}

	@Override
	public Object clone() throws CloneNotSupportedException
	{
	ConvolutionOp ret = new ConvolutionOp();

	//copy generic attributes
	ret.clone(this, false);

	//copy specific attributes
	ret.op = op;
	ret._maxNumThreads = _maxNumThreads;
	return ret;
	}

	@Override
	public boolean compare( Hop that )
	{
	if( !(that instanceof ConvolutionOp) )
	return false;

	ConvolutionOp that2 = (ConvolutionOp)that;

	boolean ret = (op == that2.op)
	&& (getInput().size()==that.getInput().size())
	&& _maxNumThreads == that2._maxNumThreads;

	//compare all childs
	if( ret ) //sizes matched
	for( int i=0; i<_input.size(); i++ )
	ret &= getInput().get(i) == that2.getInput().get(i);

	return ret;
	}


	@Override
	public void printMe() throws HopsException
	{
	if (LOG.isDebugEnabled()){
	if (getVisited() != VisitStatus.DONE) {
	super.printMe();
	LOG.debug(" Operation: " + op);
	for (Hop h : getInput()) {
	h.printMe();
	}
	}
	setVisited(VisitStatus.DONE);
	}
	}

	@Override
	public void setMaxNumThreads( int k ) {
	_maxNumThreads = k;
	}

	@Override
	public int getMaxNumThreads() {
	return _maxNumThreads;
	}
	}