src/relay/qnn/op/dense.cc - tvm - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file src/relay/qnn/op/dense.cc
  * \brief Property def of qnn dense operator.
  */

 #include <tvm/relay/base.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/qnn/attrs.h>

 #include "../../op/nn/nn.h"
 #include "../../transforms/pattern_utils.h"
 #include "../utils.h"

 namespace tvm {
 namespace relay {
 namespace qnn {

 // relay.op.qnn.dense

 bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
   // Expected Types: data, weight, input_zero_point, weight_zero_point, input_scale, weight_scale,
   // out_type
   ICHECK_EQ(types.size(), 7);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr || weight == nullptr) return false;
   const auto* param = attrs.as<DenseAttrs>();
   ICHECK(param != nullptr) << "DenseAttrs cannot be nullptr.";
   ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
       << "Expected quantized dense type(int8, uint8) for input but was " << data->dtype;
   ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
       << "Expected quantized dense type(int8, uint8) for weight but was " << weight->dtype;
   ICHECK(param->out_dtype == DataType::Int(32))
       << "Expected quantized dense type(int32) for output but was " << param->out_dtype;

   // Check the types of scale and zero points.
   for (size_t i = 2; i < 5; ++i) {
     if (types[i].as<IncompleteTypeNode>()) {
       return false;
     }
   }
   ICHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
   ICHECK(IsScalarType(types[4], DataType::Float(32)));  // input_scale
   // weight_zero_point can be a scalar or a vector of the same shape as the weight_scale
   AssignType(types[5], DataType::Float(32), param->units, reporter);  // weight_scale

   ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";

   // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
   // Dense infer type function.
   Array<Type> tensor_types = {types[0], types[1], types[6]};
   return MatmulRel<DenseAttrs>(tensor_types, 3, attrs, reporter);
 }

 // Positional relay function to create quantized dense operator used by frontend FFI.
 Expr MakeQuantizedDense(Expr data, Expr weight, Expr input_zero_point, Expr kernel_zero_point,
                         Expr input_scale, Expr kernel_scale, IndexExpr units, DataType out_dtype) {
   auto attrs = make_object<DenseAttrs>();
   attrs->units = std::move(units);
   attrs->out_dtype = out_dtype;
   static const Op& op = Op::Get("qnn.dense");
   return Call(op, {data, weight, input_zero_point, kernel_zero_point, input_scale, kernel_scale},
               Attrs(attrs), {});
 }

 Expr DenseFirstTerm(const Expr& quantized_data, const Expr& quantized_kernel,
                     const DenseAttrs* attrs) {
   return Dense(quantized_data, quantized_kernel, attrs->units, attrs->out_dtype);
 }

 Expr DenseSecondTerm(const Expr& quantized_data, const Expr& kernel_zero_point,
                      const int out_dim_size) {
   Array<Integer> axes = {1};
   Expr reduced_t2 = Sum(Cast(quantized_data, DataType::Int(32)), axes, true, false);
   Expr multiplied_t2;
   if (!IsConstScalar(kernel_zero_point)) {
     multiplied_t2 = Multiply(kernel_zero_point, MakeRepeat(reduced_t2, out_dim_size, 1));
   } else {
     multiplied_t2 = Multiply(kernel_zero_point, reduced_t2);
   }
   return multiplied_t2;
 }

 Expr DenseThirdTerm(const Expr& quantized_kernel, const Expr& input_zero_point) {
   Array<Integer> axes = {1};
   return Multiply(input_zero_point,
                   Sum(Cast(quantized_kernel, DataType::Int(32)), axes, false, false));
 }

 Expr DenseFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int reduction_dim_size) {
   int32_t scalar_term = input_zero_point_int * kernel_zero_point_int * reduction_dim_size;
   return MakeConstantScalar(DataType::Int(32), scalar_term);
 }

 Expr DenseFourthTerm(const Expr& input_zero_point, const Expr& kernel_zero_point,
                      int reduction_dim_size) {
   auto reduction_dim = MakeConstantScalar(DataType::Int(32), reduction_dim_size);
   return Multiply(Multiply(input_zero_point, kernel_zero_point), reduction_dim);
 }

 Expr DenseCombineTerms(const Expr& term1, const Expr& term2, const Expr& term3, const Expr& term4) {
   auto data_term = Subtract(term1, term2);
   // Putting constant terms together, so that constant folding can fold it.
   auto const_term = Subtract(term4, term3);
   return Add(data_term, const_term);
 }
 /*
  * \brief Forward rewrite the qnn dense op.
  * \param attrs The QNN dense attrs.
  * \param new_args The new mutated args to the call node.
  * \param arg_types The types of input and output.
  * \return The sequence of Relay ops for qnn cov2d op.
  * \note Lowering of the qnn.dense operator
  *       A quantized tensor is represented in following manner
  *          A = scale_a x (QA - zp_A)
  *       where QA is quantized tensor, scale_a and zp_A are quantization
  *       params.
  *
  *       Quantized dense multiplies two quantized tensors and returns a
  *       quantized tensor of default dtype of int32, with scale equaling to the
  *       product of scales of input tensors, and a zero point of zero.
  *
  *       The lowering for asymmetric quantized dense looks as follows. More details at
  *       https://discuss.tvm.ai/t/tf-lite-quantized-conv2d-operator-conversion/2651/8
  *       The computation gets unrolled into following 4 terms
  *          C(m, n) = Sigma(k) (A(m, k) * W(n, k))
  *
  *          RHS becomes
  *            Sigma(k) ([QA(m, k) - zp_a] * [QW(n, k) - zp_w])
  *
  *          Unrolling leads to following sequence
  *            Sigma(k) QA(m, k) * QW(n, k)                         // Term1
  *          - Sigma(k) zp_w * QA(m, k)                             // Term2
  *          - Sigma(k) zp_a * QW(n, k)                             // Term3
  *          - Sigma(k) * zp_a * zp_w                               // Term4
  *
  *       Term3 and Term4 can be computed at compile time.
  */
 Expr QnnDenseCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                           const Array<tvm::relay::Type>& arg_types) {
   ICHECK_EQ(new_args.size(), 6);
   Expr quantized_data = new_args[0];
   Expr quantized_kernel = new_args[1];
   Expr input_zero_point = new_args[2];
   Expr kernel_zero_point = new_args[3];

   const auto in_shape = get_shape(arg_types[0]);
   const auto w_shape = get_shape(arg_types[1]);
   const int reduction_dim_size = get_const_int(in_shape[1]);
   const int out_dim_size = get_const_int(w_shape[0]);

   const auto* qnn_dense_attrs = attrs.as<DenseAttrs>();

   auto term1 = DenseFirstTerm(quantized_data, quantized_kernel, qnn_dense_attrs);
   auto term2 = DenseSecondTerm(quantized_data, kernel_zero_point, out_dim_size);
   auto term3 = DenseThirdTerm(quantized_kernel, input_zero_point);

   // Extract the integer zero points.

   if (!IsConstScalar(input_zero_point) || !IsConstScalar(kernel_zero_point)) {
     auto term4 = DenseFourthTerm(input_zero_point, kernel_zero_point, reduction_dim_size);
     return DenseCombineTerms(term1, term2, term3, term4);
   }

   auto kernel_zero_point_int = GetScalarFromConstant<int>(kernel_zero_point);
   auto input_zero_point_int = GetScalarFromConstant<int>(input_zero_point);

   // Get all the terms as described in the comments.
   auto term4 = DenseFourthTerm(input_zero_point_int, kernel_zero_point_int, reduction_dim_size);

   // Combine those 4 terms depending on the zero points to get the best lowering.
   if (input_zero_point_int == 0 && kernel_zero_point_int == 0) {
     // term 2, 3 and 4 become zero.
     return term1;
   } else if (input_zero_point_int == 0 && kernel_zero_point_int != 0) {
     // term 3 and term 4 become zero.
     return Subtract(term1, term2);
   } else if (input_zero_point_int != 0 && kernel_zero_point_int == 0) {
     // term 2 and term 4 become zero.
     return Subtract(term1, term3);
   } else {
     return DenseCombineTerms(term1, term2, term3, term4);
   }
 }

 RELAY_REGISTER_OP("qnn.dense")
     .describe(R"code(Applies a linear transformation: :math:`Y = XW^T`.
 - **data**: quantized(int8, unit8) `(x1, x2, ..., xn, input_dim)`
 - **weight**: quantized(int8, unit8) `(units, input_dim)`
 - **out**: quantized(int32) `(x1, x2, ..., xn, units)`.
 )code" TVM_ADD_FILELINE)
     .set_attrs_type<DenseAttrs>()
     .set_num_inputs(6)
     .add_argument("data", "quantized nD Tensor", "Input data.")
     .add_argument("weight", "quantized 2D Tensor", "Weight matrix.")
     .add_argument("input_scale", "Tensor", "The quantization scale of the input tensor.")
     .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.")
     .add_argument("weight_scale", "Tensor", "The quantization scale of the weight tensor.")
     .add_argument("weight_zero_point", "Tensor",
                   "The quantization zero_point of the weight tensor.")
     .set_support_level(11)
     .add_type_rel("QDense", QnnDenseRel)
     .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnDenseCanonicalize);

 TVM_REGISTER_GLOBAL("relay.qnn.op._make.dense").set_body_typed(MakeQuantizedDense);

 }  // namespace qnn
 }  // namespace relay
 }  // namespace tvm
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file src/relay/qnn/op/dense.cc
	* \brief Property def of qnn dense operator.
	*/

	#include <tvm/relay/base.h>
	#include <tvm/relay/op.h>
	#include <tvm/relay/op_attr_types.h>
	#include <tvm/relay/qnn/attrs.h>

	#include "../../op/nn/nn.h"
	#include "../../transforms/pattern_utils.h"
	#include "../utils.h"

	namespace tvm {
	namespace relay {
	namespace qnn {

	// relay.op.qnn.dense

	bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
	const TypeReporter& reporter) {
	// Expected Types: data, weight, input_zero_point, weight_zero_point, input_scale, weight_scale,
	// out_type
	ICHECK_EQ(types.size(), 7);
	const auto* data = types[0].as<TensorTypeNode>();
	const auto* weight = types[1].as<TensorTypeNode>();
	if (data == nullptr \|\| weight == nullptr) return false;
	const auto* param = attrs.as<DenseAttrs>();
	ICHECK(param != nullptr) << "DenseAttrs cannot be nullptr.";
	ICHECK(data->dtype == DataType::Int(8) \|\| data->dtype == DataType::UInt(8))
	<< "Expected quantized dense type(int8, uint8) for input but was " << data->dtype;
	ICHECK(weight->dtype == DataType::Int(8) \|\| weight->dtype == DataType::UInt(8))
	<< "Expected quantized dense type(int8, uint8) for weight but was " << weight->dtype;
	ICHECK(param->out_dtype == DataType::Int(32))
	<< "Expected quantized dense type(int32) for output but was " << param->out_dtype;

	// Check the types of scale and zero points.
	for (size_t i = 2; i < 5; ++i) {
	if (types[i].as<IncompleteTypeNode>()) {
	return false;
	}
	}
	ICHECK(IsScalarType(types[2], DataType::Int(32))); // input_zero_point
	ICHECK(IsScalarType(types[4], DataType::Float(32))); // input_scale
	// weight_zero_point can be a scalar or a vector of the same shape as the weight_scale
	AssignType(types[5], DataType::Float(32), param->units, reporter); // weight_scale

	ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";

	// Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
	// Dense infer type function.
	Array<Type> tensor_types = {types[0], types[1], types[6]};
	return MatmulRel<DenseAttrs>(tensor_types, 3, attrs, reporter);
	}

	// Positional relay function to create quantized dense operator used by frontend FFI.
	Expr MakeQuantizedDense(Expr data, Expr weight, Expr input_zero_point, Expr kernel_zero_point,
	Expr input_scale, Expr kernel_scale, IndexExpr units, DataType out_dtype) {
	auto attrs = make_object<DenseAttrs>();
	attrs->units = std::move(units);
	attrs->out_dtype = out_dtype;
	static const Op& op = Op::Get("qnn.dense");
	return Call(op, {data, weight, input_zero_point, kernel_zero_point, input_scale, kernel_scale},
	Attrs(attrs), {});
	}

	Expr DenseFirstTerm(const Expr& quantized_data, const Expr& quantized_kernel,
	const DenseAttrs* attrs) {
	return Dense(quantized_data, quantized_kernel, attrs->units, attrs->out_dtype);
	}

	Expr DenseSecondTerm(const Expr& quantized_data, const Expr& kernel_zero_point,
	const int out_dim_size) {
	Array<Integer> axes = {1};
	Expr reduced_t2 = Sum(Cast(quantized_data, DataType::Int(32)), axes, true, false);
	Expr multiplied_t2;
	if (!IsConstScalar(kernel_zero_point)) {
	multiplied_t2 = Multiply(kernel_zero_point, MakeRepeat(reduced_t2, out_dim_size, 1));
	} else {
	multiplied_t2 = Multiply(kernel_zero_point, reduced_t2);
	}
	return multiplied_t2;
	}

	Expr DenseThirdTerm(const Expr& quantized_kernel, const Expr& input_zero_point) {
	Array<Integer> axes = {1};
	return Multiply(input_zero_point,
	Sum(Cast(quantized_kernel, DataType::Int(32)), axes, false, false));
	}

	Expr DenseFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int reduction_dim_size) {
	int32_t scalar_term = input_zero_point_int * kernel_zero_point_int * reduction_dim_size;
	return MakeConstantScalar(DataType::Int(32), scalar_term);
	}

	Expr DenseFourthTerm(const Expr& input_zero_point, const Expr& kernel_zero_point,
	int reduction_dim_size) {
	auto reduction_dim = MakeConstantScalar(DataType::Int(32), reduction_dim_size);
	return Multiply(Multiply(input_zero_point, kernel_zero_point), reduction_dim);
	}

	Expr DenseCombineTerms(const Expr& term1, const Expr& term2, const Expr& term3, const Expr& term4) {
	auto data_term = Subtract(term1, term2);
	// Putting constant terms together, so that constant folding can fold it.
	auto const_term = Subtract(term4, term3);
	return Add(data_term, const_term);
	}
	/*
	* \brief Forward rewrite the qnn dense op.
	* \param attrs The QNN dense attrs.
	* \param new_args The new mutated args to the call node.
	* \param arg_types The types of input and output.
	* \return The sequence of Relay ops for qnn cov2d op.
	* \note Lowering of the qnn.dense operator
	* A quantized tensor is represented in following manner
	* A = scale_a x (QA - zp_A)
	* where QA is quantized tensor, scale_a and zp_A are quantization
	* params.
	*
	* Quantized dense multiplies two quantized tensors and returns a
	* quantized tensor of default dtype of int32, with scale equaling to the
	* product of scales of input tensors, and a zero point of zero.
	*
	* The lowering for asymmetric quantized dense looks as follows. More details at
	* https://discuss.tvm.ai/t/tf-lite-quantized-conv2d-operator-conversion/2651/8
	* The computation gets unrolled into following 4 terms
	* C(m, n) = Sigma(k) (A(m, k) * W(n, k))
	*
	* RHS becomes
	* Sigma(k) ([QA(m, k) - zp_a] * [QW(n, k) - zp_w])
	*
	* Unrolling leads to following sequence
	* Sigma(k) QA(m, k) * QW(n, k) // Term1
	* - Sigma(k) zp_w * QA(m, k) // Term2
	* - Sigma(k) zp_a * QW(n, k) // Term3
	* - Sigma(k) * zp_a * zp_w // Term4
	*
	* Term3 and Term4 can be computed at compile time.
	*/
	Expr QnnDenseCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
	const Array<tvm::relay::Type>& arg_types) {
	ICHECK_EQ(new_args.size(), 6);
	Expr quantized_data = new_args[0];
	Expr quantized_kernel = new_args[1];
	Expr input_zero_point = new_args[2];
	Expr kernel_zero_point = new_args[3];

	const auto in_shape = get_shape(arg_types[0]);
	const auto w_shape = get_shape(arg_types[1]);
	const int reduction_dim_size = get_const_int(in_shape[1]);
	const int out_dim_size = get_const_int(w_shape[0]);

	const auto* qnn_dense_attrs = attrs.as<DenseAttrs>();

	auto term1 = DenseFirstTerm(quantized_data, quantized_kernel, qnn_dense_attrs);
	auto term2 = DenseSecondTerm(quantized_data, kernel_zero_point, out_dim_size);
	auto term3 = DenseThirdTerm(quantized_kernel, input_zero_point);

	// Extract the integer zero points.

	if (!IsConstScalar(input_zero_point) \|\| !IsConstScalar(kernel_zero_point)) {
	auto term4 = DenseFourthTerm(input_zero_point, kernel_zero_point, reduction_dim_size);
	return DenseCombineTerms(term1, term2, term3, term4);
	}

	auto kernel_zero_point_int = GetScalarFromConstant<int>(kernel_zero_point);
	auto input_zero_point_int = GetScalarFromConstant<int>(input_zero_point);

	// Get all the terms as described in the comments.
	auto term4 = DenseFourthTerm(input_zero_point_int, kernel_zero_point_int, reduction_dim_size);

	// Combine those 4 terms depending on the zero points to get the best lowering.
	if (input_zero_point_int == 0 && kernel_zero_point_int == 0) {
	// term 2, 3 and 4 become zero.
	return term1;
	} else if (input_zero_point_int == 0 && kernel_zero_point_int != 0) {
	// term 3 and term 4 become zero.
	return Subtract(term1, term2);
	} else if (input_zero_point_int != 0 && kernel_zero_point_int == 0) {
	// term 2 and term 4 become zero.
	return Subtract(term1, term3);
	} else {
	return DenseCombineTerms(term1, term2, term3, term4);
	}
	}

	RELAY_REGISTER_OP("qnn.dense")
	.describe(R"code(Applies a linear transformation: :math:`Y = XW^T`.
	- data: quantized(int8, unit8) `(x1, x2, ..., xn, input_dim)`
	- weight: quantized(int8, unit8) `(units, input_dim)`
	- out: quantized(int32) `(x1, x2, ..., xn, units)`.
	)code" TVM_ADD_FILELINE)
	.set_attrs_type<DenseAttrs>()
	.set_num_inputs(6)
	.add_argument("data", "quantized nD Tensor", "Input data.")
	.add_argument("weight", "quantized 2D Tensor", "Weight matrix.")
	.add_argument("input_scale", "Tensor", "The quantization scale of the input tensor.")
	.add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.")
	.add_argument("weight_scale", "Tensor", "The quantization scale of the weight tensor.")
	.add_argument("weight_zero_point", "Tensor",
	"The quantization zero_point of the weight tensor.")
	.set_support_level(11)
	.add_type_rel("QDense", QnnDenseRel)
	.set_attr<TNonComputational>("TNonComputational", true)
	.set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnDenseCanonicalize);

	TVM_REGISTER_GLOBAL("relay.qnn.op._make.dense").set_body_typed(MakeQuantizedDense);

	} // namespace qnn
	} // namespace relay
	} // namespace tvm