src/operator/quantization/quantized_fully_connected.cu - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * Copyright (c) 2017 by Contributors
  * \file quantized_fully_connected.cu
  * \brief
  * \author Ziheng Jiang, Jun Wu
 */
 #include "./quantization_utils.h"
 #include "../mxnet_op.h"
 #include "../nn/fully_connected-inl.h"

 namespace mxnet {
 namespace op {

 #if CUDA_VERSION >= 8000
 // value + bias_value * (range1 / limit_range1) * (limit_range2 / range2)
 struct QuantizedBiasAddKernel {
   MSHADOW_XINLINE static void Map(int i, size_t k, int32_t *out,
                                   const int8_t *bias, const float *min_out,
                                   const float *max_out, const float *min_bias,
                                   const float *max_bias) {
     typedef int32_t T1;
     typedef int8_t  T2;
     using mshadow::red::limits::MinValue;
     using mshadow::red::limits::MaxValue;
     float float_for_one_out_quant  =
       MaxAbs(*min_out, *max_out) / static_cast<double>(MaxValue<T1>());
     float float_for_one_bias_quant =
       MaxAbs(*min_bias, *max_bias) / static_cast<double>(MaxValue<T2>());
     out[i] = (out[i] * float_for_one_out_quant +
               bias[i%k] * float_for_one_bias_quant) /
              float_for_one_out_quant;
   }
 };
 #endif  // CUDA_VERSION >= 8000

 template<typename SrcType, typename DstType, typename CmpType>
 void QuantizedFullyConnectedForwardGPU(const nnvm::NodeAttrs& attrs,
                                        const OpContext &ctx,
                                        const std::vector<TBlob> &inputs,
                                        const std::vector<OpReqType> &req,
                                        const std::vector<TBlob> &outputs) {
 #if CUDA_VERSION >= 8000
   const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
   using namespace mshadow;
   using namespace mxnet_op;
   size_t num_inputs = param.no_bias ? 2 : 3;
   CHECK_EQ(inputs.size(),  num_inputs * 3);
   CHECK_EQ(outputs.size(), 3U);
   Stream<gpu> *s = ctx.get_stream<gpu>();
   CHECK_EQ(s->blas_handle_ownership_, Stream<gpu>::OwnHandle);
   const TBlob& data   =  inputs[0];
   const TBlob& weight =  inputs[1];
   const TBlob& out    = outputs[0];
   mxnet::TShape dshape = data.shape_;
   mxnet::TShape wshape = weight.shape_;
   mxnet::TShape oshape = out.shape_;
   // (m, n) * (k, n).T = (m, k)
   // A * B.T = C
   if (dshape.ndim() != 2) {
     CHECK(param.flatten)
       << "Currently, QuantizedFullyConnected Op only supports flatten=true "
       << "when ishape.ndim()!=2 for GPU.";
   }

   // row_C = col_C(T) = cublas(col_B * col_A(T)) = cublas(row_B(T), row_A)
   // row_C = col_C(T) = cublas(col_B(T) * col_A(T)) = cublas(row_B, row_A)
   const int m = dshape[0], n = dshape.ProdShape(1, dshape.ndim()), k = wshape[0];
   CmpType alpha = 1.0f;
   CmpType beta  = 0.0f;
   const cudaDataType src_type = mshadow::DataType<SrcType>::kCudaFlag;
   const cudaDataType dst_type = mshadow::DataType<DstType>::kCudaFlag;
   const cudaDataType cmp_type = mshadow::DataType<CmpType>::kCudaFlag;
   CUBLAS_CALL(cublasGemmEx(s->blas_handle_,
                            CUBLAS_OP_T,
                            CUBLAS_OP_N,
                            k,
                            m,
                            n,
                            &alpha,
                            weight.dptr_,
                            src_type,
                            n,
                            data.dptr_,
                            src_type,
                            n,
                            &beta,
                            out.dptr_,
                            dst_type,
                            k,
                            cmp_type,
                            CUBLAS_GEMM_DFALT));

   Kernel<QuantizationRangeForS8S8MultiplicationStruct, gpu>::Launch(s, 1,
     outputs[1].dptr<float>(), outputs[2].dptr<float>(),
      inputs[num_inputs].dptr<float>(),   inputs[num_inputs+1].dptr<float>(),
      inputs[num_inputs+2].dptr<float>(), inputs[num_inputs+3].dptr<float>());

   if (!param.no_bias) {
     const TBlob& bias = inputs[2];
     Kernel<QuantizedBiasAddKernel, gpu>::Launch(s, out.Size(),
         k, out.dptr<int32_t>(), bias.dptr<int8_t>(),
         outputs[1].dptr<float>(), outputs[2].dptr<float>(),
          inputs[7].dptr<float>(),  inputs[8].dptr<float>());
   }
 #else
   LOG(FATAL) << "QuantizedFullyConnectedForwardGPU only supports CUDA >= 8.0";
 #endif  // CUDA_VERSION >= 8000
 }

 NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
 .set_attr<FCompute>("FCompute<gpu>", QuantizedFullyConnectedForwardGPU<int8_t, int32_t, int32_t>);

 }  // namespace op
 }  // namespace mxnet
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* Copyright (c) 2017 by Contributors
	* \file quantized_fully_connected.cu
	* \brief
	* \author Ziheng Jiang, Jun Wu
	*/
	#include "./quantization_utils.h"
	#include "../mxnet_op.h"
	#include "../nn/fully_connected-inl.h"

	namespace mxnet {
	namespace op {

	#if CUDA_VERSION >= 8000
	// value + bias_value * (range1 / limit_range1) * (limit_range2 / range2)
	struct QuantizedBiasAddKernel {
	MSHADOW_XINLINE static void Map(int i, size_t k, int32_t *out,
	const int8_t bias, const float min_out,
	const float max_out, const float min_bias,
	const float *max_bias) {
	typedef int32_t T1;
	typedef int8_t T2;
	using mshadow::red::limits::MinValue;
	using mshadow::red::limits::MaxValue;
	float float_for_one_out_quant =
	MaxAbs(min_out, max_out) / static_cast<double>(MaxValue<T1>());
	float float_for_one_bias_quant =
	MaxAbs(min_bias, max_bias) / static_cast<double>(MaxValue<T2>());
	out[i] = (out[i] * float_for_one_out_quant +
	bias[i%k] * float_for_one_bias_quant) /
	float_for_one_out_quant;
	}
	};
	#endif // CUDA_VERSION >= 8000

	template<typename SrcType, typename DstType, typename CmpType>
	void QuantizedFullyConnectedForwardGPU(const nnvm::NodeAttrs& attrs,
	const OpContext &ctx,
	const std::vector<TBlob> &inputs,
	const std::vector<OpReqType> &req,
	const std::vector<TBlob> &outputs) {
	#if CUDA_VERSION >= 8000
	const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
	using namespace mshadow;
	using namespace mxnet_op;
	size_t num_inputs = param.no_bias ? 2 : 3;
	CHECK_EQ(inputs.size(), num_inputs * 3);
	CHECK_EQ(outputs.size(), 3U);
	Stream<gpu> *s = ctx.get_stream<gpu>();
	CHECK_EQ(s->blas_handle_ownership_, Stream<gpu>::OwnHandle);
	const TBlob& data = inputs[0];
	const TBlob& weight = inputs[1];
	const TBlob& out = outputs[0];
	mxnet::TShape dshape = data.shape_;
	mxnet::TShape wshape = weight.shape_;
	mxnet::TShape oshape = out.shape_;
	// (m, n) * (k, n).T = (m, k)
	// A * B.T = C
	if (dshape.ndim() != 2) {
	CHECK(param.flatten)
	<< "Currently, QuantizedFullyConnected Op only supports flatten=true "
	<< "when ishape.ndim()!=2 for GPU.";
	}

	// row_C = col_C(T) = cublas(col_B * col_A(T)) = cublas(row_B(T), row_A)
	// row_C = col_C(T) = cublas(col_B(T) * col_A(T)) = cublas(row_B, row_A)
	const int m = dshape[0], n = dshape.ProdShape(1, dshape.ndim()), k = wshape[0];
	CmpType alpha = 1.0f;
	CmpType beta = 0.0f;
	const cudaDataType src_type = mshadow::DataType<SrcType>::kCudaFlag;
	const cudaDataType dst_type = mshadow::DataType<DstType>::kCudaFlag;
	const cudaDataType cmp_type = mshadow::DataType<CmpType>::kCudaFlag;
	CUBLAS_CALL(cublasGemmEx(s->blas_handle_,
	CUBLAS_OP_T,
	CUBLAS_OP_N,
	k,
	m,
	n,
	&alpha,
	weight.dptr_,
	src_type,
	n,
	data.dptr_,
	src_type,
	n,
	&beta,
	out.dptr_,
	dst_type,
	k,
	cmp_type,
	CUBLAS_GEMM_DFALT));

	Kernel<QuantizationRangeForS8S8MultiplicationStruct, gpu>::Launch(s, 1,
	outputs[1].dptr<float>(), outputs[2].dptr<float>(),
	inputs[num_inputs].dptr<float>(), inputs[num_inputs+1].dptr<float>(),
	inputs[num_inputs+2].dptr<float>(), inputs[num_inputs+3].dptr<float>());

	if (!param.no_bias) {
	const TBlob& bias = inputs[2];
	Kernel<QuantizedBiasAddKernel, gpu>::Launch(s, out.Size(),
	k, out.dptr<int32_t>(), bias.dptr<int8_t>(),
	outputs[1].dptr<float>(), outputs[2].dptr<float>(),
	inputs[7].dptr<float>(), inputs[8].dptr<float>());
	}
	#else
	LOG(FATAL) << "QuantizedFullyConnectedForwardGPU only supports CUDA >= 8.0";
	#endif // CUDA_VERSION >= 8000
	}

	NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
	.set_attr<FCompute>("FCompute<gpu>", QuantizedFullyConnectedForwardGPU<int8_t, int32_t, int32_t>);

	} // namespace op
	} // namespace mxnet