src/core/tensor/tensor_math.h - singa - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 #ifndef SINGA_CORE_MATH_H_
 #define SINGA_CORE_MATH_H_
 #include <algorithm>
 #include <iostream>
 #include <iterator>
 #include <sstream>
 #include <string>
 #include <type_traits>
 #include <vector>

 #include "singa/core/common.h"
 #include "singa/core/tensor.h"
 #include "singa/utils/logging.h"

 namespace singa {

 /// \file math.h Math functions for linear algebra, neural net and random
 /// operations.
 /// All functions have a template argument, DType for DataType, Lang for the
 /// device programming language, e.g., Langice::kCpp, Langice::kCuda
 ///
 /// TODO(wangwei) Clean the functions to make the function APIs consistent:
 /// 1. All function names should be like XxxYyy or XY, i.e., capitalize the
 /// first letter.
 /// 2. Order functions based on function name in alphabetical order.
 /// 3. Function arguments order is [const basic type] [const Tensor] [mutable
 /// Tensor].
 /// 4. Function argument names, use 'num' for total number of elements in
 ///    elementwise operations; use 'in1' 'in2' for in Tensors; use 'out' for
 ///    output Tensor or value. With exceptions for some functions, e.g.,
 ///      Scale(const float alpha, const Tensor &in, Tensor* out);
 ///    For such cases, use x, v, alpha, etc for scalar types.
 ///    For blas functions, follow the blas style for argument names.
 ///    Use 'M' and 'v' for matrix and vector tensors in functions involving both
 ///    matrix and vectors.
 /// 5. For Tensor argument xxx, name its raw pointer as xxxPtr.
 /// 6. Pass the 'cudaStream_t s' to every function in math_kernel.h
 /// 7. Use size_t for the number of elements, rows or columns.
 /// 8. Use the same name for the Tensor and Tensor level math functions.

 const std::string vec2str(const std::vector<int> &vec) {
   std::ostringstream vts;
   if (!vec.empty()) {
     // Convert all but the last element to avoid a trailing ","
     std::copy(vec.begin(), vec.end(), std::ostream_iterator<int>(vts, ", "));
   }
   return vts.str();
 }

 const std::string vec2str(const std::vector<size_t> &vec) {
   std::ostringstream vts;
   if (!vec.empty()) {
     // Convert all but the last element to avoid a trailing ","
     std::copy(vec.begin(), vec.end(), std::ostream_iterator<size_t>(vts, ", "));
   }
   return vts.str();
 }

 // **************************************
 // // Element-wise functions
 // // Cpp tensors support multi-dimensional broadcasting;
 // // Cuda supports unidirectional broadcasting,
 // // i.e., the lhs and the output have the same shape
 // // **************************************

 /// out[i] = |in[i]|
 template <typename DType, typename Lang>
 void Abs(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Abs Not Implemented";
 }

 template <typename DType, typename Lang>
 void Erf(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Erf Not Implemented";
 }

 template <typename DTypeSrc, typename DTypeDst, typename Lang>
 void CastCopy(const Tensor *src, Tensor *dst, Context *ctx) {
   LOG(FATAL) << "CastCopy Not Implemented";
 }

 template <typename DType, typename Lang>
 void Ceil(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Ceil Not Implemented";
 }

 template <typename DType, typename Lang>
 void Floor(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Floor Not Implemented";
 }

 template <typename DType, typename Lang>
 void Round(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Round Not Implemented";
 }

 template <typename DType, typename Lang>
 void RoundE(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Round Not Implemented";
 }

 /// out[i] = in[i] + x
 template <typename DType, typename Lang>
 void Add(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Add Not Implemented";
 }

 /// out[i] = in1[i] + in2[i]
 template <typename DType, typename Lang>
 void Add(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Add-Pair Not Implemented";
 }
 /// Clamp every element into [low, high]
 /// if in[i]>high, then out[i]=high; if in[i]<low, then out[i]=low.
 template <typename DType, typename Lang>
 void Clamp(const DType low, const DType high, const Tensor &in, Tensor *out,
            Context *ctx) {
   LOG(FATAL) << "Clamp Not Implemented";
 }

 /// out[i] = x / in[i]
 template <typename DType, typename Lang>
 void Div(const DType x, const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Div Not Implemented";
 }

 /// out[i] = in[i] / x
 template <typename DType, typename Lang>
 void Div(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
   CHECK_NE(x, 0.f);
   EltwiseMult<DType, Lang>(in, DType(1) / x, out, ctx);
 }

 /// out[i] = in1[i] / in2[i]
 template <typename DType, typename Lang>
 void Div(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Div-Pair Not Implemented";
 }

 /// out[i] = in[i] * x
 template <typename DType, typename Lang>
 void EltwiseMult(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
   LOG(FATAL) << "EltwiseMult Not Implemented";
 }

 /// out[i] = in1[i] * in2[i]
 template <typename DType, typename Lang>
 void EltwiseMult(const Tensor &in1, const Tensor &in2, Tensor *out,
                  Context *ctx) {
   LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
 }

 /// out[i]=(in2[i]>0)?in1[i]:0.f
 template <typename DType, typename Lang>
 void ReLUBackward(const Tensor &in1, const Tensor &in2, Tensor *out,
                   Context *ctx) {
   LOG(FATAL) << "ReLUBackward Not Implemented";
 }

 /// Base is e, Neper number. out[i]=exp(in[i])
 template <typename DType, typename Lang>
 void Exp(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Exp Not Implemented";
 }

 /// out[i]=(in[i]<=x)?1.f:0.f
 template <typename DType, typename Lang>
 void LE(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
   LOG(FATAL) << "LE Not Implemented";
 }
 /// out[i]=(in1[i]<=in2[i])?1.f:0.f
 template <typename DType, typename Lang>
 void LE(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Tensor-Tensor LE Not Implemented";
 }
 /// Natural logarithm, the base is e, Neper number out[i]=log(in[i]).
 template <typename DType, typename Lang>
 void Log(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Log Not Implemented";
 }
 /// out[i]=(in[i]<x)?1.f:0.f
 template <typename DType, typename Lang>
 void LT(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
   LOG(FATAL) << "LT Not Implemented";
 }
 /// out[i]=(in1[i]<in2[i])?1.f:0.f
 template <typename DType, typename Lang>
 void LT(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Tensor-Tensor LT Not Implemented";
 }
 /// out[i]=(in[i]>=x)?1.f:0.f
 template <typename DType, typename Lang>
 void GE(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
   LOG(FATAL) << "GE Not Implemented";
 }
 /// out[i]=(in1[i]>=in2[i])?1.f:0.f
 template <typename DType, typename Lang>
 void GE(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Tensor-Tensor GE Not Implemented";
 }
 /// out[i]=(in[i]>x)?1.f:0.f
 template <typename DType, typename Lang>
 void GT(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
   LOG(FATAL) << "GT Not Implemented";
 }
 /// out[i]=(in[i]>in2[i])?1.f:0.f
 template <typename DType, typename Lang>
 void GT(const Tensor &in, const Tensor &in2, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Tensor-Tensor GT Not Implemented";
 }
 /// out[i]=(in[i]==x)?1.f:0.f
 template <typename DType, typename Lang>
 void EQ(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
   LOG(FATAL) << "EQ Not Implemented";
 }
 /// out[i]=(in[i]==in2[i])?1.f:0.f
 template <typename DType, typename Lang>
 void EQ(const Tensor &in, const Tensor &in2, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Tensor-Tensor EQ Not Implemented";
 }
 /// out[i] = pow(in[i], x)
 template <typename DType, typename Lang>
 void Pow(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Pow Not Implemented";
 }

 /// out[i]=pow(in1[i], in2[i])
 template <typename DType, typename Lang>
 void Pow(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Pow-Pair Not Implemented";
 }

 /// out[i]=max(0, in[i])
 template <typename DType, typename Lang>
 void ReLU(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "ReLU Not Implemented";
 }

 /// out[i] = x
 template <typename DType, typename Lang>
 void Set(const DType x, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Set Not Implemented";
 }
 /// out[i]=sigmoid(in[i])
 template <typename DType, typename Lang>
 void Sigmoid(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Sigmoid Not Implemented";
 }

 /// out[i] = log(exp(in[i]) + 1)
 template <typename DType, typename Lang>
 void SoftPlus(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "SoftPlus Not Implemented";
 }

 /// out[i] = in[i] / (abs(in[i]) + 1)
 template <typename DType, typename Lang>
 void SoftSign(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "SoftSign Not Implemented";
 }

 /// out[i] = sign(in[i])
 template <typename DType, typename Lang>
 void Sign(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Sign Not Implemented";
 }
 /// out[i]=sqrt(in[i])
 template <typename DType, typename Lang>
 void Sqrt(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Sqrt Not Implemented";
 }

 /// out[i]=square(in[i])
 template <typename DType, typename Lang>
 void Square(const Tensor &in, Tensor *out, Context *ctx) {
   EltwiseMult<DType, Lang>(in, in, out, ctx);
 }

 /// out[i] =  in[i] - x
 template <typename DType, typename Lang>
 void Sub(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
   Add<DType, Lang>(in, -x, out, ctx);
 }

 /// out[i] = in1[i] - in2[i]
 template <typename DType, typename Lang>
 void Sub(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Sub-Pair Not Implemented";
 }

 /// sum all elements of in into out
 template <typename DType, typename Lang>
 void Sum(const Tensor &in, DType *out, Context *ctx) {
   LOG(FATAL) << "Sum Not Implemented";
 }

 /// out[i]=fn(in[i])
 #define GenUnaryNotImplemented(fn, stringfn)             \
   template <typename DType, typename Lang>               \
   void fn(const Tensor &in, Tensor *out, Context *ctx) { \
     std::string str = stringfn;                          \
     str += " Not Implemented";                           \
     LOG(FATAL) << str;                                   \
   }

 GenUnaryNotImplemented(Cos, "Cos");
 GenUnaryNotImplemented(Cosh, "Cosh");
 GenUnaryNotImplemented(Acos, "Acos");
 GenUnaryNotImplemented(Acosh, "Acosh");
 GenUnaryNotImplemented(Sin, "Sin");
 GenUnaryNotImplemented(Sinh, "Sinh");
 GenUnaryNotImplemented(Asin, "Asin");
 GenUnaryNotImplemented(Asinh, "Asinh");
 GenUnaryNotImplemented(Tan, "Tan");
 GenUnaryNotImplemented(Tanh, "Tanh");
 GenUnaryNotImplemented(Atan, "Atan");
 GenUnaryNotImplemented(Atanh, "Atanh");

 /// similar to cudnnTransformTensor
 /// copies the data from one tensor to another tensor with a different layout
 /// the tensors must have the same dimensions but not necessarily the same
 /// strides
 template <typename DType, typename Lang>
 void Transform(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Transform Not Implemented";
 }

 // **************************************
 // Random functions
 // **************************************
 /// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
 // Get the random generator from 'ctx'
 // If DType is not float, then convert the threshold to DType
 template <typename DType, typename Lang>
 void Bernoulli(const float p, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Bernoulli Not Implemented";
 }
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the mean and std to DType
 template <typename DType, typename Lang>
 void Gaussian(const DType mean, const DType std, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Gaussian Not Implemented";
 }
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the low and high to DType
 template <typename DType, typename Lang>
 void Uniform(const DType low, const DType high, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Uniform Not Implemented";
 }

 // *********************************************************
 // BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
 // *********************************************************

 /// outurn the index of the element with the max value.
 template <typename DType, typename Lang>
 void Amax(const Tensor &in, size_t *out, Context *ctx) {
   LOG(FATAL) << "Amax Not Implemented";
 }

 /// outurn the index of the element with the min value.
 template <typename DType, typename Lang>
 void Amin(const Tensor &in, size_t *out, Context *ctx) {
   LOG(FATAL) << "Amin Not Implemented";
 }
 /// out = sum |x| for all x in in
 template <typename DType, typename Lang>
 void Asum(const Tensor &in, DType *out, Context *ctx) {
   LOG(FATAL) << "Asum Not Implemented";
 }

 /// out = alpha * in + out
 template <typename DType, typename Lang>
 void Axpy(const DType alpha, const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Axpy Not Implemented";
 }

 /// out = alpha * in + out
 template <typename DType, typename Lang>
 void Axpy(const Tensor &alpha, const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Axpy Not Implemented";
 }

 /// out = ||in||_2^2, i.e, L2 norm.
 template <typename DType, typename Lang>
 void Nrm2(const Tensor &in, float *out, Context *ctx) {
   LOG(FATAL) << "Nrm2 Not Implemented";
 }

 /// out *= x
 template <typename DType, typename Lang>
 void Scale(const DType x, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Scale Not Implemented";
 }

 /// inner product of array in1 and in2
 template <typename DType, typename Lang>
 void Dot(const Tensor &in1, const Tensor &in2, DType *out, Context *ctx) {
   LOG(FATAL) << "Dot Not Implemented";
 }
 template <typename DType, typename Lang>
 void Dot(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Dot Not Implemented";
 }

 /// out = alpha * A * v + beta * out.
 /// transA indicates if the internal data layout is transposed of A
 template <typename DType, typename Lang>
 void GEMV(const DType alpha, const Tensor &A, const Tensor &v, const DType beta,
           Tensor *out, Context *ctx) {
   LOG(FATAL) << "GEMV Not Implemented";
 }

 /// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
 /// if matrix_lef_side is true, do M*v; else do v*M
 template <typename DType, typename Lang>
 void DGMM(const bool side_right, const Tensor &M, const Tensor &v, Tensor *out,
           Context *ctx) {
   LOG(FATAL) << "DGMM Not Implemented";
 }

 /// C = alpha * A * B + beta * C.
 /// transA indicates if the internal data layout is transposed of A
 template <typename DType, typename Lang>
 void GEMM(const DType alpha, const Tensor &A, const Tensor &B, const DType beta,
           Tensor *C, Context *ctx) {
   LOG(FATAL) << "GEMM Not Implemented";
 }

 template <typename DType, typename Lang>
 void GEMMBatched(const DType alpha, const Tensor &A, const Tensor &B,
                  const DType beta, Tensor *C, Context *ctx) {
   LOG(FATAL) << "GEMM Batched Not Implemented";
 }

 template <typename DType, typename Lang>
 void SoftMax(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }

 template <typename DType, typename Lang>
 void SoftMaxBackward(const Tensor &in, Tensor *out, const Tensor &fdout,
                      Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }

 // yisen todo
 template <typename DType, typename Lang>
 void ComputeCrossEntropy(bool int_target, const size_t batchsize,
                          const size_t dim, const Tensor &p, const Tensor &t,
                          Tensor *loss, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }

 template <typename DType, typename Lang>
 void SoftmaxCrossEntropyBwd(bool int_target, const size_t batchsize,
                             const size_t dim, const Tensor &p, const Tensor &t,
                             Tensor *grad, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }

 template <typename DType, typename Lang>
 void RowMax(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 // **************************************
 // Matrix functions
 // **************************************
 /*
 /// Add the vector v to every column of A as the column of out
 template <typename DType, typename Lang>
 void AddCol(const size_t nrow, const size_t ncol, const Tensor &A, const Tensor
 &v,
             Tensor *out, Context *ctx) {
   LOG(FATAL) << "AddCol Not Implemented";
 }
 // TODO(wangwei) unify AddRow and AddCol.
 /// Add the vector v to every row of A as the row of out
 template <typename DType, typename Lang>
 void AddRow(const size_t nrow, const size_t ncol, const Tensor &A, const Tensor
 &v,
             Tensor *out, Context *ctx) {
   LOG(FATAL) << "AddRow Not Implemented";
 }
 /// outer-product.
 /// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
 template <typename DType, typename Lang>
 void Outer(const size_t m, const size_t n, const Tensor &in1, const Tensor &in2,
            Tensor *out, Context *ctx) {
   LOG(FATAL) << "Outer Not Implemented";
 }

 /// Sum the columns of the in matrix into a vector
 template <typename DType, typename Lang>
 void SumColumns(const size_t nrow, const size_t ncol, const Tensor &in, Tensor
 *out,
                 Context *ctx) {
   LOG(FATAL) << "SumColumns Not Implemented";
 }
 template <typename DType, typename Lang>
 void Set(const DType x, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }

 // TODO(wangwei) unify SumRow and SumCol.
 /// Sum the rows of the in matrix into a vector
 template <typename DType, typename Lang>
 void SumRows(const size_t nrow, const size_t ncol, const Tensor &in, Tensor
 *out,
              Context *ctx) {
   LOG(FATAL) << "SumRows Not Implemented";
 }
 */

 }  // namespace singa
 #endif  // SINGA_CORE_MATH_H_
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	#ifndef SINGA_CORE_MATH_H_
	#define SINGA_CORE_MATH_H_
	#include <algorithm>
	#include <iostream>
	#include <iterator>
	#include <sstream>
	#include <string>
	#include <type_traits>
	#include <vector>

	#include "singa/core/common.h"
	#include "singa/core/tensor.h"
	#include "singa/utils/logging.h"

	namespace singa {

	/// \file math.h Math functions for linear algebra, neural net and random
	/// operations.
	/// All functions have a template argument, DType for DataType, Lang for the
	/// device programming language, e.g., Langice::kCpp, Langice::kCuda
	///
	/// TODO(wangwei) Clean the functions to make the function APIs consistent:
	/// 1. All function names should be like XxxYyy or XY, i.e., capitalize the
	/// first letter.
	/// 2. Order functions based on function name in alphabetical order.
	/// 3. Function arguments order is [const basic type] [const Tensor] [mutable
	/// Tensor].
	/// 4. Function argument names, use 'num' for total number of elements in
	/// elementwise operations; use 'in1' 'in2' for in Tensors; use 'out' for
	/// output Tensor or value. With exceptions for some functions, e.g.,
	/// Scale(const float alpha, const Tensor &in, Tensor* out);
	/// For such cases, use x, v, alpha, etc for scalar types.
	/// For blas functions, follow the blas style for argument names.
	/// Use 'M' and 'v' for matrix and vector tensors in functions involving both
	/// matrix and vectors.
	/// 5. For Tensor argument xxx, name its raw pointer as xxxPtr.
	/// 6. Pass the 'cudaStream_t s' to every function in math_kernel.h
	/// 7. Use size_t for the number of elements, rows or columns.
	/// 8. Use the same name for the Tensor and Tensor level math functions.

	const std::string vec2str(const std::vector<int> &vec) {
	std::ostringstream vts;
	if (!vec.empty()) {
	// Convert all but the last element to avoid a trailing ","
	std::copy(vec.begin(), vec.end(), std::ostream_iterator<int>(vts, ", "));
	}
	return vts.str();
	}

	const std::string vec2str(const std::vector<size_t> &vec) {
	std::ostringstream vts;
	if (!vec.empty()) {
	// Convert all but the last element to avoid a trailing ","
	std::copy(vec.begin(), vec.end(), std::ostream_iterator<size_t>(vts, ", "));
	}
	return vts.str();
	}

	// **************************************
	// // Element-wise functions
	// // Cpp tensors support multi-dimensional broadcasting;
	// // Cuda supports unidirectional broadcasting,
	// // i.e., the lhs and the output have the same shape
	// // **************************************

	/// out[i] = \|in[i]\|
	template <typename DType, typename Lang>
	void Abs(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Abs Not Implemented";
	}

	template <typename DType, typename Lang>
	void Erf(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Erf Not Implemented";
	}

	template <typename DTypeSrc, typename DTypeDst, typename Lang>
	void CastCopy(const Tensor src, Tensor dst, Context *ctx) {
	LOG(FATAL) << "CastCopy Not Implemented";
	}

	template <typename DType, typename Lang>
	void Ceil(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Ceil Not Implemented";
	}

	template <typename DType, typename Lang>
	void Floor(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Floor Not Implemented";
	}

	template <typename DType, typename Lang>
	void Round(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Round Not Implemented";
	}

	template <typename DType, typename Lang>
	void RoundE(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Round Not Implemented";
	}

	/// out[i] = in[i] + x
	template <typename DType, typename Lang>
	void Add(const Tensor &in, const DType x, Tensor out, Context ctx) {
	LOG(FATAL) << "Add Not Implemented";
	}

	/// out[i] = in1[i] + in2[i]
	template <typename DType, typename Lang>
	void Add(const Tensor &in1, const Tensor &in2, Tensor out, Context ctx) {
	LOG(FATAL) << "Add-Pair Not Implemented";
	}
	/// Clamp every element into [low, high]
	/// if in[i]>high, then out[i]=high; if in[i]<low, then out[i]=low.
	template <typename DType, typename Lang>
	void Clamp(const DType low, const DType high, const Tensor &in, Tensor *out,
	Context *ctx) {
	LOG(FATAL) << "Clamp Not Implemented";
	}

	/// out[i] = x / in[i]
	template <typename DType, typename Lang>
	void Div(const DType x, const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Div Not Implemented";
	}

	/// out[i] = in[i] / x
	template <typename DType, typename Lang>
	void Div(const Tensor &in, const DType x, Tensor out, Context ctx) {
	CHECK_NE(x, 0.f);
	EltwiseMult<DType, Lang>(in, DType(1) / x, out, ctx);
	}

	/// out[i] = in1[i] / in2[i]
	template <typename DType, typename Lang>
	void Div(const Tensor &in1, const Tensor &in2, Tensor out, Context ctx) {
	LOG(FATAL) << "Div-Pair Not Implemented";
	}

	/// out[i] = in[i] * x
	template <typename DType, typename Lang>
	void EltwiseMult(const Tensor &in, const DType x, Tensor out, Context ctx) {
	LOG(FATAL) << "EltwiseMult Not Implemented";
	}

	/// out[i] = in1[i] * in2[i]
	template <typename DType, typename Lang>
	void EltwiseMult(const Tensor &in1, const Tensor &in2, Tensor *out,
	Context *ctx) {
	LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
	}

	/// out[i]=(in2[i]>0)?in1[i]:0.f
	template <typename DType, typename Lang>
	void ReLUBackward(const Tensor &in1, const Tensor &in2, Tensor *out,
	Context *ctx) {
	LOG(FATAL) << "ReLUBackward Not Implemented";
	}

	/// Base is e, Neper number. out[i]=exp(in[i])
	template <typename DType, typename Lang>
	void Exp(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Exp Not Implemented";
	}

	/// out[i]=(in[i]<=x)?1.f:0.f
	template <typename DType, typename Lang>
	void LE(const Tensor &in, const DType x, Tensor out, Context ctx) {
	LOG(FATAL) << "LE Not Implemented";
	}
	/// out[i]=(in1[i]<=in2[i])?1.f:0.f
	template <typename DType, typename Lang>
	void LE(const Tensor &in1, const Tensor &in2, Tensor out, Context ctx) {
	LOG(FATAL) << "Tensor-Tensor LE Not Implemented";
	}
	/// Natural logarithm, the base is e, Neper number out[i]=log(in[i]).
	template <typename DType, typename Lang>
	void Log(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Log Not Implemented";
	}
	/// out[i]=(in[i]<x)?1.f:0.f
	template <typename DType, typename Lang>
	void LT(const Tensor &in, const DType x, Tensor out, Context ctx) {
	LOG(FATAL) << "LT Not Implemented";
	}
	/// out[i]=(in1[i]<in2[i])?1.f:0.f
	template <typename DType, typename Lang>
	void LT(const Tensor &in1, const Tensor &in2, Tensor out, Context ctx) {
	LOG(FATAL) << "Tensor-Tensor LT Not Implemented";
	}
	/// out[i]=(in[i]>=x)?1.f:0.f
	template <typename DType, typename Lang>
	void GE(const Tensor &in, const DType x, Tensor out, Context ctx) {
	LOG(FATAL) << "GE Not Implemented";
	}
	/// out[i]=(in1[i]>=in2[i])?1.f:0.f
	template <typename DType, typename Lang>
	void GE(const Tensor &in1, const Tensor &in2, Tensor out, Context ctx) {
	LOG(FATAL) << "Tensor-Tensor GE Not Implemented";
	}
	/// out[i]=(in[i]>x)?1.f:0.f
	template <typename DType, typename Lang>
	void GT(const Tensor &in, const DType x, Tensor out, Context ctx) {
	LOG(FATAL) << "GT Not Implemented";
	}
	/// out[i]=(in[i]>in2[i])?1.f:0.f
	template <typename DType, typename Lang>
	void GT(const Tensor &in, const Tensor &in2, Tensor out, Context ctx) {
	LOG(FATAL) << "Tensor-Tensor GT Not Implemented";
	}
	/// out[i]=(in[i]==x)?1.f:0.f
	template <typename DType, typename Lang>
	void EQ(const Tensor &in, const DType x, Tensor out, Context ctx) {
	LOG(FATAL) << "EQ Not Implemented";
	}
	/// out[i]=(in[i]==in2[i])?1.f:0.f
	template <typename DType, typename Lang>
	void EQ(const Tensor &in, const Tensor &in2, Tensor out, Context ctx) {
	LOG(FATAL) << "Tensor-Tensor EQ Not Implemented";
	}
	/// out[i] = pow(in[i], x)
	template <typename DType, typename Lang>
	void Pow(const Tensor &in, const DType x, Tensor out, Context ctx) {
	LOG(FATAL) << "Pow Not Implemented";
	}

	/// out[i]=pow(in1[i], in2[i])
	template <typename DType, typename Lang>
	void Pow(const Tensor &in1, const Tensor &in2, Tensor out, Context ctx) {
	LOG(FATAL) << "Pow-Pair Not Implemented";
	}

	/// out[i]=max(0, in[i])
	template <typename DType, typename Lang>
	void ReLU(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "ReLU Not Implemented";
	}

	/// out[i] = x
	template <typename DType, typename Lang>
	void Set(const DType x, Tensor out, Context ctx) {
	LOG(FATAL) << "Set Not Implemented";
	}
	/// out[i]=sigmoid(in[i])
	template <typename DType, typename Lang>
	void Sigmoid(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Sigmoid Not Implemented";
	}

	/// out[i] = log(exp(in[i]) + 1)
	template <typename DType, typename Lang>
	void SoftPlus(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "SoftPlus Not Implemented";
	}

	/// out[i] = in[i] / (abs(in[i]) + 1)
	template <typename DType, typename Lang>
	void SoftSign(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "SoftSign Not Implemented";
	}

	/// out[i] = sign(in[i])
	template <typename DType, typename Lang>
	void Sign(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Sign Not Implemented";
	}
	/// out[i]=sqrt(in[i])
	template <typename DType, typename Lang>
	void Sqrt(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Sqrt Not Implemented";
	}

	/// out[i]=square(in[i])
	template <typename DType, typename Lang>
	void Square(const Tensor &in, Tensor out, Context ctx) {
	EltwiseMult<DType, Lang>(in, in, out, ctx);
	}

	/// out[i] = in[i] - x
	template <typename DType, typename Lang>
	void Sub(const Tensor &in, const DType x, Tensor out, Context ctx) {
	Add<DType, Lang>(in, -x, out, ctx);
	}

	/// out[i] = in1[i] - in2[i]
	template <typename DType, typename Lang>
	void Sub(const Tensor &in1, const Tensor &in2, Tensor out, Context ctx) {
	LOG(FATAL) << "Sub-Pair Not Implemented";
	}

	/// sum all elements of in into out
	template <typename DType, typename Lang>
	void Sum(const Tensor &in, DType out, Context ctx) {
	LOG(FATAL) << "Sum Not Implemented";
	}

	/// out[i]=fn(in[i])
	#define GenUnaryNotImplemented(fn, stringfn) \
	template <typename DType, typename Lang> \
	void fn(const Tensor &in, Tensor out, Context ctx) { \
	std::string str = stringfn; \
	str += " Not Implemented"; \
	LOG(FATAL) << str; \
	}

	GenUnaryNotImplemented(Cos, "Cos");
	GenUnaryNotImplemented(Cosh, "Cosh");
	GenUnaryNotImplemented(Acos, "Acos");
	GenUnaryNotImplemented(Acosh, "Acosh");
	GenUnaryNotImplemented(Sin, "Sin");
	GenUnaryNotImplemented(Sinh, "Sinh");
	GenUnaryNotImplemented(Asin, "Asin");
	GenUnaryNotImplemented(Asinh, "Asinh");
	GenUnaryNotImplemented(Tan, "Tan");
	GenUnaryNotImplemented(Tanh, "Tanh");
	GenUnaryNotImplemented(Atan, "Atan");
	GenUnaryNotImplemented(Atanh, "Atanh");

	/// similar to cudnnTransformTensor
	/// copies the data from one tensor to another tensor with a different layout
	/// the tensors must have the same dimensions but not necessarily the same
	/// strides
	template <typename DType, typename Lang>
	void Transform(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Transform Not Implemented";
	}

	// **************************************
	// Random functions
	// **************************************
	/// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
	// Get the random generator from 'ctx'
	// If DType is not float, then convert the threshold to DType
	template <typename DType, typename Lang>
	void Bernoulli(const float p, Tensor out, Context ctx) {
	LOG(FATAL) << "Bernoulli Not Implemented";
	}
	// The random generator should be extracted from ctx.
	// If DType is not float, then convert the mean and std to DType
	template <typename DType, typename Lang>
	void Gaussian(const DType mean, const DType std, Tensor out, Context ctx) {
	LOG(FATAL) << "Gaussian Not Implemented";
	}
	// The random generator should be extracted from ctx.
	// If DType is not float, then convert the low and high to DType
	template <typename DType, typename Lang>
	void Uniform(const DType low, const DType high, Tensor out, Context ctx) {
	LOG(FATAL) << "Uniform Not Implemented";
	}

	// *********************************************************
	// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
	// *********************************************************

	/// outurn the index of the element with the max value.
	template <typename DType, typename Lang>
	void Amax(const Tensor &in, size_t out, Context ctx) {
	LOG(FATAL) << "Amax Not Implemented";
	}

	/// outurn the index of the element with the min value.
	template <typename DType, typename Lang>
	void Amin(const Tensor &in, size_t out, Context ctx) {
	LOG(FATAL) << "Amin Not Implemented";
	}
	/// out = sum \|x\| for all x in in
	template <typename DType, typename Lang>
	void Asum(const Tensor &in, DType out, Context ctx) {
	LOG(FATAL) << "Asum Not Implemented";
	}

	/// out = alpha * in + out
	template <typename DType, typename Lang>
	void Axpy(const DType alpha, const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Axpy Not Implemented";
	}

	/// out = alpha * in + out
	template <typename DType, typename Lang>
	void Axpy(const Tensor &alpha, const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Axpy Not Implemented";
	}

	/// out = \|\|in\|\|_2^2, i.e, L2 norm.
	template <typename DType, typename Lang>
	void Nrm2(const Tensor &in, float out, Context ctx) {
	LOG(FATAL) << "Nrm2 Not Implemented";
	}

	/// out *= x
	template <typename DType, typename Lang>
	void Scale(const DType x, Tensor out, Context ctx) {
	LOG(FATAL) << "Scale Not Implemented";
	}

	/// inner product of array in1 and in2
	template <typename DType, typename Lang>
	void Dot(const Tensor &in1, const Tensor &in2, DType out, Context ctx) {
	LOG(FATAL) << "Dot Not Implemented";
	}
	template <typename DType, typename Lang>
	void Dot(const Tensor &in1, const Tensor &in2, Tensor out, Context ctx) {
	LOG(FATAL) << "Dot Not Implemented";
	}

	/// out = alpha * A * v + beta * out.
	/// transA indicates if the internal data layout is transposed of A
	template <typename DType, typename Lang>
	void GEMV(const DType alpha, const Tensor &A, const Tensor &v, const DType beta,
	Tensor out, Context ctx) {
	LOG(FATAL) << "GEMV Not Implemented";
	}

	/// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
	/// if matrix_lef_side is true, do Mv; else do vM
	template <typename DType, typename Lang>
	void DGMM(const bool side_right, const Tensor &M, const Tensor &v, Tensor *out,
	Context *ctx) {
	LOG(FATAL) << "DGMM Not Implemented";
	}

	/// C = alpha * A * B + beta * C.
	/// transA indicates if the internal data layout is transposed of A
	template <typename DType, typename Lang>
	void GEMM(const DType alpha, const Tensor &A, const Tensor &B, const DType beta,
	Tensor C, Context ctx) {
	LOG(FATAL) << "GEMM Not Implemented";
	}

	template <typename DType, typename Lang>
	void GEMMBatched(const DType alpha, const Tensor &A, const Tensor &B,
	const DType beta, Tensor C, Context ctx) {
	LOG(FATAL) << "GEMM Batched Not Implemented";
	}

	template <typename DType, typename Lang>
	void SoftMax(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Not Implemented";
	}

	template <typename DType, typename Lang>
	void SoftMaxBackward(const Tensor &in, Tensor *out, const Tensor &fdout,
	Context *ctx) {
	LOG(FATAL) << "Not Implemented";
	}

	// yisen todo
	template <typename DType, typename Lang>
	void ComputeCrossEntropy(bool int_target, const size_t batchsize,
	const size_t dim, const Tensor &p, const Tensor &t,
	Tensor loss, Context ctx) {
	LOG(FATAL) << "Not Implemented";
	}

	template <typename DType, typename Lang>
	void SoftmaxCrossEntropyBwd(bool int_target, const size_t batchsize,
	const size_t dim, const Tensor &p, const Tensor &t,
	Tensor grad, Context ctx) {
	LOG(FATAL) << "Not Implemented";
	}

	template <typename DType, typename Lang>
	void RowMax(const Tensor &in, Tensor out, Context ctx) {
	LOG(FATAL) << "Not Implemented";
	}
	// **************************************
	// Matrix functions
	// **************************************
	/*
	/// Add the vector v to every column of A as the column of out
	template <typename DType, typename Lang>
	void AddCol(const size_t nrow, const size_t ncol, const Tensor &A, const Tensor
	&v,
	Tensor out, Context ctx) {
	LOG(FATAL) << "AddCol Not Implemented";
	}
	// TODO(wangwei) unify AddRow and AddCol.
	/// Add the vector v to every row of A as the row of out
	template <typename DType, typename Lang>
	void AddRow(const size_t nrow, const size_t ncol, const Tensor &A, const Tensor
	&v,
	Tensor out, Context ctx) {
	LOG(FATAL) << "AddRow Not Implemented";
	}
	/// outer-product.
	/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
	template <typename DType, typename Lang>
	void Outer(const size_t m, const size_t n, const Tensor &in1, const Tensor &in2,
	Tensor out, Context ctx) {
	LOG(FATAL) << "Outer Not Implemented";
	}

	/// Sum the columns of the in matrix into a vector
	template <typename DType, typename Lang>
	void SumColumns(const size_t nrow, const size_t ncol, const Tensor &in, Tensor
	*out,
	Context *ctx) {
	LOG(FATAL) << "SumColumns Not Implemented";
	}
	template <typename DType, typename Lang>
	void Set(const DType x, Tensor out, Context ctx) {
	LOG(FATAL) << "Not Implemented";
	}

	// TODO(wangwei) unify SumRow and SumCol.
	/// Sum the rows of the in matrix into a vector
	template <typename DType, typename Lang>
	void SumRows(const size_t nrow, const size_t ncol, const Tensor &in, Tensor
	*out,
	Context *ctx) {
	LOG(FATAL) << "SumRows Not Implemented";
	}
	*/

	} // namespace singa
	#endif // SINGA_CORE_MATH_H_