include/mshadow/tensor_expr_engine-inl.hpp - singa - Git at Google

 #ifndef MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP
 #define MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP
 /*!
  * \file tensor_expr_engine-inl.hpp
  * \brief definitions of how expressions should be evaluated
  * \author Tianqi Chen, Bing Xu
  */
 #include "tensor_expr.h"
 #include "tensor.h"

 namespace mshadow{
     namespace expr{
         /*!
          * \brief a general class that allows extension that makes tensors of some shape
          * \tparam SubType type of subclass
          * \tparam SrcExp source expression of the MakeTensorExp, the source of operation
          * \tparam dim dimension of the expression
          */
         template<typename SubType, typename SrcExp, int dim>
         struct MakeTensorExp: public Exp< MakeTensorExp<SubType,SrcExp,dim>, type::kMapper >{
             /*! \brief the shape of this expression */
             Shape<dim> shape_;
             /*! \brief true self of subtype */
             inline const SubType& real_self( void ) const{
                 return *static_cast<const SubType*>(this);
             }
         };
     };

     namespace expr{
         /*! \brief This part of code gives plan that can be used to carry out execution */
         template<typename ExpType>
         class Plan{
         public:
             /*!
              * \brief evaluate the expression at index [y][x]
              *        to be implemented by SubType
              */
             MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const;
         };

         template <typename Device, int dim>
         class Plan< Tensor<Device,dim> >{
         public:
             Plan( const Tensor<Device,dim> &t )
                 :dptr_(t.dptr),stride_(t.shape.stride_){}
             MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
                 return dptr_[ y * stride_ + x ];
             }
         private:
             const real_t  *dptr_;
             index_t stride_;
         };
         // special evaluation case for 1d tensor
         template <typename Device>
         class Plan< Tensor<Device,1> >{
         public:
             Plan( const Tensor<Device,1> &t ):dptr_(t.dptr){}
             MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
                 return dptr_[ x ];
             }
         private:
             const real_t  *dptr_;
         };

         template<>
         class Plan<ScalarExp>{
         public:
             Plan( real_t scalar ):scalar_(scalar){}
             /*! \brief evaluate at [y][x] */
             MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
                     return scalar_;
             }
         private:
             real_t scalar_;
         };

         template<typename OP, typename TA, typename TB,int etype>
         class Plan< BinaryMapExp<OP,TA,TB,etype> >{
         public:
             Plan( const Plan<TA> &lhs, const Plan<TB> &rhs )
                 :lhs_(lhs), rhs_(rhs){}
             MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
                 return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) );
             }
         private:
             Plan<TA> lhs_;
             Plan<TB> rhs_;
         };

         template<typename OP, typename TA, int etype>
         class Plan< UnaryMapExp<OP,TA,etype> >{
         public:
             Plan( const Plan<TA> &src ):src_(src){}
             MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
                 return OP::Map( src_.Eval( y, x ) );
             }
         private:
             Plan<TA> src_;
         };


         template<typename SubType, typename SrcExp, int dim>
         struct Plan< MakeTensorExp<SubType,SrcExp,dim> >{
         public:
             Plan( const Plan<SubType> &src ):src_(src){}
             MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
                 return src_.Eval( y, x );
             }
         private:
             Plan<SubType> src_;
         };

         // allow UnaryMap see the plan
         template<typename OP, typename TA, typename TB, int etype>
         inline Plan< BinaryMapExp<OP,TA,TB,etype> > MakePlan( const BinaryMapExp<OP,TA,TB,etype> &e );

         // translate from exp to execution plan
         inline Plan<ScalarExp> MakePlan( const ScalarExp &e ){
             return Plan<ScalarExp>( e.scalar_ );
         }

         template<typename T>
         inline Plan<T> MakePlan( const ContainerExp<T> &e ){
             return Plan<T>( e.self() );
         }

         template<typename T, typename SrcExp, int dim>
         inline Plan< T > MakePlan( const MakeTensorExp<T,SrcExp,dim> &e ){
             return Plan< T >( e.real_self() );
         }

         template<typename OP, typename TA, int etype>
         inline Plan< UnaryMapExp<OP,TA,etype> > MakePlan( const UnaryMapExp<OP,TA,etype> &e ){
             return Plan< UnaryMapExp<OP,TA,etype> >( MakePlan(e.src_) );
         }

         template<typename OP, typename TA, typename TB, int etype>
         inline Plan< BinaryMapExp<OP,TA,TB,etype> > MakePlan( const BinaryMapExp<OP,TA,TB,etype> &e ){
             return Plan< BinaryMapExp<OP,TA,TB,etype> >( MakePlan(e.lhs_), MakePlan(e.rhs_) );
         }
     }; // namespace expr

     namespace expr{
         /*!
          * \brief static type inference template,
          *        used to get the dimension of each expression,
          *        if ExpInfo<E>::kDim == -1, this means here are mismatch in expression
          *        if ( ExpInfo<E>::kDevMask & cpu::kDevMask ) != 0, this means this expression can be assigned to cpu
          * \tparam E expression
          */
         template<typename E>
         struct ExpInfo{
             const static int kDim = -1;
             const static int kDevMask = 0;
         };
         template<>
         struct ExpInfo<ScalarExp>{
             const static int kDim = 0;
             const static int kDevMask = 0xffff;
         };
         template<typename Device, int dim>
         struct ExpInfo< Tensor<Device,dim> >{
             const static int kDim = dim;
             const static int kDevMask = Device::kDevMask;
         };
         template<typename T, typename SrcExp, int dim>
         struct ExpInfo< MakeTensorExp<T,SrcExp,dim> >{
             const static int kDimSrc = ExpInfo<SrcExp>::kDim;
             const static int kDim = kDimSrc >= 0 ? dim : -1;
             const static int kDevMask = ExpInfo<SrcExp>::kDevMask;
         };
         template<typename OP, typename TA, int etype>
         struct ExpInfo< UnaryMapExp<OP,TA,etype> >{
             const static int kDim = ExpInfo<TA>::kDim;
             const static int kDevMask = ExpInfo<TA>::kDevMask;
         };
         template<typename OP, typename TA, typename TB, int etype>
         struct ExpInfo< BinaryMapExp<OP,TA,TB,etype> >{
             const static int kDimLhs = ExpInfo<TA>::kDim;
             const static int kDimRhs = ExpInfo<TB>::kDim;
             const static int kDim = (kDimLhs>=0 && kDimRhs >= 0) ? \
                 ( kDimLhs==0 ? kDimRhs : ( (kDimRhs==0||kDimLhs==kDimRhs) ? kDimLhs : -1 ) ):-1;
             const static int kDevMask = ExpInfo<TA>::kDevMask & ExpInfo<TB>::kDevMask;
         };

         /*! \brief template to do type check */
         template<typename Device, int dim, typename E>
         struct TypeCheck{
             /*! \brief dimension of expression*/
             const static int kExpDim = ExpInfo<E>::kDim;
             /*! \brief whether the expression device type matches */
             const static bool kDevPass = (ExpInfo<E>::kDevMask & Device::kDevMask) != 0;
             /*! \brief whether the expression can be mapped to expression of dim */
             const static bool kMapPass = (kExpDim == 0 || kExpDim == dim) && kDevPass;
             /*! \brief whether the expression can be reduced to expression of dim */
             const static bool kRedPass = (kExpDim > dim) && kDevPass;
         };

         template<bool kPass>
         struct TypeCheckPass;
         template<>
         struct TypeCheckPass<false>{};
         template<>
         struct TypeCheckPass<true>{
             inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type( void ){}
             inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp( void ){}
             inline static void Error_Expression_Does_Not_Meet_Dimension_Req( void ){}
         };
     }; // namespace expr

     namespace expr{
         // check shape consistency
         template<int dim,typename E>
         struct ShapeCheck{
             inline static Shape<dim> Check( const E &t );
         };

         template<int dim>
         struct ShapeCheck<dim,ScalarExp>{
             inline static Shape<dim> Check( const ScalarExp &exp ){
                 // use lowest dimension to mark scalar exp
                 Shape<dim> shape; shape[0] = 0;
                 return shape;
             }
         };
         template<int dim,typename Device>
         struct ShapeCheck<dim,Tensor<Device,dim> >{
             inline static Shape<dim> Check( const Tensor<Device,dim> &t ){
                 return t.shape;
             }
         };
         template<int dim,typename SrcExp,typename T>
         struct ShapeCheck<dim,MakeTensorExp<T,SrcExp,dim> >{
             inline static Shape<dim> Check( const MakeTensorExp<T,SrcExp,dim> &t ){
                 return t.shape_;
             }
         };
         template<int dim, typename OP, typename TA, int etype>
         struct ShapeCheck< dim,UnaryMapExp<OP,TA,etype> >{
             inline static Shape<dim> Check( const UnaryMapExp<OP,TA,etype> &t ){
                 Shape<dim> s = ShapeCheck<dim,TA>::Check( t.src_ );
                 return s;
             }
         };
         template<int dim, typename OP, typename TA, typename TB, int etype>
         struct ShapeCheck< dim, BinaryMapExp<OP,TA,TB,etype> >{
             inline static Shape<dim> Check( const BinaryMapExp<OP,TA,TB,etype> &t ){
                 Shape<dim> shape1 = ShapeCheck<dim,TA>::Check( t.lhs_ );
                 Shape<dim> shape2 = ShapeCheck<dim,TB>::Check( t.rhs_ );
                 if( shape1[0] == 0 ) return shape2;
                 if( shape2[0] == 0 ) return shape1;
                 utils::Assert( shape1 == shape2, "BinaryMapExp: Shapes of two tensors in BinaryMapExp expression is not the same");
                 return shape1;
             }
         };
     }; // namespace expr

     // the matrix OP depends on BLAS
     namespace expr{
         template<typename SV,typename Device, int ddim, int ldim, int rdim, bool ltrans, bool rtrans>
         struct DotEngine{
             inline static void Eval( Tensor<Device,ddim> &dst, const Tensor<Device,ldim> &lhs, const Tensor<Device,rdim> &rhs, real_t scale );
         };

         // handles the dot
         template<typename Device>
         struct BLASEngine;

         #if (MSHADOW_USE_CBLAS||MSHADOW_USE_MKL)
         template<>
         struct BLASEngine<cpu>{
             inline static CBLAS_TRANSPOSE GetT( bool t ){
                 return t ? CblasTrans : CblasNoTrans;
             }
             inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha, \
                                      const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ){
                 cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
             }
             inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha, \
                                      const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc ){
                 cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
             }
             inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \
                                      const float *X, int incX, float beta, float *Y, int incY ){
                 cblas_sgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
             }
             inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \
                                      const double *X, int incX, double beta, double *Y, int incY ){
                 cblas_dgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
             }
             inline static void ger( int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda ){
                 cblas_sger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda);
             }
             inline static void ger( int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda ){
                 cblas_dger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda);
             }
         };
         #endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL

         #if MSHADOW_USE_CUDA
         // All CuBLAS goes to here, use legacy API: not threadsafe
         template<>
         struct BLASEngine<gpu>{
             inline static char GetT( bool t ){
                 return t ? 'T' : 'N';
             }
             inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha,
                                      const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ){
                 cublasSgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
             }
             inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha,
                                      const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc ){
                 cublasDgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
             }
             inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \
                                      const float *X, int incX, float beta, float *Y, int incY ){
                 cublasSgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
             }
             inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \
                                      const double *X, int incX, double beta, double *Y, int incY ){
                 cublasDgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
             }
             inline static void ger( int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda ){
                 cublasSger(m,n,alpha,X,incX,Y,incY,A,lda);
             }
             inline static void ger( int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda ){
                 cublasDger(m,n,alpha,X,incX,Y,incY,A,lda);
             }
         };
         #endif

         // helper function to decide which shape we are in
         inline static Shape<2> GetShape( const Shape<2> &shape, bool transpose ){
             return transpose ? Shape2(shape[0],shape[1]) : shape;
         }
         // dst = dot( lhs[.T], rhs[.T] )
         template<typename SV, typename xpu, bool transpose_left, bool transpose_right>
         struct DotEngine<SV,xpu,2,2,2,transpose_left,transpose_right>{
             inline static void Eval( Tensor<xpu,2> &dst, const Tensor<xpu,2> &lhs, const Tensor<xpu,2> &rhs, real_t scale ) {
                 Shape<2> sleft  = GetShape( lhs.shape, transpose_left );
                 Shape<2> sright = GetShape( rhs.shape, transpose_right );
                 utils::Assert( dst.shape[1] == sleft[1] && dst.shape[0] == sright[0] \
                                && sleft[0] == sright[1] , "dot-gemm: matrix shape mismatch" );
                 // use column major argument to compatible with most BLAS
                 BLASEngine<xpu>::gemm
                     ( transpose_right , transpose_left,
                       transpose_right ? rhs.shape[1] : rhs.shape[0],
                       transpose_left  ? lhs.shape[0] : lhs.shape[1],
                       transpose_right ? rhs.shape[0] : rhs.shape[1],
                       scale * SV::kAlphaBLAS,
                       rhs.dptr, rhs.shape.stride_,
                       lhs.dptr, lhs.shape.stride_,
                       SV::kBetaBLAS,
                       dst.dptr, dst.shape.stride_ );
             }
         };
         template<typename SV, typename xpu, bool transpose_right>
         struct DotEngine<SV,xpu,1,1,2,false,transpose_right>{
             inline static void Eval( Tensor<xpu,1> &dst, const Tensor<xpu,1> &lhs, const Tensor<xpu,2> &rhs, real_t scale ) {
                 Shape<2> sright = GetShape( rhs.shape, transpose_right );
                 utils::Assert( dst.shape[0] == sright[0] && lhs.shape[0] == sright[1], "dot-gemv: matrix shape mismatch");
                 BLASEngine<xpu>::gemv
                     ( transpose_right,
                       rhs.shape[0], rhs.shape[1], scale * SV::kAlphaBLAS,
                       rhs.dptr, rhs.shape.stride_,
                       lhs.dptr, 1, SV::kBetaBLAS,
                       dst.dptr, 1 );
             }
         };
         template<typename SV, typename xpu>
         struct DotEngine<SV,xpu,2,1,1,true,false>{
             inline static void Eval( Tensor<xpu,2> &dst, const Tensor<xpu,1> &lhs, const Tensor<xpu,1> &rhs, real_t scale ) {
                 utils::Assert( dst.shape[1] == lhs.shape[0] && dst.shape[0] == rhs.shape[0], "dot-ger: matrix shape mismatch" );
                 if( SV::kBetaBLAS < 1e-6f ){
                     BLASEngine<xpu>::ger
                         ( rhs.shape[0], lhs.shape[0], scale * SV::kAlphaBLAS,
                           rhs.dptr, 1, lhs.dptr, 1, dst.dptr, dst.shape.stride_ );
                 }else{
                     DotEngine<SV,xpu,2,2,2,true,false>::Eval( dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale );
                 }
             }
         };

     }; // namespace expr

     namespace expr{
         /*! \brief some engine that evaluate complex expression */
         template<typename SV, typename Device, int dim, typename E>
         struct ExpComplexEngine{
             inline static void Eval( Tensor<Device,dim>& dst, const E &exp );
         };
         template<typename SV, typename Device, int dim>
         struct ExpEngine<SV, Tensor<Device,dim> >{
             template<typename E>
             inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kMapper> &exp ){
                 MapExp<SV,dim,E>( dst, exp );
             }
             template<typename E>
             inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kContainer> &exp ){
                 MapExp<SV,dim,E>( dst, exp );
             }
             template<typename E>
             inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kComplex> &exp ){
                 ExpComplexEngine<SV,Device,dim,E>::Eval( dst, exp.self() );
             }
         };
         template<typename SV, typename Device, int dim, int ldim,int rdim,bool ltrans,bool rtrans>
         struct ExpComplexEngine< SV, Device, dim, DotExp< Tensor<Device,ldim>, Tensor<Device,rdim>, ltrans, rtrans > >{
             inline static void Eval( Tensor<Device,dim> &dst, const DotExp< Tensor<Device,ldim>, Tensor<Device,rdim>, ltrans, rtrans > &exp ){
                 DotEngine<SV,Device,dim,ldim,rdim,ltrans,rtrans>::Eval( dst, exp.lhs_, exp.rhs_, exp.scale_ );
             }
         };
     }; // namespace expr
 };
 #endif
	#ifndef MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP
	#define MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP
	/*!
	* \file tensor_expr_engine-inl.hpp
	* \brief definitions of how expressions should be evaluated
	* \author Tianqi Chen, Bing Xu
	*/
	#include "tensor_expr.h"
	#include "tensor.h"

	namespace mshadow{
	namespace expr{
	/*!
	* \brief a general class that allows extension that makes tensors of some shape
	* \tparam SubType type of subclass
	* \tparam SrcExp source expression of the MakeTensorExp, the source of operation
	* \tparam dim dimension of the expression
	*/
	template<typename SubType, typename SrcExp, int dim>
	struct MakeTensorExp: public Exp< MakeTensorExp<SubType,SrcExp,dim>, type::kMapper >{
	/! \brief the shape of this expression /
	Shape<dim> shape_;
	/! \brief true self of subtype /
	inline const SubType& real_self( void ) const{
	return static_cast<const SubType>(this);
	}
	};
	};

	namespace expr{
	/! \brief This part of code gives plan that can be used to carry out execution /
	template<typename ExpType>
	class Plan{
	public:
	/*!
	* \brief evaluate the expression at index [y][x]
	* to be implemented by SubType
	*/
	MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const;
	};

	template <typename Device, int dim>
	class Plan< Tensor<Device,dim> >{
	public:
	Plan( const Tensor<Device,dim> &t )
	:dptr_(t.dptr),stride_(t.shape.stride_){}
	MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
	return dptr_[ y * stride_ + x ];
	}
	private:
	const real_t *dptr_;
	index_t stride_;
	};
	// special evaluation case for 1d tensor
	template <typename Device>
	class Plan< Tensor<Device,1> >{
	public:
	Plan( const Tensor<Device,1> &t ):dptr_(t.dptr){}
	MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
	return dptr_[ x ];
	}
	private:
	const real_t *dptr_;
	};

	template<>
	class Plan<ScalarExp>{
	public:
	Plan( real_t scalar ):scalar_(scalar){}
	/! \brief evaluate at [y][x] /
	MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
	return scalar_;
	}
	private:
	real_t scalar_;
	};

	template<typename OP, typename TA, typename TB,int etype>
	class Plan< BinaryMapExp<OP,TA,TB,etype> >{
	public:
	Plan( const Plan<TA> &lhs, const Plan<TB> &rhs )
	:lhs_(lhs), rhs_(rhs){}
	MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
	return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) );
	}
	private:
	Plan<TA> lhs_;
	Plan<TB> rhs_;
	};

	template<typename OP, typename TA, int etype>
	class Plan< UnaryMapExp<OP,TA,etype> >{
	public:
	Plan( const Plan<TA> &src ):src_(src){}
	MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
	return OP::Map( src_.Eval( y, x ) );
	}
	private:
	Plan<TA> src_;
	};


	template<typename SubType, typename SrcExp, int dim>
	struct Plan< MakeTensorExp<SubType,SrcExp,dim> >{
	public:
	Plan( const Plan<SubType> &src ):src_(src){}
	MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
	return src_.Eval( y, x );
	}
	private:
	Plan<SubType> src_;
	};

	// allow UnaryMap see the plan
	template<typename OP, typename TA, typename TB, int etype>
	inline Plan< BinaryMapExp<OP,TA,TB,etype> > MakePlan( const BinaryMapExp<OP,TA,TB,etype> &e );

	// translate from exp to execution plan
	inline Plan<ScalarExp> MakePlan( const ScalarExp &e ){
	return Plan<ScalarExp>( e.scalar_ );
	}

	template<typename T>
	inline Plan<T> MakePlan( const ContainerExp<T> &e ){
	return Plan<T>( e.self() );
	}

	template<typename T, typename SrcExp, int dim>
	inline Plan< T > MakePlan( const MakeTensorExp<T,SrcExp,dim> &e ){
	return Plan< T >( e.real_self() );
	}

	template<typename OP, typename TA, int etype>
	inline Plan< UnaryMapExp<OP,TA,etype> > MakePlan( const UnaryMapExp<OP,TA,etype> &e ){
	return Plan< UnaryMapExp<OP,TA,etype> >( MakePlan(e.src_) );
	}

	template<typename OP, typename TA, typename TB, int etype>
	inline Plan< BinaryMapExp<OP,TA,TB,etype> > MakePlan( const BinaryMapExp<OP,TA,TB,etype> &e ){
	return Plan< BinaryMapExp<OP,TA,TB,etype> >( MakePlan(e.lhs_), MakePlan(e.rhs_) );
	}
	}; // namespace expr

	namespace expr{
	/*!
	* \brief static type inference template,
	* used to get the dimension of each expression,
	* if ExpInfo<E>::kDim == -1, this means here are mismatch in expression
	* if ( ExpInfo<E>::kDevMask & cpu::kDevMask ) != 0, this means this expression can be assigned to cpu
	* \tparam E expression
	*/
	template<typename E>
	struct ExpInfo{
	const static int kDim = -1;
	const static int kDevMask = 0;
	};
	template<>
	struct ExpInfo<ScalarExp>{
	const static int kDim = 0;
	const static int kDevMask = 0xffff;
	};
	template<typename Device, int dim>
	struct ExpInfo< Tensor<Device,dim> >{
	const static int kDim = dim;
	const static int kDevMask = Device::kDevMask;
	};
	template<typename T, typename SrcExp, int dim>
	struct ExpInfo< MakeTensorExp<T,SrcExp,dim> >{
	const static int kDimSrc = ExpInfo<SrcExp>::kDim;
	const static int kDim = kDimSrc >= 0 ? dim : -1;
	const static int kDevMask = ExpInfo<SrcExp>::kDevMask;
	};
	template<typename OP, typename TA, int etype>
	struct ExpInfo< UnaryMapExp<OP,TA,etype> >{
	const static int kDim = ExpInfo<TA>::kDim;
	const static int kDevMask = ExpInfo<TA>::kDevMask;
	};
	template<typename OP, typename TA, typename TB, int etype>
	struct ExpInfo< BinaryMapExp<OP,TA,TB,etype> >{
	const static int kDimLhs = ExpInfo<TA>::kDim;
	const static int kDimRhs = ExpInfo<TB>::kDim;
	const static int kDim = (kDimLhs>=0 && kDimRhs >= 0) ? \
	( kDimLhs==0 ? kDimRhs : ( (kDimRhs==0\|\|kDimLhs==kDimRhs) ? kDimLhs : -1 ) ):-1;
	const static int kDevMask = ExpInfo<TA>::kDevMask & ExpInfo<TB>::kDevMask;
	};

	/! \brief template to do type check /
	template<typename Device, int dim, typename E>
	struct TypeCheck{
	/! \brief dimension of expression/
	const static int kExpDim = ExpInfo<E>::kDim;
	/! \brief whether the expression device type matches /
	const static bool kDevPass = (ExpInfo<E>::kDevMask & Device::kDevMask) != 0;
	/! \brief whether the expression can be mapped to expression of dim /
	const static bool kMapPass = (kExpDim == 0 \|\| kExpDim == dim) && kDevPass;
	/! \brief whether the expression can be reduced to expression of dim /
	const static bool kRedPass = (kExpDim > dim) && kDevPass;
	};

	template<bool kPass>
	struct TypeCheckPass;
	template<>
	struct TypeCheckPass<false>{};
	template<>
	struct TypeCheckPass<true>{
	inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type( void ){}
	inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp( void ){}
	inline static void Error_Expression_Does_Not_Meet_Dimension_Req( void ){}
	};
	}; // namespace expr

	namespace expr{
	// check shape consistency
	template<int dim,typename E>
	struct ShapeCheck{
	inline static Shape<dim> Check( const E &t );
	};

	template<int dim>
	struct ShapeCheck<dim,ScalarExp>{
	inline static Shape<dim> Check( const ScalarExp &exp ){
	// use lowest dimension to mark scalar exp
	Shape<dim> shape; shape[0] = 0;
	return shape;
	}
	};
	template<int dim,typename Device>
	struct ShapeCheck<dim,Tensor<Device,dim> >{
	inline static Shape<dim> Check( const Tensor<Device,dim> &t ){
	return t.shape;
	}
	};
	template<int dim,typename SrcExp,typename T>
	struct ShapeCheck<dim,MakeTensorExp<T,SrcExp,dim> >{
	inline static Shape<dim> Check( const MakeTensorExp<T,SrcExp,dim> &t ){
	return t.shape_;
	}
	};
	template<int dim, typename OP, typename TA, int etype>
	struct ShapeCheck< dim,UnaryMapExp<OP,TA,etype> >{
	inline static Shape<dim> Check( const UnaryMapExp<OP,TA,etype> &t ){
	Shape<dim> s = ShapeCheck<dim,TA>::Check( t.src_ );
	return s;
	}
	};
	template<int dim, typename OP, typename TA, typename TB, int etype>
	struct ShapeCheck< dim, BinaryMapExp<OP,TA,TB,etype> >{
	inline static Shape<dim> Check( const BinaryMapExp<OP,TA,TB,etype> &t ){
	Shape<dim> shape1 = ShapeCheck<dim,TA>::Check( t.lhs_ );
	Shape<dim> shape2 = ShapeCheck<dim,TB>::Check( t.rhs_ );
	if( shape1[0] == 0 ) return shape2;
	if( shape2[0] == 0 ) return shape1;
	utils::Assert( shape1 == shape2, "BinaryMapExp: Shapes of two tensors in BinaryMapExp expression is not the same");
	return shape1;
	}
	};
	}; // namespace expr

	// the matrix OP depends on BLAS
	namespace expr{
	template<typename SV,typename Device, int ddim, int ldim, int rdim, bool ltrans, bool rtrans>
	struct DotEngine{
	inline static void Eval( Tensor<Device,ddim> &dst, const Tensor<Device,ldim> &lhs, const Tensor<Device,rdim> &rhs, real_t scale );
	};

	// handles the dot
	template<typename Device>
	struct BLASEngine;

	#if (MSHADOW_USE_CBLAS\|\|MSHADOW_USE_MKL)
	template<>
	struct BLASEngine<cpu>{
	inline static CBLAS_TRANSPOSE GetT( bool t ){
	return t ? CblasTrans : CblasNoTrans;
	}
	inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha, \
	const float A, int lda, const float B, int ldb, float beta, float *C, int ldc ){
	cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
	}
	inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha, \
	const double A, int lda, const double B, int ldb, double beta, double *C, int ldc ){
	cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
	}
	inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \
	const float X, int incX, float beta, float Y, int incY ){
	cblas_sgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
	}
	inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \
	const double X, int incX, double beta, double Y, int incY ){
	cblas_dgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
	}
	inline static void ger( int m, int n, float alpha, const float X, int incX, const float Y, int incY, float *A, int lda ){
	cblas_sger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda);
	}
	inline static void ger( int m, int n, double alpha, const double X, int incX, const double Y, int incY, double *A, int lda ){
	cblas_dger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda);
	}
	};
	#endif // MSHADOW_USE_CBLAS \|\| MSHADOW_USE_MKL

	#if MSHADOW_USE_CUDA
	// All CuBLAS goes to here, use legacy API: not threadsafe
	template<>
	struct BLASEngine<gpu>{
	inline static char GetT( bool t ){
	return t ? 'T' : 'N';
	}
	inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha,
	const float A, int lda, const float B, int ldb, float beta, float *C, int ldc ){
	cublasSgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
	}
	inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha,
	const double A, int lda, const double B, int ldb, double beta, double *C, int ldc ){
	cublasDgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
	}
	inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \
	const float X, int incX, float beta, float Y, int incY ){
	cublasSgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
	}
	inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \
	const double X, int incX, double beta, double Y, int incY ){
	cublasDgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
	}
	inline static void ger( int m, int n, float alpha, const float X, int incX, const float Y, int incY, float *A, int lda ){
	cublasSger(m,n,alpha,X,incX,Y,incY,A,lda);
	}
	inline static void ger( int m, int n, double alpha, const double X, int incX, const double Y, int incY, double *A, int lda ){
	cublasDger(m,n,alpha,X,incX,Y,incY,A,lda);
	}
	};
	#endif

	// helper function to decide which shape we are in
	inline static Shape<2> GetShape( const Shape<2> &shape, bool transpose ){
	return transpose ? Shape2(shape[0],shape[1]) : shape;
	}
	// dst = dot( lhs[.T], rhs[.T] )
	template<typename SV, typename xpu, bool transpose_left, bool transpose_right>
	struct DotEngine<SV,xpu,2,2,2,transpose_left,transpose_right>{
	inline static void Eval( Tensor<xpu,2> &dst, const Tensor<xpu,2> &lhs, const Tensor<xpu,2> &rhs, real_t scale ) {
	Shape<2> sleft = GetShape( lhs.shape, transpose_left );
	Shape<2> sright = GetShape( rhs.shape, transpose_right );
	utils::Assert( dst.shape[1] == sleft[1] && dst.shape[0] == sright[0] \
	&& sleft[0] == sright[1] , "dot-gemm: matrix shape mismatch" );
	// use column major argument to compatible with most BLAS
	BLASEngine<xpu>::gemm
	( transpose_right , transpose_left,
	transpose_right ? rhs.shape[1] : rhs.shape[0],
	transpose_left ? lhs.shape[0] : lhs.shape[1],
	transpose_right ? rhs.shape[0] : rhs.shape[1],
	scale * SV::kAlphaBLAS,
	rhs.dptr, rhs.shape.stride_,
	lhs.dptr, lhs.shape.stride_,
	SV::kBetaBLAS,
	dst.dptr, dst.shape.stride_ );
	}
	};
	template<typename SV, typename xpu, bool transpose_right>
	struct DotEngine<SV,xpu,1,1,2,false,transpose_right>{
	inline static void Eval( Tensor<xpu,1> &dst, const Tensor<xpu,1> &lhs, const Tensor<xpu,2> &rhs, real_t scale ) {
	Shape<2> sright = GetShape( rhs.shape, transpose_right );
	utils::Assert( dst.shape[0] == sright[0] && lhs.shape[0] == sright[1], "dot-gemv: matrix shape mismatch");
	BLASEngine<xpu>::gemv
	( transpose_right,
	rhs.shape[0], rhs.shape[1], scale * SV::kAlphaBLAS,
	rhs.dptr, rhs.shape.stride_,
	lhs.dptr, 1, SV::kBetaBLAS,
	dst.dptr, 1 );
	}
	};
	template<typename SV, typename xpu>
	struct DotEngine<SV,xpu,2,1,1,true,false>{
	inline static void Eval( Tensor<xpu,2> &dst, const Tensor<xpu,1> &lhs, const Tensor<xpu,1> &rhs, real_t scale ) {
	utils::Assert( dst.shape[1] == lhs.shape[0] && dst.shape[0] == rhs.shape[0], "dot-ger: matrix shape mismatch" );
	if( SV::kBetaBLAS < 1e-6f ){
	BLASEngine<xpu>::ger
	( rhs.shape[0], lhs.shape[0], scale * SV::kAlphaBLAS,
	rhs.dptr, 1, lhs.dptr, 1, dst.dptr, dst.shape.stride_ );
	}else{
	DotEngine<SV,xpu,2,2,2,true,false>::Eval( dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale );
	}
	}
	};

	}; // namespace expr

	namespace expr{
	/! \brief some engine that evaluate complex expression /
	template<typename SV, typename Device, int dim, typename E>
	struct ExpComplexEngine{
	inline static void Eval( Tensor<Device,dim>& dst, const E &exp );
	};
	template<typename SV, typename Device, int dim>
	struct ExpEngine<SV, Tensor<Device,dim> >{
	template<typename E>
	inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kMapper> &exp ){
	MapExp<SV,dim,E>( dst, exp );
	}
	template<typename E>
	inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kContainer> &exp ){
	MapExp<SV,dim,E>( dst, exp );
	}
	template<typename E>
	inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kComplex> &exp ){
	ExpComplexEngine<SV,Device,dim,E>::Eval( dst, exp.self() );
	}
	};
	template<typename SV, typename Device, int dim, int ldim,int rdim,bool ltrans,bool rtrans>
	struct ExpComplexEngine< SV, Device, dim, DotExp< Tensor<Device,ldim>, Tensor<Device,rdim>, ltrans, rtrans > >{
	inline static void Eval( Tensor<Device,dim> &dst, const DotExp< Tensor<Device,ldim>, Tensor<Device,rdim>, ltrans, rtrans > &exp ){
	DotEngine<SV,Device,dim,ldim,rdim,ltrans,rtrans>::Eval( dst, exp.lhs_, exp.rhs_, exp.scale_ );
	}
	};
	}; // namespace expr
	};
	#endif