include/mshadow/tensor.h - singa - Git at Google

 #ifndef MSHADOW_TENSOR_H
 #define MSHADOW_TENSOR_H
 /*!
  * \file tensor.h
  * \brief header file of tensor data structure and functions
  *        covention: this lib requires explicit memory allocation and de-allocation
  *                   all the data structure Tensor<cpu,1>, Tensor<gpu,1> are like handles(pointers),
  *                   no memory allocation is happening during calculation
  * \author Bing Xu, Tianqi Chen
  */
 #include "tensor_base.h"
 #include "tensor_expr.h"

 namespace mshadow {
     /*!
      * \brief shape of a tensor
      *       IMPORTANT NOTE: this shape is different from numpy.shape
      *       shape[0] gives the lowest dimension, shape[dimension-1] gives the highest dimension
      *       shape[k] corresponds to k-th dimension of tensor
      * \tparam dimension dimension of tensor
      */
     template<int dimension>
     struct Shape {
     public:
         /*! \brief maximum dimension of tensor */
         const static int kMaxShape = dimension;
         /*! \brief maximum dimension minus 1 */
         const static int kSubShape = dimension - 1;
     public:
         /*! \brief default constructor, do nothing */
         MSHADOW_XINLINE Shape(void) {}
         /*! \brief constuctor */
         MSHADOW_XINLINE Shape( const Shape<dimension> &s ){
             #pragma unroll
             for( int i = 0; i < kMaxShape; ++i ){
                 this->shape_[i] = s[i];
             }
             this->stride_ = s.stride_;
         }
         /*!
          * \brief get corresponding index
          * \param idx dimension index
          * \return the corresponding dimension size
          */
         MSHADOW_XINLINE index_t& operator[](index_t idx) {
             return shape_[ idx ];
         }
         /*!
          * \brief get corresponding index
          * \param idx dimension index
          * \return the corresponding dimension size
          */
         MSHADOW_XINLINE const index_t& operator[](index_t idx) const {
             return shape_[ idx ];
         }
         /*! \return whether two shape equals */
         MSHADOW_XINLINE bool operator==(const Shape<kMaxShape> &s) const {
             #pragma unroll
             for ( int i = 0; i < kMaxShape; ++i ) {
                 if (s.shape_[i] != this->shape_[i]) return false;
             }
             return true;
         }
         /*!
          * flatten the higher dimension to second dimension, return a 2D shape
          * \return the flat 2d shape
          */
         MSHADOW_XINLINE Shape<2> FlatTo2D(void) const {
             Shape<2> s;
             s.stride_ = this->stride_;
             s.shape_[ 0 ] = this->shape_[ 0 ];
             index_t ymax = 1;

             #pragma unroll
             for (int i = 1; i < kMaxShape; ++i) {
                 ymax *= this->shape_[ i ];
             }
             s.shape_[1] = ymax;
             return s;
         }
         /*! \return number of valid elements */
         MSHADOW_XINLINE size_t Size(void) const{
             size_t memsz = this->shape_[ 0 ];
             #pragma unroll
             for (int i = 1; i < kMaxShape; ++i) {
                 memsz *= this->shape_[ i ];
             }
             return memsz;
         }
         /*! \return memory size, including the aligned x dimension */
         MSHADOW_XINLINE size_t MSize(void) const {
             size_t memsz = this->stride_;
             #pragma unroll
             for (int i = 1; i < kMaxShape; ++i) {
                 memsz *= this->shape_[ i ];
             }
             return memsz;
         }
         /*!
          * \return product shape in [dimstart,dimend)
          * \param dimstart start dimension
          * \param dimend   end dimension
          */
         MSHADOW_XINLINE index_t ProdShape( int dimstart, int dimend ) const{
             index_t num = 1;
             #pragma unroll
             for (int i = dimstart; i < dimend; ++i) {
                 num *= this->shape_[ i ];
             }
             return num;
         }
         /*!
          * \brief get subshape
          * \return subshape
          */
         MSHADOW_XINLINE Shape<kSubShape> SubShape(void) const {
             Shape<kSubShape> s;
             s.stride_ = this->stride_;
             // for cuda
             #pragma unroll
             for (int i = 0; i < kSubShape; ++i) {
                 s.shape_[ i ] = this->shape_[ i ];
             }
             return s;
         }

     public:
         /*! \brief storing the dimension information */
         index_t shape_[ kMaxShape ];
         /*!
          * \brief storing the stride information in x dimension
          *    this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency
          */
         index_t stride_;
     };
     // useful construction functions to generate shape
     /*!
      * \brief construct a one dimension shape, stride will equal s0
      * \param s0 size of dimension 0
      * \return the shape construction
      */
     MSHADOW_XINLINE Shape<1> Shape1( index_t s0 ){
         Shape<1> s; s[0] = s0; s.stride_ = s0;
         return s;
     }
     /*!
      * \brief construct a two dimension shape, stride will equal s0
      * \param s1 size of dimension 1
      * \param s0 size of dimension 0
      * \return the shape construction
      */
     MSHADOW_XINLINE Shape<2> Shape2( index_t s1, index_t s0 ){
         Shape<2> s; s[0] = s0; s[1] = s1; s.stride_ = s0;
         return s;
     }
     /*!
      * \brief construct a three dimension shape, stride will equal s0
      * \param s2 size of dimension 2
      * \param s1 size of dimension 1
      * \param s0 size of dimension 0
      * \return the shape construction
      */
     MSHADOW_XINLINE Shape<3> Shape3( index_t s2, index_t s1, index_t s0 ){
         Shape<3> s;
         s[0] = s0; s[1] = s1; s[2] = s2; s.stride_ = s0;
         return s;
     }
     /*!
      * \brief construct a four dimension shape, stride will equal s0
      * \param s3 size of dimension 3
      * \param s2 size of dimension 2
      * \param s1 size of dimension 1
      * \param s0 size of dimension 0
      * \return the shape construction
      */
     MSHADOW_XINLINE Shape<4> Shape4( index_t s3, index_t s2, index_t s1, index_t s0 ){
         Shape<4> s;
         s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s.stride_ = s0;
         return s;
     }
 }; // namespace mshadow

 namespace mshadow {
     /*! \brief device name CPU */
     struct cpu {
         /*! \brief whether this device is CPU or not */
         const static bool kDevCPU = true;
         /*! \brief device flag number, identifies this device */
         const static int kDevMask = 1<<0;
     };
     /*! \brief device name CPU */
     struct gpu {
         /*! \brief whether this device is CPU or not */
         const static bool kDevCPU = false;
         /*! \brief device flag number, identifies this device */
         const static int kDevMask = 1<<1;
     };

     // more compact template
     /*!
      * \brief general tensor
      * \tparam Device which device the tensor is on
      * \tparam dimension dimension of the tensor
      */
     template<typename Device, int dimension>
     struct Tensor: public expr::ContainerExp< Tensor<Device,dimension> >{
     public:
         /*! \brief whether current type lies in cpu */
         const static bool kDevCPU = Device::kDevCPU;
         /*! \brief dimension of subtype */
         const static int  kSubdim = dimension - 1;

     public:
         /*! \brief pointer to the data */
         real_t *dptr;
         /*! \brief shape of the tensor */
         Shape<dimension> shape;
     public:
         /*! \brief default constructor */
         MSHADOW_XINLINE Tensor(void) {}
         /*! \brief constructor from shape  */
         MSHADOW_XINLINE Tensor(const Shape<dimension> &shape): shape(shape) {}
         /*! \brief constructor from data pointer and shape  */
         MSHADOW_XINLINE Tensor(real_t *dptr, const Shape<dimension> &shape): dptr((real_t*)dptr), shape(shape) {}
         /*!
          * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together
          * \return tensor after flatten
          */
         MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
             return Tensor<Device, 2>(reinterpret_cast<real_t*> \
                                      (dptr), shape.FlatTo2D());
         }
         /*!
          * \brief get a element of dimension - 1
          * \param idx index
          * \return the result tensor
          */
         MSHADOW_XINLINE Tensor<Device, kSubdim> operator[](index_t idx) const {
             Shape<kSubdim> s = shape.SubShape();
             return Tensor<Device, kSubdim>(reinterpret_cast<real_t*> \
                                            (dptr) + s.MSize() * idx, s);
         }
         /*!
          * \brief slice the tensor in highest dimension [begin,end)
          * \param begin begin position of slice
          * \param end end position of slice
          * \return tensor after slice
          */
         MSHADOW_XINLINE Tensor<Device, dimension> Slice(index_t begin, index_t end) const {
             Shape<dimension> s = this->shape;
             s[ dimension - 1 ] = end - begin;
             return Tensor<Device, dimension>(reinterpret_cast<real_t*>\
                                              (dptr) + s.SubShape().MSize() * begin, s);
         }
     public:
         /*!\brief functions to fit expression template */
         inline Tensor<Device,dimension>& operator=( real_t s ){
             return this->__assign( s );
         }
         /*!\brief functions to fit expression template */
         template<typename E>
         inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
             return this->__assign( exp );
         }
         /*!\brief functions to fit expression template */
         template<typename E>
         inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
             return this->__assign( exp );
         }
     };

     /*
      *  respecialized class Tensor1D,thei is due to different implementation in operator[]
      */
     template<typename Device>
     struct Tensor<Device,1>: public expr::ContainerExp< Tensor<Device,1> >{
     public:
         real_t *dptr;
         Shape<1> shape;
     public:
         MSHADOW_XINLINE Tensor(void) {}
         MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape(shape) {}
         MSHADOW_XINLINE Tensor(real_t *dptr, Shape<1> shape) :dptr(dptr), shape(shape) {}

         MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
             return Tensor<Device, 2>(reinterpret_cast<real_t*> \
                                      (dptr), shape.FlatTo2D());
         }
         MSHADOW_XINLINE Tensor<Device, 1> Slice(index_t begin, index_t end) const {
             Shape<1> s;
             s[0] = s.stride_ = end  - begin;
             return Tensor<Device, 1>(reinterpret_cast<real_t*> \
                                      (dptr) + begin, s);
         }
         MSHADOW_XINLINE real_t &operator[](index_t idx) { return dptr[ idx ]; }
         MSHADOW_XINLINE const real_t &operator[](index_t idx)const { return dptr[ idx ]; }
     public:
         // functions to fit expression template
         inline Tensor<Device,1>& operator=( double s ){
             return this->__assign( s );
         }
         template<typename E>
         inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
             return this->__assign( exp );
         }
         template<typename E>
         inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
             return this->__assign( exp );
         }
     };
 }; // namespace mshadow

 // add unroll loops for the shape
 namespace mshadow {
     // function declarations
     /*!
      * \brief initialize tensor engine, used to call intialization functions of dependent libs
      *        this function should be called before all GPU tensor operations,
      *        for using tensors in CPU, this call is actually not needed
      * \param device_id GPU device id to be choosed
      */
     inline void InitTensorEngine( int device_id=0 );
     /*!
      * \brief Shutdown tensor engine,
      *        this function should be called after all GPU tensor operations,
      *        for using tensors in CPU, this call is actually not needed
      */
     inline void ShutdownTensorEngine( void );

     /*!
      * \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj
      *        this function is responsible to set the stride_ in each obj.shape
      * \tparam dim specify the dim of tensor
      * \param obj the tensor object, with shape specified
      * \param pad whether padding dimension 0, to make last dimension aligned,
      *            padding may help improve efficiency of matrix multiplications
      *            if true, will allocate space with stride_ that may not equals shape[0]
      *            if false, will allocate continuous space
      */
     template<int dim>
     inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);
     /*! \brief refer to comment of cpu ver \sa AllocSpace */
     template<int dim>
     inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);

     /*!
      * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL
      * \tparam dim specify the dim of tensor
      * \param obj the tensor object
      */
     template<int dim>
     inline void FreeSpace(Tensor<cpu,dim> &obj);
     /*! \brief refer to comment of cpu ver \sa FreeSpace */
     template<int dim>
     inline void FreeSpace(Tensor<gpu,dim> &obj);

     /*!
      * \brief CPU/GPU: short cut to allocate and initialize a Tensor
      * \tparam Device device of tensor
      * \tparam dim dimention of tensor
      * \param shape: shape of tensor
      * \param initv: initialization value
      * \param pad : padding option
      * \sa AllocSpace
      */
     template<typename Device, int dim>
     inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, bool pad = MSHADOW_ALLOC_PAD);

     /*!
      * \brief copy data from one tensor to another, with same shape
      * \tparam dim specify the dim of tensor
      * \param dst target tensor
      * \param src source tensor
      */
     template<int dim>
     inline void Copy(Tensor<cpu,dim> dst, const Tensor<cpu,dim> &src );
     /*! \brief refer to comment of cpu ver \sa Copy */
     template<int dim>
     inline void Copy(Tensor<cpu,dim> dst, const Tensor<gpu,dim> &src );
     /*! \brief refer to comment of cpu ver \sa Copy */
     template<int dim>
     inline void Copy(Tensor<gpu,dim> dst, const Tensor<cpu,dim> &src );
     /*! \brief refer to comment of cpu ver \sa Copy */
     template<int dim>
     inline void Copy(Tensor<gpu,dim> dst, const Tensor<gpu,dim> &src );


     /*!
      * \brief CPU/GPU: normalize softmax: dst[i][j] = exp( energy[i][j] ) /( sum_j exp( energy[i][j] ) )
      * \param dst destination
      * \param energy input energy
      */
     inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2> &energy );
     /*! \brief refer to comment of cpu ver \sa Softmax */
     inline void Softmax( Tensor<gpu,2> dst, const Tensor<gpu,2> &energy );

 }; // namespace mshadow


 namespace mshadow{
     // function declarations to support expression, no need to understand them
     // these functions do not need to be directly used

     /*!
      * \brief CPU/GPU: map a expression to a tensor, this function calls MapPlan
      * \tparam Saver specify storage method
      * \tparam dim dim of the tensor, during usage, there is no need to specify this parameter
      * \tparam E specifies the expression type, not need to specify this parameter during usage
      * \tparam etype expression type
      * \param dst destination
      * \param exp expression
      * \sa namespace mshadow:sv, mshadow::op, mshadow::expr
      */
     template<typename Saver, int dim, typename E, int etype>
     inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp );
     /*! \brief refer to comment of cpu ver \sa MapExp */
     template<typename Saver, int dim, typename E, int etype>
     inline void MapExp(Tensor<gpu,dim> dst, const expr::Exp<E,etype> &exp );

     /*!
      * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0)
      * \tparam Saver specify storage method
      * \tparam Reducer specify a reducer method
      * \tparam E specifies the expression type, not need to specify this parameter during usage
      * \tparam etype expression type
      * \param dst destination
      * \param exp expression
      * \param scale scale the result before save
      * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
      */
     template<typename Saver, typename Reducer, typename E, int etype>
     inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
     /*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */
     template<typename Saver, typename Reducer, typename E, int etype>
     inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );


     /*!
      * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2)
      * \tparam Saver specify storage method
      * \tparam Reducer specify a reducer method
      * \tparam E specifies the expression type, not need to specify this parameter during usage
      * \tparam dimkeep the target dimension to be kept, should be larger than 0, for 0, use MapReduceKeepLowest
      * \tparam etype expression type
      * \param dst destination
      * \param exp expression
      * \param scale scale the result before save
      * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
      */
     template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
     inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
     /*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */
     template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
     inline void MapReduceKeepHighDim( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );

 };// namespace mshadow

 // execution implementation of expression evaluations
 #include "tensor_expr_engine-inl.hpp"
 // cpu implementation of functions
 #include "tensor_cpu-inl.hpp"
 // gpu implementation of functions
 #include "tensor_gpu-inl.hpp"
 // extension of expressions
 #include "tensor_expr_ext.h"
 // io
 #include "tensor_io.h"
 // container
 #include "tensor_container.h"
 // random number generator
 #include "tensor_random.h"
 #endif // TENSOR_H
	#ifndef MSHADOW_TENSOR_H
	#define MSHADOW_TENSOR_H
	/*!
	* \file tensor.h
	* \brief header file of tensor data structure and functions
	* covention: this lib requires explicit memory allocation and de-allocation
	* all the data structure Tensor<cpu,1>, Tensor<gpu,1> are like handles(pointers),
	* no memory allocation is happening during calculation
	* \author Bing Xu, Tianqi Chen
	*/
	#include "tensor_base.h"
	#include "tensor_expr.h"

	namespace mshadow {
	/*!
	* \brief shape of a tensor
	* IMPORTANT NOTE: this shape is different from numpy.shape
	* shape[0] gives the lowest dimension, shape[dimension-1] gives the highest dimension
	* shape[k] corresponds to k-th dimension of tensor
	* \tparam dimension dimension of tensor
	*/
	template<int dimension>
	struct Shape {
	public:
	/! \brief maximum dimension of tensor /
	const static int kMaxShape = dimension;
	/! \brief maximum dimension minus 1 /
	const static int kSubShape = dimension - 1;
	public:
	/! \brief default constructor, do nothing /
	MSHADOW_XINLINE Shape(void) {}
	/! \brief constuctor /
	MSHADOW_XINLINE Shape( const Shape<dimension> &s ){
	#pragma unroll
	for( int i = 0; i < kMaxShape; ++i ){
	this->shape_[i] = s[i];
	}
	this->stride_ = s.stride_;
	}
	/*!
	* \brief get corresponding index
	* \param idx dimension index
	* \return the corresponding dimension size
	*/
	MSHADOW_XINLINE index_t& operator[](index_t idx) {
	return shape_[ idx ];
	}
	/*!
	* \brief get corresponding index
	* \param idx dimension index
	* \return the corresponding dimension size
	*/
	MSHADOW_XINLINE const index_t& operator[](index_t idx) const {
	return shape_[ idx ];
	}
	/! \return whether two shape equals /
	MSHADOW_XINLINE bool operator==(const Shape<kMaxShape> &s) const {
	#pragma unroll
	for ( int i = 0; i < kMaxShape; ++i ) {
	if (s.shape_[i] != this->shape_[i]) return false;
	}
	return true;
	}
	/*!
	* flatten the higher dimension to second dimension, return a 2D shape
	* \return the flat 2d shape
	*/
	MSHADOW_XINLINE Shape<2> FlatTo2D(void) const {
	Shape<2> s;
	s.stride_ = this->stride_;
	s.shape_[ 0 ] = this->shape_[ 0 ];
	index_t ymax = 1;

	#pragma unroll
	for (int i = 1; i < kMaxShape; ++i) {
	ymax *= this->shape_[ i ];
	}
	s.shape_[1] = ymax;
	return s;
	}
	/! \return number of valid elements /
	MSHADOW_XINLINE size_t Size(void) const{
	size_t memsz = this->shape_[ 0 ];
	#pragma unroll
	for (int i = 1; i < kMaxShape; ++i) {
	memsz *= this->shape_[ i ];
	}
	return memsz;
	}
	/! \return memory size, including the aligned x dimension /
	MSHADOW_XINLINE size_t MSize(void) const {
	size_t memsz = this->stride_;
	#pragma unroll
	for (int i = 1; i < kMaxShape; ++i) {
	memsz *= this->shape_[ i ];
	}
	return memsz;
	}
	/*!
	* \return product shape in [dimstart,dimend)
	* \param dimstart start dimension
	* \param dimend end dimension
	*/
	MSHADOW_XINLINE index_t ProdShape( int dimstart, int dimend ) const{
	index_t num = 1;
	#pragma unroll
	for (int i = dimstart; i < dimend; ++i) {
	num *= this->shape_[ i ];
	}
	return num;
	}
	/*!
	* \brief get subshape
	* \return subshape
	*/
	MSHADOW_XINLINE Shape<kSubShape> SubShape(void) const {
	Shape<kSubShape> s;
	s.stride_ = this->stride_;
	// for cuda
	#pragma unroll
	for (int i = 0; i < kSubShape; ++i) {
	s.shape_[ i ] = this->shape_[ i ];
	}
	return s;
	}

	public:
	/! \brief storing the dimension information /
	index_t shape_[ kMaxShape ];
	/*!
	* \brief storing the stride information in x dimension
	* this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency
	*/
	index_t stride_;
	};
	// useful construction functions to generate shape
	/*!
	* \brief construct a one dimension shape, stride will equal s0
	* \param s0 size of dimension 0
	* \return the shape construction
	*/
	MSHADOW_XINLINE Shape<1> Shape1( index_t s0 ){
	Shape<1> s; s[0] = s0; s.stride_ = s0;
	return s;
	}
	/*!
	* \brief construct a two dimension shape, stride will equal s0
	* \param s1 size of dimension 1
	* \param s0 size of dimension 0
	* \return the shape construction
	*/
	MSHADOW_XINLINE Shape<2> Shape2( index_t s1, index_t s0 ){
	Shape<2> s; s[0] = s0; s[1] = s1; s.stride_ = s0;
	return s;
	}
	/*!
	* \brief construct a three dimension shape, stride will equal s0
	* \param s2 size of dimension 2
	* \param s1 size of dimension 1
	* \param s0 size of dimension 0
	* \return the shape construction
	*/
	MSHADOW_XINLINE Shape<3> Shape3( index_t s2, index_t s1, index_t s0 ){
	Shape<3> s;
	s[0] = s0; s[1] = s1; s[2] = s2; s.stride_ = s0;
	return s;
	}
	/*!
	* \brief construct a four dimension shape, stride will equal s0
	* \param s3 size of dimension 3
	* \param s2 size of dimension 2
	* \param s1 size of dimension 1
	* \param s0 size of dimension 0
	* \return the shape construction
	*/
	MSHADOW_XINLINE Shape<4> Shape4( index_t s3, index_t s2, index_t s1, index_t s0 ){
	Shape<4> s;
	s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s.stride_ = s0;
	return s;
	}
	}; // namespace mshadow

	namespace mshadow {
	/! \brief device name CPU /
	struct cpu {
	/! \brief whether this device is CPU or not /
	const static bool kDevCPU = true;
	/! \brief device flag number, identifies this device /
	const static int kDevMask = 1<<0;
	};
	/! \brief device name CPU /
	struct gpu {
	/! \brief whether this device is CPU or not /
	const static bool kDevCPU = false;
	/! \brief device flag number, identifies this device /
	const static int kDevMask = 1<<1;
	};

	// more compact template
	/*!
	* \brief general tensor
	* \tparam Device which device the tensor is on
	* \tparam dimension dimension of the tensor
	*/
	template<typename Device, int dimension>
	struct Tensor: public expr::ContainerExp< Tensor<Device,dimension> >{
	public:
	/! \brief whether current type lies in cpu /
	const static bool kDevCPU = Device::kDevCPU;
	/! \brief dimension of subtype /
	const static int kSubdim = dimension - 1;

	public:
	/! \brief pointer to the data /
	real_t *dptr;
	/! \brief shape of the tensor /
	Shape<dimension> shape;
	public:
	/! \brief default constructor /
	MSHADOW_XINLINE Tensor(void) {}
	/! \brief constructor from shape /
	MSHADOW_XINLINE Tensor(const Shape<dimension> &shape): shape(shape) {}
	/! \brief constructor from data pointer and shape /
	MSHADOW_XINLINE Tensor(real_t dptr, const Shape<dimension> &shape): dptr((real_t)dptr), shape(shape) {}
	/*!
	* \brief flatten the tensor to 2 dimension, collapse the higher dimensions together
	* \return tensor after flatten
	*/
	MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
	return Tensor<Device, 2>(reinterpret_cast<real_t*> \
	(dptr), shape.FlatTo2D());
	}
	/*!
	* \brief get a element of dimension - 1
	* \param idx index
	* \return the result tensor
	*/
	MSHADOW_XINLINE Tensor<Device, kSubdim> operator[](index_t idx) const {
	Shape<kSubdim> s = shape.SubShape();
	return Tensor<Device, kSubdim>(reinterpret_cast<real_t*> \
	(dptr) + s.MSize() * idx, s);
	}
	/*!
	* \brief slice the tensor in highest dimension [begin,end)
	* \param begin begin position of slice
	* \param end end position of slice
	* \return tensor after slice
	*/
	MSHADOW_XINLINE Tensor<Device, dimension> Slice(index_t begin, index_t end) const {
	Shape<dimension> s = this->shape;
	s[ dimension - 1 ] = end - begin;
	return Tensor<Device, dimension>(reinterpret_cast<real_t*>\
	(dptr) + s.SubShape().MSize() * begin, s);
	}
	public:
	/!\brief functions to fit expression template /
	inline Tensor<Device,dimension>& operator=( real_t s ){
	return this->__assign( s );
	}
	/!\brief functions to fit expression template /
	template<typename E>
	inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
	return this->__assign( exp );
	}
	/!\brief functions to fit expression template /
	template<typename E>
	inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
	return this->__assign( exp );
	}
	};

	/*
	* respecialized class Tensor1D,thei is due to different implementation in operator[]
	*/
	template<typename Device>
	struct Tensor<Device,1>: public expr::ContainerExp< Tensor<Device,1> >{
	public:
	real_t *dptr;
	Shape<1> shape;
	public:
	MSHADOW_XINLINE Tensor(void) {}
	MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape(shape) {}
	MSHADOW_XINLINE Tensor(real_t *dptr, Shape<1> shape) :dptr(dptr), shape(shape) {}

	MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
	return Tensor<Device, 2>(reinterpret_cast<real_t*> \
	(dptr), shape.FlatTo2D());
	}
	MSHADOW_XINLINE Tensor<Device, 1> Slice(index_t begin, index_t end) const {
	Shape<1> s;
	s[0] = s.stride_ = end - begin;
	return Tensor<Device, 1>(reinterpret_cast<real_t*> \
	(dptr) + begin, s);
	}
	MSHADOW_XINLINE real_t &operator[](index_t idx) { return dptr[ idx ]; }
	MSHADOW_XINLINE const real_t &operator[](index_t idx)const { return dptr[ idx ]; }
	public:
	// functions to fit expression template
	inline Tensor<Device,1>& operator=( double s ){
	return this->__assign( s );
	}
	template<typename E>
	inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
	return this->__assign( exp );
	}
	template<typename E>
	inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
	return this->__assign( exp );
	}
	};
	}; // namespace mshadow

	// add unroll loops for the shape
	namespace mshadow {
	// function declarations
	/*!
	* \brief initialize tensor engine, used to call intialization functions of dependent libs
	* this function should be called before all GPU tensor operations,
	* for using tensors in CPU, this call is actually not needed
	* \param device_id GPU device id to be choosed
	*/
	inline void InitTensorEngine( int device_id=0 );
	/*!
	* \brief Shutdown tensor engine,
	* this function should be called after all GPU tensor operations,
	* for using tensors in CPU, this call is actually not needed
	*/
	inline void ShutdownTensorEngine( void );

	/*!
	* \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj
	* this function is responsible to set the stride_ in each obj.shape
	* \tparam dim specify the dim of tensor
	* \param obj the tensor object, with shape specified
	* \param pad whether padding dimension 0, to make last dimension aligned,
	* padding may help improve efficiency of matrix multiplications
	* if true, will allocate space with stride_ that may not equals shape[0]
	* if false, will allocate continuous space
	*/
	template<int dim>
	inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);
	/! \brief refer to comment of cpu ver \sa AllocSpace /
	template<int dim>
	inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);

	/*!
	* \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL
	* \tparam dim specify the dim of tensor
	* \param obj the tensor object
	*/
	template<int dim>
	inline void FreeSpace(Tensor<cpu,dim> &obj);
	/! \brief refer to comment of cpu ver \sa FreeSpace /
	template<int dim>
	inline void FreeSpace(Tensor<gpu,dim> &obj);

	/*!
	* \brief CPU/GPU: short cut to allocate and initialize a Tensor
	* \tparam Device device of tensor
	* \tparam dim dimention of tensor
	* \param shape: shape of tensor
	* \param initv: initialization value
	* \param pad : padding option
	* \sa AllocSpace
	*/
	template<typename Device, int dim>
	inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, bool pad = MSHADOW_ALLOC_PAD);

	/*!
	* \brief copy data from one tensor to another, with same shape
	* \tparam dim specify the dim of tensor
	* \param dst target tensor
	* \param src source tensor
	*/
	template<int dim>
	inline void Copy(Tensor<cpu,dim> dst, const Tensor<cpu,dim> &src );
	/! \brief refer to comment of cpu ver \sa Copy /
	template<int dim>
	inline void Copy(Tensor<cpu,dim> dst, const Tensor<gpu,dim> &src );
	/! \brief refer to comment of cpu ver \sa Copy /
	template<int dim>
	inline void Copy(Tensor<gpu,dim> dst, const Tensor<cpu,dim> &src );
	/! \brief refer to comment of cpu ver \sa Copy /
	template<int dim>
	inline void Copy(Tensor<gpu,dim> dst, const Tensor<gpu,dim> &src );


	/*!
	* \brief CPU/GPU: normalize softmax: dst[i][j] = exp( energy[i][j] ) /( sum_j exp( energy[i][j] ) )
	* \param dst destination
	* \param energy input energy
	*/
	inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2> &energy );
	/! \brief refer to comment of cpu ver \sa Softmax /
	inline void Softmax( Tensor<gpu,2> dst, const Tensor<gpu,2> &energy );

	}; // namespace mshadow


	namespace mshadow{
	// function declarations to support expression, no need to understand them
	// these functions do not need to be directly used

	/*!
	* \brief CPU/GPU: map a expression to a tensor, this function calls MapPlan
	* \tparam Saver specify storage method
	* \tparam dim dim of the tensor, during usage, there is no need to specify this parameter
	* \tparam E specifies the expression type, not need to specify this parameter during usage
	* \tparam etype expression type
	* \param dst destination
	* \param exp expression
	* \sa namespace mshadow:sv, mshadow::op, mshadow::expr
	*/
	template<typename Saver, int dim, typename E, int etype>
	inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp );
	/! \brief refer to comment of cpu ver \sa MapExp /
	template<typename Saver, int dim, typename E, int etype>
	inline void MapExp(Tensor<gpu,dim> dst, const expr::Exp<E,etype> &exp );

	/*!
	* \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0)
	* \tparam Saver specify storage method
	* \tparam Reducer specify a reducer method
	* \tparam E specifies the expression type, not need to specify this parameter during usage
	* \tparam etype expression type
	* \param dst destination
	* \param exp expression
	* \param scale scale the result before save
	* \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
	*/
	template<typename Saver, typename Reducer, typename E, int etype>
	inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
	/! \brief refer to comment of cpu ver \sa MapReduceKeepLowest /
	template<typename Saver, typename Reducer, typename E, int etype>
	inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );


	/*!
	* \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2)
	* \tparam Saver specify storage method
	* \tparam Reducer specify a reducer method
	* \tparam E specifies the expression type, not need to specify this parameter during usage
	* \tparam dimkeep the target dimension to be kept, should be larger than 0, for 0, use MapReduceKeepLowest
	* \tparam etype expression type
	* \param dst destination
	* \param exp expression
	* \param scale scale the result before save
	* \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
	*/
	template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
	inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
	/! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim /
	template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
	inline void MapReduceKeepHighDim( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );

	};// namespace mshadow

	// execution implementation of expression evaluations
	#include "tensor_expr_engine-inl.hpp"
	// cpu implementation of functions
	#include "tensor_cpu-inl.hpp"
	// gpu implementation of functions
	#include "tensor_gpu-inl.hpp"
	// extension of expressions
	#include "tensor_expr_ext.h"
	// io
	#include "tensor_io.h"
	// container
	#include "tensor_container.h"
	// random number generator
	#include "tensor_random.h"
	#endif // TENSOR_H