blob: b98383e83d6a4ea2c07e3da3093fd9718f4b66d2 [file] [log] [blame]
* \file tensor_sse-inl.hpp
* \brief support of sse2 optimization of some operations
* \author Tianqi Chen
#ifdef __APPLE__
#include <stdlib.h>
#include <malloc.h>
#include "tensor_expr.h"
#include "tensor.h"
namespace mshadow {
/*! \brief namespace to support sse2 vectorization */
namespace sse2{
* \brief analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells
* \param pitch output parameter, the actuall space allocated for each line
* \param lspace number of cells required for each line
* \param num_line number of lines to be allocated
inline void* AlignedMallocPitch( size_t &pitch, size_t lspace, size_t num_line ){
pitch = ((lspace+15) >> 4) << 4;
#ifdef _MSC_VER
void * res = _aligned_malloc( pitch*num_line, 16 );
#ifdef __APPLE__
void *res = malloc( pitch * num_line );
void * res = memalign( 16, pitch*num_line );
utils::Assert( res != NULL, "AlignedMallocPitch failed" );
return res;
* \brief free aligned space
* \param ptr pointer to space to be freed
inline void AlignedFree( void *ptr ){
#ifdef _MSC_VER
_aligned_free( ptr );
free( ptr );
/*! \brief check if a pointer is aligned */
inline bool CheckAlign( size_t pitch ){
return !(pitch & ((1<<4)-1));
/*! \brief check if a pointer is aligned */
inline bool CheckAlign( void *ptr ){
return CheckAlign( (size_t)ptr );
* \brief get upper bound of aligned index of size
* \param size size of the array
* \param fsize size of float
inline index_t UpperAlign( index_t size, size_t fsize ){
return (( (size*fsize+15) >> 4 ) << 4) / fsize;
* \brief get lower bound of aligned index of size
* \param size size of the array
* \param fsize size of float
inline index_t LowerAlign( index_t size, size_t fsize ){
return (( (size*fsize) >> 4 ) << 4) / fsize;
}; // namespace sse2
}; // namespace mshadow
// sse types are not compatible with nvcc, only use them in cpu mode
#include <emmintrin.h>
namespace mshadow{
namespace sse2{
* \brief float vector real type, used for vectorization
* \tparam FloatType double or float
template<typename FloatType> struct FVec{};
/*! \brief vector real type for float */
struct FVec<float> {
typedef __m128 DType;
/*! \brief number of float in vector */
const static index_t kSize = 4;
/*! \brief data content */
DType data_;
/* constructors */
FVec( void ){}
FVec( DType data ):data_(data){}
/* set the float */
FVec( const float &s ){
data_ = _mm_set1_ps( s );
/*!\brief load from pointer src */
FVec( const float *src ){
data_ = _mm_load_ps( src );
/*! \brief store data into dst space */
inline void Store( float *dst ) const{
return _mm_store_ps( dst, data_ );
/*! \brief sum of all content */
inline float Sum( void ) const{
DType ans = _mm_add_ps( data_, _mm_movehl_ps( data_, data_ ) );
DType rst = _mm_add_ss( ans, _mm_shuffle_ps( ans, ans, 1 ) );
#if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64)
return rst.m128_f32[ 0 ];
float rr = _mm_cvtss_f32( rst ) ;
return rr;
/*! \brief vector real type for float */
struct FVec<double> {
typedef __m128d DType;
/*! \brief number of float in vector */
const static index_t kSize = 2;
/*! \brief data content */
DType data_;
/* constructors */
FVec( void ){}
FVec( DType data ):data_(data){}
/* set the float */
FVec( const double &s ){
data_ = _mm_set1_pd( s );
/*!\brief load from pointer src */
FVec( const double *src ){
data_ = _mm_load_pd( src );
/*! \brief store data into dst space */
inline void Store( double *dst ) const{
return _mm_store_pd( dst, data_ );
/*! \brief sum of all content */
inline double Sum( void ) const{
DType tmp = _mm_add_sd( data_, _mm_unpackhi_pd( data_,data_ ) ) ;
#if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64)
return tmp.m128d_f64[0];
double ans = _mm_cvtsd_f64( tmp );
return ans;
namespace sse2{
/*! \brief sse2 operator type of certain operator */
template<typename OP>
struct SSEOp{
const static bool kEnabled = false;
struct SSEOp<op::plus>{
const static bool kEnabled = true;
MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
return FVec<float>( _mm_add_ps( lhs.data_, rhs.data_ ) );
MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
return FVec<double>( _mm_add_pd( lhs.data_, rhs.data_ ) );
struct SSEOp<op::minus>{
const static bool kEnabled = true;
MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
return FVec<float>( _mm_sub_ps( lhs.data_, rhs.data_ ) );
MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
return FVec<double>( _mm_sub_pd( lhs.data_, rhs.data_ ) );
struct SSEOp<op::mul>{
const static bool kEnabled = true;
MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
return FVec<float>( _mm_mul_ps( lhs.data_, rhs.data_ ) );
MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
return FVec<double>( _mm_mul_pd( lhs.data_, rhs.data_ ) );
struct SSEOp<op::div>{
const static bool kEnabled = true;
MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
return FVec<float>( _mm_div_ps( lhs.data_, rhs.data_ ) );
MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
return FVec<double>( _mm_div_pd( lhs.data_, rhs.data_ ) );
struct SSEOp<op::identity>{
const static bool kEnabled = true;
MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &src ){
return src;
MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &src ){
return src;
}; // namespace sse2
namespace sse2{
// savers to do storage
template<typename SV, typename TFloat>
struct Saver{
MSHADOW_CINLINE static void Save( TFloat *dst, const FVec<TFloat> &src ){
FVec<TFloat> lhs( dst );
FVec<TFloat> ans = SSEOp<typename SV::OPType>::Map( lhs, src );
ans.Store( dst );
template<typename TFloat>
struct Saver<sv::saveto,TFloat>{
MSHADOW_CINLINE static void Save( TFloat *dst, const FVec<TFloat> &src ){
src.Store( dst );
}; // namespace sse2
}; // namespace mshadow
namespace mshadow{
namespace expr{
// same as plan, but use sse2
template<typename ExpType>
class SSEPlan {
* \brief evaluate the expression at index [y][x], x will be aligned to 4
* to be implemented by SubType
MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const;
MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const;
template <typename Device, int dim>
class SSEPlan< Tensor<Device,dim> >{
SSEPlan( const Tensor<Device,dim> &t )
MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
return sse2::FVec<real_t>( &dptr_[ y*stride_+x ] );
MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
return dptr_[ y * stride_ + x ];
const real_t *dptr_;
index_t stride_;
class SSEPlan<ScalarExp>{
SSEPlan( real_t scalar ):scalar_(scalar){}
MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
return sse2::FVec<real_t>( scalar_ );
MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
return scalar_;
real_t scalar_;
template<typename OP, typename TA, typename TB,int etype>
class SSEPlan< BinaryMapExp<OP,TA,TB,etype> >{
SSEPlan( const SSEPlan<TA> &lhs, const SSEPlan<TB> &rhs )
:lhs_(lhs), rhs_(rhs){}
MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
return sse2::SSEOp<OP>::Map( lhs_.EvalSSE( y, x ), rhs_.EvalSSE( y, x ) );
MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) );
SSEPlan<TA> lhs_;
SSEPlan<TB> rhs_;
template<typename OP, typename TA, int etype>
class SSEPlan< UnaryMapExp<OP,TA,etype> >{
SSEPlan( const SSEPlan<TA> &src ):src_(src){}
MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
return sse2::SSEOp<OP>::Map( src_.EvalSSE( y, x ) );
MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
return OP::Map( src_.Eval( y, x ) );
SSEPlan<TA> src_;
template<typename OP, typename TA, typename TB, int etype>
inline SSEPlan< BinaryMapExp<OP,TA,TB,etype> > MakeSSEPlan( const BinaryMapExp<OP,TA,TB,etype> &e );
inline SSEPlan<ScalarExp> MakeSSEPlan( const ScalarExp &e ){
return SSEPlan<ScalarExp>( e.scalar_ );
template<typename T>
inline SSEPlan<T> MakeSSEPlan( const ContainerExp<T> &e ){
return SSEPlan<T>( e.self() );
template<typename T,int dim>
inline SSEPlan<T> MakeSSEPlan( const MakeTensorExp<T,cpu,dim> &e ){
return SSEPlan<T>( e.real_self() );
template<typename OP, typename TA, int etype>
inline SSEPlan< UnaryMapExp<OP,TA,etype> > MakeSSEPlan( const UnaryMapExp<OP,TA,etype> &e ){
return SSEPlan< UnaryMapExp<OP,TA,etype> >( MakeSSEPlan(e.src_) );
template<typename OP, typename TA, typename TB, int etype>
inline SSEPlan< BinaryMapExp<OP,TA,TB,etype> > MakeSSEPlan( const BinaryMapExp<OP,TA,TB,etype> &e ){
return SSEPlan< BinaryMapExp<OP,TA,TB,etype> >( MakeSSEPlan(e.lhs_), MakeSSEPlan(e.rhs_) );
namespace expr{
* \brief static check sse enable
* if a expression E can not be evaluated using sse, then kPass = false
* \tparam Device the type of Device
* \tparam dim dimension of the tensor
* \tparam E expression
template<typename E>
struct SSECheck{
const static bool kPass = false;
struct SSECheck<ScalarExp>{
const static bool kPass = true;
template<int dim>
struct SSECheck<Tensor<cpu,dim> >{
const static bool kPass = true;
template<typename OP, typename TA, int etype>
struct SSECheck<UnaryMapExp<OP,TA,etype> >{
const static bool kPass = SSECheck<TA>::kPass && sse2::SSEOp<OP>::kEnabled;
template<typename OP, typename TA, typename TB, int etype>
struct SSECheck< BinaryMapExp<OP,TA,TB,etype> >{
const static bool kPass = SSECheck<TA>::kPass && SSECheck<TB>::kPass && sse2::SSEOp<OP>::kEnabled;
}; // namespace expr
namespace expr{
// check if data is aligned and allow sse operation
template<int dim,typename E>
struct SSEAlignCheck{
inline static bool Check( const E &exp ){
return false;
template<int dim>
struct SSEAlignCheck< dim, ScalarExp >{
inline static bool Check( const ScalarExp &exp ){
return true;
template<int dim>
struct SSEAlignCheck< dim,Tensor<cpu,dim> >{
inline static bool Check( const Tensor<cpu,dim> &t ){
return sse2::CheckAlign( t.dptr ) && sse2::CheckAlign( t.shape.stride_ * sizeof( real_t ) );
template<int dim, typename OP, typename TA, int etype>
struct SSEAlignCheck< dim, UnaryMapExp<OP,TA,etype> >{
inline static bool Check( const UnaryMapExp<OP,TA,etype> &t ){
return SSEAlignCheck<dim,TA>::Check( t.src_);
template<int dim, typename OP, typename TA, typename TB, int etype>
struct SSEAlignCheck< dim, BinaryMapExp<OP,TA,TB,etype> >{
inline static bool Check( const BinaryMapExp<OP,TA,TB,etype> &t ){
return SSEAlignCheck<dim,TA>::Check( t.lhs_ ) &&
SSEAlignCheck<dim,TB>::Check( t.rhs_ );
}; // namespace expr
* \brief use SSEPlan to compute result
template<typename SV, typename E, int dim>
inline void MapSSEPlan(Tensor<cpu,dim> _dst, const expr::SSEPlan<E> &plan){
Tensor<cpu,2> dst = _dst.FlatTo2D();
const index_t xlen = sse2::LowerAlign( dst.shape[0], sizeof(real_t) );
for ( index_t y = 0; y < dst.shape[1]; y ++ ) {
for( index_t x = 0; x < xlen; x += sse2::FVec<real_t>::kSize ){
sse2::Saver<SV,real_t>::Save( &dst[y][x], plan.EvalSSE( y,x ) );
for( index_t x = xlen; x < dst.shape[0]; x ++ ){
SV::Save( dst[y][x], plan.Eval(y,x) );
}; // namespace mshadow