blob: b7808a6ffa30c50bc0e166c3186885a5d1feb1fa [file] [log] [blame]
#ifndef MSHADOW_CUDA_REDUCE_CUH
#define MSHADOW_CUDA_REDUCE_CUH
/*!
* \file cuda_reduce.cuh
* \brief helper functions to do reduction
* \author Tianqi Chen
*/
namespace mshadow{
namespace cuda{
/*
* \brief reduce over the dimension x
* \tparam Reducer reducer
* \tparam x_bits dimension = 1<<x_bits
*/
template<typename Reducer,int x_bits>
inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] );
/*
* \brief reduce over the dimension x
* \tparam Reducer reducer
* \tparam xmax_bits maximum size of buffer
* \param xsize size of x dimension, not sure if aligned
*/
template<typename Reducer, int xmax_bits>
inline __device__ void Reduce1DNotAlign( volatile real_t buf[1<<xmax_bits], int xsize );
};
};
// ===============================================x===
// implementations afterwards,
// no need to read if only use the functions
// --------------------------------------------------
#ifdef __DEVICE_EMULATION__
#define __MSHADOW_EMUSYNC__ __syncthreads()
#else
#define __MSHADOW_EMUSYNC__
#endif
namespace mshadow{
namespace cuda{
template<typename Reducer, int x_bits>
inline __device__ void ReduceX( volatile real_t buf[], int tid ){
if( x_bits >= 10 ){
if( tid < 512 ) Reducer::Reduce( buf[tid] , buf[tid + 512] );
__syncthreads();
}
if( x_bits >= 9 ){
if( tid < 256 ) Reducer::Reduce( buf[tid] , buf[tid + 256] );
__syncthreads();
}
if( x_bits >= 8 ){
if( tid < 128 ) Reducer::Reduce( buf[tid] , buf[tid + 128] );
__syncthreads();
}
if( x_bits >= 7 ){
if( tid < 64 ) Reducer::Reduce( buf[tid] , buf[tid + 64 ] );
__syncthreads();
}
if( x_bits >= 6 ){
if( tid < 32 ) Reducer::Reduce( buf[tid] , buf[tid + 32] );
__syncthreads();
}
// in warp optimization
if( x_bits >= 5 ){
if( tid < 16 ) Reducer::Reduce( buf[tid] , buf[tid + 16] );
__MSHADOW_EMUSYNC__;
}
if( x_bits >= 4 ){
if( tid < 8 ) Reducer::Reduce( buf[tid] , buf[tid + 8 ] );
__MSHADOW_EMUSYNC__;
}
if( x_bits >= 3 ){
if( tid < 4 ) Reducer::Reduce( buf[tid] , buf[tid + 4 ] );
__MSHADOW_EMUSYNC__;
}
if( x_bits >= 2 ){
if( tid < 2 ) Reducer::Reduce( buf[tid] , buf[tid + 2 ] );
__MSHADOW_EMUSYNC__;
}
if( x_bits >= 1 ){
if( tid < 1 ) Reducer::Reduce( buf[tid] , buf[tid + 1 ] );
__MSHADOW_EMUSYNC__;
}
};
template<typename Reducer,int x_bits>
inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] ){
ReduceX<Reducer,x_bits>( buf, threadIdx.x );
}
// reduce with a upper bound
#define __RD_NON_ALIGN(els,x_bits) \
els \
if( xmax_bits >= x_bits && x_size >= (1 << x_bits) ){ \
if( tid < (1 << x_bits) && tid + (1<<x_bits) < x_size ){ \
Reducer::Reduce( buf[tid] , buf[tid + (1<<x_bits)] ); \
} \
__syncthreads(); \
ReduceX<Reducer, x_bits>( buf, tid ); \
} \
template<typename Reducer, int xmax_bits>
inline __device__ void Reduce1DNotAlign( volatile real_t buf[], int x_size ){
int tid = threadIdx.x;
__RD_NON_ALIGN(, 8)
__RD_NON_ALIGN(else, 7)
__RD_NON_ALIGN(else, 6)
__RD_NON_ALIGN(else, 5)
__RD_NON_ALIGN(else, 4)
__RD_NON_ALIGN(else, 3)
__RD_NON_ALIGN(else, 2)
__RD_NON_ALIGN(else, 1)
}
};
};
#endif // MSHADOW_CUDA_REDUCE_CUH