blob: 04fd098f26bfa4fddfc2234ac06075417eebfe44 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include "utils.cuh"
#include "agg_ops.cuh"
#include "reduction.cuh"
using uint = unsigned int;
#include <cuda_runtime.h>
/**
* Do a summation over all elements of an array/matrix
* @param g_idata input data stored in device memory (of size n)
* @param g_odata output/temporary array stored in device memory (of size n)
* @param n size of the input and temporary/output arrays
*/
template<typename T>
__device__ void reduce_sum(T *g_idata, T *g_odata, uint n) {
SumOp<T> agg_op;
IdentityOp<T> spoof_op;
FULL_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, (T) 0.0, agg_op, spoof_op);
}
extern "C" __global__ void reduce_sum_d(double *g_idata, double *g_odata, uint n) {
reduce_sum(g_idata, g_odata, n);
}
extern "C" __global__ void reduce_sum_f(float *g_idata, float *g_odata, uint n) {
reduce_sum(g_idata, g_odata, n);
}
/**
* Do a summation over all rows of a matrix
* @param g_idata input matrix stored in device memory (of size rows * cols)
* @param g_odata output vector stored in device memory (of size rows)
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
*/
template<typename T>
__device__ void reduce_row_sum(T *g_idata, T *g_odata, uint rows, uint cols) {
SumOp<T> agg_op;
IdentityOp<T> spoof_op;
ROW_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, 0.0, agg_op, spoof_op);
}
extern "C" __global__ void reduce_row_sum_d(double *g_idata, double *g_odata, uint rows, uint cols) {
reduce_row_sum(g_idata, g_odata, rows, cols);
}
extern "C" __global__ void reduce_row_sum_f(float *g_idata, float *g_odata, uint rows, uint cols) {
reduce_row_sum(g_idata, g_odata, rows, cols);
}
/**
* Do a summation over all columns of a matrix
* @param g_idata input matrix stored in device memory (of size rows * cols)
* @param g_odata output vector stored in device memory (of size cols)
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
*/
template<typename T>
__device__ void reduce_col_sum(T *g_idata, T *g_odata, uint rows, uint cols) {
SumOp<T> agg_op;
IdentityOp<T> spoof_op;
COL_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op);
}
extern "C" __global__ void reduce_col_sum_d(double *g_idata, double *g_odata, uint rows, uint cols) {
reduce_col_sum(g_idata, g_odata, rows, cols);
}
extern "C" __global__ void reduce_col_sum_f(float *g_idata, float *g_odata, uint rows, uint cols) {
reduce_col_sum(g_idata, g_odata, rows, cols);
}
/**
* Do a max over all elements of an array/matrix
* @param g_idata input data stored in device memory (of size n)
* @param g_odata output/temporary array stode in device memory (of size n)
* @param n size of the input and temporary/output arrays
*/
template<typename T>
__device__ void reduce_max(T *g_idata, T *g_odata, uint n) {
MaxOp<T> agg_op;
IdentityOp<T> spoof_op;
FULL_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, -MAX<T>(), agg_op, spoof_op);
}
extern "C" __global__ void reduce_max_d(double *g_idata, double *g_odata, uint n) {
reduce_max(g_idata, g_odata, n);
}
extern "C" __global__ void reduce_max_f(float *g_idata, float *g_odata, uint n) {
reduce_max(g_idata, g_odata, n);
}
/**
* Do a max over all rows of a matrix
* @param g_idata input matrix stored in device memory (of size rows * cols)
* @param g_odata output vector stored in device memory (of size rows)
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
*/
template<typename T>
__device__ void reduce_row_max(T *g_idata, T *g_odata, uint rows, uint cols) {
MaxOp<T> agg_op;
IdentityOp<T> spoof_op;
ROW_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, -MAX<T>(), agg_op, spoof_op);
}
extern "C" __global__ void reduce_row_max_d(double *g_idata, double *g_odata, uint rows, uint cols) {
reduce_row_max(g_idata, g_odata, rows, cols);
}
extern "C" __global__ void reduce_row_max_f(float *g_idata, float *g_odata, uint rows, uint cols) {
reduce_row_max(g_idata, g_odata, rows, cols);
}
/**
* Do a max over all columns of a matrix
* @param g_idata input matrix stored in device memory (of size rows * cols)
* @param g_odata output vector stored in device memory (of size cols)
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
*/
template<typename T>
__device__ void reduce_col_max(T *g_idata, T *g_odata, uint rows, uint cols) {
MaxOp<T> agg_op;
IdentityOp<T> spoof_op;
COL_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, -MAX<T>(), agg_op, spoof_op);
}
extern "C" __global__ void reduce_col_max_d(double *g_idata, double *g_odata, uint rows, uint cols) {
reduce_col_max(g_idata, g_odata, rows, cols);
}
extern "C" __global__ void reduce_col_max_f(float *g_idata, float *g_odata, uint rows, uint cols) {
reduce_col_max(g_idata, g_odata, rows, cols);
}
/**
* Do a min over all elements of an array/matrix
* @param g_idata input data stored in device memory (of size n)
* @param g_odata output/temporary array stode in device memory (of size n)
* @param n size of the input and temporary/output arrays
*/
template<typename T>
__device__ void reduce_min(T *g_idata, T *g_odata, uint n) {
MinOp<T> agg_op;
IdentityOp<T> spoof_op;
FULL_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, MAX<T>(), agg_op, spoof_op);
}
extern "C" __global__ void reduce_min_d(double *g_idata, double *g_odata, uint n) {
reduce_min(g_idata, g_odata, n);
}
extern "C" __global__ void reduce_min_f(float *g_idata, float *g_odata, uint n) {
reduce_min(g_idata, g_odata, n);
}
/**
* Do a min over all rows of a matrix
* @param g_idata input matrix stored in device memory (of size rows * cols)
* @param g_odata output vector stored in device memory (of size rows)
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
*/
template<typename T>
__device__ void reduce_row_min(T *g_idata, T *g_odata, uint rows, uint cols) {
MinOp<T> agg_op;
IdentityOp<T> spoof_op;
ROW_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, MAX<T>(), agg_op, spoof_op);
}
extern "C" __global__ void reduce_row_min_d(double *g_idata, double *g_odata, uint rows, uint cols) {
reduce_row_min(g_idata, g_odata, rows, cols);
}
extern "C" __global__ void reduce_row_min_f(float *g_idata, float *g_odata, uint rows, uint cols) {
reduce_row_min(g_idata, g_odata, rows, cols);
}
/**
* Do a min over all columns of a matrix
* @param g_idata input matrix stored in device memory (of size rows * cols)
* @param g_odata output vector stored in device memory (of size cols)
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
*/
template<typename T>
__device__ void reduce_col_min(T *g_idata, T *g_odata, uint rows, uint cols) {
MinOp<T> agg_op;
IdentityOp<T> spoof_op;
COL_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, MAX<T>(), agg_op, spoof_op);
}
extern "C" __global__ void reduce_col_min_d(double *g_idata, double *g_odata, uint rows, uint cols) {
reduce_col_min(g_idata, g_odata, rows, cols);
}
extern "C" __global__ void reduce_col_min_f(float *g_idata, float *g_odata, uint rows, uint cols) {
reduce_col_min(g_idata, g_odata, rows, cols);
}
/**
* Do a summation over all squared elements of an array/matrix
* @param g_idata input data stored in device memory (of size n)
* @param g_odata output/temporary array stored in device memory (of size n)
* @param n size of the input and temporary/output arrays
*/
template<typename T>
__device__ void reduce_sum_sq(T *g_idata, T *g_odata, uint n) {
SumSqOp<T> agg_op;
IdentityOp<T> spoof_op;
FULL_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, (T) 0.0, agg_op, spoof_op);
}
extern "C" __global__ void reduce_sum_sq_d(double *g_idata, double *g_odata, uint n) {
reduce_sum_sq(g_idata, g_odata, n);
}
extern "C" __global__ void reduce_sum_sq_f(float *g_idata, float *g_odata, uint n) {
reduce_sum_sq(g_idata, g_odata, n);
}
/**
* Do a summation over all squared elements of an array/matrix
* @param g_idata input data stored in device memory (of size n)
* @param g_odata output/temporary array stored in device memory (of size n)
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
*/
template<typename T>
__device__ void reduce_col_sum_sq(T* g_idata, T* g_odata, uint rows, uint cols) {
SumSqOp<T> agg_op;
IdentityOp<T> spoof_op;
COL_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op);
}
extern "C" __global__ void reduce_col_sum_sq_d(double* g_idata, double* g_odata, uint rows, uint cols) {
reduce_col_sum_sq(g_idata, g_odata, rows, cols);
}
extern "C" __global__ void reduce_col_sum_sq_f(float* g_idata, float* g_odata, uint rows, uint cols) {
reduce_col_sum_sq(g_idata, g_odata, rows, cols);
}
template<typename T>
__device__ void reduce_row_sum_sq(T* g_idata, T* g_odata, uint rows, uint cols) {
SumSqOp<T> agg_op;
IdentityOp<T> spoof_op;
ROW_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op);
}
extern "C" __global__ void reduce_row_sum_sq_d(double* g_idata, double* g_odata, uint rows, uint cols) {
reduce_row_sum_sq(g_idata, g_odata, rows, cols);
}
extern "C" __global__ void reduce_row_sum_sq_f(float* g_idata, float* g_odata, uint rows, uint cols) {
reduce_row_sum_sq(g_idata, g_odata, rows, cols);
}