| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| #include "utils.cuh" |
| #include "agg_ops.cuh" |
| #include "reduction.cuh" |
| |
| using uint = unsigned int; |
| #include <cuda_runtime.h> |
| |
| /** |
| * Do a summation over all elements of an array/matrix |
| * @param g_idata input data stored in device memory (of size n) |
| * @param g_odata output/temporary array stored in device memory (of size n) |
| * @param n size of the input and temporary/output arrays |
| */ |
| template<typename T> |
| __device__ void reduce_sum(T *g_idata, T *g_odata, uint n) { |
| SumOp<T> agg_op; |
| IdentityOp<T> spoof_op; |
| FULL_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, (T) 0.0, agg_op, spoof_op); |
| } |
| |
| extern "C" __global__ void reduce_sum_d(double *g_idata, double *g_odata, uint n) { |
| reduce_sum(g_idata, g_odata, n); |
| } |
| |
| extern "C" __global__ void reduce_sum_f(float *g_idata, float *g_odata, uint n) { |
| reduce_sum(g_idata, g_odata, n); |
| } |
| |
| /** |
| * Do a summation over all rows of a matrix |
| * @param g_idata input matrix stored in device memory (of size rows * cols) |
| * @param g_odata output vector stored in device memory (of size rows) |
| * @param rows number of rows in input matrix |
| * @param cols number of columns in input matrix |
| */ |
| template<typename T> |
| __device__ void reduce_row_sum(T *g_idata, T *g_odata, uint rows, uint cols) { |
| SumOp<T> agg_op; |
| IdentityOp<T> spoof_op; |
| ROW_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, 0.0, agg_op, spoof_op); |
| } |
| |
| extern "C" __global__ void reduce_row_sum_d(double *g_idata, double *g_odata, uint rows, uint cols) { |
| reduce_row_sum(g_idata, g_odata, rows, cols); |
| } |
| |
| extern "C" __global__ void reduce_row_sum_f(float *g_idata, float *g_odata, uint rows, uint cols) { |
| reduce_row_sum(g_idata, g_odata, rows, cols); |
| } |
| |
| /** |
| * Do a summation over all columns of a matrix |
| * @param g_idata input matrix stored in device memory (of size rows * cols) |
| * @param g_odata output vector stored in device memory (of size cols) |
| * @param rows number of rows in input matrix |
| * @param cols number of columns in input matrix |
| */ |
| template<typename T> |
| __device__ void reduce_col_sum(T *g_idata, T *g_odata, uint rows, uint cols) { |
| SumOp<T> agg_op; |
| IdentityOp<T> spoof_op; |
| COL_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op); |
| } |
| |
| extern "C" __global__ void reduce_col_sum_d(double *g_idata, double *g_odata, uint rows, uint cols) { |
| reduce_col_sum(g_idata, g_odata, rows, cols); |
| } |
| |
| extern "C" __global__ void reduce_col_sum_f(float *g_idata, float *g_odata, uint rows, uint cols) { |
| reduce_col_sum(g_idata, g_odata, rows, cols); |
| } |
| |
| |
| /** |
| * Do a max over all elements of an array/matrix |
| * @param g_idata input data stored in device memory (of size n) |
| * @param g_odata output/temporary array stode in device memory (of size n) |
| * @param n size of the input and temporary/output arrays |
| */ |
| template<typename T> |
| __device__ void reduce_max(T *g_idata, T *g_odata, uint n) { |
| MaxOp<T> agg_op; |
| IdentityOp<T> spoof_op; |
| FULL_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, -MAX<T>(), agg_op, spoof_op); |
| } |
| |
| extern "C" __global__ void reduce_max_d(double *g_idata, double *g_odata, uint n) { |
| reduce_max(g_idata, g_odata, n); |
| } |
| |
| extern "C" __global__ void reduce_max_f(float *g_idata, float *g_odata, uint n) { |
| reduce_max(g_idata, g_odata, n); |
| } |
| |
| /** |
| * Do a max over all rows of a matrix |
| * @param g_idata input matrix stored in device memory (of size rows * cols) |
| * @param g_odata output vector stored in device memory (of size rows) |
| * @param rows number of rows in input matrix |
| * @param cols number of columns in input matrix |
| */ |
| template<typename T> |
| __device__ void reduce_row_max(T *g_idata, T *g_odata, uint rows, uint cols) { |
| MaxOp<T> agg_op; |
| IdentityOp<T> spoof_op; |
| ROW_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, -MAX<T>(), agg_op, spoof_op); |
| } |
| |
| extern "C" __global__ void reduce_row_max_d(double *g_idata, double *g_odata, uint rows, uint cols) { |
| reduce_row_max(g_idata, g_odata, rows, cols); |
| } |
| |
| extern "C" __global__ void reduce_row_max_f(float *g_idata, float *g_odata, uint rows, uint cols) { |
| reduce_row_max(g_idata, g_odata, rows, cols); |
| } |
| |
| /** |
| * Do a max over all columns of a matrix |
| * @param g_idata input matrix stored in device memory (of size rows * cols) |
| * @param g_odata output vector stored in device memory (of size cols) |
| * @param rows number of rows in input matrix |
| * @param cols number of columns in input matrix |
| */ |
| template<typename T> |
| __device__ void reduce_col_max(T *g_idata, T *g_odata, uint rows, uint cols) { |
| MaxOp<T> agg_op; |
| IdentityOp<T> spoof_op; |
| COL_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, -MAX<T>(), agg_op, spoof_op); |
| } |
| |
| extern "C" __global__ void reduce_col_max_d(double *g_idata, double *g_odata, uint rows, uint cols) { |
| reduce_col_max(g_idata, g_odata, rows, cols); |
| } |
| |
| extern "C" __global__ void reduce_col_max_f(float *g_idata, float *g_odata, uint rows, uint cols) { |
| reduce_col_max(g_idata, g_odata, rows, cols); |
| } |
| |
| |
| /** |
| * Do a min over all elements of an array/matrix |
| * @param g_idata input data stored in device memory (of size n) |
| * @param g_odata output/temporary array stode in device memory (of size n) |
| * @param n size of the input and temporary/output arrays |
| */ |
| template<typename T> |
| __device__ void reduce_min(T *g_idata, T *g_odata, uint n) { |
| MinOp<T> agg_op; |
| IdentityOp<T> spoof_op; |
| FULL_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, MAX<T>(), agg_op, spoof_op); |
| } |
| |
| extern "C" __global__ void reduce_min_d(double *g_idata, double *g_odata, uint n) { |
| reduce_min(g_idata, g_odata, n); |
| } |
| |
| extern "C" __global__ void reduce_min_f(float *g_idata, float *g_odata, uint n) { |
| reduce_min(g_idata, g_odata, n); |
| } |
| |
| |
| /** |
| * Do a min over all rows of a matrix |
| * @param g_idata input matrix stored in device memory (of size rows * cols) |
| * @param g_odata output vector stored in device memory (of size rows) |
| * @param rows number of rows in input matrix |
| * @param cols number of columns in input matrix |
| */ |
| template<typename T> |
| __device__ void reduce_row_min(T *g_idata, T *g_odata, uint rows, uint cols) { |
| MinOp<T> agg_op; |
| IdentityOp<T> spoof_op; |
| ROW_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, MAX<T>(), agg_op, spoof_op); |
| } |
| |
| extern "C" __global__ void reduce_row_min_d(double *g_idata, double *g_odata, uint rows, uint cols) { |
| reduce_row_min(g_idata, g_odata, rows, cols); |
| } |
| |
| extern "C" __global__ void reduce_row_min_f(float *g_idata, float *g_odata, uint rows, uint cols) { |
| reduce_row_min(g_idata, g_odata, rows, cols); |
| } |
| |
| /** |
| * Do a min over all columns of a matrix |
| * @param g_idata input matrix stored in device memory (of size rows * cols) |
| * @param g_odata output vector stored in device memory (of size cols) |
| * @param rows number of rows in input matrix |
| * @param cols number of columns in input matrix |
| */ |
| template<typename T> |
| __device__ void reduce_col_min(T *g_idata, T *g_odata, uint rows, uint cols) { |
| MinOp<T> agg_op; |
| IdentityOp<T> spoof_op; |
| COL_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, MAX<T>(), agg_op, spoof_op); |
| } |
| |
| extern "C" __global__ void reduce_col_min_d(double *g_idata, double *g_odata, uint rows, uint cols) { |
| reduce_col_min(g_idata, g_odata, rows, cols); |
| } |
| |
| extern "C" __global__ void reduce_col_min_f(float *g_idata, float *g_odata, uint rows, uint cols) { |
| reduce_col_min(g_idata, g_odata, rows, cols); |
| } |
| |
| |
| /** |
| * Do a summation over all squared elements of an array/matrix |
| * @param g_idata input data stored in device memory (of size n) |
| * @param g_odata output/temporary array stored in device memory (of size n) |
| * @param n size of the input and temporary/output arrays |
| */ |
| template<typename T> |
| __device__ void reduce_sum_sq(T *g_idata, T *g_odata, uint n) { |
| SumSqOp<T> agg_op; |
| IdentityOp<T> spoof_op; |
| FULL_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, (T) 0.0, agg_op, spoof_op); |
| } |
| |
| extern "C" __global__ void reduce_sum_sq_d(double *g_idata, double *g_odata, uint n) { |
| reduce_sum_sq(g_idata, g_odata, n); |
| } |
| |
| extern "C" __global__ void reduce_sum_sq_f(float *g_idata, float *g_odata, uint n) { |
| reduce_sum_sq(g_idata, g_odata, n); |
| } |
| |
| /** |
| * Do a summation over all squared elements of an array/matrix |
| * @param g_idata input data stored in device memory (of size n) |
| * @param g_odata output/temporary array stored in device memory (of size n) |
| * @param rows number of rows in input matrix |
| * @param cols number of columns in input matrix |
| */ |
| template<typename T> |
| __device__ void reduce_col_sum_sq(T* g_idata, T* g_odata, uint rows, uint cols) { |
| SumSqOp<T> agg_op; |
| IdentityOp<T> spoof_op; |
| COL_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op); |
| } |
| |
| extern "C" __global__ void reduce_col_sum_sq_d(double* g_idata, double* g_odata, uint rows, uint cols) { |
| reduce_col_sum_sq(g_idata, g_odata, rows, cols); |
| } |
| |
| extern "C" __global__ void reduce_col_sum_sq_f(float* g_idata, float* g_odata, uint rows, uint cols) { |
| reduce_col_sum_sq(g_idata, g_odata, rows, cols); |
| } |
| |
| template<typename T> |
| __device__ void reduce_row_sum_sq(T* g_idata, T* g_odata, uint rows, uint cols) { |
| SumSqOp<T> agg_op; |
| IdentityOp<T> spoof_op; |
| ROW_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op); |
| } |
| |
| extern "C" __global__ void reduce_row_sum_sq_d(double* g_idata, double* g_odata, uint rows, uint cols) { |
| reduce_row_sum_sq(g_idata, g_odata, rows, cols); |
| } |
| |
| extern "C" __global__ void reduce_row_sum_sq_f(float* g_idata, float* g_odata, uint rows, uint cols) { |
| reduce_row_sum_sq(g_idata, g_odata, rows, cols); |
| } |