blob: be228cde559bee1bb3f2e5e2dd82cd3c1f0b9bbf [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#pragma once
#ifndef CUM_SUM_PROD_H
#define CUM_SUM_PROD_H
using uint = unsigned int;
#include <cuda_runtime.h>
/**
* Cumulative Sum-Product
* Applies column-wise x_n = x_n + y_n * sum with sum = x_n-1 on a two column matrix in a down-up-sweep cascade of
* thread blocks.
* ToDo: Put phases of sum-prod into separate kernels
*/
template<typename T, typename Vec2Accessor>
__device__ void cumulative_sum_prod(T *g_idata, ///< input data stored in device memory (of size n)
T *g_odata, ///< output/temporary array stored in device memory (of size n)
T *g_tiData, // data input to final stage from intermediate stage
T *g_toData, // temporary data produced by cascading thread blocks
uint rows, ///< rows in input and temporary/output arrays
uint block_height, // number of rows in a block
uint phase, // up and down sweep are implemented in this one kernel for now. <phase> indicates what to do.
// phase 0: initial down-sweep; 1: subsequent down-sweeps of the cascade;
// 2: subsequent up-sweeps; 3: final up-sweep
bool verbose = false) // toggle printf output (for debugging purposes)
{
uint idx = blockIdx.x * block_height;
// return if thread of (last) block is out of bounds
// ToDo: should not happen - remove after checking
if (idx > rows - 1)
return;
uint n = min(idx + block_height, rows);
auto value = Vec2Accessor::init();
// if up-sweep, fetch initial value from intermediate block results
if(phase > 1)
{
int read_idx = (blockIdx.x - 1) * 2;
T a = 0.0;
if(read_idx >= 0)
a = g_tiData[read_idx];
value = Vec2Accessor::get(g_idata, idx * 2);
if(verbose)
printf("In: blockIdx.x: %d, 2*idx: %d, val_x: %f, val_y: %f, a: %f, phase=%d\n",
blockIdx.x, 2*idx, value.x, value.y, a, phase);
value.x = value.x + value.y * a;
n = min(idx + block_height, rows);
}
else // fetch initial value from input
{
value = Vec2Accessor::get(g_idata, 2 * idx);
if(verbose)
printf("Read: blockIdx.x: %d, 2 * idx: %d, val_x: %f, val_y: %f, n: %d, phase=%d\n",
blockIdx.x, 2 * idx, value.x, value.y, n, phase);
}
T sumprod = value.x;
T prod = value.y;
// write to one or two column output in up-sweep
if(phase == 2)
Vec2Accessor::put(g_odata, 2 * idx, sumprod, prod);
else if (phase == 3)
g_odata[idx] = value.x;
if(phase > 1 && verbose)
printf("Out[i=0]: blockIdx.x: %d, idx: %d, sumprod: %f, prod: %f, n: %d, phase=%d\n",
blockIdx.x, idx, sumprod, prod, n, phase);
// loop over 2nd to n rows of thread block
for (int i = idx + 1; i < n; ++i)
{
value = Vec2Accessor::get(g_idata, 2 * i);
prod = prod * value.y;
sumprod = value.x + value.y * sumprod;
// write subsequent row results of that block to one or two column output in up-sweep
if(phase == 2)
Vec2Accessor::put(g_odata, 2 * i, sumprod, prod);
else if (phase == 3)
g_odata[i] = sumprod;
if(verbose)
printf("Loop[i=%d]: blockIdx.x: %d, read_i: %d, sumprod: %f, prod: %f, n: %d, phase=%d\n",
i, blockIdx.x, i * 2, sumprod, prod, n, phase);
}
// down sweep (phase 0/1)
// write out accumulated block offset to intermediate buffer
if (g_toData != nullptr)
{
uint write_idx = blockIdx.x * 2;
if(blockIdx.x < gridDim.x - 1)
{
Vec2Accessor::put(g_toData, write_idx, sumprod, prod);
if(verbose)
printf("Carry: blockIdx.x: %d, write_idx: %d, sumprod: %f, prod: %f, n: %d, phase=%d\n",
blockIdx.x, write_idx, sumprod, prod, n, phase);
}
}
}
/**
* Cumulative sum-product instantiation for double
*/
extern "C"
__global__ void cumulative_sum_prod_d(double *g_idata, double *g_odata, double *g_tiData, double *g_toData, uint rows,
uint block_height, uint offset)
{
cumulative_sum_prod<double, double2Accessor>(g_idata, g_odata, g_tiData, g_toData, rows, block_height, offset);
}
/**
* Cumulative sum-product instantiation for float
*/
extern "C" __global__ void cumulative_sum_prod_f(float *g_idata, float *g_odata, float *g_tiData, float *g_toData,
uint rows, uint block_height, uint offset)
{
cumulative_sum_prod<float, float2Accessor>(g_idata, g_odata, g_tiData, g_toData, rows, block_height, offset);
}
#endif // CUM_SUM_PROD_H