src/main/cpp/kernels/cum_sum_prod.cuh - systemds - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 #ifndef __CUM_SUM_PROD_H
 #define __CUM_SUM_PROD_H

 #pragma once

 #include <cuda_runtime.h>

 /**
  * Cumulative Sum-Product
  * Applies column-wise x_n = x_n + y_n * sum with sum = x_n-1 on a two column matrix in a down-up-sweep cascade of
  * thread blocks.
  * ToDo: Put phases of sum-prod into separate kernels
  */
 template<typename T, typename Vec2Accessor>
 __device__ void cumulative_sum_prod(T *g_idata, ///< input data stored in device memory (of size n)
 		T *g_odata, ///< output/temporary array stored in device memory (of size n)
 		T *g_tiData,  // data input to final stage from intermediate stage
 		T *g_toData,  // temporary data produced by cascading thread blocks
 		uint rows,  ///< rows in input and temporary/output arrays
 		uint block_height, // number of rows in a block
 		uint phase, // up and down sweep are implemented in this one kernel for now. <phase> indicates what to do.
 		// phase 0: initial down-sweep; 1: subsequent down-sweeps of the cascade;
 		//       2: subsequent up-sweeps; 3: final up-sweep
 		bool verbose = false) // toggle printf output (for debugging purposes)
 {
 	uint idx = blockIdx.x * block_height;

 	// return if thread of (last) block is out of bounds
 	// ToDo: should not happen - remove after checking
 	if (idx > rows - 1)
 		return;

 	uint n = min(idx + block_height, rows);
 	auto value = Vec2Accessor::init();

 	// if up-sweep, fetch initial value from intermediate block results
 	if(phase > 1)
 	{
 		int read_idx = (blockIdx.x - 1) * 2;
 		T a = 0.0;
 		if(read_idx >= 0)
 			a = g_tiData[read_idx];

 		value = Vec2Accessor::get(g_idata, idx * 2);

 		if(verbose)
 			printf("In: blockIdx.x: %d, 2*idx: %d, val_x: %f, val_y: %f, a: %f, phase=%d\n",
 				blockIdx.x, 2*idx, value.x, value.y, a, phase);

 		value.x = value.x + value.y * a;
 		n = min(idx + block_height, rows);
 	}
 	else // fetch initial value from input
 	{
 		value = Vec2Accessor::get(g_idata, 2 * idx);

 		if(verbose)
 			printf("Read: blockIdx.x: %d, 2 * idx: %d, val_x: %f, val_y: %f, n: %d, phase=%d\n",
 				blockIdx.x, 2 * idx, value.x, value.y, n, phase);
 	}

 	T sumprod = value.x;
 	T prod = value.y;

 	// write to one or two column output in up-sweep
 	if(phase == 2)
 		Vec2Accessor::put(g_odata, 2 * idx, sumprod, prod);
 	else if (phase == 3)
 		g_odata[idx] = value.x;

 	if(phase > 1 && verbose)
 		printf("Out[i=0]: blockIdx.x: %d, idx: %d, sumprod: %f, prod: %f, n: %d, phase=%d\n",
 			blockIdx.x, idx, sumprod, prod, n, phase);

 	// loop over 2nd to n rows of thread block
 	for (int i = idx + 1; i < n; ++i)
 	{
 		value = Vec2Accessor::get(g_idata, 2 * i);

 		prod = prod * value.y;
 		sumprod = value.x + value.y * sumprod;

 		// write subsequent row results of that block to one or two column output in up-sweep
 		if(phase == 2)
 			Vec2Accessor::put(g_odata, 2 * i, sumprod, prod);
 		else if (phase == 3)
 			g_odata[i] = sumprod;

 		if(verbose)
 			printf("Loop[i=%d]: blockIdx.x: %d, read_i: %d, sumprod: %f, prod: %f, n: %d, phase=%d\n",
 				i, blockIdx.x, i * 2, sumprod, prod, n, phase);
 	}

 	// down sweep (phase 0/1)
 	// write out accumulated block offset to intermediate buffer
 	if (g_toData != nullptr)
 	{
 		uint write_idx = blockIdx.x * 2;
 		if(blockIdx.x < gridDim.x - 1)
 		{
 			Vec2Accessor::put(g_toData, write_idx, sumprod, prod);

 			if(verbose)
 				printf("Carry: blockIdx.x: %d, write_idx: %d, sumprod: %f, prod: %f, n: %d, phase=%d\n",
 					blockIdx.x, write_idx, sumprod, prod, n, phase);
 		}
 	}
 }

 /**
  * Cumulative sum-product instantiation for double
  */
 extern "C"
 __global__ void cumulative_sum_prod_d(double *g_idata, double *g_odata, double *g_tiData, double *g_toData, uint rows,
     uint block_height, uint offset)
 {
 	cumulative_sum_prod<double, double2Accessor>(g_idata, g_odata, g_tiData, g_toData, rows, block_height, offset);
 }

 /**
  * Cumulative sum-product instantiation for float
  */
 extern "C" __global__ void cumulative_sum_prod_f(float *g_idata, float *g_odata, float *g_tiData, float *g_toData,
     uint rows, uint block_height, uint offset)
 {
 	cumulative_sum_prod<float, float2Accessor>(g_idata, g_odata, g_tiData, g_toData, rows, block_height, offset);
 }

 #endif // __CUM_SUM_PROD_H
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	#ifndef __CUM_SUM_PROD_H
	#define __CUM_SUM_PROD_H

	#pragma once

	#include <cuda_runtime.h>

	/**
	* Cumulative Sum-Product
	* Applies column-wise x_n = x_n + y_n * sum with sum = x_n-1 on a two column matrix in a down-up-sweep cascade of
	* thread blocks.
	* ToDo: Put phases of sum-prod into separate kernels
	*/
	template<typename T, typename Vec2Accessor>
	__device__ void cumulative_sum_prod(T *g_idata, ///< input data stored in device memory (of size n)
	T *g_odata, ///< output/temporary array stored in device memory (of size n)
	T *g_tiData, // data input to final stage from intermediate stage
	T *g_toData, // temporary data produced by cascading thread blocks
	uint rows, ///< rows in input and temporary/output arrays
	uint block_height, // number of rows in a block
	uint phase, // up and down sweep are implemented in this one kernel for now. <phase> indicates what to do.
	// phase 0: initial down-sweep; 1: subsequent down-sweeps of the cascade;
	// 2: subsequent up-sweeps; 3: final up-sweep
	bool verbose = false) // toggle printf output (for debugging purposes)
	{
	uint idx = blockIdx.x * block_height;

	// return if thread of (last) block is out of bounds
	// ToDo: should not happen - remove after checking
	if (idx > rows - 1)
	return;

	uint n = min(idx + block_height, rows);
	auto value = Vec2Accessor::init();

	// if up-sweep, fetch initial value from intermediate block results
	if(phase > 1)
	{
	int read_idx = (blockIdx.x - 1) * 2;
	T a = 0.0;
	if(read_idx >= 0)
	a = g_tiData[read_idx];

	value = Vec2Accessor::get(g_idata, idx * 2);

	if(verbose)
	printf("In: blockIdx.x: %d, 2*idx: %d, val_x: %f, val_y: %f, a: %f, phase=%d\n",
	blockIdx.x, 2*idx, value.x, value.y, a, phase);

	value.x = value.x + value.y * a;
	n = min(idx + block_height, rows);
	}
	else // fetch initial value from input
	{
	value = Vec2Accessor::get(g_idata, 2 * idx);

	if(verbose)
	printf("Read: blockIdx.x: %d, 2 * idx: %d, val_x: %f, val_y: %f, n: %d, phase=%d\n",
	blockIdx.x, 2 * idx, value.x, value.y, n, phase);
	}

	T sumprod = value.x;
	T prod = value.y;

	// write to one or two column output in up-sweep
	if(phase == 2)
	Vec2Accessor::put(g_odata, 2 * idx, sumprod, prod);
	else if (phase == 3)
	g_odata[idx] = value.x;

	if(phase > 1 && verbose)
	printf("Out[i=0]: blockIdx.x: %d, idx: %d, sumprod: %f, prod: %f, n: %d, phase=%d\n",
	blockIdx.x, idx, sumprod, prod, n, phase);

	// loop over 2nd to n rows of thread block
	for (int i = idx + 1; i < n; ++i)
	{
	value = Vec2Accessor::get(g_idata, 2 * i);

	prod = prod * value.y;
	sumprod = value.x + value.y * sumprod;

	// write subsequent row results of that block to one or two column output in up-sweep
	if(phase == 2)
	Vec2Accessor::put(g_odata, 2 * i, sumprod, prod);
	else if (phase == 3)
	g_odata[i] = sumprod;

	if(verbose)
	printf("Loop[i=%d]: blockIdx.x: %d, read_i: %d, sumprod: %f, prod: %f, n: %d, phase=%d\n",
	i, blockIdx.x, i * 2, sumprod, prod, n, phase);
	}

	// down sweep (phase 0/1)
	// write out accumulated block offset to intermediate buffer
	if (g_toData != nullptr)
	{
	uint write_idx = blockIdx.x * 2;
	if(blockIdx.x < gridDim.x - 1)
	{
	Vec2Accessor::put(g_toData, write_idx, sumprod, prod);

	if(verbose)
	printf("Carry: blockIdx.x: %d, write_idx: %d, sumprod: %f, prod: %f, n: %d, phase=%d\n",
	blockIdx.x, write_idx, sumprod, prod, n, phase);
	}
	}
	}

	/**
	* Cumulative sum-product instantiation for double
	*/
	extern "C"
	__global__ void cumulative_sum_prod_d(double g_idata, double g_odata, double g_tiData, double g_toData, uint rows,
	uint block_height, uint offset)
	{
	cumulative_sum_prod<double, double2Accessor>(g_idata, g_odata, g_tiData, g_toData, rows, block_height, offset);
	}

	/**
	* Cumulative sum-product instantiation for float
	*/
	extern "C" __global__ void cumulative_sum_prod_f(float g_idata, float g_odata, float g_tiData, float g_toData,
	uint rows, uint block_height, uint offset)
	{
	cumulative_sum_prod<float, float2Accessor>(g_idata, g_odata, g_tiData, g_toData, rows, block_height, offset);
	}

	#endif // __CUM_SUM_PROD_H