blob: a005a3c7e9f09c0f719580b9d95980106fa32b23 [file] [log] [blame]
// This file is auto-generated by tool/opencl/clsrc_to_str, do not edit
// manually.
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef USE_OPENCL
#include <string>
namespace singa {
namespace opencl {
const std::string im2col_str =
"// This file is modified from the file located at\n// "
"https://github.com/BVLC/caffe/blob/opencl/src/caffe/greentea/cl_kernels/"
"im2col.cl\n// and is covered under the BSD 2-Clause License, as indicated "
"in the LICENSE\n// file at the root of this repository.\n\n__kernel void "
"im2col(const int n, __global const float* data_im,\n "
"const int data_im_off,\n const int height, const int "
"width,\n const int kernel_h, const int kernel_w,\n "
" const int pad_h, const int pad_w,\n "
"const int stride_h, const int stride_w,\n const int "
"dilation_h, const int dilation_w,\n const int "
"height_col, const int width_col,\n __global float* "
"data_col, const int data_col_off) {\n\n for (int index = "
"get_global_id(0); index < n;\n index += get_global_size(0)) {\n "
"const int h_index = index / width_col;\n const int h_col = h_index % "
"height_col;\n const int w_col = index % width_col;\n const int c_im "
"= h_index / height_col;\n const int c_col = c_im * kernel_h * "
"kernel_w;\n const int h_offset = h_col * stride_h - pad_h;\n const "
"int w_offset = w_col * stride_w - pad_w;\n \n __global float* "
"data_col_ptr = data_col + data_col_off;\n data_col_ptr += (c_col * "
"height_col + h_col) * width_col + w_col;\n __global const float* "
"data_im_ptr = data_im + data_im_off;\n data_im_ptr += (c_im * height + "
"h_offset) * width + w_offset;\n \n for (int i = 0; i < kernel_h; "
"++i) {\n for (int j = 0; j < kernel_w; ++j) {\n int h_im = "
"h_offset + i * dilation_h;\n int w_im = w_offset + j * "
"dilation_w;\n *data_col_ptr =\n (h_im >= 0 && w_im >= 0 "
"&& h_im < height && w_im < width) ?\n data_im_ptr[i * "
"dilation_h * width + j * dilation_w] : 0;\n data_col_ptr += "
"height_col * width_col;\n }\n }\n }\n}\n\n__kernel void "
"col2im(const int n, __global const float* data_col,\n "
"const int data_col_off, const int channels,\n const "
"int height, const int width,\n const int kernel_h, "
"const int kernel_w,\n const int pad_h, const int "
"pad_w,\n const int stride_h, const int stride_w,\n "
" const int dilation_h, const int dilation_w,\n "
" const int height_col, const int width_col,\n "
"__global float* data_im, const int data_im_off) {\n\n for (int index = "
"get_global_id(0); index < n; index += get_global_size(0)) {\n float "
"val = 0;\n const int w_im = index % width + pad_w;\n const int h_im "
"= (index / width) % height + pad_h;\n const int c_im = index / (width "
"* height);\n int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;\n "
" int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;\n // compute "
"the start and end of the output\n const int w_col_start =\n "
"(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;\n "
" const int w_col_end = min(w_im / stride_w + 1, width_col);\n const "
"int h_col_start =\n (h_im < kernel_extent_h) ? 0 : (h_im - "
"kernel_extent_h) / stride_h + 1;\n const int h_col_end = min(h_im / "
"stride_h + 1, height_col);\n \n // TODO: use LCM of stride and "
"dilation to avoid unnecessary loops\n for (int h_col = h_col_start; "
"h_col < h_col_end; h_col += 1) {\n for (int w_col = w_col_start; "
"w_col < w_col_end; w_col += 1) {\n int h_k = (h_im - h_col * "
"stride_h);\n int w_k = (w_im - w_col * stride_w);\n if (h_k "
"% dilation_h == 0 && w_k % dilation_w == 0) {\n h_k /= "
"dilation_h;\n w_k /= dilation_w;\n int data_col_index = "
"(((c_im * kernel_h + h_k) * kernel_w + w_k) *\n "
" height_col + h_col) * width_col + w_col;\n val += "
"data_col[data_col_off + data_col_index];\n }\n }\n }\n "
"data_im[data_im_off + index] = val;\n }\n}\n";
const std::string pooling_str =
"// This file is modified from the file located at\n// "
"https://github.com/BVLC/caffe/blob/opencl/src/caffe/greentea/cl_kernels/"
"pooling.cl\n// and is covered under the BSD 2-Clause License, as "
"indicated in the LICENSE\n// file at the root of this "
"repository.\n\n__kernel void max_pool_forward(\n const int nthreads, "
"__global const float* bottom, const int channels, \n const int height, "
"const int width,\n const int pooled_h, const int pooled_w,\n const "
"int kernel_h, const int kernel_w,\n const int stride_h, const int "
"stride_w,\n const int pad_h, const int pad_w,\n __global float* "
"top, __global float* mask) {\n\n// printf(\"%d \", "
"get_global_size(0));\n for (int i = get_global_id(0); i < nthreads; i += "
"get_global_size(0)) {\n const int pw = i % pooled_w;\n const int ph "
"= (i / pooled_w) % pooled_h;\n const int c = (i / pooled_w / pooled_h) "
"% channels;\n const int n = i / pooled_w / pooled_h / channels;\n "
"\n int hstart = ph * stride_h - pad_h;\n int wstart = pw * stride_w "
"- pad_w;\n const int hend = min(hstart + kernel_h, height);\n const "
"int wend = min(wstart + kernel_w, width);\n hstart = max(hstart, "
"(int)0);\n wstart = max(wstart, (int)0);\n \n float maxval = "
"-FLT_MAX;\n int maxidx = -1;\n __global const float* bottom_slice = "
"bottom + (n * channels + c) * height * width;\n for (int h = hstart; h "
"< hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n "
"const int index = h * width + w;\n if (bottom_slice[index] > "
"maxval) {\n maxidx = index;\n maxval = "
"bottom_slice[maxidx];\n }\n }\n }\n top[i] = maxval;\n "
" mask[i] = (float)maxidx;\n }\n}\n\n__kernel void ave_pool_forward(\n "
" const int nthreads, __global const float* const bottom, const int "
"channels, \n const int height, const int width,\n const int "
"pooled_h, const int pooled_w,\n const int kernel_h, const int "
"kernel_w,\n const int stride_h, const int stride_w, \n const int "
"pad_h, const int pad_w, __global float* top) {\n \n for (int i = "
"get_global_id(0); i < nthreads; i += get_global_size(0)) {\n const int "
"pw = i % pooled_w;\n const int ph = (i / pooled_w) % pooled_h;\n "
"const int c = (i / pooled_w / pooled_h) % channels;\n const int n = i "
"/ pooled_w / pooled_h / channels;\n int hstart = ph * stride_h - "
"pad_h;\n int wstart = pw * stride_w - pad_w;\n int hend = "
"min(hstart + kernel_h, height + pad_h);\n int wend = min(wstart + "
"kernel_w, width + pad_w);\n const int pool_size = (hend - hstart) * "
"(wend - wstart);\n hstart = max(hstart, (int)0);\n wstart = "
"max(wstart, (int)0);\n hend = min(hend, height);\n wend = min(wend, "
"width);\n float aveval = 0;\n __global const float* bottom_slice = "
"bottom + (n * channels + c) * height * width;\n for (int h = hstart; h "
"< hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n "
"aveval += bottom_slice[h * width + w];\n }\n }\n top[i] = "
"aveval / pool_size;\n }\n}\n\n__kernel void sto_pool_forward_train(\n "
"const int nthreads, __global const float* bottom,\n const int "
"channels, const int height, const int width,\n const int pooled_h, "
"const int pooled_w, const int kernel_h,\n const int kernel_w, const "
"int stride_h, const int stride_w,\n __global float* rand_idx, __global "
"float* top) {\n \n for (int i = get_global_id(0); i < nthreads; i += "
"get_global_size(0)) {\n const int pw = i % pooled_w;\n const int ph "
"= (i / pooled_w) % pooled_h;\n const int c = (i / pooled_w / pooled_h) "
"% channels;\n const int n = i / pooled_w / pooled_h / channels;\n "
"\n const int hstart = ph * stride_h;\n const int hend = min(hstart "
"+ kernel_h, height);\n const int wstart = pw * stride_w;\n const "
"int wend = min(wstart + kernel_w, width);\n float cumsum = 0.;\n "
"__global const float* bottom_slice = bottom + (n * channels + c) * height "
"* width;\n // First pass: get sum\n for (int h = hstart; h < hend; "
"++h) {\n for (int w = wstart; w < wend; ++w) {\n cumsum += "
"bottom_slice[h * width + w];\n }\n }\n const float thres = "
"rand_idx[i] * cumsum;\n // Second pass: get value, and set i.\n "
"cumsum = 0;\n for (int h = hstart; h < hend; ++h) {\n for (int w "
"= wstart; w < wend; ++w) {\n cumsum += bottom_slice[h * width + "
"w];\n if (cumsum >= thres) {\n rand_idx[i] = ((n * "
"channels + c) * height + h) * width + w;\n top[i] = "
"bottom_slice[h * width + w];\n h = hend;\n w = wend;\n "
" }\n }\n }\n }\n}\n\n__kernel void sto_pool_forward_test(\n "
" const int nthreads, __global const float* const bottom, const int "
"channels, \n const int height, const int width,\n const int "
"pooled_h, const int pooled_w, \n const int kernel_h, const int "
"kernel_w, \n const int stride_h, const int stride_w,\n __global "
"float* top) {\n \n for (int i = get_global_id(0); i < nthreads; i += "
"get_global_size(0)) {\n const int pw = i % pooled_w;\n const int ph "
"= (i / pooled_w) % pooled_h;\n const int c = (i / pooled_w / pooled_h) "
"% channels;\n const int n = i / pooled_w / pooled_h / channels;\n "
"\n const int hstart = ph * stride_h;\n const int hend = min(hstart "
"+ kernel_h, height);\n const int wstart = pw * stride_w;\n const "
"int wend = min(wstart + kernel_w, width);\n // We set cumsum to be 0 "
"to avoid divide-by-zero problems\n float cumsum = FLT_MIN;\n float "
"cumvalues = 0.;\n __global const float* bottom_slice = bottom + (n * "
"channels + c) * height * width;\n // First pass: get sum\n for (int "
"h = hstart; h < hend; ++h) {\n for (int w = wstart; w < wend; ++w) "
"{\n cumsum += bottom_slice[h * width + w];\n cumvalues += "
"bottom_slice[h * width + w] * bottom_slice[h * width + w];\n }\n "
"}\n top[i] = cumvalues / cumsum;\n }\n}\n\n__kernel void "
"max_pool_backward(const int nthreads,\n "
"__global const float* top_diff,\n __global "
"const float* mask,\n const int channels,\n "
" const int height, const int width,\n "
" const int pooled_h, const int pooled_w,\n "
" const int kernel_h, const int kernel_w,\n "
" const int stride_h, const int stride_w,\n "
" const int pad_h, const int pad_w,\n "
" __global float* bottom_diff) {\n for (int i = "
"get_global_id(0); i < nthreads; i += get_global_size(0)) {\n // find "
"out the local i\n // find out the local offset\n const int w = i % "
"width;\n const int h = (i / width) % height;\n const int c = (i / "
"width / height) % channels;\n const int n = i / width / height / "
"channels;\n \n const int phstart =\n (h + pad_h < kernel_h) "
"? 0 : (h + pad_h - kernel_h) / stride_h + 1;\n const int phend = "
"min((h + pad_h) / stride_h + 1, pooled_h);\n const int pwstart =\n "
" (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;\n "
" const int pwend = min((w + pad_w) / stride_w + 1, pooled_w);\n float "
"gradient = 0.0f;\n const int offset = (n * channels + c) * pooled_h * "
"pooled_w;\n __global const float* top_diff_slice = top_diff + "
"offset;\n __global const float* mask_slice = mask + offset;\n for "
"(int ph = phstart; ph < phend; ++ph) {\n for (int pw = pwstart; pw < "
"pwend; ++pw) {\n if (mask_slice[ph * pooled_w + pw] == (float)(h * "
"width + w)) {\n gradient += top_diff_slice[ph * pooled_w + "
"pw];\n }\n }\n }\n bottom_diff[i] = gradient;\n "
"}\n}\n\n__kernel void ave_pool_backward(const int nthreads,\n "
" __global const float* top_diff,\n "
" const int channels,\n const int "
"height, const int width,\n const int "
"pooled_h, const int pooled_w,\n const int "
"kernel_h, const int kernel_w,\n const int "
"stride_h, const int stride_w,\n const int "
"pad_h, const int pad_w,\n __global float* "
"bottom_diff) {\n for (int i = get_global_id(0); i < nthreads; i += "
"get_global_size(0)) {\n // find out the local i\n // find out the "
"local offset\n const int w = i % width + pad_w;\n const int h = (i "
"/ width) % height + pad_h;\n const int c = (i / width / height) % "
"channels;\n const int n = i / width / height / channels;\n \n "
"const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;\n "
" const int phend = min(h / stride_h + 1, pooled_h);\n const int "
"pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;\n const "
"int pwend = min(w / stride_w + 1, pooled_w);\n float gradient = 0.0;\n "
" __global const float* const top_diff_slice = top_diff + (n * channels "
"+ c) * pooled_h * pooled_w;\n for (int ph = phstart; ph < phend; ++ph) "
"{\n for (int pw = pwstart; pw < pwend; ++pw) {\n // figure "
"out the pooling size\n int hstart = ph * stride_h - pad_h;\n "
" int wstart = pw * stride_w - pad_w;\n int hend = min(hstart + "
"kernel_h, height + pad_h);\n int wend = min(wstart + kernel_w, "
"width + pad_w);\n int pool_size = (hend - hstart) * (wend - "
"wstart);\n gradient += top_diff_slice[ph * pooled_w + pw] / "
"pool_size;\n }\n }\n bottom_diff[i] = gradient;\n "
"}\n}\n\n__kernel void sto_pool_backward(\n const int nthreads, "
"__global const float* rand_idx,\n __global const float* const "
"top_diff, const int channels,\n const int height, const int width,\n "
" const int pooled_h, const int pooled_w,\n const int kernel_h, const "
"int kernel_w,\n const int stride_h, const int stride_w,\n __global "
"float* bottom_diff) {\n\n for (int i = get_global_id(0); i < nthreads; i "
"+= get_global_size(0)) {\n // find out the local i\n // find out "
"the local offset\n const int w = i % width;\n const int h = (i / "
"width) % height;\n const int c = (i / width / height) % channels;\n "
"const int n = i / width / height / channels;\n \n const int phstart "
"= (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;\n const int "
"phend = min(h / stride_h + 1, pooled_h);\n const int pwstart = (w < "
"kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;\n const int pwend = "
"min(w / stride_w + 1, pooled_w);\n float gradient = 0.0;\n __global "
"const float* rand_idx_slice = rand_idx + (n * channels + c) * pooled_h * "
"pooled_w;\n __global const float* top_diff_slice = top_diff + (n * "
"channels + c) * pooled_h * pooled_w;\n for (int ph = phstart; ph < "
"phend; ++ph) {\n for (int pw = pwstart; pw < pwend; ++pw) {\n "
"gradient += top_diff_slice[ph * pooled_w + pw]\n * (i == (int) "
"(rand_idx_slice[ph * pooled_w + pw])?1.0:0.0);\n }\n }\n "
"bottom_diff[i] = gradient;\n }\n}\n\n";
const std::string distribution_str =
"// This code is adapted from "
"https://github.com/amd/OpenCL-caffe/blob/stable/src/caffe/ocl/"
"random.cl\n\n//Note: random generator has two parts\n//first part: the "
"open sourced threefy random generator kernel from DE Shaw "
"Research\n//second part. we wrap the kernel up to generate uniform, "
"bernoulli and gaussion distribution generators.\n\n//begin: the open "
"sourced random generator from DE Shaw "
"Research\n//https://www.deshawresearch.com/"
"resources_random123.html\ntypedef uint uint32_t;\n\nstruct r123array4x32 "
"{\n uint32_t v[4];\n};\n\nenum r123_enum_threefry32x4 {\n R_32x4_0_0 = "
"10,\n R_32x4_0_1 = 26,\n R_32x4_1_0 = 11,\n R_32x4_1_1 = 21,\n "
"R_32x4_2_0 = 13,\n R_32x4_2_1 = 27,\n R_32x4_3_0 = 23,\n R_32x4_3_1 = "
"5,\n R_32x4_4_0 = 6,\n R_32x4_4_1 = 20,\n R_32x4_5_0 = 17,\n "
"R_32x4_5_1 = 11,\n R_32x4_6_0 = 25,\n R_32x4_6_1 = 10,\n R_32x4_7_0 = "
"18,\n R_32x4_7_1 = 20\n};\n\ninline uint32_t RotL_32(uint32_t x, "
"unsigned int N) {\n return (x << (N & 31)) | (x >> ((32 - N) & "
"31));\n}\n\ntypedef struct r123array4x32 threefry4x32_ctr_t;\ntypedef "
"struct r123array4x32 threefry4x32_key_t;\ntypedef struct r123array4x32 "
"threefry4x32_ukey_t;\n\ninline threefry4x32_ctr_t threefry4x32_R(unsigned "
"int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k) {\n "
"threefry4x32_ctr_t X;\n uint32_t ks[4 + 1];\n int i;\n ks[4] = "
"0x1BD11BDA;\n\n {\n ks[0] = k.v[0];\n X.v[0] = in.v[0];\n ks[4] "
"^= k.v[0];\n\n ks[1] = k.v[1];\n X.v[1] = in.v[1];\n ks[4] ^= "
"k.v[1];\n\n ks[2] = k.v[2];\n X.v[2] = in.v[2];\n ks[4] ^= "
"k.v[2];\n\n ks[3] = k.v[3];\n X.v[3] = in.v[3];\n ks[4] ^= "
"k.v[3];\n }\n\n X.v[0] += ks[0];\n X.v[1] += ks[1];\n X.v[2] += "
"ks[2];\n X.v[3] += ks[3];\n\n if (Nrounds > 0) {\n X.v[0] += "
"X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n X.v[1] ^= "
"X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_0_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 1) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_1_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 2) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_2_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 3) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_3_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 3) {\n "
"X.v[0] += ks[1];\n X.v[1] += ks[2];\n X.v[2] += ks[3];\n X.v[3] "
"+= ks[4];\n X.v[4 - 1] += 1;\n }\n\n if (Nrounds > 4) {\n X.v[0] "
"+= X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n X.v[1] ^= "
"X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_4_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 5) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_5_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 6) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_6_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 7) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_7_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 7) {\n "
"X.v[0] += ks[2];\n X.v[1] += ks[3];\n X.v[2] += ks[4];\n X.v[3] "
"+= ks[0];\n X.v[4 - 1] += 2;\n }\n\n if (Nrounds > 8) {\n X.v[0] "
"+= X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n X.v[1] ^= "
"X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_0_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 9) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_1_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 10) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_2_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 11) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_3_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 11) {\n "
"X.v[0] += ks[3];\n X.v[1] += ks[4];\n X.v[2] += ks[0];\n X.v[3] "
"+= ks[1];\n X.v[4 - 1] += 3;\n }\n\n if (Nrounds > 12) {\n X.v[0] "
"+= X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n X.v[1] ^= "
"X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_4_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 13) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_5_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 14) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_6_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 15) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_7_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 15) {\n "
"X.v[0] += ks[4];\n X.v[1] += ks[0];\n X.v[2] += ks[1];\n X.v[3] "
"+= ks[2];\n X.v[4 - 1] += 4;\n }\n\n if (Nrounds > 16) {\n X.v[0] "
"+= X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n X.v[1] ^= "
"X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_0_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 17) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_1_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 18) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_2_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 19) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_3_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 19) {\n "
"X.v[0] += ks[0];\n X.v[1] += ks[1];\n X.v[2] += ks[2];\n X.v[3] "
"+= ks[3];\n X.v[4 - 1] += 5;\n }\n\n if (Nrounds > 20) {\n X.v[0] "
"+= X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n X.v[1] ^= "
"X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_4_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 21) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_5_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 22) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_6_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 23) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_7_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 23) {\n "
"X.v[0] += ks[1];\n X.v[1] += ks[2];\n X.v[2] += ks[3];\n X.v[3] "
"+= ks[4];\n X.v[4 - 1] += 6;\n }\n\n if (Nrounds > 24) {\n X.v[0] "
"+= X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n X.v[1] ^= "
"X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_0_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 25) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_1_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 26) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_2_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 27) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_3_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 27) {\n "
"X.v[0] += ks[2];\n X.v[1] += ks[3];\n X.v[2] += ks[4];\n X.v[3] "
"+= ks[0];\n X.v[4 - 1] += 7;\n }\n\n if (Nrounds > 28) {\n X.v[0] "
"+= X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n X.v[1] ^= "
"X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_4_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 29) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_5_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 30) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_6_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 31) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_7_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 31) {\n "
"X.v[0] += ks[3];\n X.v[1] += ks[4];\n X.v[2] += ks[0];\n X.v[3] "
"+= ks[1];\n X.v[4 - 1] += 8;\n }\n\n if (Nrounds > 32) {\n X.v[0] "
"+= X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n X.v[1] ^= "
"X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_0_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 33) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_1_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 34) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_2_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 35) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_3_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 35) {\n "
"X.v[0] += ks[4];\n X.v[1] += ks[0];\n X.v[2] += ks[1];\n X.v[3] "
"+= ks[2];\n X.v[4 - 1] += 9;\n }\n\n if (Nrounds > 36) {\n X.v[0] "
"+= X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n X.v[1] ^= "
"X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_4_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 37) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_5_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 38) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_6_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 39) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_7_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 39) {\n "
"X.v[0] += ks[0];\n X.v[1] += ks[1];\n X.v[2] += ks[2];\n X.v[3] "
"+= ks[3];\n X.v[4 - 1] += 10;\n }\n\n if (Nrounds > 40) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_0_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 41) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_1_1);\n X.v[1] ^= X.v[2];\n }\n if (Nrounds > 42) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_2_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 43) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_3_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 43) {\n "
"X.v[0] += ks[1];\n X.v[1] += ks[2];\n X.v[2] += ks[3];\n X.v[3] "
"+= ks[4];\n X.v[4 - 1] += 11;\n }\n\n if (Nrounds > 44) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_4_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 45) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_5_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 46) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_6_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 47) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_7_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 47) {\n "
"X.v[0] += ks[2];\n X.v[1] += ks[3];\n X.v[2] += ks[4];\n X.v[3] "
"+= ks[0];\n X.v[4 - 1] += 12;\n }\n\n if (Nrounds > 48) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_0_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 49) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_1_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 50) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_2_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 51) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_3_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 51) {\n "
"X.v[0] += ks[3];\n X.v[1] += ks[4];\n X.v[2] += ks[0];\n X.v[3] "
"+= ks[1];\n X.v[4 - 1] += 13;\n }\n\n if (Nrounds > 52) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_4_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 53) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_5_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 54) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_6_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 55) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_7_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 55) {\n "
"X.v[0] += ks[4];\n X.v[1] += ks[0];\n X.v[2] += ks[1];\n X.v[3] "
"+= ks[2];\n X.v[4 - 1] += 14;\n }\n\n if (Nrounds > 56) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_0_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 57) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_1_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 58) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_2_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 59) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_3_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 59) {\n "
"X.v[0] += ks[0];\n X.v[1] += ks[1];\n X.v[2] += ks[2];\n X.v[3] "
"+= ks[3];\n X.v[4 - 1] += 15;\n }\n\n if (Nrounds > 60) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_4_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 61) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_5_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 62) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_6_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 63) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_7_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 63) {\n "
"X.v[0] += ks[1];\n X.v[1] += ks[2];\n X.v[2] += ks[3];\n X.v[3] "
"+= ks[4];\n X.v[4 - 1] += 16;\n }\n\n if (Nrounds > 64) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_0_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_0_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 65) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_1_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_1_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 66) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_2_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_2_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 67) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_3_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_3_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 67) {\n "
"X.v[0] += ks[2];\n X.v[1] += ks[3];\n X.v[2] += ks[4];\n X.v[3] "
"+= ks[0];\n X.v[4 - 1] += 17;\n }\n\n if (Nrounds > 68) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_4_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_4_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 69) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_5_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_5_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 70) {\n "
"X.v[0] += X.v[1];\n X.v[1] = RotL_32(X.v[1], R_32x4_6_0);\n X.v[1] "
"^= X.v[0];\n X.v[2] += X.v[3];\n X.v[3] = RotL_32(X.v[3], "
"R_32x4_6_1);\n X.v[3] ^= X.v[2];\n }\n\n if (Nrounds > 71) {\n "
"X.v[0] += X.v[3];\n X.v[3] = RotL_32(X.v[3], R_32x4_7_0);\n X.v[3] "
"^= X.v[0];\n X.v[2] += X.v[1];\n X.v[1] = RotL_32(X.v[1], "
"R_32x4_7_1);\n X.v[1] ^= X.v[2];\n }\n\n if (Nrounds > 71) {\n "
"X.v[0] += ks[3];\n X.v[1] += ks[4];\n X.v[2] += ks[0];\n X.v[3] "
"+= ks[1];\n X.v[4 - 1] += 18;\n }\n return X;\n}\n//end: the open "
"sourced random generator from DE Shaw Research\n\n// "
"**************************\n// BERNOULLI DISTRIBUTION\n// "
"**************************\n\n__kernel void "
"PRNG_threefry4x32_bernoulli(\n__global float4 "
"*randomnumber,\nthreefry4x32_ctr_t ctr_i,\nfloat inf, float sup,\nfloat "
"threshold,\nuint nrounds, uint numrandom) {\n\n size_t gdx = "
"get_global_id(0);\n\n uint maxUint = 0;\n maxUint--;\n float r = "
"(float)maxUint;\n\n threefry4x32_ctr_t ctr = ctr_i;\n "
"threefry4x32_ukey_t ukey;\n\n ukey.v[0] = ukey.v[1] = ukey.v[2] = "
"ukey.v[3] = gdx;\n\n threefry4x32_ctr_t random4;\n\n if ( gdx < "
"numrandom ) {\n random4 = threefry4x32_R(nrounds, ctr, ukey);\n "
"float4 frnd;\n frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + "
"inf ) < threshold ? 1.0f : 0.0f;\n frnd.y = ( (((float)random4.v[1]) / "
"r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;\n frnd.z = ( "
"(((float)random4.v[2]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : "
"0.0f;\n frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) < "
"threshold ? 1.0f : 0.0f;\n randomnumber[gdx] = frnd;\n }\n}\n\n// "
"**************************\n// UNIFORM DISTRIBUTION (float)\n// "
"**************************\n\n__kernel void "
"PRNG_threefry4x32_uniform(\n__global float4 "
"*randomnumber,\nthreefry4x32_ctr_t ctr_i,\nfloat inf, float sup,\nuint "
"nrounds, uint numrandom) {\n\n size_t gdx = get_global_id(0);\n\n uint "
"maxUint = 0;\n maxUint--;\n float r = (float)maxUint;\n\n "
"threefry4x32_ctr_t ctr = ctr_i;\n threefry4x32_ukey_t ukey;\n\n "
"ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;\n\n "
"threefry4x32_ctr_t random4;\n\n if ( gdx < numrandom ) {\n random4 = "
"threefry4x32_R(nrounds, ctr, ukey);\n float4 frnd;\n frnd.x = ( "
"(((float)random4.v[0]) / r) * (sup - inf) + inf );\n frnd.y = ( "
"(((float)random4.v[1]) / r) * (sup - inf) + inf );\n frnd.z = ( "
"(((float)random4.v[2]) / r) * (sup - inf) + inf );\n frnd.w = ( "
"(((float)random4.v[3]) / r) * (sup - inf) + inf );\n randomnumber[gdx] "
"= frnd;\n }\n}\n\n// **************************\n// UNIFORM DISTRIBUTION "
"(uint)\n// **************************\n\n__kernel void "
"PRNG_threefry4x32_uint_uniform(\n__global uint4 "
"*randomnumber,\nthreefry4x32_ctr_t ctr_i,\nuint inf, uint sup,\nuint "
"nrounds, uint numrandom) {\n\n size_t gdx = get_global_id(0);\n\n "
"threefry4x32_ctr_t ctr = ctr_i;\n threefry4x32_ukey_t ukey;\n\n "
"ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;\n\n "
"threefry4x32_ctr_t random4;\n\n if ( gdx < numrandom ) {\n random4 = "
"threefry4x32_R(nrounds, ctr, ukey);\n uint4 frnd;\n frnd.x = "
"random4.v[0] % (sup - inf) + inf;\n frnd.y = random4.v[1] % (sup - "
"inf) + inf;\n frnd.z = random4.v[2] % (sup - inf) + inf;\n frnd.w = "
"random4.v[3] % (sup - inf) + inf;\n randomnumber[gdx] = frnd;\n "
"}\n}\n\n// **************************\n// GAUSSIAN DISTRIBUTION\n// "
"**************************\n\n__kernel void "
"PRNG_threefry4x32_gaussian(\n__global float4 "
"*randomnumber,\nthreefry4x32_ctr_t ctr_i,\nfloat E, float V,\nuint "
"nrounds, uint numrandom) {\n\n size_t gdx = get_global_id(0);\n\n uint "
"maxUint = 0;\n maxUint--;\n float r = (float)maxUint;\n\n "
"threefry4x32_ctr_t ctr = ctr_i;\n threefry4x32_ukey_t ukey1, ukey2;\n\n "
"ukey1.v[0] = ukey2.v[1] = ukey1.v[2] = ukey2.v[3] = gdx;\n ukey2.v[0] = "
"ukey1.v[1] = ukey2.v[2] = ukey1.v[3] = 0;\n\n threefry4x32_ctr_t "
"random1, random2;\n\n if ( gdx < numrandom ) {\n random1 = "
"threefry4x32_R(nrounds, ctr, ukey1);\n random2 = "
"threefry4x32_R(nrounds, ctr, ukey2);\n float4 frnd1;\n\n float r1 = "
"(((float)random1.v[0]) / r); // generate a random sequence of uniform "
"distribution\n float r2 = (((float)random2.v[0]) / r);\n float r3 = "
"(((float)random1.v[1]) / r);\n float r4 = (((float)random2.v[1]) / "
"r);\n float r5 = (((float)random1.v[2]) / r);\n float r6 = "
"(((float)random2.v[2]) / r);\n float r7 = (((float)random1.v[3]) / "
"r);\n float r8 = (((float)random2.v[3]) / r);\n\n if(r2 == 0 || r4 "
"== 0 || r6 == 0 || r8 == 0) {\n r2 += 0.0001;\n r4 += 0.0001;\n "
" r6 += 0.0001;\n r8 += 0.0001;\n }\n\n frnd1.x = "
"cos(2*M_PI*r1)*sqrt(-2.0*log(r2)) * V + E;// return a pseudo sequence of "
"normal distribution using two above uniform noise data\n //frnd2.x = "
"sin(2*M_PI*r1)*sqrt(-2.0*log(r2)); // return the quadrature "
"counterpart of the foregoing pseudo normal distribution sequence\n "
"frnd1.y = cos(2*M_PI*r3)*sqrt(-2.0*log(r4)) * V + E;// return a pseudo "
"sequence of normal distribution using two above uniform noise data\n "
"//frnd2.y = sin(2*M_PI*r3)*sqrt(-2.0*log(r4)); // return the "
"quadrature counterpart of the foregoing pseudo normal distribution "
"sequence\n frnd1.z = cos(2*M_PI*r5)*sqrt(-2.0*log(r6)) * V + E;// "
"return a pseudo sequence of normal distribution using two above uniform "
"noise data\n //frnd2.z = sin(2*M_PI*r5)*sqrt(-2.0*log(r6)); // "
"return the quadrature counterpart of the foregoing pseudo normal "
"distribution sequence\n frnd1.w = cos(2*M_PI*r7)*sqrt(-2.0*log(r8)) * "
"V + E;// return a pseudo sequence of normal distribution using two above "
"uniform noise data\n //frnd2.w = sin(2*M_PI*r7)*sqrt(-2.0*log(r8)); "
" // return the quadrature counterpart of the foregoing pseudo normal "
"distribution sequence\n\n randomnumber[gdx] = frnd1;\n }\n}\n";
const std::string tensormath_str =
"/**\n * Licensed to the Apache Software Foundation (ASF) under one\n * or "
"more contributor license agreements. See the NOTICE file\n * distributed "
"with this work for additional information\n * regarding copyright "
"ownership. The ASF licenses this file\n * to you under the Apache "
"License, Version 2.0 (the\n * \"License\"); you may not use this file "
"except in compliance\n * with the License. You may obtain a copy of the "
"License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * "
"Unless required by applicable law or agreed to in writing, software\n * "
"distributed under the License is distributed on an \"AS IS\" BASIS,\n * "
"WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or "
"implied.\n * See the License for the specific language governing "
"permissions and\n * limitations under the License.\n */\n\n// "
"**************************************\n// Element-wise functions\n// "
"**************************************\n\n// Sum is basically "
"reduction.\n// This reduction code is serial reduction modified from "
"AMD\'s example.\n// "
"http://developer.amd.com/resources/documentation-articles/"
"articles-whitepapers/opencl-optimization-case-study-simple-reductions/"
"\n__kernel\nvoid clkernel_fabs(const int num, __global const float* in, "
"__global float* out) {\n const int i = get_global_id(0);\n if (i >= "
"num) return;\n out[i] = fabs(in[i]);\n}\n\n__kernel\nvoid "
"clkernel_add_scalar(const int num, float x, __global const float* in, "
"__global float* out) {\n const int i = get_global_id(0);\n if (i >= "
"num) return;\n out[i] = in[i] + x;\n}\n\n__kernel\nvoid "
"clkernel_add(const int num, __global const float* in1, __global const "
"float* in2,\n __global float* out) {\n const int i = "
"get_global_id(0);\n if (i >= num) return;\n out[i] = in1[i] + "
"in2[i];\n}\n\n__kernel\nvoid clkernel_clamp(const int num, float low, "
"float high, __global const float* in,\n __global float* out) {\n const "
"int i = get_global_id(0);\n if (i >= num) return;\n out[i] = "
"clamp(in[i], low, high);\n}\n\n__kernel\nvoid "
"clkernel_divide_scalar_matx(const int num, __global const float* in1, "
"const float x,\n __global float* out) {\n const int i = "
"get_global_id(0);\n if (i >= num) return;\n out[i] = in1[i] / "
"x;\n}\n\n__kernel\nvoid clkernel_divide_scalar_xmat(const int num, const "
"float x, __global const float* in1,\n __global float* out) {\n const "
"int i = get_global_id(0);\n if (i >= num) return;\n out[i] = x / "
"in1[i];\n}\n\n__kernel\nvoid clkernel_divide(const int num, __global "
"const float* in1, __global const float* in2,\n __global float* out) {\n "
"const int i = get_global_id(0);\n if (i >= num) return;\n out[i] = "
"in1[i] / in2[i];\n}\n\n__kernel\nvoid clkernel_eltmult_scalar(const int "
"num, const float x, __global const float* in,\n __global float* out) {\n "
" const int i = get_global_id(0);\n if (i >= num) return;\n out[i] = "
"in[i] * x;\n}\n\n__kernel\nvoid clkernel_eltmult(const int num, __global "
"const float* in1, __global const float* in2,\n __global float* out) {\n "
"const int i = get_global_id(0);\n if (i >= num) return;\n out[i] = "
"in1[i] * in2[i];\n}\n\n__kernel\nvoid clkernel_exp(const int num, "
"__global const float* in, __global float* out) {\n const int i = "
"get_global_id(0);\n if (i >= num) return;\n out[i] = "
"exp(in[i]);\n}\n\n__kernel\nvoid clkernel_le(const int num, __global "
"const float* in, const float x,\n __global float* out) {\n const int i "
"= get_global_id(0);\n if (i >= num) return;\n out[i] = (in[i] <= x) ? "
"1.0f : 0.0f;\n}\n\n__kernel\nvoid clkernel_log(const int num, __global "
"const float* in, __global float* out) {\n const int i = "
"get_global_id(0);\n if (i >= num) return;\n out[i] = "
"log(in[i]);\n}\n\n__kernel\nvoid clkernel_lt(const int num, __global "
"const float* in, const float x,\n __global float* out) {\n const int i "
"= get_global_id(0);\n if (i >= num) return;\n out[i] = (in[i] < x) ? "
"1.0f : 0.0f;\n}\n\n__kernel\nvoid clkernel_ge(const int num, __global "
"const float* in, const float x,\n __global float* out) {\n const int i "
"= get_global_id(0);\n if (i >= num) return;\n out[i] = (in[i] >= x) ? "
"1.0f : 0.0f;\n}\n\n__kernel\nvoid clkernel_gt(const int num, __global "
"const float* in, const float x,\n __global float* out) {\n const int i "
"= get_global_id(0);\n if (i >= num) return;\n out[i] = (in[i] > x) ? "
"1.0f : 0.0f;\n}\n\n__kernel\nvoid clkernel_pow_scalar(const int num, "
"const float x, __global const float* in,\n __global float* out) {\n "
"const int i = get_global_id(0);\n if (i >= num) return;\n out[i] = "
"pow(in[i], x);\n}\n\n__kernel\nvoid clkernel_pow(const int num, __global "
"const float* in1, __global const float* in2,\n __global float* out) {\n "
"const int i = get_global_id(0);\n if (i >= num) return;\n out[i] = "
"pow(in1[i], in2[i]);\n}\n\n__kernel\nvoid clkernel_relu(const int num, "
"__global const float* in, __global float* out) {\n const int i = "
"get_global_id(0);\n if (i >= num) return;\n out[i] = (in[i] >= 0.0f) ? "
"in[i] : 0.0f;\n}\n\n__kernel\nvoid clkernel_set(const int num, const "
"float x, __global float* out) {\n const int i = get_global_id(0);\n if "
"(i >= num) return;\n out[i] = x;\n}\n\n__kernel\nvoid "
"clkernel_sigmoid(const int num, __global const float* in, __global float* "
"out) {\n const int i = get_global_id(0);\n if (i >= num) return;\n "
"out[i] = 1 / (1 + exp(-(in[i])));\n}\n\n__kernel\nvoid "
"clkernel_sign(const int num, __global const float* in, __global float* "
"out) {\n const int i = get_global_id(0);\n if (i >= num) return;\n "
"out[i] = (in[i] > 0) - (in[i] < 0);\n}\n\n__kernel\nvoid "
"clkernel_sqrt(const int num, __global const float* in, __global float* "
"out) {\n const int i = get_global_id(0);\n if (i >= num) return;\n "
"out[i] = sqrt(in[i]);\n}\n\n// kernel for square is called "
"pow(2).\n\n__kernel\nvoid clkernel_subtract_scalar(const int num, "
"__global const float* in, const float x,\n __global float* out) {\n "
"const int i = get_global_id(0);\n if (i >= num) return;\n out[i] = "
"in[i] - x;\n}\n\n__kernel\nvoid clkernel_subtract(const int num, __global "
"const float* in1, __global const float* in2,\n __global float* out) {\n "
" const int i = get_global_id(0);\n if (i >= num) return;\n out[i] = "
"in1[i] - in2[i];\n}\n\n// reduce3 kernel from\n// "
"https://github.com/sschaetz/nvidia-opencl-examples/blob/master/OpenCL/src/"
"oclReduction/oclReduction_kernel.cl\n__kernel\nvoid clkernel_sum(const "
"int num, __global const float* in, __global float* out,\n __local float* "
"sdata) {\n const int i = get_group_id(0)*(get_local_size(0)*2) + "
"get_local_id(0);\n const int tid = get_local_id(0);\n sdata[tid] = (i < "
"num) ? in[i] : 0.0f;\n\n // Perform the first level of reduction.\n if "
"(i + get_local_size(0) < num) {\nsdata[tid] += in[i + "
"get_local_size(0)];\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n\n for (int s "
"= get_local_size(0)/2; s > 0; s >>= 1) {\nif (tid > s) {\n sdata[tid] += "
"sdata[tid + s];\n}\nbarrier(CLK_LOCAL_MEM_FENCE);\n }\n\n if (tid == 0) "
"{\nout[get_group_id(0)] = sdata[0];\n }\n}\n\n__kernel\nvoid "
"clkernel_tanh(const int num, __global const float* in, __global float* "
"out) {\n const int i = get_global_id(0);\n if (i >= num) return;\n "
"out[i] = tanh(in[i]);\n}\n\n// **************************************\n// "
"Random functions\n// **************************************\n\n// See: "
"distribution.cl\n\n// "
"*********************************************************\n// BLAS "
"functions, ref to http://docs.nvidia.com/cuda/cublas\n// "
"*********************************************************\n\n__"
"kernel\nvoid clkernel_amax(const int num, __global const float* in, "
"__global int* ret,\n __local uint* sdata, __local size_t* temp) {\n "
"const int gid = get_global_id(0);\n const int tid = get_local_id(0);\n\n "
" for(int s = get_local_size(0)/2; s > 0; s >>= 1) {\nif (tid < s) {\n "
"sdata[tid] = (in[sdata[tid]] > in[tid+s]) ? sdata[tid] : "
"tid;\n}\nbarrier(CLK_LOCAL_MEM_FENCE);\n }\n if (tid == 0) {\nret[0] = "
"sdata[0];\n }\n}\n\n\n/* TODO: Fix line 284:20.\n__kernel\nvoid "
"clkernel_amin(const int num, __global const float* in, __global int* "
"ret,\n __local float* sdata, __local size_t* temp) {\n const int gid = "
"get_global_id(0);\n const int tid = get_local_id(0);\n\n // Initialize "
"the values to pos infinity.\n sdata[tid] = (gid < num) ? in[gid] : "
"INFINITY;\n barrier(CLK_LOCAL_MEM_FENCE);\n\n for(int s = "
"get_local_size(0)/2; s > 0; s >>= 1) {\nif (tid < s) {\n sdata[tid] = "
"(in[sdata[tid]] < in[tid+s]) ? sdata[tid] : "
"tid;\n}\nbarrier(CLK_LOCAL_MEM_FENCE);\n }\n if (tid == 0) {\nret[0] = "
"sdata[0];\n }\n}*/\n\n\n__kernel\nvoid clkernel_asum(const int num, "
"__global const float* in, __global float* out,\n __local float* sdata) "
"{\n const int tid = get_local_id(0);\n const int i = "
"get_global_id(0);\n\n // Initialize\n sdata[tid] = (i < num) ? in[i] : "
"INFINITY;\n // Perform the first level of reduction.\n if (i + "
"get_local_size(0) < num) {\nsdata[tid] += in[i + get_local_size(0)];\n "
"}\n barrier(CLK_LOCAL_MEM_FENCE);\n\n for(int s = get_local_size(0)/2; "
"s > 0; s >>= 1) {\nif (tid < s) {\n sdata[tid] = fabs(sdata[tid + "
"s]);\n}\nbarrier(CLK_LOCAL_MEM_FENCE);\n }\n if (tid == 0) {\nout[0] = "
"sdata[0];\n }\n}\n\n__kernel\nvoid clkernel_axpy(const int num, float "
"alpha, __global const float* in,\n __global float* out) {\n const int "
"i = get_global_id(0);\n if (i >= num) return;\n out[i] = fma(alpha, "
"in[i], out[i]);\n}\n\n// This kernel is essentially the same as Sum, "
"except that during the process\n// of reading in data to the local "
"memory, the value is also doubled.\n// Then, just before submitting the "
"sum to out, we do a square-root on it.\n__kernel\nvoid "
"clkernel_nrm2(const int num, __global const float* in, __global float* "
"out,\n __local float* sdata) {\n const int i = "
"get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);\n const int tid "
"= get_local_id(0);\n sdata[tid] = (i < num) ? (in[i] * in[i]) : "
"0.0f;\n\n // Perform the first level of reduction.\n if (i + "
"get_local_size(0) < num) {\nsdata[tid] += in[i + get_local_size(0)];\n "
"}\n barrier(CLK_LOCAL_MEM_FENCE);\n\n for (int s = get_local_size(0)/2; "
"s > 0; s >>= 1) {\nif (tid > s) {\n sdata[tid] += sdata[tid + "
"s];\n}\nbarrier(CLK_LOCAL_MEM_FENCE);\n }\n\n if (tid == 0) "
"{\nout[get_group_id(0)] = sqrt(sdata[0]);\n }\n}\n\n__kernel\nvoid "
"clkernel_scale(const int num, float x, __global float* out) {\n const "
"int i = get_global_id(0);\n if (i >= num) return;\n out[i] = x * "
"out[i];\n}\n\n__kernel\nvoid clkernel_dot(const int num, __global const "
"float* in1, __global const float* in2,\n __global float* out, __local "
"float* scratch) {\n const int i = get_global_id(0);\n if (i >= num) "
"return;\n int offset = i << 2;\n scratch[i] = in1[offset] * "
"in2[offset];\n\n}\n\n// First kernel from "
"http://www.bealto.com/gpu-gemv_intro.html\n// y = \xce\xb1*A*v + "
"\xce\xb2*y\n// fma(a, b, c) == (a * b) + c with infinite "
"precision\n__kernel\nvoid clkernel_gemv(const int m, const int n, const "
"float alpha,\n __global const float* A, __global const float* v,\n "
"const float beta, __global float* out) {\n const int i = "
"get_global_id(0);\n float sum = 0.0f;\n for (int k = 0; k < n; k++) "
"{\n sum += fma(beta, out[i + m * k], alpha * A[i + m * k] * v[k]);\n "
"}\n out[i] = sum;\n}\n\n// "
"http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-dgmm\n// X[j] = "
"x[j*inc(x)] if inc(x) \xe2\x89\xa5 0\n//= x[(\xcf\x87 \xe2\x88\x92 "
"1)*|inc(x)| \xe2\x88\x92 j*|inc(x)|] if inc(x) < 0\n\n// C = diag( X "
")*A\n__kernel\nvoid clkernel_dgmm_left(const int nrow, const int "
"ncol,\n__global const float* M, __global const float* v,\n__global float* "
"out) {\n const uint gidx = get_global_id(0);\n\n uint offset = gidx * "
"ncol;\n for (uint i = 0; i < ncol; i++) {\nout[offset + i] = M[offset + "
"i] * v[i];\n }\n}\n\n// C = A*diag( X )\n__kernel\nvoid "
"clkernel_dgmm_right(const int nrow, const int ncol,\n __global const "
"float* M, __global const float* v,\n __global float* out) {\n const uint "
"gidx = get_global_id(0);\n\n uint offset = gidx * ncol;\n for (uint i = "
"0; i < ncol; i++) {\nout[offset + i] = M[offset + i] * v[gidx];\n "
"}\n}\n\n// TODO: Optimize with Reference from "
"http://www.cedricnugteren.nl/tutorial.php?page=1\n// C = \xce\xb1*A*B + "
"\xce\xb2*C\n__kernel\nvoid clkernel_gemm(const uint nrowA, const uint "
"ncolB, const uint ncolA, const float alpha,\n __global const float* A, "
"__global const float* B, const float beta,\n __global float* C, "
"__local float* Asub, __local float* Bsub) {\n\n const uint lidx = "
"get_local_id(0);\n const uint lidy = get_local_id(1);\n const uint TS = "
"get_local_size(0); // Tile size\n const uint gidx = TS * get_group_id(0) "
"+ lidx; // Row ID of C (0..M)\n const uint gidy = TS * get_group_id(1) + "
"lidy; // Row ID of C (0..N)\n\n // Initialise the accumulation "
"register\n float acc = 0.0f;\n\n // Loop over all tiles\n const int "
"numtiles = ncolA / TS;\n for (int t = 0; t < numtiles; t++) {\n const "
"int tiledRow = TS * t + lidx;\n const int tiledCol = TS * t + lidy;\n "
" Asub[lidy * TS + lidx] = A[tiledCol * nrowA + gidx];\n Bsub[lidy * "
"TS + lidx] = B[gidy * ncolA + tiledRow];\n\n "
"barrier(CLK_LOCAL_MEM_FENCE);\n\n for(int k = 0; k < TS; k++) {\n "
"acc += Asub[k * TS + lidx] * Bsub[lidy * TS + k] * alpha;\n }\n\n "
"barrier(CLK_LOCAL_MEM_FENCE);\n }\n\n C[gidy * nrowA + gidx] = "
"fma(beta, C[gidy * nrowA + gidx], acc);\n}\n\n\n__kernel\nvoid "
"clkernel_crossentropy(const uint batchsize, const uint dim,\n __global "
"const float* p, __global const int* t,\n __global float* loss) {\n "
"const uint gidx = get_global_id(0);\n if (gidx >= batchsize) return;\n\n "
" int truth_idx = t[gidx];\n if (truth_idx <= 0) return;\n float "
"prob_of_truth = p[gidx * dim + truth_idx];\n loss[gidx] = "
"-log(fmax(prob_of_truth, -FLT_MIN));\n}\n\n\n__kernel\nvoid "
"clkernel_softmaxentropy(const uint batchsize, const uint dim,\n __global "
"const float* p, __global const int* t,\n __global float* grad) {\n const "
"uint gidx = get_global_id(0);\n if (gidx >= batchsize) return;\n\n int "
"truth_idx = t[gidx];\n if (truth_idx <= 0) return;\n grad[gidx * dim + "
"truth_idx] -= 1.0;\n}\n\n\n__kernel\nvoid clkernel_rowmax(const uint "
"nrow, const uint ncol,\n __global const float* in, "
"__global float* out) {\n const uint row_id = get_global_id(0);\n if "
"(row_id >= nrow) return;\n\n float row_max_val = -FLT_MAX;\n for (uint "
"i = 0; i < ncol; i++) {\n row_max_val = fmax(row_max_val, in[row_id * "
"ncol + i]);\n }\n\n out[row_id] = row_max_val;\n}\n\n\n// "
"**************************************\n// Matrix functions\n// "
"**************************************\n/*\n__kernel\nvoid "
"clkernel_addcol(int nrow, int ncol, __global const float* A, __global "
"const float* v, __global float* out) {\n const int i = "
"get_global_id(0);\n const int j = get_global_id(1);\n if (i >= nrow) "
"return;\n if (j >= ncol) return;\n ret[j] = A[j + nrow * i] + "
"v[j];\n}\n\n__kernel\nvoid clkernel_addrow(int nrow, int ncol, __global "
"const float* A, __global const float* v, __global float* out) {\n const "
"int i = get_global_id(0);\n const int j = get_global_id(1);\n if (i >= "
"nrow) return;\n if (j >= ncol) return;\n out[i] = A[i + ncol * j] + "
"v[i];\n}\n\n__kernel\nvoid clkernel_outerproduct(int m, const int n, "
"__global const float* in1, __global const float* in2, __global float* "
"out) {\n const int col = get_global_id(0);\n const int row = "
"get_global_id(1);\n\n // TODO: This\n}\n\n__kernel\nvoid "
"clkernel_sumcol(int nrow, int ncol, __global const float* in, __global "
"float* out) {\n const int i = get_global_id(0);\n if (i >= nrow) "
"return;\n\n float sum = 0.0f;\n for (int j = 0; j < nrow; j++) {\nsum "
"+= input[nrow * i + j];\n }\n out[i] = sum;\n}\n*/\n__kernel\nvoid "
"clkernel_sumrow(int nrow, int ncol, __global const float* in, __global "
"float* out) {\n const int idx = get_global_id(0);\n if (idx >= nrow) "
"return;\n\n float sum = 0.0f;\n for (int j = 0; j < ncol; j++) {\nsum "
"+= in[j + ncol * idx];\n }\n out[idx] = sum;\n}\n\n\n// Adapted from "
"http://code.haskell.org/HsOpenCL/tests/bench/transpose.cl\n#define "
"BLOCK_DIM 16\n__kernel\nvoid clkernel_transpose(uint nrow, uint "
"ncol,\n__global const float* in, __global float* out,\n__local float* "
"sdata) {\n uint gidx = get_global_id(0);\n uint gidy = "
"get_global_id(1);\n\n if ((gidx < ncol) && (gidy < nrow)) {\nuint id_in "
"= gidy * ncol + gidx;\nsdata[get_local_id(1) * (BLOCK_DIM+1) + "
"get_local_id(0)] = in[id_in];\n }\n\n barrier(CLK_LOCAL_MEM_FENCE);\n\n "
" gidx = get_group_id(1) * BLOCK_DIM + get_local_id(0);\n gidy = "
"get_group_id(0) * BLOCK_DIM + get_local_id(1);\n if ((gidx < nrow) && "
"(gidy < ncol)) {\nuint id_out = gidy * nrow + gidx;\nout[id_out] = "
"sdata[get_local_id(0) * (BLOCK_DIM + 1) + get_local_id(1)];\n "
"}\n}\n/*\n__kernel\nvoid clkernel_transpose2(uint nrow, uint ncol, "
"__global const float* in, __global float* out, __local float* sdata) {\n "
"const uint lidx = get_local_id(0);\n const uint lidy = "
"get_local_id(1);\n const uint id0 = get_group_id(0) * ncol * lidx;\n "
"const uint id1 = get_group_id(1) * nrow * lidy;\n\n if (id0 < nrow && "
"id1 < ncol) {\nsdata[lidx][lidy] = in[id1 * nrow + id0];\n }\n\n "
"barrier(CLK_LOCAL_MEM_FENCE);\n\n const uint new_id0 = get_group_id(1) * "
"nrow + lidx;\n const uint new_id1 = get_group_id(0) * ncol + lidy;\n\n "
"if (new_id0 < ncol && new_id1 < nrow) {\nout[new_id1 * ncol + new_id0] = "
"sdata[lidx][lidy];\n }\n}*/\n\n__kernel\nvoid clkernel_diagvec_left(uint "
"vsize, __global const float* vin, __global float* out) {\n const uint "
"gid = get_global_id(0);\n\n for (uint i = 0; i < vsize; i++)\nout[gid * "
"vsize + i] = (i == gid) ? vin[gid] : 0.0f;\n}\n\n\n__kernel\nvoid "
"clkernel_diagvec_right(uint vsize, __global const float* vin, __global "
"float* out) {\n const uint gid = get_global_id(0);\n\n for (uint i = 0; "
"i < vsize; i++)\nout[gid * vsize + i] = (i == gid) ? vin[gid] : "
"0.0f;\n}\n";
} // namespace opencl
} // namespace singa
#endif