blob: ce298c0a23b02cd450015602c0106d488cd6413a [file] [log] [blame]
// This code is adapted from https://github.com/amd/OpenCL-caffe/blob/stable/src/caffe/ocl/random.cl
//Note: random generator has two parts
//first part: the open sourced threefy random generator kernel from DE Shaw Research
//second part. we wrap the kernel up to generate uniform, bernoulli and gaussion distribution generators.
//begin: the open sourced random generator from DE Shaw Research
//https://www.deshawresearch.com/resources_random123.html
typedef uint uint32_t;
struct r123array4x32 {
uint32_t v[4];
};
enum r123_enum_threefry32x4 {
R_32x4_0_0 = 10,
R_32x4_0_1 = 26,
R_32x4_1_0 = 11,
R_32x4_1_1 = 21,
R_32x4_2_0 = 13,
R_32x4_2_1 = 27,
R_32x4_3_0 = 23,
R_32x4_3_1 = 5,
R_32x4_4_0 = 6,
R_32x4_4_1 = 20,
R_32x4_5_0 = 17,
R_32x4_5_1 = 11,
R_32x4_6_0 = 25,
R_32x4_6_1 = 10,
R_32x4_7_0 = 18,
R_32x4_7_1 = 20
};
inline uint32_t RotL_32(uint32_t x, unsigned int N) {
return (x << (N & 31)) | (x >> ((32 - N) & 31));
}
typedef struct r123array4x32 threefry4x32_ctr_t;
typedef struct r123array4x32 threefry4x32_key_t;
typedef struct r123array4x32 threefry4x32_ukey_t;
inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k) {
threefry4x32_ctr_t X;
uint32_t ks[4 + 1];
int i;
ks[4] = 0x1BD11BDA;
{
ks[0] = k.v[0];
X.v[0] = in.v[0];
ks[4] ^= k.v[0];
ks[1] = k.v[1];
X.v[1] = in.v[1];
ks[4] ^= k.v[1];
ks[2] = k.v[2];
X.v[2] = in.v[2];
ks[4] ^= k.v[2];
ks[3] = k.v[3];
X.v[3] = in.v[3];
ks[4] ^= k.v[3];
}
X.v[0] += ks[0];
X.v[1] += ks[1];
X.v[2] += ks[2];
X.v[3] += ks[3];
if (Nrounds > 0) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 1) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 2) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 3) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 3) {
X.v[0] += ks[1];
X.v[1] += ks[2];
X.v[2] += ks[3];
X.v[3] += ks[4];
X.v[4 - 1] += 1;
}
if (Nrounds > 4) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 5) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 6) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 7) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 7) {
X.v[0] += ks[2];
X.v[1] += ks[3];
X.v[2] += ks[4];
X.v[3] += ks[0];
X.v[4 - 1] += 2;
}
if (Nrounds > 8) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 9) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 10) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 11) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 11) {
X.v[0] += ks[3];
X.v[1] += ks[4];
X.v[2] += ks[0];
X.v[3] += ks[1];
X.v[4 - 1] += 3;
}
if (Nrounds > 12) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 13) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 14) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 15) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 15) {
X.v[0] += ks[4];
X.v[1] += ks[0];
X.v[2] += ks[1];
X.v[3] += ks[2];
X.v[4 - 1] += 4;
}
if (Nrounds > 16) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 17) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 18) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 19) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 19) {
X.v[0] += ks[0];
X.v[1] += ks[1];
X.v[2] += ks[2];
X.v[3] += ks[3];
X.v[4 - 1] += 5;
}
if (Nrounds > 20) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 21) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 22) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 23) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 23) {
X.v[0] += ks[1];
X.v[1] += ks[2];
X.v[2] += ks[3];
X.v[3] += ks[4];
X.v[4 - 1] += 6;
}
if (Nrounds > 24) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 25) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 26) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 27) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 27) {
X.v[0] += ks[2];
X.v[1] += ks[3];
X.v[2] += ks[4];
X.v[3] += ks[0];
X.v[4 - 1] += 7;
}
if (Nrounds > 28) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 29) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 30) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 31) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 31) {
X.v[0] += ks[3];
X.v[1] += ks[4];
X.v[2] += ks[0];
X.v[3] += ks[1];
X.v[4 - 1] += 8;
}
if (Nrounds > 32) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 33) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 34) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 35) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 35) {
X.v[0] += ks[4];
X.v[1] += ks[0];
X.v[2] += ks[1];
X.v[3] += ks[2];
X.v[4 - 1] += 9;
}
if (Nrounds > 36) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 37) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 38) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 39) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 39) {
X.v[0] += ks[0];
X.v[1] += ks[1];
X.v[2] += ks[2];
X.v[3] += ks[3];
X.v[4 - 1] += 10;
}
if (Nrounds > 40) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 41) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 42) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 43) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 43) {
X.v[0] += ks[1];
X.v[1] += ks[2];
X.v[2] += ks[3];
X.v[3] += ks[4];
X.v[4 - 1] += 11;
}
if (Nrounds > 44) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 45) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 46) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 47) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 47) {
X.v[0] += ks[2];
X.v[1] += ks[3];
X.v[2] += ks[4];
X.v[3] += ks[0];
X.v[4 - 1] += 12;
}
if (Nrounds > 48) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 49) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 50) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 51) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 51) {
X.v[0] += ks[3];
X.v[1] += ks[4];
X.v[2] += ks[0];
X.v[3] += ks[1];
X.v[4 - 1] += 13;
}
if (Nrounds > 52) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 53) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 54) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 55) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 55) {
X.v[0] += ks[4];
X.v[1] += ks[0];
X.v[2] += ks[1];
X.v[3] += ks[2];
X.v[4 - 1] += 14;
}
if (Nrounds > 56) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 57) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 58) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 59) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 59) {
X.v[0] += ks[0];
X.v[1] += ks[1];
X.v[2] += ks[2];
X.v[3] += ks[3];
X.v[4 - 1] += 15;
}
if (Nrounds > 60) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 61) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 62) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 63) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 63) {
X.v[0] += ks[1];
X.v[1] += ks[2];
X.v[2] += ks[3];
X.v[3] += ks[4];
X.v[4 - 1] += 16;
}
if (Nrounds > 64) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 65) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 66) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 67) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 67) {
X.v[0] += ks[2];
X.v[1] += ks[3];
X.v[2] += ks[4];
X.v[3] += ks[0];
X.v[4 - 1] += 17;
}
if (Nrounds > 68) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 69) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 70) {
X.v[0] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
X.v[1] ^= X.v[0];
X.v[2] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
X.v[3] ^= X.v[2];
}
if (Nrounds > 71) {
X.v[0] += X.v[3];
X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
X.v[3] ^= X.v[0];
X.v[2] += X.v[1];
X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
X.v[1] ^= X.v[2];
}
if (Nrounds > 71) {
X.v[0] += ks[3];
X.v[1] += ks[4];
X.v[2] += ks[0];
X.v[3] += ks[1];
X.v[4 - 1] += 18;
}
return X;
}
//end: the open sourced random generator from DE Shaw Research
// **************************
// BERNOULLI DISTRIBUTION
// **************************
__kernel void PRNG_threefry4x32_bernoulli(
__global float4 *randomnumber,
threefry4x32_ctr_t ctr_i,
float inf, float sup,
float threshold,
uint nrounds, uint numrandom) {
size_t gdx = get_global_id(0);
uint maxUint = 0;
maxUint--;
float r = (float)maxUint;
threefry4x32_ctr_t ctr = ctr_i;
threefry4x32_ukey_t ukey;
ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
threefry4x32_ctr_t random4;
if ( gdx < numrandom ) {
random4 = threefry4x32_R(nrounds, ctr, ukey);
float4 frnd;
frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;
frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;
frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;
frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;
randomnumber[gdx] = frnd;
}
}
// **************************
// UNIFORM DISTRIBUTION (float)
// **************************
__kernel void PRNG_threefry4x32_uniform(
__global float4 *randomnumber,
threefry4x32_ctr_t ctr_i,
float inf, float sup,
uint nrounds, uint numrandom) {
size_t gdx = get_global_id(0);
uint maxUint = 0;
maxUint--;
float r = (float)maxUint;
threefry4x32_ctr_t ctr = ctr_i;
threefry4x32_ukey_t ukey;
ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
threefry4x32_ctr_t random4;
if ( gdx < numrandom ) {
random4 = threefry4x32_R(nrounds, ctr, ukey);
float4 frnd;
frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf );
frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf );
frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf );
frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf );
randomnumber[gdx] = frnd;
}
}
// **************************
// UNIFORM DISTRIBUTION (uint)
// **************************
__kernel void PRNG_threefry4x32_uint_uniform(
__global uint4 *randomnumber,
threefry4x32_ctr_t ctr_i,
uint inf, uint sup,
uint nrounds, uint numrandom) {
size_t gdx = get_global_id(0);
threefry4x32_ctr_t ctr = ctr_i;
threefry4x32_ukey_t ukey;
ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
threefry4x32_ctr_t random4;
if ( gdx < numrandom ) {
random4 = threefry4x32_R(nrounds, ctr, ukey);
uint4 frnd;
frnd.x = random4.v[0] % (sup - inf) + inf;
frnd.y = random4.v[1] % (sup - inf) + inf;
frnd.z = random4.v[2] % (sup - inf) + inf;
frnd.w = random4.v[3] % (sup - inf) + inf;
randomnumber[gdx] = frnd;
}
}
// **************************
// GAUSSIAN DISTRIBUTION
// **************************
__kernel void PRNG_threefry4x32_gaussian(
__global float4 *randomnumber,
threefry4x32_ctr_t ctr_i,
float E, float V,
uint nrounds, uint numrandom) {
size_t gdx = get_global_id(0);
uint maxUint = 0;
maxUint--;
float r = (float)maxUint;
threefry4x32_ctr_t ctr = ctr_i;
threefry4x32_ukey_t ukey1, ukey2;
ukey1.v[0] = ukey2.v[1] = ukey1.v[2] = ukey2.v[3] = gdx;
ukey2.v[0] = ukey1.v[1] = ukey2.v[2] = ukey1.v[3] = 0;
threefry4x32_ctr_t random1, random2;
if ( gdx < numrandom ) {
random1 = threefry4x32_R(nrounds, ctr, ukey1);
random2 = threefry4x32_R(nrounds, ctr, ukey2);
float4 frnd1;
float r1 = (((float)random1.v[0]) / r); // generate a random sequence of uniform distribution
float r2 = (((float)random2.v[0]) / r);
float r3 = (((float)random1.v[1]) / r);
float r4 = (((float)random2.v[1]) / r);
float r5 = (((float)random1.v[2]) / r);
float r6 = (((float)random2.v[2]) / r);
float r7 = (((float)random1.v[3]) / r);
float r8 = (((float)random2.v[3]) / r);
if(r2 == 0 || r4 == 0 || r6 == 0 || r8 == 0) {
r2 += 0.0001;
r4 += 0.0001;
r6 += 0.0001;
r8 += 0.0001;
}
frnd1.x = cos(2*M_PI*r1)*sqrt(-2.0*log(r2)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data
//frnd2.x = sin(2*M_PI*r1)*sqrt(-2.0*log(r2)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
frnd1.y = cos(2*M_PI*r3)*sqrt(-2.0*log(r4)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data
//frnd2.y = sin(2*M_PI*r3)*sqrt(-2.0*log(r4)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
frnd1.z = cos(2*M_PI*r5)*sqrt(-2.0*log(r6)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data
//frnd2.z = sin(2*M_PI*r5)*sqrt(-2.0*log(r6)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
frnd1.w = cos(2*M_PI*r7)*sqrt(-2.0*log(r8)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data
//frnd2.w = sin(2*M_PI*r7)*sqrt(-2.0*log(r8)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
randomnumber[gdx] = frnd1;
}
}