test/singa/test_opencl.cc - singa - Git at Google

 /************************************************************
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *************************************************************/

 #include "gtest/gtest.h"
 #include "singa/core/device.h"
 #include "singa/core/tensor.h"
 #include "singa/proto/core.pb.h"
 using singa::CppCPU;
 using singa::Block;
 using singa::Shape;
 using singa::Tensor;

 #ifdef USE_OPENCL
 using singa::OpenclDevice;
 class OpenCL_TensorMath : public ::testing::Test {
 protected:

   OpenCL_TensorMath() {
     for (int i = 0; i < 4; i++) {
       float4[i] = (float)i;
       float4zero[i] = 0.0f;
     }

     for (int i = 0; i < 16; i++) {
       float16[i] = (float)i;
       float16zero[i] = 0.0f;
     }

     auto ocl_dev = std::make_shared<OpenclDevice>();

     tf4in = Tensor(Shape{1, 4}, ocl_dev);
     tf4in.CopyDataFromHostPtr(float4, 4);

     tf4zin = Tensor(Shape{1, 4}, ocl_dev);
     tf4zin.CopyDataFromHostPtr(float4zero, 4);

     tf16in = Tensor(Shape{4, 4}, ocl_dev);
     tf16in.CopyDataFromHostPtr(float16, 16);

     tf16zin = Tensor(Shape{4, 4}, ocl_dev);
     tf16zin.CopyDataFromHostPtr(float16zero, 16);

     float empty[10000] = {};
     empty10k = Tensor(Shape{10000}, ocl_dev);
     empty10k.CopyDataFromHostPtr(empty, 10000);
   }

   float float4[4];
   float float4zero[4];
   float float16[16];
   float float16zero[16];

   Tensor tf4in, tf16in;
   Tensor tf4zin, tf16zin;
   Tensor empty10k;
 };


 // Makes a float array and fills it with increasing values from 0.
 float* MakeMatrix(const int size) {
   float* mat = new float[size];
   for (int i = 0; i < size; i++)
     mat[i] = i;
   return mat;
 }


 TEST(OpenclDevice, Constructor) {
   OpenclDevice dev;
   EXPECT_EQ(0, dev.id());
 }


 TEST(OpenclDevice, MemoryAllocFree) {
   OpenclDevice dev;
   Block* b = dev.NewBlock(4);
   EXPECT_NE(nullptr, b);
   EXPECT_EQ(4u, b->size());
   dev.FreeBlock(b);
 }

 // Tests for integrity of one round of data transfer to an OpenCL device and back.
 TEST(OpenclDevice, CopyDataToFrom) {
   OpenclDevice dev;
   CppCPU host;

   Block* a = host.NewBlock(4);
   Block* b = dev.NewBlock(4);
   Block* c = host.NewBlock(4);

   // Allocate the Block object on the host.
   char s[] = {'a', 'b', 'c', 'x'};
   host.CopyDataFromHostPtr(a, s, 4);

   // Copy back and forth.
   dev.CopyDataToFrom(b, a, 4, singa::kHostToDevice);
   dev.CopyDataToFrom(c, b, 4, singa::kDeviceToHost);

   const char* astr = static_cast<const char*>(c->data());
   EXPECT_EQ('a', astr[0]);
   EXPECT_EQ('b', astr[1]);
   EXPECT_EQ('x', astr[3]);
 }


 TEST(OpenclDevice, DuplicateDataOnDevice) {
   OpenclDevice dev;
   CppCPU host;

   Block* a = host.NewBlock(4);
   Block* b = dev.NewBlock(4);
   Block* c = dev.NewBlock(4);
   Block* d = host.NewBlock(4);

   // Allocate the Block object on the host.
   char s[] = {'a', 'b', 'c', 'x'};
   host.CopyDataFromHostPtr(a, s, 4);

   // Copy to device and duplicate.
   dev.CopyDataToFrom(b, a, 4, singa::kHostToDevice);
   dev.CopyDataToFrom(c, b, 4, singa::kDeviceToDevice);
   dev.CopyDataToFrom(d, c, 4, singa::kDeviceToHost);

   const char* astr = static_cast<const char*>(d->data());
   EXPECT_EQ('a', astr[0]);
   EXPECT_EQ('b', astr[1]);
   EXPECT_EQ('x', astr[3]);
 }

 // Tensor tests, uses OpenCL_TensorMath class defined above.

 TEST_F(OpenCL_TensorMath, CopyDataToDevice) {
   tf4in.ToHost();
   const float* out = tf4in.data<float>();

   EXPECT_EQ(1.0f, out[1]);
   EXPECT_EQ(3.0f, out[3]);
 }


 TEST_F(OpenCL_TensorMath, MemberAbs) {
   tf4in = Abs(tf4in);

   tf4in.ToHost();
   const float* out = tf4in.data<float>();

   EXPECT_EQ(0.0f, out[0]);
   EXPECT_EQ(1.0f, out[1]);
   EXPECT_EQ(2.0f, out[2]);
   EXPECT_EQ(3.0f, out[3]);
 }


 TEST_F(OpenCL_TensorMath, MemberExp) {
   tf4in = Exp(tf4in);

   tf4in.ToHost();
   const float* out = tf4in.data<float>();

   EXPECT_NEAR(exp(0.0f), out[0], 1e-5);
   EXPECT_NEAR(exp(1.0f), out[1], 1e-5);
   EXPECT_NEAR(exp(2.0f), out[2], 1e-5);
   EXPECT_NEAR(exp(3.0f), out[3], 1e-5);
 }


 TEST_F(OpenCL_TensorMath, MemberLog) {
   tf4in = Log(tf4in);

   tf4in.ToHost();
   const float* out = tf4in.data<float>();

 //  EXPECT_NEAR(log(0.0f), out[0], 1e-5); // Evaluates to neg infinity.
   EXPECT_NEAR(log(1.0f), out[1], 1e-5);
   EXPECT_NEAR(log(2.0f), out[2], 1e-5);
   EXPECT_NEAR(log(3.0f), out[3], 1e-5);
 }


 TEST_F(OpenCL_TensorMath, MemberReLU) {
   tf4in -= 2.0f;
   Tensor result = ReLU(tf4in);

   result.ToHost();
   const float* out = result.data<float>();

   EXPECT_NEAR(0.0f, out[0], 1e-5);
   EXPECT_NEAR(0.0f, out[1], 1e-5);
   EXPECT_NEAR(0.0f, out[2], 1e-5);
   EXPECT_NEAR(1.0f, out[3], 1e-5);
 }


 TEST_F(OpenCL_TensorMath, MemberSigmoid) {
   tf4in = Sigmoid(tf4in);

   tf4in.ToHost();
   const float* out = tf4in.data<float>();

   EXPECT_NEAR(1.0f / (1.0f + exp(-0.0f)), out[0], 1e-5);
   EXPECT_NEAR(1.0f / (1.0f + exp(-1.0f)), out[1], 1e-5);
   EXPECT_NEAR(1.0f / (1.0f + exp(-2.0f)), out[2], 1e-5);
 }


 TEST_F(OpenCL_TensorMath, MemberSign) {
   tf4in -= 1.0f;

   tf4in.ToHost();
   const float* out = tf4in.data<float>();

   EXPECT_NEAR(-1.0f, out[0], 1e-5);
   EXPECT_NEAR(0.0f, out[1], 1e-5);
   EXPECT_NEAR(1.0f, out[2], 1e-5);
   EXPECT_NEAR(2.0f, out[3], 1e-5);
 }


 TEST_F(OpenCL_TensorMath, MemberSqrt) {
   tf4in = Sqrt(tf4in);

   tf4in.ToHost();
   const float* out = tf4in.data<float>();

   EXPECT_NEAR(0.0f, out[0], 1e-5);
   EXPECT_NEAR(1.0f, out[1], 1e-5);
   EXPECT_NEAR(sqrt(2.0f), out[2], 1e-5);
   EXPECT_NEAR(sqrt(3.0f), out[3], 1e-5);
 }


 TEST_F(OpenCL_TensorMath, MemberSquare) {
   tf4in = Square(tf4in);

   tf4in.ToHost();
   const float* out = tf4in.data<float>();

   EXPECT_NEAR(0.0f, out[0], 1e-5);
   EXPECT_NEAR(1.0f, out[1], 1e-5);
   EXPECT_NEAR(4.0f, out[2], 1e-5);
   EXPECT_NEAR(9.0f, out[3], 1e-5);
 }


 TEST_F(OpenCL_TensorMath, MemberTanh) {
   tf4in = Tanh(tf4in);

   tf4in.ToHost();
   const float* out = tf4in.data<float>();

   EXPECT_NEAR(0.0f, out[0], 1e-5);
   EXPECT_NEAR(tanh(1.0f), out[1], 1e-5);
   EXPECT_NEAR(tanh(2.0f), out[2], 1e-5);
   EXPECT_NEAR(tanh(3.0f), out[3], 1e-5);
 }


 TEST_F(OpenCL_TensorMath, Sum) {
   Tensor result = Sum(tf4in, 0);

   result.ToHost();
   const float* out = result.data<float>();

   EXPECT_NEAR(0.0f, out[0], 1e-5);
   EXPECT_NEAR(1.0f, out[1], 1e-5);
   EXPECT_NEAR(2.0f, out[2], 1e-5);
   EXPECT_NEAR(3.0f, out[3], 1e-5);
 }

 TEST_F(OpenCL_TensorMath, MemberLT) {
   Tensor result = tf4in < 2.0f;

   result.ToHost();
   const float* out = result.data<float>();

   EXPECT_FLOAT_EQ(1.0f, out[0]);
   EXPECT_FLOAT_EQ(1.0f, out[1]);
   EXPECT_FLOAT_EQ(0.0f, out[2]);
   EXPECT_FLOAT_EQ(0.0f, out[3]);
 }


 TEST_F(OpenCL_TensorMath, MemberLE) {
   Tensor result = tf4in <= 2.0f;

   result.ToHost();
   const float* out = result.data<float>();

   EXPECT_FLOAT_EQ(1.0f, out[0]);
   EXPECT_FLOAT_EQ(1.0f, out[1]);
   EXPECT_FLOAT_EQ(1.0f, out[2]);
   EXPECT_FLOAT_EQ(0.0f, out[3]);
 }


 TEST_F(OpenCL_TensorMath, MemberGT) {
   Tensor result = tf4in > 2.0f;

   result.ToHost();
   const float* out = result.data<float>();

   EXPECT_FLOAT_EQ(0.0f, out[0]);
   EXPECT_FLOAT_EQ(0.0f, out[1]);
   EXPECT_FLOAT_EQ(0.0f, out[2]);
   EXPECT_FLOAT_EQ(1.0f, out[3]);
 }


 TEST_F(OpenCL_TensorMath, MemberGE) {
   Tensor result = tf4in >= 2.0f;

   result.ToHost();
   const float* out = result.data<float>();

   EXPECT_FLOAT_EQ(0.0f, out[0]);
   EXPECT_FLOAT_EQ(0.0f, out[1]);
   EXPECT_FLOAT_EQ(1.0f, out[2]);
   EXPECT_FLOAT_EQ(1.0f, out[3]);
 }


 TEST_F(OpenCL_TensorMath, MemberPow) {
   Tensor result = Pow(tf4in, 2.0f);

   result.ToHost();
   const float* out = result.data<float>();

   EXPECT_FLOAT_EQ(0.0f, out[0]);
   EXPECT_FLOAT_EQ(1.0f, out[1]);
   EXPECT_FLOAT_EQ(4.0f, out[2]);
   EXPECT_FLOAT_EQ(9.0f, out[3]);

   result = Pow(tf4in, tf4in);

   result.ToHost();
   const float* out1 = result.data<float>();

   EXPECT_FLOAT_EQ(1.0f, out1[0]); // 0 ^ 0 is 1, apparently.
   EXPECT_FLOAT_EQ(1.0f, out1[1]);
   EXPECT_FLOAT_EQ(4.0f, out1[2]);
   EXPECT_FLOAT_EQ(27.0f, out1[3]);
 }


 TEST_F(OpenCL_TensorMath, MemberSub) {
   Tensor result = tf4in - tf4zin;

   result.ToHost();
   const float* out = result.data<float>();

   EXPECT_FLOAT_EQ(0.0f, out[0]);
   EXPECT_FLOAT_EQ(1.0f, out[1]);
   EXPECT_FLOAT_EQ(2.0f, out[2]);
   EXPECT_FLOAT_EQ(3.0f, out[3]);

   result = tf4in - 0.0f;

   result.ToHost();
   const float* out1 = result.data<float>();

   EXPECT_FLOAT_EQ(0.0f, out1[0]);
   EXPECT_FLOAT_EQ(1.0f, out1[1]);
   EXPECT_FLOAT_EQ(2.0f, out1[2]);
   EXPECT_FLOAT_EQ(3.0f, out1[3]);
 }


 TEST_F(OpenCL_TensorMath, MemberEltwiseMult) {
   Tensor result = tf4in * tf4zin;

   result.ToHost();
   const float* out = result.data<float>();

   EXPECT_FLOAT_EQ(0.0f, out[0]);
   EXPECT_FLOAT_EQ(0.0f, out[1]);
   EXPECT_FLOAT_EQ(0.0f, out[2]);
   EXPECT_FLOAT_EQ(0.0f, out[3]);

   result = tf4in * 10.0f;

   result.ToHost();
   const float* out1 = result.data<float>();

   EXPECT_FLOAT_EQ(0.0f, out1[0]);
   EXPECT_FLOAT_EQ(10.0f, out1[1]);
   EXPECT_FLOAT_EQ(20.0f, out1[2]);
   EXPECT_FLOAT_EQ(30.0f, out1[3]);
 }


 TEST_F(OpenCL_TensorMath, MemberDiv) {
   Tensor result = tf4in / tf4in;

   result.ToHost();
   const float* out = result.data<float>();

 //  EXPECT_FLOAT_EQ(0.0f, out[0]); // Divide by zero.
   EXPECT_FLOAT_EQ(1.0f, out[1]);
   EXPECT_FLOAT_EQ(1.0f, out[2]);
   EXPECT_FLOAT_EQ(1.0f, out[3]);

   result = tf4in / 10.0f;

   result.ToHost();
   const float* out1 = result.data<float>();

   EXPECT_FLOAT_EQ(0.0f, out1[0]);
   EXPECT_FLOAT_EQ(0.1f, out1[1]);
   EXPECT_FLOAT_EQ(0.2f, out1[2]);
   EXPECT_FLOAT_EQ(0.3f, out1[3]);

   result = Div(10.0f, tf4in);

   result.ToHost();
   const float* out2 = result.data<float>();

 //  EXPECT_FLOAT_EQ(0.0f, out[0]); // Divide by 0.
   EXPECT_FLOAT_EQ(10.0f, out2[1]);
   EXPECT_FLOAT_EQ(5.0f, out2[2]);
   EXPECT_NEAR((10.0f / 3.0f), out2[3], 1e-5);
 }

 // **************************************
 // Random functions
 // **************************************

 TEST_F(OpenCL_TensorMath, Bernoulli) {
   const float p = 0.3f;

   Bernoulli(p, &empty10k);

   empty10k.ToHost();
   const float* out = empty10k.data<float>();

   float sum = 0.0f;
   for (int i = 0; i < 10000; i++) sum += out[i];

   float mean = sum / 10000;

   EXPECT_NEAR(mean, p, 1e-2);

   sum = 0.0f;
   for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean);
   float variance = sum / 9999;

   EXPECT_NEAR(variance, p * (1 - p), 1e-2);
 }


 TEST_F(OpenCL_TensorMath, Gaussian) {
   Gaussian(0.0f, 1.0f, &empty10k);

   empty10k.ToHost();
   const float* out = empty10k.data<float>();

   float sum = 0.0f;
   for (int i = 0; i < 10000; i++) sum += out[i];
   float mean = sum / 10000;

   EXPECT_NEAR(mean, 0.0f, 1e-2);

   sum = 0.0f;
   for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean);
   float variance = sum / 9999;

   EXPECT_NEAR(variance, 1.0f, 1e-2);
 }


 TEST_F(OpenCL_TensorMath, Uniform) {
   Uniform(0.1f, 0.2f, &empty10k);

   empty10k.ToHost();
   const float* out = empty10k.data<float>();

   float sum = 0.0f;
   for (int i = 0; i < 10000; i++) sum += out[i];
   float mean = sum / 10000;

   EXPECT_NEAR(mean, 0.15f, 1e-2);

   sum = 0.0f;
   for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean);
   float variance = sum / 9999;

   EXPECT_NEAR(variance, 0.01f, 1e-2);
 }

 // *********************************************************
 // BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
 // *********************************************************


 TEST_F(OpenCL_TensorMath, EltwiseAdd) {
   Tensor result = tf4in + tf4in;

   result.ToHost();
   const float* out = result.data<float>();

   EXPECT_EQ(0.0f, out[0]);
   EXPECT_EQ(2.0f, out[1]);
   EXPECT_EQ(4.0f, out[2]);
   EXPECT_EQ(6.0f, out[3]);

   result = tf4in + tf4zin;

   result.ToHost();
   const float* out1 = result.data<float>();

   EXPECT_EQ(0.0f, out1[0]);
   EXPECT_EQ(1.0f, out1[1]);
   EXPECT_EQ(2.0f, out1[2]);
   EXPECT_EQ(3.0f, out1[3]);

   result = Tensor(tf4in.shape(), tf4in.device(), tf4in.data_type());
   Add(tf4in, tf4in, &result);

   result.ToHost();
   const float* out2 = result.data<float>();

   EXPECT_EQ(0.0f, out2[0]);
   EXPECT_EQ(2.0f, out2[1]);
   EXPECT_EQ(4.0f, out2[2]);
   EXPECT_EQ(6.0f, out2[3]);

   result = tf4in + 1.0f;

   result.ToHost();
   const float* out3 = result.data<float>();

   EXPECT_EQ(1.0f, out3[0]);
   EXPECT_EQ(2.0f, out3[1]);
   EXPECT_EQ(3.0f, out3[2]);
   EXPECT_EQ(4.0f, out3[3]);
 }


 TEST_F(OpenCL_TensorMath, SetValue) {
   const float one_third = 1.0f / 3.0f;
   empty10k.SetValue(one_third);

   empty10k.ToHost();
   const float* out = empty10k.data<float>();

   EXPECT_EQ(one_third, out[0]);
   EXPECT_EQ(one_third, out[1]);
   EXPECT_EQ(one_third, out[1024]);
   EXPECT_EQ(one_third, out[4096]);
   EXPECT_EQ(one_third, out[9998]);
   EXPECT_EQ(one_third, out[9999]);
 }


 TEST_F(OpenCL_TensorMath, Axpy) {
   Axpy(10.0f, tf4in, &tf4in);

   tf4in.ToHost();
   const float* out = tf4in.data<float>();

   EXPECT_EQ(0.0f, out[0]);  // 0 * 10 + 0 = 0
   EXPECT_EQ(11.0f, out[1]); // 1 * 10 + 1 = 11
   EXPECT_EQ(22.0f, out[2]); // 2 * 10 + 2 = 22
   EXPECT_EQ(33.0f, out[3]); // 3 * 10 + 3 = 33
 }

 TEST_F(OpenCL_TensorMath, Mult) {
   Tensor result = Mult(tf4in, tf4zin.T()); // Multiply with zero.

   result.ToHost();
   const float* out = result.data<float>();

   EXPECT_EQ(0.0f, out[0]); // 1x4 * 4x1 = 1x1.

   result = Mult(tf4in, tf4in.T());

   result.ToHost();
   const float* out0 = result.data<float>();

   EXPECT_EQ(14.0f, out0[0]); // 1x4 * 4x1 = 1x1.

   tf16zin.SetValue(10.0f); // Multiply with 10.0.
   result = Mult(tf16in, tf16zin); // 4x4 * 4x4 = 4x4.

   result.ToHost();
   const float* out1 = result.data<float>();
   EXPECT_EQ(240.0f, out1[0]);
   EXPECT_EQ(280.0f, out1[1]);
   EXPECT_EQ(320.0f, out1[2]);
   EXPECT_EQ(360.0f, out1[3]);

   EXPECT_EQ(240.0f, out1[4]);
   EXPECT_EQ(280.0f, out1[5]);
   EXPECT_EQ(320.0f, out1[6]);
   EXPECT_EQ(360.0f, out1[7]);

   EXPECT_EQ(240.0f, out1[8]);
   EXPECT_EQ(280.0f, out1[9]);
   EXPECT_EQ(320.0f, out1[10]);
   EXPECT_EQ(360.0f, out1[11]);

   EXPECT_EQ(240.0f, out1[12]);
   EXPECT_EQ(280.0f, out1[13]);
   EXPECT_EQ(320.0f, out1[14]);
   EXPECT_EQ(360.0f, out1[15]);
 }


 // TODO: ComputeCrossEntropy, SoftmaxCrossEntropy
 //
 #endif  // USE_OPENCL
	/************************************************************
	*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	*************************************************************/

	#include "gtest/gtest.h"
	#include "singa/core/device.h"
	#include "singa/core/tensor.h"
	#include "singa/proto/core.pb.h"
	using singa::CppCPU;
	using singa::Block;
	using singa::Shape;
	using singa::Tensor;

	#ifdef USE_OPENCL
	using singa::OpenclDevice;
	class OpenCL_TensorMath : public ::testing::Test {
	protected:

	OpenCL_TensorMath() {
	for (int i = 0; i < 4; i++) {
	float4[i] = (float)i;
	float4zero[i] = 0.0f;
	}

	for (int i = 0; i < 16; i++) {
	float16[i] = (float)i;
	float16zero[i] = 0.0f;
	}

	auto ocl_dev = std::make_shared<OpenclDevice>();

	tf4in = Tensor(Shape{1, 4}, ocl_dev);
	tf4in.CopyDataFromHostPtr(float4, 4);

	tf4zin = Tensor(Shape{1, 4}, ocl_dev);
	tf4zin.CopyDataFromHostPtr(float4zero, 4);

	tf16in = Tensor(Shape{4, 4}, ocl_dev);
	tf16in.CopyDataFromHostPtr(float16, 16);

	tf16zin = Tensor(Shape{4, 4}, ocl_dev);
	tf16zin.CopyDataFromHostPtr(float16zero, 16);

	float empty[10000] = {};
	empty10k = Tensor(Shape{10000}, ocl_dev);
	empty10k.CopyDataFromHostPtr(empty, 10000);
	}

	float float4[4];
	float float4zero[4];
	float float16[16];
	float float16zero[16];

	Tensor tf4in, tf16in;
	Tensor tf4zin, tf16zin;
	Tensor empty10k;
	};


	// Makes a float array and fills it with increasing values from 0.
	float* MakeMatrix(const int size) {
	float* mat = new float[size];
	for (int i = 0; i < size; i++)
	mat[i] = i;
	return mat;
	}


	TEST(OpenclDevice, Constructor) {
	OpenclDevice dev;
	EXPECT_EQ(0, dev.id());
	}


	TEST(OpenclDevice, MemoryAllocFree) {
	OpenclDevice dev;
	Block* b = dev.NewBlock(4);
	EXPECT_NE(nullptr, b);
	EXPECT_EQ(4u, b->size());
	dev.FreeBlock(b);
	}

	// Tests for integrity of one round of data transfer to an OpenCL device and back.
	TEST(OpenclDevice, CopyDataToFrom) {
	OpenclDevice dev;
	CppCPU host;

	Block* a = host.NewBlock(4);
	Block* b = dev.NewBlock(4);
	Block* c = host.NewBlock(4);

	// Allocate the Block object on the host.
	char s[] = {'a', 'b', 'c', 'x'};
	host.CopyDataFromHostPtr(a, s, 4);

	// Copy back and forth.
	dev.CopyDataToFrom(b, a, 4, singa::kHostToDevice);
	dev.CopyDataToFrom(c, b, 4, singa::kDeviceToHost);

	const char* astr = static_cast<const char*>(c->data());
	EXPECT_EQ('a', astr[0]);
	EXPECT_EQ('b', astr[1]);
	EXPECT_EQ('x', astr[3]);
	}


	TEST(OpenclDevice, DuplicateDataOnDevice) {
	OpenclDevice dev;
	CppCPU host;

	Block* a = host.NewBlock(4);
	Block* b = dev.NewBlock(4);
	Block* c = dev.NewBlock(4);
	Block* d = host.NewBlock(4);

	// Allocate the Block object on the host.
	char s[] = {'a', 'b', 'c', 'x'};
	host.CopyDataFromHostPtr(a, s, 4);

	// Copy to device and duplicate.
	dev.CopyDataToFrom(b, a, 4, singa::kHostToDevice);
	dev.CopyDataToFrom(c, b, 4, singa::kDeviceToDevice);
	dev.CopyDataToFrom(d, c, 4, singa::kDeviceToHost);

	const char* astr = static_cast<const char*>(d->data());
	EXPECT_EQ('a', astr[0]);
	EXPECT_EQ('b', astr[1]);
	EXPECT_EQ('x', astr[3]);
	}

	// Tensor tests, uses OpenCL_TensorMath class defined above.

	TEST_F(OpenCL_TensorMath, CopyDataToDevice) {
	tf4in.ToHost();
	const float* out = tf4in.data<float>();

	EXPECT_EQ(1.0f, out[1]);
	EXPECT_EQ(3.0f, out[3]);
	}


	TEST_F(OpenCL_TensorMath, MemberAbs) {
	tf4in = Abs(tf4in);

	tf4in.ToHost();
	const float* out = tf4in.data<float>();

	EXPECT_EQ(0.0f, out[0]);
	EXPECT_EQ(1.0f, out[1]);
	EXPECT_EQ(2.0f, out[2]);
	EXPECT_EQ(3.0f, out[3]);
	}


	TEST_F(OpenCL_TensorMath, MemberExp) {
	tf4in = Exp(tf4in);

	tf4in.ToHost();
	const float* out = tf4in.data<float>();

	EXPECT_NEAR(exp(0.0f), out[0], 1e-5);
	EXPECT_NEAR(exp(1.0f), out[1], 1e-5);
	EXPECT_NEAR(exp(2.0f), out[2], 1e-5);
	EXPECT_NEAR(exp(3.0f), out[3], 1e-5);
	}


	TEST_F(OpenCL_TensorMath, MemberLog) {
	tf4in = Log(tf4in);

	tf4in.ToHost();
	const float* out = tf4in.data<float>();

	// EXPECT_NEAR(log(0.0f), out[0], 1e-5); // Evaluates to neg infinity.
	EXPECT_NEAR(log(1.0f), out[1], 1e-5);
	EXPECT_NEAR(log(2.0f), out[2], 1e-5);
	EXPECT_NEAR(log(3.0f), out[3], 1e-5);
	}


	TEST_F(OpenCL_TensorMath, MemberReLU) {
	tf4in -= 2.0f;
	Tensor result = ReLU(tf4in);

	result.ToHost();
	const float* out = result.data<float>();

	EXPECT_NEAR(0.0f, out[0], 1e-5);
	EXPECT_NEAR(0.0f, out[1], 1e-5);
	EXPECT_NEAR(0.0f, out[2], 1e-5);
	EXPECT_NEAR(1.0f, out[3], 1e-5);
	}


	TEST_F(OpenCL_TensorMath, MemberSigmoid) {
	tf4in = Sigmoid(tf4in);

	tf4in.ToHost();
	const float* out = tf4in.data<float>();

	EXPECT_NEAR(1.0f / (1.0f + exp(-0.0f)), out[0], 1e-5);
	EXPECT_NEAR(1.0f / (1.0f + exp(-1.0f)), out[1], 1e-5);
	EXPECT_NEAR(1.0f / (1.0f + exp(-2.0f)), out[2], 1e-5);
	}


	TEST_F(OpenCL_TensorMath, MemberSign) {
	tf4in -= 1.0f;

	tf4in.ToHost();
	const float* out = tf4in.data<float>();

	EXPECT_NEAR(-1.0f, out[0], 1e-5);
	EXPECT_NEAR(0.0f, out[1], 1e-5);
	EXPECT_NEAR(1.0f, out[2], 1e-5);
	EXPECT_NEAR(2.0f, out[3], 1e-5);
	}


	TEST_F(OpenCL_TensorMath, MemberSqrt) {
	tf4in = Sqrt(tf4in);

	tf4in.ToHost();
	const float* out = tf4in.data<float>();

	EXPECT_NEAR(0.0f, out[0], 1e-5);
	EXPECT_NEAR(1.0f, out[1], 1e-5);
	EXPECT_NEAR(sqrt(2.0f), out[2], 1e-5);
	EXPECT_NEAR(sqrt(3.0f), out[3], 1e-5);
	}


	TEST_F(OpenCL_TensorMath, MemberSquare) {
	tf4in = Square(tf4in);

	tf4in.ToHost();
	const float* out = tf4in.data<float>();

	EXPECT_NEAR(0.0f, out[0], 1e-5);
	EXPECT_NEAR(1.0f, out[1], 1e-5);
	EXPECT_NEAR(4.0f, out[2], 1e-5);
	EXPECT_NEAR(9.0f, out[3], 1e-5);
	}


	TEST_F(OpenCL_TensorMath, MemberTanh) {
	tf4in = Tanh(tf4in);

	tf4in.ToHost();
	const float* out = tf4in.data<float>();

	EXPECT_NEAR(0.0f, out[0], 1e-5);
	EXPECT_NEAR(tanh(1.0f), out[1], 1e-5);
	EXPECT_NEAR(tanh(2.0f), out[2], 1e-5);
	EXPECT_NEAR(tanh(3.0f), out[3], 1e-5);
	}


	TEST_F(OpenCL_TensorMath, Sum) {
	Tensor result = Sum(tf4in, 0);

	result.ToHost();
	const float* out = result.data<float>();

	EXPECT_NEAR(0.0f, out[0], 1e-5);
	EXPECT_NEAR(1.0f, out[1], 1e-5);
	EXPECT_NEAR(2.0f, out[2], 1e-5);
	EXPECT_NEAR(3.0f, out[3], 1e-5);
	}

	TEST_F(OpenCL_TensorMath, MemberLT) {
	Tensor result = tf4in < 2.0f;

	result.ToHost();
	const float* out = result.data<float>();

	EXPECT_FLOAT_EQ(1.0f, out[0]);
	EXPECT_FLOAT_EQ(1.0f, out[1]);
	EXPECT_FLOAT_EQ(0.0f, out[2]);
	EXPECT_FLOAT_EQ(0.0f, out[3]);
	}


	TEST_F(OpenCL_TensorMath, MemberLE) {
	Tensor result = tf4in <= 2.0f;

	result.ToHost();
	const float* out = result.data<float>();

	EXPECT_FLOAT_EQ(1.0f, out[0]);
	EXPECT_FLOAT_EQ(1.0f, out[1]);
	EXPECT_FLOAT_EQ(1.0f, out[2]);
	EXPECT_FLOAT_EQ(0.0f, out[3]);
	}


	TEST_F(OpenCL_TensorMath, MemberGT) {
	Tensor result = tf4in > 2.0f;

	result.ToHost();
	const float* out = result.data<float>();

	EXPECT_FLOAT_EQ(0.0f, out[0]);
	EXPECT_FLOAT_EQ(0.0f, out[1]);
	EXPECT_FLOAT_EQ(0.0f, out[2]);
	EXPECT_FLOAT_EQ(1.0f, out[3]);
	}


	TEST_F(OpenCL_TensorMath, MemberGE) {
	Tensor result = tf4in >= 2.0f;

	result.ToHost();
	const float* out = result.data<float>();

	EXPECT_FLOAT_EQ(0.0f, out[0]);
	EXPECT_FLOAT_EQ(0.0f, out[1]);
	EXPECT_FLOAT_EQ(1.0f, out[2]);
	EXPECT_FLOAT_EQ(1.0f, out[3]);
	}


	TEST_F(OpenCL_TensorMath, MemberPow) {
	Tensor result = Pow(tf4in, 2.0f);

	result.ToHost();
	const float* out = result.data<float>();

	EXPECT_FLOAT_EQ(0.0f, out[0]);
	EXPECT_FLOAT_EQ(1.0f, out[1]);
	EXPECT_FLOAT_EQ(4.0f, out[2]);
	EXPECT_FLOAT_EQ(9.0f, out[3]);

	result = Pow(tf4in, tf4in);

	result.ToHost();
	const float* out1 = result.data<float>();

	EXPECT_FLOAT_EQ(1.0f, out1[0]); // 0 ^ 0 is 1, apparently.
	EXPECT_FLOAT_EQ(1.0f, out1[1]);
	EXPECT_FLOAT_EQ(4.0f, out1[2]);
	EXPECT_FLOAT_EQ(27.0f, out1[3]);
	}


	TEST_F(OpenCL_TensorMath, MemberSub) {
	Tensor result = tf4in - tf4zin;

	result.ToHost();
	const float* out = result.data<float>();

	EXPECT_FLOAT_EQ(0.0f, out[0]);
	EXPECT_FLOAT_EQ(1.0f, out[1]);
	EXPECT_FLOAT_EQ(2.0f, out[2]);
	EXPECT_FLOAT_EQ(3.0f, out[3]);

	result = tf4in - 0.0f;

	result.ToHost();
	const float* out1 = result.data<float>();

	EXPECT_FLOAT_EQ(0.0f, out1[0]);
	EXPECT_FLOAT_EQ(1.0f, out1[1]);
	EXPECT_FLOAT_EQ(2.0f, out1[2]);
	EXPECT_FLOAT_EQ(3.0f, out1[3]);
	}


	TEST_F(OpenCL_TensorMath, MemberEltwiseMult) {
	Tensor result = tf4in * tf4zin;

	result.ToHost();
	const float* out = result.data<float>();

	EXPECT_FLOAT_EQ(0.0f, out[0]);
	EXPECT_FLOAT_EQ(0.0f, out[1]);
	EXPECT_FLOAT_EQ(0.0f, out[2]);
	EXPECT_FLOAT_EQ(0.0f, out[3]);

	result = tf4in * 10.0f;

	result.ToHost();
	const float* out1 = result.data<float>();

	EXPECT_FLOAT_EQ(0.0f, out1[0]);
	EXPECT_FLOAT_EQ(10.0f, out1[1]);
	EXPECT_FLOAT_EQ(20.0f, out1[2]);
	EXPECT_FLOAT_EQ(30.0f, out1[3]);
	}


	TEST_F(OpenCL_TensorMath, MemberDiv) {
	Tensor result = tf4in / tf4in;

	result.ToHost();
	const float* out = result.data<float>();

	// EXPECT_FLOAT_EQ(0.0f, out[0]); // Divide by zero.
	EXPECT_FLOAT_EQ(1.0f, out[1]);
	EXPECT_FLOAT_EQ(1.0f, out[2]);
	EXPECT_FLOAT_EQ(1.0f, out[3]);

	result = tf4in / 10.0f;

	result.ToHost();
	const float* out1 = result.data<float>();

	EXPECT_FLOAT_EQ(0.0f, out1[0]);
	EXPECT_FLOAT_EQ(0.1f, out1[1]);
	EXPECT_FLOAT_EQ(0.2f, out1[2]);
	EXPECT_FLOAT_EQ(0.3f, out1[3]);

	result = Div(10.0f, tf4in);

	result.ToHost();
	const float* out2 = result.data<float>();

	// EXPECT_FLOAT_EQ(0.0f, out[0]); // Divide by 0.
	EXPECT_FLOAT_EQ(10.0f, out2[1]);
	EXPECT_FLOAT_EQ(5.0f, out2[2]);
	EXPECT_NEAR((10.0f / 3.0f), out2[3], 1e-5);
	}

	// **************************************
	// Random functions
	// **************************************

	TEST_F(OpenCL_TensorMath, Bernoulli) {
	const float p = 0.3f;

	Bernoulli(p, &empty10k);

	empty10k.ToHost();
	const float* out = empty10k.data<float>();

	float sum = 0.0f;
	for (int i = 0; i < 10000; i++) sum += out[i];

	float mean = sum / 10000;

	EXPECT_NEAR(mean, p, 1e-2);

	sum = 0.0f;
	for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean);
	float variance = sum / 9999;

	EXPECT_NEAR(variance, p * (1 - p), 1e-2);
	}


	TEST_F(OpenCL_TensorMath, Gaussian) {
	Gaussian(0.0f, 1.0f, &empty10k);

	empty10k.ToHost();
	const float* out = empty10k.data<float>();

	float sum = 0.0f;
	for (int i = 0; i < 10000; i++) sum += out[i];
	float mean = sum / 10000;

	EXPECT_NEAR(mean, 0.0f, 1e-2);

	sum = 0.0f;
	for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean);
	float variance = sum / 9999;

	EXPECT_NEAR(variance, 1.0f, 1e-2);
	}


	TEST_F(OpenCL_TensorMath, Uniform) {
	Uniform(0.1f, 0.2f, &empty10k);

	empty10k.ToHost();
	const float* out = empty10k.data<float>();

	float sum = 0.0f;
	for (int i = 0; i < 10000; i++) sum += out[i];
	float mean = sum / 10000;

	EXPECT_NEAR(mean, 0.15f, 1e-2);

	sum = 0.0f;
	for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean);
	float variance = sum / 9999;

	EXPECT_NEAR(variance, 0.01f, 1e-2);
	}

	// *********************************************************
	// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
	// *********************************************************


	TEST_F(OpenCL_TensorMath, EltwiseAdd) {
	Tensor result = tf4in + tf4in;

	result.ToHost();
	const float* out = result.data<float>();

	EXPECT_EQ(0.0f, out[0]);
	EXPECT_EQ(2.0f, out[1]);
	EXPECT_EQ(4.0f, out[2]);
	EXPECT_EQ(6.0f, out[3]);

	result = tf4in + tf4zin;

	result.ToHost();
	const float* out1 = result.data<float>();

	EXPECT_EQ(0.0f, out1[0]);
	EXPECT_EQ(1.0f, out1[1]);
	EXPECT_EQ(2.0f, out1[2]);
	EXPECT_EQ(3.0f, out1[3]);

	result = Tensor(tf4in.shape(), tf4in.device(), tf4in.data_type());
	Add(tf4in, tf4in, &result);

	result.ToHost();
	const float* out2 = result.data<float>();

	EXPECT_EQ(0.0f, out2[0]);
	EXPECT_EQ(2.0f, out2[1]);
	EXPECT_EQ(4.0f, out2[2]);
	EXPECT_EQ(6.0f, out2[3]);

	result = tf4in + 1.0f;

	result.ToHost();
	const float* out3 = result.data<float>();

	EXPECT_EQ(1.0f, out3[0]);
	EXPECT_EQ(2.0f, out3[1]);
	EXPECT_EQ(3.0f, out3[2]);
	EXPECT_EQ(4.0f, out3[3]);
	}


	TEST_F(OpenCL_TensorMath, SetValue) {
	const float one_third = 1.0f / 3.0f;
	empty10k.SetValue(one_third);

	empty10k.ToHost();
	const float* out = empty10k.data<float>();

	EXPECT_EQ(one_third, out[0]);
	EXPECT_EQ(one_third, out[1]);
	EXPECT_EQ(one_third, out[1024]);
	EXPECT_EQ(one_third, out[4096]);
	EXPECT_EQ(one_third, out[9998]);
	EXPECT_EQ(one_third, out[9999]);
	}


	TEST_F(OpenCL_TensorMath, Axpy) {
	Axpy(10.0f, tf4in, &tf4in);

	tf4in.ToHost();
	const float* out = tf4in.data<float>();

	EXPECT_EQ(0.0f, out[0]); // 0 * 10 + 0 = 0
	EXPECT_EQ(11.0f, out[1]); // 1 * 10 + 1 = 11
	EXPECT_EQ(22.0f, out[2]); // 2 * 10 + 2 = 22
	EXPECT_EQ(33.0f, out[3]); // 3 * 10 + 3 = 33
	}

	TEST_F(OpenCL_TensorMath, Mult) {
	Tensor result = Mult(tf4in, tf4zin.T()); // Multiply with zero.

	result.ToHost();
	const float* out = result.data<float>();

	EXPECT_EQ(0.0f, out[0]); // 1x4 * 4x1 = 1x1.

	result = Mult(tf4in, tf4in.T());

	result.ToHost();
	const float* out0 = result.data<float>();

	EXPECT_EQ(14.0f, out0[0]); // 1x4 * 4x1 = 1x1.

	tf16zin.SetValue(10.0f); // Multiply with 10.0.
	result = Mult(tf16in, tf16zin); // 4x4 * 4x4 = 4x4.

	result.ToHost();
	const float* out1 = result.data<float>();
	EXPECT_EQ(240.0f, out1[0]);
	EXPECT_EQ(280.0f, out1[1]);
	EXPECT_EQ(320.0f, out1[2]);
	EXPECT_EQ(360.0f, out1[3]);

	EXPECT_EQ(240.0f, out1[4]);
	EXPECT_EQ(280.0f, out1[5]);
	EXPECT_EQ(320.0f, out1[6]);
	EXPECT_EQ(360.0f, out1[7]);

	EXPECT_EQ(240.0f, out1[8]);
	EXPECT_EQ(280.0f, out1[9]);
	EXPECT_EQ(320.0f, out1[10]);
	EXPECT_EQ(360.0f, out1[11]);

	EXPECT_EQ(240.0f, out1[12]);
	EXPECT_EQ(280.0f, out1[13]);
	EXPECT_EQ(320.0f, out1[14]);
	EXPECT_EQ(360.0f, out1[15]);
	}



	// TODO: ComputeCrossEntropy, SoftmaxCrossEntropy
	//
	#endif // USE_OPENCL