| /************************************************************ |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| *************************************************************/ |
| |
| #ifndef SINGA_UTILS_MATH_BLOB_H_ |
| #define SINGA_UTILS_MATH_BLOB_H_ |
| |
| #include <vector> |
| #include <algorithm> |
| #include <thread> |
| #include "singa/utils/blob.h" |
| #include "singa/utils/singa_op.h" |
| #include "singa/utils/math_addr.h" |
| #include "singa/utils/singleton.h" |
| #include "singa/utils/context.h" |
| |
| namespace singa { |
| |
| #define NO_GPU LOG(FATAL) << "Not compiled with GPU"; |
| /** |
| * \file math_blob.h is not tested thorough. |
| * Only GEMM() and MMDot() MVSumRow() andMVAddRow() are used now. |
| */ |
| /************* BLAS level 1 *****************/ |
| /** |
| * Scale each element of A with alpha, and put the result into A. |
| * Ai = alpha*Ai |
| * Use blas scale internally. |
| */ |
| template<typename Dtype> |
| void Scale(Dtype alpha, Blob<Dtype> * B) { |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| cpu_scale(B->count(), alpha, B->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| gpu_scale(context->cublas_handle(device), B->count(), alpha, |
| B->mutable_gpu_data()); |
| #else |
| NO_GPU; |
| #endif |
| } |
| } |
| |
| /** |
| * Element-wise operation: Bi = alpha*Ai+Bi. A and B should have the same size |
| */ |
| template<typename Dtype> |
| void AXPY(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) { |
| CHECK_EQ(A.count(), B->count()); |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| cpu_axpy(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| gpu_axpy(context->cublas_handle(device), A.count(), alpha, A.gpu_data(), |
| B->mutable_gpu_data()); |
| #else |
| NO_GPU; |
| #endif |
| } |
| } |
| |
| /************* BLAS level 2 *****************/ |
| /** |
| * Matrix vector multiplication, C = alpha A(.T) * B + beta C. |
| * Loose shape checking: |
| * - dim of A >=2 |
| * - row of A is shape(0) (no transpose) |
| * - column of A(.T) == B.count() |
| * - rows of A(.T) == C.count() |
| * |
| * @param[in] alpha |
| * @param[in] beta |
| * @param[in] A, matrix |
| * @param[in] B, vector |
| * @param[in, out] C, vector |
| */ |
| template<typename Dtype> |
| void GEMV(Dtype alpha, Dtype beta, const Blob<Dtype>& A, |
| const Blob<Dtype>& B, Blob<Dtype>* C) { |
| CHECK_EQ(A.shape().size(), 2); |
| int a1, a2, m, n; |
| a1 = A.transpose() ? A.count() / A.shape(0) : A.shape(0); |
| a2 = A.transpose() ? A.shape(0) : A.count() / A.shape(0); |
| m = B.count(); |
| n = C->count(); |
| CHECK_EQ(a2, m) << "# columns of A(.T) must = length of B"; |
| CHECK_EQ(a1, n) << "# rows of A(.T) must = length of C"; |
| |
| bool TranA = A.transpose(); |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, alpha, beta, TranA, |
| C->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| gpu_gemv(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), m, n, |
| alpha, beta, TranA, C->mutable_gpu_data()); |
| #else |
| NO_GPU; |
| #endif // USE_GPU |
| } |
| } |
| /** |
| * Matrix vector multiplication, C = A(.T) * B, transpose is considered. |
| * Loose shape checking: |
| * - dim of A >=2 |
| * - A.count() % B.count() == 0 |
| * - B.count() == C.count() |
| * |
| * @param[in] A input matrix |
| * @param[in] B input vector |
| * @param[out] C output vector |
| */ |
| template <typename Dtype> |
| void MVDot(const Blob<Dtype>& A, const Blob<Dtype>& B, |
| Blob<Dtype>* C) { |
| GEMV(Dtype(1), Dtype(0), A, B, C); |
| } |
| |
| /************* BLAS level 3 *****************/ |
| /** |
| * Matrix multiplication, C = alpha A*B + beta C, A, B and C are matrix. |
| * |
| * Tranpose is considered for A and B. |
| * Loose shape checking: |
| * - the first dimension is row (no transpose) or col (with transpose) size |
| * - shapes match for matrix multiplication |
| * |
| * @param[in] alpha |
| * @param[in] beta |
| * @param[in] A, matrix |
| * @param[in] B, matrix |
| * @param[in, out] C, matrix |
| */ |
| template <typename Dtype> |
| void GEMM(Dtype alpha, Dtype beta, const Blob<Dtype>& A, const Blob<Dtype>& B, |
| Blob<Dtype> * C) { |
| CHECK_GE(A.shape().size(), 2); |
| CHECK_GE(B.shape().size(), 2); |
| CHECK_GE(C->shape().size(), 2); |
| int a1, a2, b1, b2, m, n; |
| CHECK(!C->transpose()); |
| a1 = A.transpose() ? A.count() / A.shape(0) : A.shape(0); |
| a2 = A.count() / a1; |
| b1 = B.transpose() ? B.count() /B.shape(0) : B.shape(0); |
| b2 = B.count() / b1; |
| m = C->shape(0); |
| n = C->count() / m; |
| CHECK_EQ(a2, b1); |
| CHECK_EQ(a1, m); |
| CHECK_EQ(b2, n); |
| |
| int k = a2; |
| bool TranA = A.transpose(); |
| bool TranB = B.transpose(); |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, TranA, TranB, |
| C->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), |
| m, n, k, alpha, beta, TranA, TranB, C->mutable_gpu_data()); |
| #else |
| NO_GPU; |
| #endif // USE_GPU |
| } |
| } |
| /** |
| * Matrix multiplication, C = A(.T) * B(.T), transpose is considered. |
| * Strict shape checking: |
| * - all are matrix |
| * - shapes match for matrix multiplication |
| * |
| * @param[in] A input matrix |
| * @param[in] B input matrix |
| * @param[out] C output matrix |
| */ |
| template <typename Dtype> |
| void MMDot(const Blob<Dtype>& A, const Blob<Dtype>& B, |
| Blob<Dtype>* C) { |
| GEMM(Dtype(1), Dtype(0), A, B, C); |
| } |
| |
| |
| /*********************** Inner and Outer product****************************/ |
| /** |
| * Inner product for two vectors. |
| * Loose shape checking, A.count() == B.count. |
| * |
| * @param[in] A, input vector (shape checking using A.count()). |
| * @param[in] B, input vector (shape checking using B.count()). |
| * @return inner product value. |
| */ |
| template <typename Dtype> |
| Dtype VVDot(const Blob<Dtype> & A, const Blob<Dtype> & B) { |
| Dtype res = 0; |
| CHECK_EQ(A.count(), B.count()); |
| int n = A.count(); |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| res = cpu_dot(n, A.cpu_data(), B.cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| res = gpu_dot(context->cublas_handle(device), n, A.gpu_data(), |
| B.gpu_data()); |
| #else |
| NO_GPU; |
| #endif // USE_GPU |
| } |
| return res; |
| } |
| |
| /** |
| * Outer product, C = A ** B, transpose is disabled. |
| * Loose shape checking, A.count() * B.count() == C.count() |
| * |
| * @param[in] A, input vector |
| * @param[in] B, input vector |
| * @param[out] C, output matrix |
| */ |
| template <typename Dtype> |
| void OuterProduct(const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype> * C) { |
| CHECK(!C->transpose()); // do not support C.T now. |
| |
| int m = A.count(); |
| int n = B.count(); |
| CHECK_EQ(C->count(), m * n); |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, Dtype(1), Dtype(0), false, |
| false, C->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), |
| m, n, 1, Dtype(1), Dtype(0), false, false, C->mutable_gpu_data()); |
| #else |
| NO_GPU; |
| #endif // USE_GPU |
| } |
| } |
| /*********************** Element-wise functions ***********************/ |
| /** |
| * Apply the function from Op for each element in A and put the result into B, |
| * i.e., Bi = Op(Ai). |
| * Loose shape checking, A.count() == B.count(). |
| */ |
| template<typename Op, typename Dtype> |
| void Map(const Blob<Dtype> & A, Blob<Dtype> * B) { |
| CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| cpu_e_f<Op>(A.count(), A.cpu_data(), B->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data()); |
| #else |
| NO_GPU; |
| #endif // USE_GPU |
| } |
| } |
| |
| /** |
| * Apply the function from Op for each element in A and B, and put the result |
| * into C, i.e., Ci = Op(Ai, Bi). |
| * Loose shape checking, A, B and C are of the same size. |
| */ |
| template<typename Op, typename Dtype> |
| void Map(const Blob<Dtype> & A, const Blob<Dtype> & B, Blob<Dtype> * C) { |
| CHECK_EQ(A.count(), B.count()) << "Blobs must have the same size"; |
| CHECK_EQ(A.count(), C->count()) << "Blobs must have the same size"; |
| // cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data()); |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| gpu_e_f<Op>(A.count(), A.gpu_data(), B.gpu_data(), C->mutable_gpu_data()); |
| #else |
| NO_GPU; |
| #endif // USE_GPU |
| } |
| } |
| |
| /** |
| * Bi = Op(alpha, Ai) |
| * Loose shape checking, A.count() == B.count(). |
| */ |
| template<typename Op, typename Dtype> |
| void Map(Dtype alpha, const Blob<Dtype>& A, Blob<Dtype>* B) { |
| CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| gpu_e_f<Op>(A.count(), alpha, A.gpu_data(), B->mutable_gpu_data()); |
| #else |
| NO_GPU; |
| #endif // USE_GPU |
| } |
| } |
| /** |
| * Ci = Op(alpha, Ai, Bi) |
| * Loose shape checking, A, B and C are of the same size. |
| */ |
| template<typename Op, typename Dtype> |
| void Map(Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B, |
| Blob<Dtype>* C) { |
| CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(), |
| C->mutable_cpu_data()); |
| } else { |
| // TODO(wangwei) implement gpu version. |
| NO_GPU; |
| } |
| } |
| |
| /** |
| * Currently use std::copy which has shown better performance than memcpy. |
| * http://stackoverflow.com/questions/4707012/c-memcpy-vs-stdcopy |
| * TODO(wangwei) test blas copy vs std::copy. |
| * |
| * Loose shape checking, A.count() == B.count(). |
| */ |
| template<typename Dtype> |
| void Copy(const Blob<Dtype>& A, Blob<Dtype>* B) { |
| CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| std::copy(A.cpu_data(), A.cpu_data() + A.count(), B->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| CUDA_CHECK(cudaMemcpy(static_cast<Dtype*>(B->mutable_gpu_data()), |
| A.gpu_data(), sizeof(Dtype) * A.count(), cudaMemcpyDefault)); |
| #else |
| NO_GPU; |
| #endif |
| } |
| } |
| |
| |
| /** |
| * B = alpha + A |
| * Implemented using Copy and AXPY. |
| */ |
| template<typename Dtype> |
| void Add(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) { |
| Map<singa::op::Add<Dtype>, Dtype>(alpha, A, B); |
| } |
| |
| /** |
| * C = A + B |
| * Implemented using Copy and AXPY. |
| */ |
| template<typename Dtype> |
| void Add(const Blob<Dtype> & A, const Blob<Dtype> & B, |
| Blob<Dtype> * C) { |
| Copy(A, C); |
| AXPY(Dtype(1), B, C); |
| } |
| |
| /** |
| * B = alpha - A |
| * Implemented using Copy and AXPY. |
| */ |
| template<typename Dtype> |
| void Sub(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype>* B) { |
| Map<singa::op::Sub<Dtype>, Dtype>(alpha, A, B); |
| } |
| |
| /** |
| * C = A - B |
| * Implemented using Copy and AXPY. |
| */ |
| template<typename Dtype> |
| void Sub(const Blob<Dtype> & A, const Blob<Dtype> & B, |
| Blob<Dtype> * C) { |
| Copy(A, C); |
| AXPY(Dtype(-1), B, C); |
| } |
| |
| /** |
| * C = A * B, implemented using |
| * Map(const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*). |
| */ |
| template<typename Dtype> |
| void Mult(const Blob<Dtype> & A, const Blob<Dtype> & B, |
| Blob<Dtype> * C) { |
| Map<singa::op::Mult<Dtype>, Dtype>(A, B, C); |
| // TODO(wangwei) use MKL's vector func |
| } |
| |
| /** |
| * C = A / B, implemented using |
| * Map(const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*). |
| */ |
| template<typename Dtype> |
| void Div(const Blob<Dtype> & A, const Blob<Dtype> & B, |
| Blob<Dtype> * C) { |
| Map<singa::op::Div<Dtype>, Dtype>(A, B, C); |
| // TODO(wangwei) use MKL's vector func |
| } |
| /** |
| * B = sqrt(A) |
| */ |
| template<typename Dtype> |
| void Sqrt(const Blob<Dtype> & A, Blob<Dtype>* B) { |
| Map<singa::op::Sqrt<Dtype>, Dtype>(A, B); |
| } |
| /** |
| * B = square(A) |
| */ |
| template<typename Dtype> |
| void Square(const Blob<Dtype> & A, Blob<Dtype>* B) { |
| Map<singa::op::Square<Dtype>, Dtype>(A, B); |
| } |
| /** |
| * B = exp(A) |
| */ |
| template<typename Dtype> |
| void Exp(const Blob<Dtype> & A, Blob<Dtype>* B) { |
| Map<singa::op::Exp<Dtype>, Dtype>(A, B); |
| } |
| /** |
| * B = log(A) |
| */ |
| template<typename Dtype> |
| void Log(const Blob<Dtype>& A, Blob<Dtype>* B) { |
| Map<singa::op::Log<Dtype>, Dtype>(A, B); |
| } |
| /** |
| * B = tanh(A) |
| */ |
| template<typename Dtype> |
| void Tanh(const Blob<Dtype>& A, Blob<Dtype>* B) { |
| Map<singa::op::Tanh<Dtype>, Dtype>(A, B); |
| } |
| /*************************1D<-->2D op/transform***************************/ |
| /** |
| * Add A to each column of B, i.e., Bij = alpha*Ai + beta*Bij |
| * Loose shape checking, B.count() % A.count() == 0. |
| * # columns of B = B.count() / A.count(). |
| */ |
| template<typename Dtype> |
| void MVAddCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) { |
| if (B->transpose()) { |
| B->set_transpose(false); |
| MVAddRow(alpha, beta, A, B); |
| B->set_transpose(true); |
| } else { |
| CHECK_EQ(B->count() % A.count(), 0) << "#col of B not match length of A"; |
| int m = A.count(), n = B->count() / m; |
| Blob<Dtype> one(n); |
| one.SetValue(1); |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| cpu_gemm(A.cpu_data(), one.cpu_data(), m, n, 1, alpha, beta, false, false, |
| B->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| gpu_gemm(context->cublas_handle(device), A.gpu_data(), one.gpu_data(), m, |
| n, 1, alpha, beta, false, false, B->mutable_gpu_data()); |
| #else |
| NO_GPU; |
| #endif // USE_GPU |
| } |
| } |
| } |
| /** |
| * Add A to each column of B, i.e., Bij = Ai + Bij |
| * Loose shape checking, B.count() % A.count() == 0. |
| * # columns of B = B.count() / A.count(). |
| */ |
| template<typename Dtype> |
| void MVAddCol(const Blob<Dtype> & A, Blob<Dtype>* B) { |
| MVAddCol(Dtype(1), Dtype(1), A, B); |
| } |
| |
| /** |
| * Add A to each row of B, i.e., Bij = alpha*Aj + beta*Bij |
| * Loose shape checking, B.count() % A.count() == 0. |
| * # rows of B = B.count() / A.count(). |
| */ |
| template<typename Dtype> |
| void MVAddRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) { |
| if (B->transpose()) { |
| B->set_transpose(false); |
| MVAddCol(alpha, beta, A, B); |
| B->set_transpose(true); |
| } else { |
| CHECK_EQ(B->count() % A.count(), 0) << "#col of B not match length of A"; |
| int n = A.count(), m = B->count() / n; |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| Blob<Dtype> one(m); |
| one.SetValue(1); |
| cpu_gemm(one.cpu_data(), A.cpu_data(), m, n, 1, alpha, beta, |
| false, false, B->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| singa_gpu_add_vec_row(A.gpu_data(), B->gpu_data(), B->mutable_gpu_data(), |
| m, n, n); |
| #else |
| NO_GPU; |
| #endif // USE_GPU |
| } |
| } |
| } |
| /** |
| * Add A to each row of B, i.e., Bij = Aj + Bij |
| * Loose shape checking, B.count() % A.count() == 0. |
| * # rows of B = B.count() / A.count(). |
| */ |
| template<typename Dtype> |
| void MVAddRow(const Blob<Dtype> & A, Blob<Dtype>* B) { |
| MVAddRow(Dtype(1), Dtype(1), A, B); |
| } |
| |
| /** |
| * Copy A to each column of B, i.e., Bij = Ai |
| * Loose shape checking, B.count() % A.count() == 0, |
| * # columns of B = B.count() / A.count(). |
| */ |
| template<typename Dtype> |
| void RepmatCol(const Blob<Dtype> & A, Blob<Dtype> * B) { |
| MVAddCol(Dtype(1), Dtype(0), A, B); |
| } |
| |
| /** |
| * Copy A to each row of B, i.e., Bij = Aj |
| * Loose shape checking, B.count() % A.count() == 0, |
| * # rows of B = B.count() / A.count(). |
| */ |
| template<typename Dtype> |
| void RepmatRow(const Blob<Dtype> & A, Blob<Dtype> * B) { |
| MVAddRow(Dtype(1), Dtype(0), A, B); |
| } |
| |
| /** |
| * Sum all columns of matrix A to a column vector B, |
| * i.e., Bi = \sum_j {alpha*Aij}+beta*Bi |
| * Loose shape checking, A.count() % B.count() == 0. |
| * # columns of A = A.count() / B.count(). |
| */ |
| template<typename Dtype> |
| void MVSumCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) { |
| CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A"; |
| int m = B->count(), n = A.count() / m; |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| Blob<Dtype> one(n); |
| one.SetValue(1); |
| cpu_gemm(A.cpu_data(), one.cpu_data(), m, 1, n, alpha, beta, |
| A.transpose(), false, B->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| singa_gpu_sum_col(A.gpu_data(), B->mutable_gpu_data(), m, n, n); |
| #else |
| NO_GPU; |
| #endif // USE_GPU |
| } |
| } |
| |
| /** |
| * Sum all rows of matrix A to a row vector B, |
| * i.e., Bj = \sum_i {alpha*Aij}+beta*Bj |
| * Loose shape checking, A.count() % B.count() == 0. |
| * # rows of A = A.count() / B.count(). |
| */ |
| template<typename Dtype> |
| void MVSumRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) { |
| CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A"; |
| int n = B->count(), m = A.count() / n; |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| Blob<Dtype> one(m); |
| one.SetValue(1); |
| cpu_gemm(one.cpu_data(), A.cpu_data(), 1, n, m, alpha, beta, false, |
| A.transpose(), B->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| singa_gpu_sum_row(A.gpu_data(), B->mutable_gpu_data(), m, n, n); |
| #else |
| NO_GPU; |
| #endif // USE_GPU |
| } |
| } |
| |
| /** |
| * Reduce each row of A to an element of B. |
| * Loose shape checking, A.count() % B.count() == 0. |
| * # columns of A = A.count() / B.count(). |
| */ |
| template<typename Op, typename Dtype> |
| void Reduce2D(const Blob<Dtype> & A, Blob<Dtype> * B) { |
| CHECK_EQ(A.count() % B->count(), 0) << "Row size not match B length"; |
| int m = B->count(), n = A.count() / m; |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); |
| #else |
| NO_GPU; |
| #endif // USE_GPU |
| } |
| } |
| /** |
| * Duplicate each element of A into a row of B. |
| * Loose shape checking, B.count() % A.count() == 0. |
| * # columns of B = B.count() / A.count(). |
| */ |
| template<typename Op, typename Dtype> |
| void Expand2D(const Blob<Dtype> & A, Blob<Dtype> * B) { |
| CHECK_EQ(B->count() % A.count(), 0) << "Row size of B not match length of A"; |
| int m = A.count(), n = B->count() / m; |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); |
| #else |
| NO_GPU; |
| #endif // USE_GPU |
| } |
| } |
| |
| /** |
| * Average the absolute values. |
| */ |
| template<typename Dtype> |
| Dtype Asum(const Blob<Dtype>& A) { |
| if (A.count() == 0) return Dtype(0); |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| Dtype ret = Dtype(0); |
| if (device < 0) { |
| ret = cpu_asum(A.count(), A.cpu_data(), 1) / A.count(); |
| } else { |
| #ifdef USE_GPU |
| ret = gpu_asum(context->cublas_handle(device), A.count(), A.gpu_data(), 1) |
| / A.count(); |
| #else |
| NO_GPU; |
| #endif |
| } |
| return ret; |
| } |
| |
| |
| /*************Random Sample***************/ |
| template<typename Dtype> |
| void SampleUniform(Dtype low, Dtype high, Blob<Dtype>* A) { |
| auto context = Singleton<Context>::Instance(); |
| const auto& thread = std::this_thread::get_id(); |
| int device = context->device_id(thread); |
| if (device < 0) { |
| cpu_sample_uniform(*context->rand_generator(thread), A->count(), low, high, |
| A->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| gpu_sample_uniform(context->curand_generator(thread), A->count(), low, high, |
| A->mutable_gpu_data()); |
| #else |
| NO_GPU; |
| #endif |
| } |
| } |
| |
| template<typename Dtype> |
| void SampleGaussian(Dtype mean, Dtype std, Blob<Dtype>* A) { |
| auto context = Singleton<Context>::Instance(); |
| const auto& thread = std::this_thread::get_id(); |
| int device = context->device_id(thread); |
| if (device < 0) { |
| cpu_sample_gaussian(*context->rand_generator(thread), A->count(), mean, std, |
| A->mutable_cpu_data()); |
| } else { |
| #ifdef USE_GPU |
| gpu_sample_gaussian(context->curand_generator(thread), A->count(), |
| mean, std, A->mutable_gpu_data()); |
| #else |
| NO_GPU; |
| #endif |
| } |
| } |
| |
| /************** Other functions ****************/ |
| template<typename Dtype> |
| void Softmax(int nb_rows, const Blob<Dtype>& A, Blob<Dtype>* B) { |
| CHECK_GT(nb_rows, 0); |
| CHECK_EQ(A.count() % nb_rows, 0); |
| CHECK_EQ(A.count(), B->count()); |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| cpu_softmax(nb_rows, A.count() / nb_rows, A.cpu_data(), |
| B->mutable_cpu_data()); |
| } else { |
| // TODO(wangwei) implement the GPU version. |
| NO_GPU; |
| } |
| } |
| |
| template<typename Dtype> |
| void Zero(Blob<Dtype>* B) { |
| auto context = Singleton<Context>::Instance(); |
| int device = context->device_id(std::this_thread::get_id()); |
| if (device < 0) { |
| B->SetValue(0); |
| } else { |
| #ifdef USE_GPU |
| cudaMemset(B->mutable_gpu_data(), 0, B->count() * sizeof(float)); |
| #else |
| NO_GPU; |
| #endif // USE_GPU |
| } |
| } |
| } // end of namespace singa |
| |
| #endif // SINGA_UTILS_MATH_BLOB_H_ |