| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| #include "libmatrixmult.h" |
| #include "libmatrixdnn.h" |
| #include <cstdlib> |
| #include <iostream> |
| #include <cstdio> |
| #include <cmath> |
| #include <cstring> |
| #ifdef USE_INTEL_MKL |
| #include "mkl_dnn.h" |
| #else |
| #include "omp.h" |
| #endif |
| |
| template<class FP> int computeNNZ(FP* arr, int limit) { |
| int nnz = 0; |
| #ifndef USE_INTEL_MKL |
| #pragma omp parallel for reduction(+: nnz) |
| #endif |
| for(int i=0; i<limit; i++) |
| nnz += (arr[i]!=0) ? 1 : 0; |
| return nnz; |
| } |
| |
| template<class FP> void biasAdd(FP* bias, FP* output, int K, int PQ) { |
| for(int k = 0, index=0; k < K; k++) |
| for(int pq = 0; pq < PQ; pq++, index++) |
| output[index] += bias[k]; |
| } |
| |
| void rotate180(double* inputArray, double* outputArray, int N, int C, int H, int W, |
| int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q) { |
| int PQ = P*Q; |
| int KQ = K*Q; |
| for (int k = 0; k < K; k++) { |
| for (int p = 0; p < P; p++) { |
| for (int q = 0; q < Q; q++) { |
| outputArray[p*KQ + q*K + k] = inputArray[k*PQ + p*Q + q]; |
| } |
| } |
| } |
| } |
| |
| void col2im(double* inputArray, double* outputArray, int N, int C, int H, int W, |
| int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q) { |
| for (int p = 0; p < P; p++) { |
| // h = p*stride_h + r - pad_h |
| // = r + hOffset |
| // Based on restrictions: h >= 0 and r >= 0 and h < H and r < R, we get |
| // max(0, - hOffset) <= r < min(R, H - hOffset) |
| int hOffset = p*stride_h - pad_h; |
| int rStart = MAX(0, - hOffset); |
| int rEnd = MIN(R, H - hOffset); |
| for (int q = 0; q < Q; q++) { |
| // Using the same logic as above on following: |
| // w = q*stride_w + s - pad_w |
| int wOffset = q*stride_w - pad_w; |
| int sStart = MAX(0, - wOffset); |
| int sEnd = MIN(S, W - wOffset); |
| int tempOffset = (p*Q + q)*C*R*S; |
| for (int c = 0; c < C; c++) { |
| int outOffset = c*H*W; |
| int inputOffset = tempOffset + c*R*S; |
| for (int r = rStart; r < rEnd; r++) { |
| for (int s = sStart; s < sEnd; s++) { |
| int inputIndex = inputOffset + r*S + s; |
| int outIndex = outOffset + (hOffset + r)*W + wOffset + s; |
| outputArray[outIndex] += inputArray[inputIndex]; |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| template<class FP> void im2col(FP* inputArray, FP* outputArray, int N, int C, int H, int W, |
| int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q) { |
| int CRS = C * R * S; |
| std::size_t size = Q * sizeof(FP); |
| if (stride_h == 1 && stride_w == 1 && pad_h == 0 && pad_w == 0) { |
| for (int c = 0; c < CRS; ++c) { |
| int wOffset = c % S; |
| int hOffset = (c / S) % R; |
| int cInput = c / R / S; |
| for (int h = 0; h < P; ++h) { |
| int hPadded = h + hOffset; |
| int outOffset = (c * P + h) * Q; |
| int inputOffset = (cInput * H + hPadded) * W; |
| std::memcpy(outputArray + outOffset, inputArray + inputOffset + wOffset, size); |
| int w = Q - 1; |
| int wPadded = w + wOffset; |
| if (hPadded < H && wPadded < W) |
| outputArray[outOffset + w] = inputArray[inputOffset + wPadded]; |
| else |
| outputArray[outOffset + w] = 0; |
| } |
| } |
| } else { |
| for (int c = 0; c < CRS; ++c) { |
| int wOffset = c % S; |
| int hOffset = (c / S) % R; |
| int cInput = c / R / S; |
| for (int h = 0; h < P; ++h) { |
| int outOffset = (c * P + h) * Q; |
| int hPadded = h * stride_h - pad_h + hOffset; |
| int inputOffset = (cInput * H + hPadded) * W; |
| if (hPadded < 0 || hPadded >= H) { |
| std::memset(outputArray + outOffset, 0, size); |
| } else { |
| for (int w = 0; w < Q; ++w) { |
| int wPadded = w * stride_w - pad_w + wOffset; |
| if (wPadded >= 0 && wPadded < W) |
| outputArray[outOffset + w] = inputArray[inputOffset + wPadded]; |
| else |
| outputArray[outOffset + w] = 0; |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| #ifdef USE_INTEL_MKL |
| // Returns true if error |
| bool MKL_DNN_ERROR(dnnError_t code) { |
| if(code == E_SUCCESS) return false; |
| else if(code == E_INCORRECT_INPUT_PARAMETER) std::cerr << "MKL ERROR: Incorrect input parameter.\n"; |
| else if(code == E_MEMORY_ERROR) std::cerr << "MKL ERROR: Memory error.\n"; |
| else if(code == E_UNSUPPORTED_DIMENSION) std::cerr << "MKL ERROR: Unsupported dimensions.\n"; |
| else if(code == E_UNIMPLEMENTED) std::cerr << "MKL ERROR: Unimplemented operation.\n"; |
| else std::cerr << "MKL ERROR: Unhandled error code = " << code << ".\n"; |
| return true; |
| } |
| #endif |
| |
| int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, |
| int N, int C, int H, int W, int K, int R, int S, |
| int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) { |
| int CRS = C*R*S; |
| #ifdef USE_INTEL_MKL |
| setNumThreadsForBLAS(numThreads); |
| // Step 1: Create a description of a DNN operation |
| dnnPrimitive_t pConvolution; |
| size_t dimension = 4; |
| size_t srcSize[4] = {W, H, C, N}; |
| size_t dstSize[4] = {Q, P, K, N}; |
| size_t filterSize[4] = {S, R, C, K}; |
| size_t convolutionStrides[2] = {stride_w, stride_h}; |
| int pads[2] = {-pad_w, -pad_h}; |
| void* resources[dnnResourceNumber] = {0}; |
| resources[dnnResourceDiffDst] = doutPtr; |
| resources[dnnResourceSrc] = inputPtr; |
| resources[dnnResourceDiffFilter] = retPtr; |
| dnnConvolutionCreateBackwardFilter_F64(&pConvolution, NULL, dnnAlgorithmConvolutionDirect, dimension, |
| srcSize, dstSize, filterSize, convolutionStrides, pads, dnnBorderZeros); |
| |
| // Step 2: Perform the DNN operation |
| if(MKL_DNN_ERROR(dnnExecute_F64(pConvolution, resources))) { |
| return -1; // nnz == -1 indicates error. |
| } |
| |
| // Step 3: Destroy the description of the operation |
| dnnDelete_F64(pConvolution); |
| #else |
| // First step: Avoids oversubscription and other openmp/internal blas threading issues |
| setNumThreadsForBLAS(1); |
| |
| int CHW = C * H * W; |
| int PQ = P*Q; |
| int KPQ = K*PQ; |
| int numRotatedElem = KPQ; |
| int numIm2ColElem = CRS * PQ; |
| int numTempElem = CRS * K; |
| |
| int m1 = CRS; |
| int n1 = K; |
| int k1 = PQ; |
| |
| // Allocate temporary data structures used in parallel for |
| int numOpenMPThreads = MIN(numThreads, N); |
| double* temp = new double[numTempElem*numOpenMPThreads]; |
| std::memset(temp, 0, numTempElem*numOpenMPThreads*sizeof(double)); |
| double* rotatedDoutPtrArrays = new double[numRotatedElem*numOpenMPThreads]; |
| double* loweredMatArrays = new double[numIm2ColElem*numOpenMPThreads]; |
| |
| #pragma omp parallel for num_threads(numOpenMPThreads) |
| for (int n = 0; n < N; n++) { |
| int threadID = omp_get_thread_num(); |
| double* loweredMat = loweredMatArrays + numIm2ColElem*threadID; |
| |
| // Step 1: Perform im2col |
| im2col<double>(inputPtr + n * CHW, loweredMat, 1, C, H, W, K, |
| R, S, stride_h, stride_w, pad_h, pad_w, P, Q); |
| |
| // Step 2: Rotate dout |
| double* rotatedDoutPtr = rotatedDoutPtrArrays + numRotatedElem*threadID; |
| rotate180(doutPtr + n * KPQ, rotatedDoutPtr, 1, C, H, W, K, |
| R, S, stride_h, stride_w, pad_h, pad_w, P, Q); |
| |
| // Multiply to get tmp1 = CRS X K |
| double* temp1 = temp + numTempElem*threadID; |
| // Step 3: temp1 = alpha * (loweredMat (CRS X PQ) %*% rotated_dout (PQ X K)) + beta*temp1 |
| int m1rlen = C * R * S; int m1clen = P * Q; int m2clen = K; |
| double* m1Ptr = loweredMat; double* m2Ptr = rotatedDoutPtr; double alpha = 1; double beta = 1; |
| cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m1rlen, m2clen, m1clen, alpha, m1Ptr, m1clen, m2Ptr, m2clen, beta, temp1, m2clen); |
| } // end omp parallel for |
| |
| delete [] loweredMatArrays; |
| delete [] rotatedDoutPtrArrays; |
| |
| // Inplace transpose addition |
| int numRow = CRS; |
| for(int t = 0; t < numOpenMPThreads; t++) { |
| int iter = 0; |
| double* temp1 = temp + numTempElem*t; |
| for(int i = 0; i < CRS; i++) { |
| for(int j = 0; j < K; j++, iter++) { |
| int index = j*numRow+i; |
| retPtr[index] += temp1[iter]; |
| } |
| } |
| } |
| |
| delete [] temp; |
| #endif |
| return computeNNZ<double>(retPtr, K*CRS); |
| } |
| |
| int conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, |
| int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) { |
| int CHW = C * H * W; |
| #ifdef USE_INTEL_MKL |
| setNumThreadsForBLAS(numThreads); |
| // Step 1: Create a description of a DNN operation |
| dnnPrimitive_t pConvolution; |
| size_t dimension = 4; |
| size_t srcSize[4] = {W, H, C, N}; |
| size_t dstSize[4] = {Q, P, K, N}; |
| size_t filterSize[4] = {S, R, C, K}; |
| size_t convolutionStrides[2] = {stride_w, stride_h}; |
| int pads[2] = {-pad_w, -pad_h}; |
| void* resources[dnnResourceNumber] = {0}; |
| resources[dnnResourceDiffDst] = doutPtr; |
| resources[dnnResourceFilter] = filterPtr; |
| resources[dnnResourceDiffSrc] = retPtr; |
| dnnConvolutionCreateBackwardData_F64(&pConvolution, NULL, dnnAlgorithmConvolutionDirect, dimension, |
| srcSize, dstSize, filterSize, convolutionStrides, pads, dnnBorderZeros); |
| |
| // Step 2: Perform the DNN operation |
| if(MKL_DNN_ERROR(dnnExecute_F64(pConvolution, resources))) { |
| return -1; // nnz == -1 indicates error. |
| } |
| |
| // Step 3: Destroy the description of the operation |
| dnnDelete_F64(pConvolution); |
| #else |
| // First step: Avoids oversubscription and other openmp/internal blas threading issues |
| setNumThreadsForBLAS(1); |
| |
| int CRS = C * R * S; |
| int PQ = P * Q; |
| int KPQ = K * PQ; |
| int numRotatedElem = PQ * K; |
| int numCol2ImElem = PQ * CRS; |
| |
| // Allocate temporary data structures used in parallel for |
| int numOpenMPThreads = MIN(numThreads, N); |
| double* rotatedDoutPtrArrays = new double[numRotatedElem*numOpenMPThreads]; |
| double* col2imInputArrays = new double[numCol2ImElem*numOpenMPThreads]; |
| |
| #pragma omp parallel for num_threads(numOpenMPThreads) |
| for (int n = 0; n < N; n++) { |
| int threadID = omp_get_thread_num(); |
| // Step 1: Rotate dout |
| double* rotatedDoutPtr = rotatedDoutPtrArrays + numRotatedElem*threadID; |
| rotate180(doutPtr + n * KPQ, rotatedDoutPtr, 1, C, H, W, K, |
| R, S, stride_h, stride_w, pad_h, pad_w, |
| P, Q); |
| |
| // Step 2: t(rotatedDout (PQ X K) %*% filter (K X CRS)) |
| double* col2imInput = col2imInputArrays + numCol2ImElem*threadID; |
| dmatmult(rotatedDoutPtr, filterPtr, col2imInput, PQ, K, CRS, 1); |
| |
| // Step 3: Perform col2im |
| double* outputArr = retPtr + n * CHW; |
| col2im(col2imInput, outputArr, 1, C, H, W, K, |
| R, S, stride_h, stride_w, pad_h, pad_w, P, Q); |
| } // end omp parallel for |
| |
| delete [] rotatedDoutPtrArrays; |
| delete [] col2imInputArrays; |
| #endif |
| return computeNNZ<double>(retPtr, N*CHW); |
| } |
| |
| void conv2dSparse(int apos, int alen, int* aix, double* avals, double* filterPtr, double* retPtr, int N, int C, int H, int W, |
| int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) { |
| setNumThreadsForBLAS(1); |
| double* loweredMat = new double[C * R * S * P * Q]; |
| |
| // Step 1: Perform im2col |
| double* temp = new double[C * H * W]; |
| std::size_t size = C * H * W * sizeof(double); |
| std::memset(temp, 0, size); |
| for(int j=apos; j<apos+alen; j++) |
| temp[ aix[j] ] = avals[j]; |
| im2col<double>(temp, loweredMat, 1, C, H, W, K, |
| R, S, stride_h, stride_w, pad_h, pad_w, P, Q); |
| delete [] temp; |
| |
| // Step 2: filter (K X CRS) %*% loweredMat (CRS X PQ) |
| dmatmult(filterPtr, loweredMat, retPtr, K, C * R * S, P * Q, 1); |
| |
| delete [] loweredMat; |
| } |
| |
| void conv2dBackwardFilterSparseDense(int apos, int alen, int* aix, double* avals, double* rotatedDoutPtr, double* retPtr, int N, int C, int H, int W, |
| int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) { |
| setNumThreadsForBLAS(1); |
| int CHW = C * H * W; |
| int CRS = C*R*S; |
| int PQ = P*Q; |
| int KPQ = K*PQ; |
| int m1 = CRS; |
| int n1 = K; |
| int k1 = PQ; |
| |
| double* loweredMat = new double[CRS * PQ]; |
| |
| // Step 1: Perform im2col |
| double* temp = new double[C * H * W]; |
| std::size_t size = C * H * W * sizeof(double); |
| std::memset(temp, 0, size); |
| for(int j=apos; j<apos+alen; j++) |
| temp[ aix[j] ] = avals[j]; |
| im2col(temp, loweredMat, 1, C, H, W, K, |
| R, S, stride_h, stride_w, pad_h, pad_w, |
| P, Q); |
| delete [] temp; |
| |
| // Multiply to get CRS X K |
| double* temp1 = new double[CRS * K]; |
| // Step 3: loweredMat (CRS X PQ) %*% rotatedDoutPtr (PQ X K) |
| dmatmult(loweredMat, rotatedDoutPtr, temp1, C * R * S, P * Q, K, 1); |
| delete [] loweredMat; |
| |
| // Inplace addition |
| for(int iter = 0; iter<K*CRS; iter++) { |
| retPtr[iter] += temp1[iter]; |
| } |
| |
| delete [] temp1; |
| } |
| |
| int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, |
| int N, int C, int H, int W, int K, int R, int S, |
| int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads) { |
| int KPQ = K * P * Q; |
| |
| #ifdef USE_INTEL_MKL |
| setNumThreadsForBLAS(numThreads); |
| // Step 1: Create a description of a DNN operation |
| dnnPrimitive_t pConvolution; |
| size_t dimension = 4; |
| size_t srcSize[4] = {W, H, C, N}; |
| size_t dstSize[4] = {Q, P, K, N}; |
| size_t filterSize[4] = {S, R, C, K}; |
| size_t convolutionStrides[2] = {stride_w, stride_h}; |
| int pads[2] = {-pad_w, -pad_h}; |
| void* resources[dnnResourceNumber] = {0}; |
| resources[dnnResourceSrc] = inputPtr; |
| resources[dnnResourceFilter] = filterPtr; |
| resources[dnnResourceDst] = retPtr; |
| if(addBias) { |
| dnnConvolutionCreateForwardBias_F64(&pConvolution, NULL, dnnAlgorithmConvolutionDirect, dimension, |
| srcSize, dstSize, filterSize, convolutionStrides, pads, dnnBorderZeros); |
| resources[dnnResourceBias] = biasPtr; |
| } |
| else { |
| dnnConvolutionCreateForward_F64(&pConvolution, NULL, dnnAlgorithmConvolutionDirect, dimension, |
| srcSize, dstSize, filterSize, convolutionStrides, pads, dnnBorderZeros); |
| } |
| |
| // Step 2: Perform the DNN operation |
| if(MKL_DNN_ERROR(dnnExecute_F64(pConvolution, resources))) { |
| return -1; // nnz == -1 indicates error. |
| } |
| |
| // Step 3: Destroy the description of the operation |
| dnnDelete_F64(pConvolution); |
| return computeNNZ<double>(retPtr, N*KPQ); |
| #else |
| // First step: Avoids oversubscription and other openmp/internal blas threading issues |
| setNumThreadsForBLAS(1); |
| |
| int CHW = C * H * W; |
| int PQ = P * Q; |
| int numIm2ColElem = C * R * S * P * Q; |
| |
| // Allocate temporary data structures used in parallel for |
| int numOpenMPThreads = MIN(numThreads, N); |
| double* loweredMatArrays = new double[numIm2ColElem*numOpenMPThreads]; |
| int nnz = 0; |
| |
| #pragma omp parallel for reduction(+: nnz) num_threads(numOpenMPThreads) |
| for (int n = 0; n < N; n++) { |
| int threadID = omp_get_thread_num(); |
| double* loweredMat = loweredMatArrays + numIm2ColElem*threadID; |
| |
| // Step 1: Perform im2col |
| im2col<double>(inputPtr + n * CHW, loweredMat, 1, C, H, W, K, |
| R, S, stride_h, stride_w, pad_h, pad_w, P, Q); |
| |
| // Step 2: filter (K X CRS) %*% loweredMat (CRS X PQ) |
| dmatmult(filterPtr, loweredMat, retPtr + n * KPQ, K, |
| C * R * S, P * Q, 1); |
| |
| // Step 3: Add bias |
| double* outputArr = retPtr + n*KPQ; |
| if( addBias ) |
| biasAdd<double>(biasPtr, outputArr, K, PQ); |
| |
| // Step 4: thread-local nnz maintenance |
| nnz += computeNNZ<double>(retPtr + n*KPQ, KPQ); |
| } |
| delete [] loweredMatArrays; |
| return nnz; |
| #endif |
| } |
| |
| int sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, float* retPtr, |
| int N, int C, int H, int W, int K, int R, int S, |
| int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads) { |
| int KPQ = K * P * Q; |
| |
| #ifdef USE_INTEL_MKL |
| setNumThreadsForBLAS(numThreads); |
| // Step 1: Create a description of a DNN operation |
| dnnPrimitive_t pConvolution; |
| size_t dimension = 4; |
| size_t srcSize[4] = {W, H, C, N}; |
| size_t dstSize[4] = {Q, P, K, N}; |
| size_t filterSize[4] = {S, R, C, K}; |
| size_t convolutionStrides[2] = {stride_w, stride_h}; |
| int pads[2] = {-pad_w, -pad_h}; |
| void* resources[dnnResourceNumber] = {0}; |
| resources[dnnResourceSrc] = inputPtr; |
| resources[dnnResourceFilter] = filterPtr; |
| resources[dnnResourceDst] = retPtr; |
| if(addBias) { |
| dnnConvolutionCreateForwardBias_F32(&pConvolution, NULL, dnnAlgorithmConvolutionDirect, dimension, |
| srcSize, dstSize, filterSize, convolutionStrides, pads, dnnBorderZeros); |
| resources[dnnResourceBias] = biasPtr; |
| } |
| else { |
| dnnConvolutionCreateForward_F32(&pConvolution, NULL, dnnAlgorithmConvolutionDirect, dimension, |
| srcSize, dstSize, filterSize, convolutionStrides, pads, dnnBorderZeros); |
| } |
| |
| // Step 2: Perform the DNN operation |
| if(MKL_DNN_ERROR(dnnExecute_F32(pConvolution, resources))) { |
| return -1; // nnz == -1 indicates error. |
| } |
| |
| // Step 3: Destroy the description of the operation |
| dnnDelete_F32(pConvolution); |
| return computeNNZ<float>(retPtr, N*KPQ); |
| #else |
| // First step: Avoids oversubscription and other openmp/internal blas threading issues |
| setNumThreadsForBLAS(1); |
| |
| int CHW = C * H * W; |
| int PQ = P * Q; |
| int numIm2ColElem = C * R * S * P * Q; |
| |
| // Allocate temporary data structures used in parallel for |
| int numOpenMPThreads = MIN(numThreads, N); |
| float* loweredMatArrays = new float[numIm2ColElem*numOpenMPThreads]; |
| int nnz = 0; |
| |
| #pragma omp parallel for reduction(+: nnz) num_threads(numOpenMPThreads) |
| for (int n = 0; n < N; n++) { |
| int threadID = omp_get_thread_num(); |
| float* loweredMat = loweredMatArrays + numIm2ColElem*threadID; |
| |
| // Step 1: Perform im2col |
| im2col<float>(inputPtr + n * CHW, loweredMat, 1, C, H, W, K, |
| R, S, stride_h, stride_w, pad_h, pad_w, P, Q); |
| |
| // Step 2: filter (K X CRS) %*% loweredMat (CRS X PQ) |
| smatmult(filterPtr, loweredMat, retPtr + n * KPQ, K, |
| C * R * S, P * Q, 1); |
| |
| // Step 3: Add bias |
| float* outputArr = retPtr + n*KPQ; |
| if( addBias ) |
| biasAdd<float>(biasPtr, outputArr, K, PQ); |
| |
| // Step 4: thread-local nnz maintenance |
| nnz += computeNNZ<float>(retPtr + n*KPQ, KPQ); |
| } |
| delete [] loweredMatArrays; |
| return nnz; |
| #endif |
| |
| } |