| /********************************************************* |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| ************************************************************/ |
| |
| #include "rnn.h" |
| |
| #include <map> |
| namespace singa { |
| #ifdef USE_CUDNN |
| CudnnRNNHandle::CudnnRNNHandle(const Tensor &x, const int hidden_size, |
| const int mode, const int num_layers, |
| const int bias, const float dropout, |
| const int bidirectional) |
| : bias(bias), |
| dropout(dropout), |
| bidirectional(bidirectional), |
| hidden_size(hidden_size), |
| mode(mode), |
| num_layers(num_layers) { |
| // cudnn rnn bias is not available in cudnn v7.4.5, not found in cudnn.h |
| CHECK_EQ(bias, 1) << "Current implementation always include bias"; |
| CHECK(bidirectional == 0 || bidirectional == 1) |
| << "bidirectional should be 0 or 1 not " << bidirectional; |
| |
| dev = x.device(); |
| ctx = x.device()->context(0); |
| |
| // TODO: batch first mode failed in cudnn |
| batch_first = 0; |
| |
| // x shape {seq, bs, ..} |
| seq_length = x.shape(0); |
| batch_size = x.shape(1); |
| feature_size = x.shape(2); |
| |
| cudnnRNNAlgo = CUDNN_RNN_ALGO_STANDARD; |
| cudnnDataType = CUDNN_DATA_FLOAT; |
| |
| cudnnTensorDescriptor_t *xDesc = new cudnnTensorDescriptor_t[seq_length]; |
| init_xDesc(xDesc, *this); |
| |
| init_dropout_desc(); |
| init_rnn_desc(); |
| init_parameters_desc(xDesc); |
| init_workspace(xDesc); |
| init_param_mapping(xDesc); |
| delete[] xDesc; |
| } |
| |
| void CudnnRNNHandle::init_workspace(cudnnTensorDescriptor_t *xDesc) { |
| /* workspace data */ |
| // Need for every pass |
| CUDNN_CHECK(cudnnGetRNNWorkspaceSize(ctx->cudnn_handle, rnnDesc, seq_length, |
| xDesc, &workspace_size_bytes)); |
| // Only needed in training, shouldn't be touched between passes. |
| CUDNN_CHECK(cudnnGetRNNTrainingReserveSize( |
| ctx->cudnn_handle, rnnDesc, seq_length, xDesc, &reserve_size_bytes)); |
| |
| workspace_size = workspace_size_bytes / sizeof(float); |
| reserve_size = reserve_size_bytes / sizeof(float); |
| workspace = Tensor(Shape{workspace_size}, dev); |
| reserve_space = Tensor(Shape{reserve_size}, dev); |
| } |
| |
| void CudnnRNNHandle::init_parameters_desc(cudnnTensorDescriptor_t *xDesc) { |
| /* weights size |
| * depends on rnn desc */ |
| CUDNN_CHECK(cudnnGetRNNParamsSize(ctx->cudnn_handle, rnnDesc, xDesc[0], |
| &weights_size_bytes, cudnnDataType)); |
| /* weights desc |
| * depends on weights size */ |
| CUDNN_CHECK(cudnnCreateFilterDescriptor(&wDesc)); |
| CUDNN_CHECK(cudnnCreateFilterDescriptor(&dwDesc)); |
| |
| weights_size = weights_size_bytes / sizeof(float); // TODO different types |
| int dimW[3]; |
| dimW[0] = weights_size; // TODO different types |
| dimW[1] = 1; |
| dimW[2] = 1; |
| CUDNN_CHECK(cudnnSetFilterNdDescriptor(wDesc, cudnnDataType, |
| CUDNN_TENSOR_NCHW, 3, dimW)); |
| CUDNN_CHECK(cudnnSetFilterNdDescriptor(dwDesc, cudnnDataType, |
| CUDNN_TENSOR_NCHW, 3, dimW)); |
| } |
| |
| void CudnnRNNHandle::init_rnn_desc() { |
| /* rnn desc */ |
| CUDNN_CHECK(cudnnCreateRNNDescriptor(&rnnDesc)); |
| if (mode == 0) |
| RNNMode = CUDNN_RNN_RELU; |
| else if (mode == 1) |
| RNNMode = CUDNN_RNN_TANH; |
| else if (mode == 2) |
| RNNMode = CUDNN_LSTM; |
| else if (mode == 3) |
| RNNMode = CUDNN_GRU; |
| CUDNN_CHECK(cudnnSetRNNDescriptor( |
| ctx->cudnn_handle, rnnDesc, hidden_size, num_layers, dropoutDesc, |
| CUDNN_LINEAR_INPUT, |
| bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, RNNMode, |
| cudnnRNNAlgo, // CUDNN_RNN_ALGO_STANDARD, |
| cudnnDataType)); |
| } |
| void CudnnRNNHandle::init_dropout_desc() { |
| /* drop out */ |
| size_t seed = 0x1234567; |
| CUDNN_CHECK(cudnnCreateDropoutDescriptor(&dropoutDesc)); |
| size_t stateSize; |
| CUDNN_CHECK(cudnnDropoutGetStatesSize(ctx->cudnn_handle, &stateSize)); |
| CUDA_CHECK(cudaMalloc(&states, stateSize)); |
| CUDNN_CHECK(cudnnSetDropoutDescriptor(dropoutDesc, ctx->cudnn_handle, dropout, |
| states, stateSize, seed)); |
| } |
| |
| void init_yDesc(cudnnTensorDescriptor_t *yDesc, CudnnRNNHandle &h) { |
| int dimA[] = {h.batch_size, |
| h.bidirectional ? h.hidden_size * 2 : h.hidden_size, 1}; |
| int strideA[] = {dimA[1] * dimA[2], dimA[2], 1}; |
| for (int i = 0; i < h.seq_length; i++) { |
| CUDNN_CHECK(cudnnCreateTensorDescriptor(&yDesc[i])); |
| CUDNN_CHECK(cudnnSetTensorNdDescriptor(yDesc[i], h.cudnnDataType, 3, dimA, |
| strideA)); |
| } |
| } |
| |
| void init_xDesc(cudnnTensorDescriptor_t *xDesc, CudnnRNNHandle &h) { |
| int dimA[] = {h.batch_size, h.feature_size, 1}; |
| int strideA[] = {dimA[1] * dimA[2], dimA[2], 1}; |
| for (int i = 0; i < h.seq_length; i++) { |
| CUDNN_CHECK(cudnnCreateTensorDescriptor(&xDesc[i])); |
| CUDNN_CHECK(cudnnSetTensorNdDescriptor(xDesc[i], h.cudnnDataType, 3, dimA, |
| strideA)); |
| } |
| } |
| |
| void init_hc_Desc(cudnnTensorDescriptor_t &hxDesc, CudnnRNNHandle &h) { |
| /* If direction is CUDNN_BIDIRECTIONAL then the first dimension should match |
| double the numLayers argument passed to cudnnSetRNNDescriptor(). */ |
| /* The second dimension must match the batchSize parameter in xDesc */ |
| /* the third dimension must match the hiddenSize argument passed to the |
| cudnnSetRNNDescriptor() call used to initialize rnnDesc. */ |
| int dimA[] = {h.num_layers * (h.bidirectional ? 2 : 1), h.batch_size, |
| h.hidden_size}; |
| int strideA[] = {dimA[2] * dimA[1], dimA[2], 1}; |
| CUDNN_CHECK(cudnnCreateTensorDescriptor(&hxDesc)); |
| CUDNN_CHECK( |
| cudnnSetTensorNdDescriptor(hxDesc, h.cudnnDataType, 3, dimA, strideA)); |
| } |
| |
| /* |
| vector<Tensor> GpuRNNForwardTraining(); |
| vector<Tensor> GpuRNNForwardInference(); |
| vector<Tensor> GpuRNNBackwardx(); |
| Tensor GpuRNNBackwardW(); |
| */ |
| |
| vector<Tensor> GpuRNNForwardInference(const Tensor &x, const Tensor &hx, |
| const Tensor &cx, const Tensor &W, |
| CudnnRNNHandle &h) { |
| CHECK_EQ(h.feature_size, x.shape(2)) << "feature size should not change"; |
| |
| // in |
| // x in shape {seq, bs, ..} |
| // out |
| // y in shape {seq, bs, ..} |
| |
| h.batch_size = x.shape(1); // update batch size to accomodate bs change |
| h.seq_length = x.shape(0); |
| |
| Tensor y(Shape{h.seq_length, h.batch_size, |
| h.hidden_size * (h.bidirectional ? 2 : 1)}, |
| x.device()); |
| Tensor hy(Shape{h.num_layers * (h.bidirectional ? 2 : 1), h.batch_size, |
| h.hidden_size}, |
| x.device()); |
| Tensor cy(Shape{h.num_layers * (h.bidirectional ? 2 : 1), h.batch_size, |
| h.hidden_size}, |
| x.device()); |
| y.SetValue(0.0f); |
| hy.SetValue(0.0f); |
| cy.SetValue(0.0f); |
| h.workspace.SetValue(0.0f); |
| y.device()->Exec( |
| [y, hy, cy, x, hx, cx, &W, &h](Context *ctx) { |
| // require desc, [x], hx, cx, w, y, hy, cy |
| cudnnTensorDescriptor_t *xDesc = |
| new cudnnTensorDescriptor_t[h.seq_length]; |
| cudnnTensorDescriptor_t *yDesc = |
| new cudnnTensorDescriptor_t[h.seq_length]; |
| init_xDesc(xDesc, h); |
| init_yDesc(yDesc, h); |
| cudnnTensorDescriptor_t hxDesc; |
| cudnnTensorDescriptor_t cxDesc; |
| cudnnTensorDescriptor_t hyDesc; |
| cudnnTensorDescriptor_t cyDesc; |
| init_hc_Desc(hxDesc, h); |
| init_hc_Desc(cxDesc, h); |
| init_hc_Desc(hyDesc, h); |
| init_hc_Desc(cyDesc, h); |
| |
| auto x_con = Contiguous(x); |
| |
| auto xptr = x_con.block()->data(); |
| auto hxptr = hx.block()->data(); |
| auto cxptr = cx.block()->data(); |
| auto Wptr = W.block()->data(); |
| auto yptr = y.block()->mutable_data(); |
| auto hyptr = hy.block()->mutable_data(); |
| auto cyptr = cy.block()->mutable_data(); |
| auto wsptr = h.workspace.block()->mutable_data(); |
| |
| CUDNN_CHECK(cudnnRNNForwardInference( |
| ctx->cudnn_handle, h.rnnDesc, h.seq_length, xDesc, xptr, hxDesc, |
| hxptr, cxDesc, cxptr, h.wDesc, Wptr, yDesc, yptr, hyDesc, hyptr, |
| cyDesc, cyptr, wsptr, h.workspace_size_bytes)); |
| |
| delete[] xDesc; |
| delete[] yDesc; |
| }, |
| {x.block(), hx.block(), cx.block(), W.block()}, |
| {y.block(), hy.block(), cy.block(), h.workspace.block()}, |
| "cudnnRNNForwardInterface"); |
| return {y, hy, cy}; |
| } |
| |
| vector<Tensor> GpuRNNForwardTraining(const Tensor &x, const Tensor &hx, |
| const Tensor &cx, const Tensor &W, |
| CudnnRNNHandle &h) { |
| CHECK_EQ(h.feature_size, x.shape(2)) << "feature size should not change"; |
| |
| // in |
| // x in shape {seq, bs, ..} |
| // out |
| // y in shape {seq, bs, ..} |
| |
| // update batch size to accomodate bs change |
| h.batch_size = x.shape(1); |
| h.seq_length = x.shape(0); |
| |
| Tensor y(Shape{h.seq_length, h.batch_size, |
| h.hidden_size * (h.bidirectional ? 2 : 1)}, |
| x.device()); |
| Tensor hy(Shape{h.num_layers * (h.bidirectional ? 2 : 1), h.batch_size, |
| h.hidden_size}, |
| x.device()); |
| Tensor cy(Shape{h.num_layers * (h.bidirectional ? 2 : 1), h.batch_size, |
| h.hidden_size}, |
| x.device()); |
| y.SetValue(0.0f); |
| hy.SetValue(0.0f); |
| cy.SetValue(0.0f); |
| h.workspace.SetValue(0.0f); |
| h.reserve_space.SetValue(0.0f); |
| |
| y.device()->Exec( |
| [y, hy, cy, x, hx, cx, &W, &h](Context *ctx) { |
| // require desc, [x], hx, cx, w, y, hy, cy |
| cudnnTensorDescriptor_t *xDesc = |
| new cudnnTensorDescriptor_t[h.seq_length]; |
| cudnnTensorDescriptor_t *yDesc = |
| new cudnnTensorDescriptor_t[h.seq_length]; |
| init_xDesc(xDesc, h); |
| init_yDesc(yDesc, h); |
| cudnnTensorDescriptor_t hxDesc; |
| cudnnTensorDescriptor_t cxDesc; |
| cudnnTensorDescriptor_t hyDesc; |
| cudnnTensorDescriptor_t cyDesc; |
| init_hc_Desc(hxDesc, h); |
| init_hc_Desc(cxDesc, h); |
| init_hc_Desc(hyDesc, h); |
| init_hc_Desc(cyDesc, h); |
| |
| auto x_con = Contiguous(x); |
| |
| auto xptr = x_con.block()->data(); |
| auto hxptr = hx.block()->data(); |
| auto cxptr = cx.block()->data(); |
| auto Wptr = W.block()->data(); |
| auto yptr = y.block()->mutable_data(); |
| auto hyptr = hy.block()->mutable_data(); |
| auto cyptr = cy.block()->mutable_data(); |
| auto wsptr = h.workspace.block()->mutable_data(); |
| auto rsptr = h.reserve_space.block()->mutable_data(); |
| CUDNN_CHECK(cudnnRNNForwardTraining( |
| ctx->cudnn_handle, h.rnnDesc, h.seq_length, xDesc, xptr, hxDesc, |
| hxptr, cxDesc, cxptr, h.wDesc, Wptr, yDesc, yptr, hyDesc, hyptr, |
| cyDesc, cyptr, wsptr, h.workspace_size_bytes, rsptr, |
| h.reserve_size_bytes)); |
| delete[] xDesc; |
| delete[] yDesc; |
| }, |
| {x.block(), hx.block(), cx.block(), W.block()}, |
| {y.block(), hy.block(), cy.block(), h.workspace.block(), |
| h.reserve_space.block()}, |
| "cudnnRNNForwardTraining"); |
| |
| return {y, hy, cy}; |
| } |
| vector<Tensor> GpuRNNBackwardx(const Tensor &y, const Tensor &dy, |
| const Tensor &dhy, const Tensor &dcy, |
| const Tensor &W, const Tensor &hx, |
| const Tensor &cx, CudnnRNNHandle &h) { |
| // in |
| // y shape {seq, bs} |
| // dy shape {seq, bs} |
| Tensor dx(Shape{h.seq_length, h.batch_size, h.feature_size}, y.device()); |
| Tensor dhx(Shape{h.num_layers * (h.bidirectional ? 2 : 1), h.batch_size, |
| h.hidden_size}, |
| y.device()); |
| Tensor dcx(Shape{h.num_layers * (h.bidirectional ? 2 : 1), h.batch_size, |
| h.hidden_size}, |
| y.device()); |
| dx.SetValue(0.0f); |
| dhx.SetValue(0.0f); |
| dcx.SetValue(0.0f); |
| h.workspace.SetValue(0.0f); |
| dx.device()->Exec( |
| [dx, dhx, dcx, y, dy, dhy, dcy, &W, hx, cx, &h](Context *ctx) { |
| // require desc: |
| // [dx], hx, dhx, cx, dcx, w, |
| // [y], [dy], dhy, dcy |
| cudnnTensorDescriptor_t *dxDesc = |
| new cudnnTensorDescriptor_t[h.seq_length]; |
| cudnnTensorDescriptor_t *yDesc = |
| new cudnnTensorDescriptor_t[h.seq_length]; |
| cudnnTensorDescriptor_t *dyDesc = |
| new cudnnTensorDescriptor_t[h.seq_length]; |
| init_yDesc(yDesc, h); |
| init_xDesc(dxDesc, h); |
| init_yDesc(dyDesc, h); |
| cudnnTensorDescriptor_t hxDesc; |
| cudnnTensorDescriptor_t cxDesc; |
| cudnnTensorDescriptor_t dhxDesc; |
| cudnnTensorDescriptor_t dcxDesc; |
| cudnnTensorDescriptor_t dhyDesc; |
| cudnnTensorDescriptor_t dcyDesc; |
| init_hc_Desc(hxDesc, h); |
| init_hc_Desc(cxDesc, h); |
| init_hc_Desc(dhxDesc, h); |
| init_hc_Desc(dcxDesc, h); |
| init_hc_Desc(dhyDesc, h); |
| init_hc_Desc(dcyDesc, h); |
| |
| auto y_con = Contiguous(y); |
| auto dy_con = Contiguous(dy); |
| |
| auto dxptr = dx.block()->mutable_data(); |
| auto hxptr = hx.block()->data(); |
| auto dhxptr = dhx.block()->mutable_data(); |
| auto cxptr = cx.block()->data(); |
| auto dcxptr = dcx.block()->mutable_data(); |
| auto Wptr = W.block()->data(); |
| auto yptr = y_con.block()->data(); |
| auto dyptr = dy_con.block()->data(); |
| auto dhyptr = dhy.block()->data(); |
| auto dcyptr = dcy.block()->data(); |
| auto wsptr = h.workspace.block()->mutable_data(); |
| auto rsptr = h.reserve_space.block()->mutable_data(); |
| |
| CUDNN_CHECK(cudnnRNNBackwardData( |
| ctx->cudnn_handle, h.rnnDesc, h.seq_length, yDesc, yptr, dyDesc, |
| dyptr, dhyDesc, dhyptr, dcyDesc, dcyptr, h.wDesc, Wptr, hxDesc, |
| hxptr, cxDesc, cxptr, dxDesc, dxptr, dhxDesc, dhxptr, dcxDesc, |
| dcxptr, wsptr, h.workspace_size_bytes, rsptr, |
| h.reserve_size_bytes)); |
| delete[] dxDesc; |
| delete[] yDesc; |
| delete[] dyDesc; |
| }, |
| {y.block(), dy.block(), dhy.block(), dcy.block(), hx.block(), cx.block(), |
| W.block()}, |
| {dx.block(), dhx.block(), dcx.block(), h.workspace.block(), |
| h.reserve_space.block()}, |
| "cudnnRNNBackwardx"); |
| return {dx, dhx, dcx}; |
| } |
| |
| Tensor GpuRNNBackwardW(const Tensor &x, const Tensor &hx, const Tensor &y, |
| CudnnRNNHandle &h) { |
| Tensor dW(Shape{h.weights_size}, x.device()); |
| // in |
| // x shape {seq, bs} |
| // y shape {seq, bs} |
| dW.SetValue(0.0f); |
| h.workspace.SetValue(0.0f); |
| dW.device()->Exec( |
| [dW, x, hx, y, &h](Context *ctx) { |
| cudnnTensorDescriptor_t *xDesc = |
| new cudnnTensorDescriptor_t[h.seq_length]; |
| cudnnTensorDescriptor_t *yDesc = |
| new cudnnTensorDescriptor_t[h.seq_length]; |
| init_xDesc(xDesc, h); |
| init_yDesc(yDesc, h); |
| cudnnTensorDescriptor_t hxDesc; |
| init_hc_Desc(hxDesc, h); |
| |
| auto y_con = Contiguous(y); |
| auto x_con = Contiguous(x); |
| |
| auto xptr = x_con.block()->data(); |
| auto hxptr = hx.block()->data(); |
| auto yptr = y_con.block()->data(); |
| auto dWptr = dW.block()->mutable_data(); |
| auto wsptr = h.workspace.block()->mutable_data(); |
| auto rsptr = h.reserve_space.block()->mutable_data(); |
| |
| CUDNN_CHECK(cudnnRNNBackwardWeights( |
| ctx->cudnn_handle, h.rnnDesc, h.seq_length, xDesc, xptr, hxDesc, |
| hxptr, yDesc, yptr, wsptr, h.workspace_size_bytes, h.dwDesc, dWptr, |
| rsptr, h.reserve_size_bytes)); |
| delete[] xDesc; |
| delete[] yDesc; |
| }, |
| {x.block(), y.block(), hx.block()}, |
| {dW.block(), h.workspace.block(), h.reserve_space.block()}, |
| "cudnnRnnBackwardW"); |
| return dW; |
| } |
| |
| void CudnnRNNHandle::init_param_mapping(cudnnTensorDescriptor_t *xDesc) { |
| int linLayerIDRange = 2; |
| if (mode == 0 || mode == 1) { |
| // vanilla relu/tanh |
| linLayerIDRange = 2; |
| } else if (mode == 2) { |
| // lstm |
| linLayerIDRange = 8; |
| } else if (mode == 3) { |
| // gru |
| linLayerIDRange = 6; |
| } |
| int pseudoLayerRange = (bidirectional ? 2 : 1) * num_layers; |
| |
| // dummy weights for getting the offset |
| Tensor weights( |
| Shape{ |
| weights_size, |
| }, |
| dev); |
| weights.SetValue(0.0f); |
| const void *W_ptr = weights.block()->data(); |
| |
| void *param_ptr = nullptr; |
| int dims[] = {1, 1, 1}; |
| cudnnDataType_t data_type; |
| cudnnTensorFormat_t tensor_format; |
| int n_dims; |
| cudnnFilterDescriptor_t paramDesc; |
| CUDNN_CHECK(cudnnCreateFilterDescriptor(¶mDesc)); |
| |
| vector<bool> paramTypes{false, true}; |
| for (int linLayerID = 0; linLayerID < linLayerIDRange; linLayerID++) { |
| for (int pseudoLayer = 0; pseudoLayer < pseudoLayerRange; pseudoLayer++) { |
| for (const bool &is_bias : paramTypes) { |
| // get param ptr |
| if (is_bias) { |
| CUDNN_CHECK(cudnnGetRNNLinLayerBiasParams( |
| ctx->cudnn_handle, rnnDesc, pseudoLayer, xDesc[0], wDesc, W_ptr, |
| linLayerID, paramDesc, ¶m_ptr)); |
| } else { |
| CUDNN_CHECK(cudnnGetRNNLinLayerMatrixParams( |
| ctx->cudnn_handle, rnnDesc, pseudoLayer, xDesc[0], wDesc, W_ptr, |
| linLayerID, paramDesc, ¶m_ptr)); |
| } |
| |
| // get param dims |
| CUDNN_CHECK(cudnnGetFilterNdDescriptor(paramDesc, 3, &data_type, |
| &tensor_format, &n_dims, dims)); |
| |
| // get diff - offset |
| size_t offset = (float *)param_ptr - (float *)W_ptr; |
| |
| // save in map |
| weights_mapping[std::make_tuple(linLayerID, pseudoLayer, is_bias)] = |
| std::make_tuple(offset, dims[0] * dims[1] * dims[2]); |
| } |
| } |
| } |
| } |
| |
| void GpuRNNSetParam(int linLayerID, int pseudoLayer, Tensor &weights, |
| Tensor ¶mValues, bool is_bias, CudnnRNNHandle &h) { |
| size_t offset, size; |
| std::tie(offset, size) = |
| h.weights_mapping[std::make_tuple(linLayerID, pseudoLayer, is_bias)]; |
| CHECK_EQ(size, paramValues.size()) << "param size is not expected"; |
| CopyDataToFrom(&weights, paramValues, size, offset, 0); |
| } |
| |
| Tensor GpuRNNGetParamCopy(int linLayerID, int pseudoLayer, Tensor &weights, |
| bool is_bias, CudnnRNNHandle &h) { |
| size_t offset, size; |
| std::tie(offset, size) = |
| h.weights_mapping[std::make_tuple(linLayerID, pseudoLayer, is_bias)]; |
| Tensor paramCopy( |
| Shape{ |
| size, |
| }, |
| weights.device()); |
| CopyDataToFrom(¶mCopy, weights, size, 0, offset); |
| return paramCopy; |
| } |
| |
| /* |
| vector<Tensor> GpuRNNForwardTrainingEx(); |
| vector<Tensor> GpuRNNForwardInferenceEx(); |
| vector<Tensor> GpuRNNBackwardxEx(); |
| Tensor GpuRNNBackwardWEx(); |
| */ |
| |
| void init_data_desc(cudnnRNNDataDescriptor_t &desc, int data_size, |
| const Tensor seq_lengths, CudnnRNNHandle &h) { |
| /* cudnnRNNDataDescriptor_t is a pointer to an opaque structure holding |
| the description of an RNN data set. The function |
| cudnnCreateRNNDataDescriptor() is used to create one instance, and |
| cudnnSetRNNDataDescriptor() must be used to initialize this instance. |
| */ |
| CUDNN_CHECK(cudnnCreateRNNDataDescriptor(&desc)); |
| /* CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED |
| Data layout is padded, with outer stride from one time-step to the |
| next. CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED The sequence length is |
| sorted and packed as in basic RNN API. |
| CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED |
| Data layout is padded, with outer stride from one batch to the next. |
| */ |
| cudnnRNNDataLayout_t layout; |
| if (h.batch_first) { |
| layout = CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED; |
| } else { |
| layout = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED; |
| } |
| |
| /* This is only effective when the descriptor is describing the RNN |
| output, and the unpacked layout is specified.*/ |
| float paddingFill = 0.0f; |
| |
| /* Input. An integer array with batchSize number of elements. |
| Describes the length (number of time-steps) of each sequence. Each |
| element in seqLengthArray must be greater than 0 but less than or |
| equal to maxSeqLength. */ |
| Tensor tmp = seq_lengths.Clone(); |
| tmp.ToHost(); |
| tmp = tmp.AsType(singa::kInt); |
| const int *seq_lengths_ptr = static_cast<const int *>(tmp.block()->data()); |
| |
| CUDNN_CHECK(cudnnSetRNNDataDescriptor(desc, h.cudnnDataType, layout, |
| h.seq_length, h.batch_size, data_size, |
| seq_lengths_ptr, (void *)&paddingFill)); |
| } |
| |
| vector<Tensor> GpuRNNForwardInferenceEx(const Tensor &x, const Tensor &hx, |
| const Tensor &cx, const Tensor &W, |
| const Tensor &seq_lengths, |
| CudnnRNNHandle &h) { |
| CHECK_EQ(h.feature_size, x.shape(2)) << "feature size should not change"; |
| |
| Tensor y, hy, cy; |
| Shape yshape, states_shape; |
| |
| if (h.batch_first) { |
| LOG(FATAL) << "batch_first not implemented for GpuRNNForwardTrainingEx"; |
| } else { |
| h.seq_length = x.shape(0); |
| h.batch_size = x.shape(1); |
| yshape = Shape{h.seq_length, h.batch_size, |
| h.hidden_size * (h.bidirectional ? 2 : 1)}; |
| states_shape = Shape{h.num_layers * (h.bidirectional ? 2 : 1), h.batch_size, |
| h.hidden_size}; |
| } |
| |
| y = Tensor(yshape, x.device()); |
| hy = Tensor(states_shape, x.device()); |
| cy = Tensor(states_shape, x.device()); |
| |
| y.device()->Exec( |
| [y, hy, cy, x, seq_lengths, hx, cx, &W, &h](Context *ctx) { |
| // data descriptor |
| cudnnRNNDataDescriptor_t xDesc, yDesc; |
| init_data_desc(xDesc, h.feature_size, seq_lengths, h); |
| init_data_desc(yDesc, |
| h.bidirectional ? h.hidden_size * 2 : h.hidden_size, |
| seq_lengths, h); |
| |
| // hidden cell states descriptor |
| cudnnTensorDescriptor_t hxDesc, cxDesc, hyDesc, cyDesc; |
| init_hc_Desc(hxDesc, h); |
| init_hc_Desc(cxDesc, h); |
| init_hc_Desc(hyDesc, h); |
| init_hc_Desc(cyDesc, h); |
| |
| auto xptr = x.block()->data(); |
| auto hxptr = hx.block()->data(); |
| auto cxptr = cx.block()->data(); |
| auto Wptr = W.block()->data(); |
| auto yptr = y.block()->mutable_data(); |
| auto hyptr = hy.block()->mutable_data(); |
| auto cyptr = cy.block()->mutable_data(); |
| auto wsptr = h.workspace.block()->mutable_data(); |
| |
| /* This routine is the extended version of the cudnnRNNForwardTraining() |
| function. The cudnnRNNForwardTrainingEx() allows the user to use |
| unpacked (padded) layout for input x and output y. |
| */ |
| CUDNN_CHECK(cudnnRNNForwardInferenceEx( |
| ctx->cudnn_handle, h.rnnDesc, xDesc, xptr, hxDesc, hxptr, cxDesc, |
| cxptr, h.wDesc, Wptr, yDesc, yptr, hyDesc, hyptr, cyDesc, cyptr, |
| NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, wsptr, |
| h.workspace_size_bytes)); |
| }, |
| {x.block(), hx.block(), cx.block(), W.block()}, |
| {y.block(), hy.block(), cy.block(), h.workspace.block(), |
| h.reserve_space.block()}); |
| return {y, hy, cy}; |
| } |
| |
| vector<Tensor> GpuRNNForwardTrainingEx(const Tensor &x, const Tensor &hx, |
| const Tensor &cx, const Tensor &W, |
| const Tensor &seq_lengths, |
| CudnnRNNHandle &h) { |
| CHECK_EQ(h.feature_size, x.shape(2)) << "feature size should not change"; |
| |
| Tensor y, hy, cy; |
| Shape yshape, states_shape; |
| |
| if (h.batch_first) { |
| LOG(FATAL) << "batch_first not implemented for GpuRNNForwardTrainingEx"; |
| } else { |
| h.seq_length = x.shape(0); |
| h.batch_size = x.shape(1); |
| yshape = Shape{h.seq_length, h.batch_size, |
| h.hidden_size * (h.bidirectional ? 2 : 1)}; |
| states_shape = Shape{h.num_layers * (h.bidirectional ? 2 : 1), h.batch_size, |
| h.hidden_size}; |
| } |
| |
| y = Tensor(yshape, x.device()); |
| hy = Tensor(states_shape, x.device()); |
| cy = Tensor(states_shape, x.device()); |
| |
| y.device()->Exec( |
| [y, hy, cy, x, seq_lengths, hx, cx, &W, &h](Context *ctx) { |
| // data descriptor |
| cudnnRNNDataDescriptor_t xDesc, yDesc; |
| init_data_desc(xDesc, h.feature_size, seq_lengths, h); |
| init_data_desc(yDesc, |
| h.bidirectional ? h.hidden_size * 2 : h.hidden_size, |
| seq_lengths, h); |
| |
| // hidden cell states descriptor |
| cudnnTensorDescriptor_t hxDesc, cxDesc, hyDesc, cyDesc; |
| init_hc_Desc(hxDesc, h); |
| init_hc_Desc(cxDesc, h); |
| init_hc_Desc(hyDesc, h); |
| init_hc_Desc(cyDesc, h); |
| |
| auto xptr = x.block()->data(); |
| auto hxptr = hx.block()->data(); |
| auto cxptr = cx.block()->data(); |
| auto Wptr = W.block()->data(); |
| auto yptr = y.block()->mutable_data(); |
| auto hyptr = hy.block()->mutable_data(); |
| auto cyptr = cy.block()->mutable_data(); |
| auto wsptr = h.workspace.block()->mutable_data(); |
| auto rsptr = h.reserve_space.block()->mutable_data(); |
| |
| /* This routine is the extended version of the cudnnRNNForwardTraining() |
| function. The cudnnRNNForwardTrainingEx() allows the user to use |
| unpacked (padded) layout for input x and output y. |
| */ |
| CUDNN_CHECK(cudnnRNNForwardTrainingEx( |
| ctx->cudnn_handle, h.rnnDesc, xDesc, xptr, hxDesc, hxptr, cxDesc, |
| cxptr, h.wDesc, Wptr, yDesc, yptr, hyDesc, hyptr, cyDesc, cyptr, |
| NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, wsptr, |
| h.workspace_size_bytes, rsptr, h.reserve_size_bytes)); |
| }, |
| {x.block(), hx.block(), cx.block(), W.block()}, |
| {y.block(), hy.block(), cy.block(), h.workspace.block(), |
| h.reserve_space.block()}); |
| return {y, hy, cy}; |
| } |
| |
| vector<Tensor> GpuRNNBackwardxEx(const Tensor &y, const Tensor &dy, |
| const Tensor &dhy, const Tensor &dcy, |
| const Tensor &W, const Tensor &hx, |
| const Tensor &cx, const Tensor &seq_lengths, |
| CudnnRNNHandle &h) { |
| // y shape: {bs, seq} |
| // dy shape: {bs, seq} |
| // dx shape: {bs, seq} |
| Shape xshape, states_shape; |
| if (h.batch_first) { |
| LOG(FATAL) << "batch_first not implemented for GpuRNNBackwardxEx"; |
| } else { |
| xshape = Shape{h.batch_size, h.seq_length, h.feature_size}; |
| states_shape = Shape{h.num_layers * (h.bidirectional ? 2 : 1), h.batch_size, |
| h.hidden_size}; |
| } |
| Tensor dx(xshape, y.device()); |
| Tensor dhx(states_shape, y.device()); |
| Tensor dcx(states_shape, y.device()); |
| |
| dx.SetValue(0.0f); |
| dhx.SetValue(0.0f); |
| dcx.SetValue(0.0f); |
| h.workspace.SetValue(0.0f); |
| |
| dx.device()->Exec( |
| [dx, dhx, dcx, y, dy, dhy, dcy, &W, hx, cx, seq_lengths, |
| &h](Context *ctx) { |
| cudnnRNNDataDescriptor_t yDesc, dyDesc, dxDesc; |
| init_data_desc(yDesc, |
| h.bidirectional ? h.hidden_size * 2 : h.hidden_size, |
| seq_lengths, h); |
| init_data_desc(dyDesc, |
| h.bidirectional ? h.hidden_size * 2 : h.hidden_size, |
| seq_lengths, h); |
| init_data_desc(dxDesc, h.feature_size, seq_lengths, h); |
| |
| /* other tensors desc*/ |
| cudnnTensorDescriptor_t hxDesc, cxDesc, dhxDesc, dcxDesc, dhyDesc, |
| dcyDesc; |
| init_hc_Desc(hxDesc, h); |
| init_hc_Desc(cxDesc, h); |
| init_hc_Desc(dhxDesc, h); |
| init_hc_Desc(dcxDesc, h); |
| init_hc_Desc(dhyDesc, h); |
| init_hc_Desc(dcyDesc, h); |
| |
| auto dxptr = dx.block()->mutable_data(); |
| auto hxptr = hx.block()->data(); |
| auto dhxptr = dhx.block()->mutable_data(); |
| auto cxptr = cx.block()->data(); |
| auto dcxptr = dcx.block()->mutable_data(); |
| auto Wptr = W.block()->data(); |
| auto yptr = y.block()->data(); |
| auto dyptr = dy.block()->data(); |
| auto dhyptr = dhy.block()->data(); |
| auto dcyptr = dcy.block()->data(); |
| auto wsptr = h.workspace.block()->mutable_data(); |
| auto rsptr = h.reserve_space.block()->mutable_data(); |
| |
| CUDNN_CHECK(cudnnRNNBackwardDataEx( |
| ctx->cudnn_handle, h.rnnDesc, yDesc, yptr, dyDesc, dyptr, NULL, |
| NULL, dhyDesc, dhyptr, dcyDesc, dcyptr, h.wDesc, Wptr, hxDesc, |
| hxptr, cxDesc, cxptr, dxDesc, dxptr, dhxDesc, dhxptr, dcxDesc, |
| dcxptr, NULL, NULL, wsptr, h.workspace_size_bytes, rsptr, |
| h.reserve_size_bytes)); |
| }, |
| {y.block(), dy.block(), dhy.block(), dcy.block(), hx.block(), cx.block(), |
| W.block()}, |
| {dx.block(), dhx.block(), dcx.block(), h.workspace.block(), |
| h.reserve_space.block()}); |
| return {dx, dhx, dcx}; |
| } |
| |
| Tensor GpuRNNBackwardWEx(const Tensor &x, const Tensor &hx, const Tensor &y, |
| const Tensor &seq_lengths, CudnnRNNHandle &h) { |
| Tensor dW(Shape{h.weights_size}, x.device()); |
| dW.SetValue(0.0f); |
| |
| dW.device()->Exec( |
| [dW, x, hx, y, seq_lengths, &h](Context *ctx) { |
| cudnnRNNDataDescriptor_t xDesc, yDesc; |
| init_data_desc(xDesc, h.feature_size, seq_lengths, h); |
| init_data_desc(yDesc, |
| h.bidirectional ? h.hidden_size * 2 : h.hidden_size, |
| seq_lengths, h); |
| |
| /* other tensor desc */ |
| cudnnTensorDescriptor_t hxDesc; |
| init_hc_Desc(hxDesc, h); |
| |
| auto xptr = x.block()->data(); |
| auto hxptr = hx.block()->data(); |
| auto yptr = y.block()->data(); |
| auto dWptr = dW.block()->mutable_data(); |
| auto wsptr = h.workspace.block()->mutable_data(); |
| auto rsptr = h.reserve_space.block()->mutable_data(); |
| |
| CUDNN_CHECK(cudnnRNNBackwardWeightsEx( |
| ctx->cudnn_handle, h.rnnDesc, xDesc, xptr, hxDesc, hxptr, yDesc, |
| yptr, wsptr, h.workspace_size_bytes, h.dwDesc, dWptr, rsptr, |
| h.reserve_size_bytes)); |
| }, |
| {x.block(), y.block(), hx.block()}, |
| {dW.block(), h.workspace.block(), h.reserve_space.block()}); |
| return dW; |
| } |
| |
| #endif // USE_CUDNN |
| } // namespace singa |