blob: 2c521b5abb5d0badb30f407bc3855a04c8ed0932 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#pragma once
#include "ctc_helper.h"
#include "gpu_ctc_kernels.h"
namespace mxnet_warpctc {
template <typename ProbT>
class GpuCTC {
public:
GpuCTC(int alphabet_size,
int minibatch,
void *workspace,
CUstream stream,
int blank_label) :
out_dim_(alphabet_size), minibatch_(minibatch),
gpu_workspace_(workspace), stream_(stream),
blank_label_(blank_label) {};
// Noncopyable
GpuCTC(const GpuCTC&) = delete;
GpuCTC& operator=(const GpuCTC&) = delete;
ctcStatus_t
cost_and_grad(const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths);
ctcStatus_t
score_forward(const ProbT* const activations,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths);
private:
template<int NT, int VT>
ctcStatus_t launch_alpha_beta_kernels(const ProbT* const log_probs,
ProbT *grads,
bool compute_alpha,
bool compute_beta);
ctcStatus_t
launch_gpu_kernels(const ProbT* const log_probs,
ProbT *grads,
size_t config,
bool launch_alpha,
bool launch_beta);
ctcStatus_t
setup_gpu_metadata(const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths);
ctcStatus_t
create_metadata_and_choose_config(const int* const label_lengths,
const int* const flat_labels,
const int* const input_lengths,
size_t& best_config);
ctcStatus_t
compute_log_probs(const ProbT* const activations);
ctcStatus_t
compute_cost_and_score(const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
bool compute_alpha,
bool compute_betas_and_grad);
int out_dim_; // Number of characters plus blank
int minibatch_;
int S_;
int T_;
int activation_cols_; // Number of columns in activations
void *gpu_workspace_; // Buffer for all temporary GPU memory
CUstream stream_;
int blank_label_;
int *utt_length_; // T
int *label_sizes_; // L
int *repeats_; // repeats_
int *label_offsets_;
int *labels_without_blanks_;
int *labels_with_blanks_;
ProbT *alphas_;
ProbT *nll_forward_;
ProbT *nll_backward_;
ProbT *denoms_; // Temporary storage for denoms for softmax
ProbT *log_probs_; // Temporary storage for probabilities (log softmax output)
};
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths)
{
size_t gpu_bytes_used = 0;
nll_forward_ =
reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(ProbT);
nll_backward_ =
reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(ProbT);
repeats_ =
reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
label_offsets_ =
reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
// This is the max of all S and T for all valid examples in the minibatch.
// A valid example is one for which L + repeats <= T
S_ = 0;
T_ = 0;
// This is the max of all timesteps, valid or not. Needed to compute offsets
int Tmax = 0;
// This is the max of all labels, valid or not. Needed to compute offsets
int Lmax = 0;
int total_label_length = 0;
constexpr int cpu_buffer_size = 64;
int repeats[cpu_buffer_size];
int label_offsets[cpu_buffer_size];
const int num_passes = ctc_helper::div_up(minibatch_, cpu_buffer_size);
cudaError_t cuda_status;
for (int pass = 0; pass < num_passes; ++pass) {
const int start_idx = pass * cpu_buffer_size;
const int end_idx = std::min(minibatch_, (pass+1) * cpu_buffer_size);
for (int j = start_idx; j < end_idx; ++j) {
const int L = label_lengths[j];
const int local_T = input_lengths[j];
const int *label_ptr = &(flat_labels[total_label_length]);
label_offsets[j % cpu_buffer_size] = total_label_length;
total_label_length += L;
int repeat_counter = 0;
for (int i = 1; i < L; ++i)
repeat_counter += (label_ptr[i] == label_ptr[i-1]);
repeats[j % cpu_buffer_size] = repeat_counter;
const bool valid_label = ((L + repeat_counter) <= local_T);
// Only update S and T if label is valid
S_ = (valid_label) ? std::max(S_, L) : S_;
T_ = (valid_label) ? std::max(T_, local_T) : T_;
Tmax = std::max(Tmax, local_T);
Lmax = std::max(Lmax, L);
}
cuda_status = cudaMemcpyAsync(&(repeats_[start_idx]), repeats,
(end_idx - start_idx) * sizeof(int),
cudaMemcpyHostToDevice, stream_);
if (cuda_status != cudaSuccess)
return CTC_STATUS_MEMOPS_FAILED;
cuda_status = cudaMemcpyAsync(&(label_offsets_[start_idx]), label_offsets,
(end_idx - start_idx) * sizeof(int),
cudaMemcpyHostToDevice, stream_);
if (cuda_status != cudaSuccess)
return CTC_STATUS_MEMOPS_FAILED;
}
S_ = 2 * S_ + 1;
const int Smax = 2 * Lmax + 1;
activation_cols_ = minibatch_ * Tmax;
// Allocate memory for T
utt_length_ =
reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
cuda_status = cudaMemcpyAsync(utt_length_, input_lengths,
minibatch_ * sizeof(int),
cudaMemcpyHostToDevice, stream_);
if (cuda_status != cudaSuccess)
return CTC_STATUS_MEMOPS_FAILED;
label_sizes_ =
reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
cuda_status = cudaMemcpyAsync(label_sizes_, label_lengths,
minibatch_ * sizeof(int),
cudaMemcpyHostToDevice, stream_);
if (cuda_status != cudaSuccess)
return CTC_STATUS_MEMOPS_FAILED;
labels_without_blanks_ =
reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += Lmax * minibatch_ * sizeof(int);
cuda_status = cudaMemcpyAsync(labels_without_blanks_, flat_labels,
total_label_length * sizeof(int),
cudaMemcpyHostToDevice, stream_);
if (cuda_status != cudaSuccess)
return CTC_STATUS_MEMOPS_FAILED;
labels_with_blanks_ =
reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += Smax * minibatch_ * sizeof(int);
alphas_ =
reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT);
denoms_ =
reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += activation_cols_ * sizeof(ProbT);
log_probs_ =
reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += out_dim_ * activation_cols_ * sizeof(ProbT);
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
template<int NT, int VT>
ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT* const log_probs,
ProbT* grads,
bool compute_alpha,
bool compute_beta ) {
// One thread block per utterance
const int grid_size = minibatch_;
// The data is laid out so that the next timestep is minibatch entries
// away
const int stride = minibatch_;
if (compute_alpha)
compute_alpha_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
(log_probs, label_sizes_, utt_length_,
repeats_, labels_without_blanks_, label_offsets_,
labels_with_blanks_, alphas_, nll_forward_,
stride, out_dim_, S_, T_, blank_label_);
if (compute_beta) {
compute_betas_and_grad_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
(log_probs, label_sizes_, utt_length_, repeats_,
labels_with_blanks_, alphas_, nll_forward_, nll_backward_,
grads, stride, out_dim_, S_, T_, blank_label_);
cudaStreamSynchronize(stream_);
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
return CTC_STATUS_EXECUTION_FAILED;
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::create_metadata_and_choose_config(const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
size_t& best_config) {
// Setup the metadata for GPU
ctcStatus_t status = setup_gpu_metadata(flat_labels, label_lengths, input_lengths);
if (status != CTC_STATUS_SUCCESS)
return status;
constexpr int num_configs = 12;
int config_NT[num_configs] =
{32, 64, 128, 64, 128, 32, 64, 128, 64, 128, 128, 128};
int config_VT[num_configs] =
{ 1, 1, 1, 3, 2, 9, 6, 4, 9, 6, 9, 10};
best_config = 0;
for (int i = 0; i < num_configs; ++i) {
if ((config_NT[i]* config_VT[i]) >= S_)
break;
else
best_config++;
}
if (best_config >= num_configs)
return CTC_STATUS_UNKNOWN_ERROR;
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::launch_gpu_kernels(const ProbT* const log_probs,
ProbT* grads,
size_t config,
bool l_a,
bool l_b) {
switch(config) {
case 0: {return launch_alpha_beta_kernels<32, 1>(log_probs, grads, l_a, l_b);}
case 1: {return launch_alpha_beta_kernels<64, 1>(log_probs, grads, l_a, l_b);}
case 2: {return launch_alpha_beta_kernels<128, 1>(log_probs, grads, l_a, l_b);}
case 3: {return launch_alpha_beta_kernels<64, 3>(log_probs, grads, l_a, l_b);}
case 4: {return launch_alpha_beta_kernels<128, 2>(log_probs, grads, l_a, l_b);}
case 5: {return launch_alpha_beta_kernels<32, 9>(log_probs, grads, l_a, l_b);}
case 6: {return launch_alpha_beta_kernels<64, 6>(log_probs, grads, l_a, l_b);}
case 7: {return launch_alpha_beta_kernels<128, 4>(log_probs, grads, l_a, l_b);}
case 8: {return launch_alpha_beta_kernels<64, 9>(log_probs, grads, l_a, l_b);}
case 9: {return launch_alpha_beta_kernels<128, 6>(log_probs, grads, l_a, l_b);}
case 10: {return launch_alpha_beta_kernels<128, 9>(log_probs, grads, l_a, l_b);}
case 11: {return launch_alpha_beta_kernels<128, 10>(log_probs, grads, l_a, l_b);}
}
return CTC_STATUS_EXECUTION_FAILED;
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::compute_log_probs(const ProbT* const activations) {
cudaError_t cuda_status;
cuda_status =
cudaMemcpyAsync(log_probs_, activations,
activation_cols_ * out_dim_ *sizeof(ProbT),
cudaMemcpyDeviceToDevice, stream_);
if (cuda_status != cudaSuccess)
return CTC_STATUS_MEMOPS_FAILED;
// create mshadow handles to data
using namespace mshadow;
using namespace mshadow::expr;
Stream<mxnet::gpu> mxstream;
mxstream.stream_ = stream_;
Tensor<mxnet::gpu, 2, ProbT> log_probs_handle(log_probs_, mshadow::Shape2(activation_cols_, out_dim_), &mxstream);
Tensor<mxnet::gpu, 1, ProbT> denoms_handle(denoms_, mshadow::Shape1(activation_cols_), &mxstream);
denoms_handle = reduce_with_axis<red::maximum, false>(log_probs_handle, 1);
// Kernel launch to subtract maximum
const int NT = 128;
const int VT = 1;
const int NV = NT * VT;
const int num_elements = out_dim_ * activation_cols_;
const int grid_size = ctc_helper::div_up(num_elements, NV);
prepare_stable_LSM_kernel<ProbT, VT> <<< grid_size, NT, 0, stream_>>>
(ctc_helper::identity<ProbT>(), log_probs_,
denoms_, out_dim_, num_elements);
// compute denominators for softmax
denoms_handle = reduce_with_axis<red::sum, false>(F<mxnet::op::mshadow_op::exp>(log_probs_handle), 1);
// Kernel launch to calculate probabilities
compute_log_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>
(ctc_helper::identity<ProbT>(), log_probs_,
denoms_, out_dim_, num_elements);
cuda_status = cudaGetLastError();
if (cuda_status != cudaSuccess)
return CTC_STATUS_EXECUTION_FAILED;
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::compute_cost_and_score(const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
bool compute_alpha,
bool compute_betas_and_grad) {
size_t best_config;
ctcStatus_t status = create_metadata_and_choose_config(flat_labels,
label_lengths,
input_lengths,
best_config);
if (status != CTC_STATUS_SUCCESS)
return status;
status = compute_log_probs(activations);
if (status != CTC_STATUS_SUCCESS)
return status;
launch_gpu_kernels(log_probs_, grads, best_config,
compute_alpha, compute_betas_and_grad);
cudaError_t cuda_status_mem, cuda_status_sync;
cuda_status_mem = cudaMemcpyAsync(costs, nll_forward_,
sizeof(ProbT) * minibatch_,
cudaMemcpyDeviceToHost, stream_);
cuda_status_sync = cudaStreamSynchronize(stream_);
if (cuda_status_mem != cudaSuccess || cuda_status_sync != cudaSuccess)
return CTC_STATUS_MEMOPS_FAILED;
return CTC_STATUS_SUCCESS;
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths) {
if (activations == nullptr ||
grads == nullptr ||
costs == nullptr ||
flat_labels == nullptr ||
label_lengths == nullptr ||
input_lengths == nullptr
)
return CTC_STATUS_INVALID_VALUE;
return compute_cost_and_score(activations, grads, costs, flat_labels,
label_lengths, input_lengths, true, true);
}
template<typename ProbT>
ctcStatus_t
GpuCTC<ProbT>::score_forward(const ProbT* const activations,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths) {
if (activations == nullptr ||
costs == nullptr ||
flat_labels == nullptr ||
label_lengths == nullptr ||
input_lengths == nullptr
)
return CTC_STATUS_INVALID_VALUE;
return compute_cost_and_score(activations, nullptr, costs, flat_labels,
label_lengths, input_lengths, true, false);
}
} // mxnet_warpctc