3rdparty/ctc_include/detail/gpu_ctc.h - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 #pragma once


 #include "ctc_helper.h"
 #include "gpu_ctc_kernels.h"

 namespace mxnet_warpctc {

 template <typename ProbT>
 class GpuCTC {
     public:
         GpuCTC(int alphabet_size,
                int minibatch,
                void *workspace,
                CUstream stream,
                int blank_label) :
             out_dim_(alphabet_size), minibatch_(minibatch),
             gpu_workspace_(workspace), stream_(stream),
             blank_label_(blank_label) {};

         // Noncopyable
         GpuCTC(const GpuCTC&) = delete;
         GpuCTC& operator=(const GpuCTC&) = delete;

         ctcStatus_t
         cost_and_grad(const ProbT* const activations,
                       ProbT* grads,
                       ProbT* costs,
                       const int* const flat_labels,
                       const int* const label_lengths,
                       const int* const input_lengths);

         ctcStatus_t
         score_forward(const ProbT* const activations,
                       ProbT* costs,
                       const int* const flat_labels,
                       const int* const label_lengths,
                       const int* const input_lengths);

     private:

         template<int NT, int VT>
         ctcStatus_t launch_alpha_beta_kernels(const ProbT* const log_probs,
                                               ProbT *grads,
                                               bool compute_alpha,
                                               bool compute_beta);

         ctcStatus_t
         launch_gpu_kernels(const ProbT* const log_probs,
                            ProbT *grads,
                            size_t config,
                            bool launch_alpha,
                            bool launch_beta);

         ctcStatus_t
         setup_gpu_metadata(const int* const flat_labels,
                            const int* const label_lengths,
                            const int* const input_lengths);

         ctcStatus_t
         create_metadata_and_choose_config(const int* const label_lengths,
                                           const int* const flat_labels,
                                           const int* const input_lengths,
                                           size_t& best_config);

         ctcStatus_t
         compute_log_probs(const ProbT* const activations);

         ctcStatus_t
         compute_cost_and_score(const ProbT* const activations,
                                ProbT* grads,
                                ProbT* costs,
                                const int* const flat_labels,
                                const int* const label_lengths,
                                const int* const input_lengths,
                                bool compute_alpha,
                                bool compute_betas_and_grad);


         int out_dim_; // Number of characters plus blank
         int minibatch_;

         int S_;
         int T_;

         int activation_cols_; // Number of columns in activations

         void *gpu_workspace_; // Buffer for all temporary GPU memory
         CUstream stream_;
         int blank_label_;

         int *utt_length_; // T
         int *label_sizes_; // L
         int *repeats_; // repeats_
         int *label_offsets_;
         int *labels_without_blanks_;
         int *labels_with_blanks_;
         ProbT *alphas_;
         ProbT *nll_forward_;
         ProbT *nll_backward_;
         ProbT *denoms_; // Temporary storage for denoms for softmax
         ProbT *log_probs_; // Temporary storage for probabilities (log softmax output)
 };

 template<typename ProbT>
 ctcStatus_t
 GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
                                   const int* const label_lengths,
                                   const int* const input_lengths)
 {
     size_t gpu_bytes_used = 0;

     nll_forward_ =
         reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
                                   gpu_bytes_used);
     gpu_bytes_used += minibatch_ * sizeof(ProbT);


     nll_backward_ =
         reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
                                   gpu_bytes_used);
     gpu_bytes_used += minibatch_ * sizeof(ProbT);


     repeats_ =
         reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
                                 gpu_bytes_used);
     gpu_bytes_used += minibatch_ * sizeof(int);

     label_offsets_ =
         reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
                                 gpu_bytes_used);
     gpu_bytes_used += minibatch_ * sizeof(int);


     // This is the max of all S and T for all valid examples in the minibatch.
     // A valid example is one for which L + repeats <= T
     S_ = 0;
     T_ = 0;

     // This is the max of all timesteps, valid or not. Needed to compute offsets
     int Tmax = 0;

     // This is the max of all labels, valid or not. Needed to compute offsets
     int Lmax = 0;
     int total_label_length = 0;

     constexpr int cpu_buffer_size = 64;
     int repeats[cpu_buffer_size];
     int label_offsets[cpu_buffer_size];

     const int num_passes = ctc_helper::div_up(minibatch_, cpu_buffer_size);

     cudaError_t cuda_status;

     for (int pass = 0; pass < num_passes; ++pass) {

         const int start_idx = pass * cpu_buffer_size;
         const int end_idx = std::min(minibatch_, (pass+1) * cpu_buffer_size);

         for (int j = start_idx; j < end_idx; ++j) {
             const int L = label_lengths[j];
             const int local_T = input_lengths[j];
             const int *label_ptr = &(flat_labels[total_label_length]);

             label_offsets[j % cpu_buffer_size] = total_label_length;
             total_label_length += L;

             int repeat_counter = 0;

             for (int i = 1; i < L; ++i)
                 repeat_counter += (label_ptr[i] == label_ptr[i-1]);

             repeats[j % cpu_buffer_size] = repeat_counter;
             const bool valid_label = ((L + repeat_counter) <= local_T);

             // Only update S and T if label is valid
             S_ = (valid_label) ? std::max(S_, L) : S_;
             T_ = (valid_label) ? std::max(T_, local_T) : T_;

             Tmax = std::max(Tmax, local_T);
             Lmax = std::max(Lmax, L);
         }

         cuda_status = cudaMemcpyAsync(&(repeats_[start_idx]), repeats,
                                       (end_idx - start_idx) * sizeof(int),
                                       cudaMemcpyHostToDevice, stream_);
         if (cuda_status != cudaSuccess)
             return CTC_STATUS_MEMOPS_FAILED;


         cuda_status = cudaMemcpyAsync(&(label_offsets_[start_idx]), label_offsets,
                                       (end_idx - start_idx) * sizeof(int),
                                       cudaMemcpyHostToDevice, stream_);
         if (cuda_status != cudaSuccess)
             return CTC_STATUS_MEMOPS_FAILED;
     }

     S_ = 2 * S_ + 1;
     const int Smax = 2 * Lmax + 1;

     activation_cols_ = minibatch_ * Tmax;

     // Allocate memory for T
     utt_length_ =
         reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
                                 gpu_bytes_used);
     gpu_bytes_used += minibatch_  * sizeof(int);

     cuda_status = cudaMemcpyAsync(utt_length_, input_lengths,
                                   minibatch_ * sizeof(int),
                                   cudaMemcpyHostToDevice, stream_);
     if (cuda_status != cudaSuccess)
         return CTC_STATUS_MEMOPS_FAILED;

     label_sizes_ =
         reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
                                 gpu_bytes_used);
     gpu_bytes_used += minibatch_ * sizeof(int);
     cuda_status = cudaMemcpyAsync(label_sizes_, label_lengths,
                                   minibatch_ * sizeof(int),
                                   cudaMemcpyHostToDevice, stream_);
     if (cuda_status != cudaSuccess)
         return CTC_STATUS_MEMOPS_FAILED;

     labels_without_blanks_ =
         reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
                                 gpu_bytes_used);
     gpu_bytes_used += Lmax * minibatch_ * sizeof(int);
     cuda_status = cudaMemcpyAsync(labels_without_blanks_, flat_labels,
                                   total_label_length * sizeof(int),
                                   cudaMemcpyHostToDevice, stream_);
     if (cuda_status != cudaSuccess)
         return CTC_STATUS_MEMOPS_FAILED;

     labels_with_blanks_ =
         reinterpret_cast<int *>(static_cast<char*>(gpu_workspace_) +
                                 gpu_bytes_used);
     gpu_bytes_used += Smax * minibatch_ * sizeof(int);

     alphas_ =
         reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
                                   gpu_bytes_used);
     gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT);


     denoms_ =
         reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
                                   gpu_bytes_used);
     gpu_bytes_used += activation_cols_ * sizeof(ProbT);

     log_probs_ =
         reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
                                   gpu_bytes_used);
     gpu_bytes_used += out_dim_ * activation_cols_ * sizeof(ProbT);

     return CTC_STATUS_SUCCESS;
 }

 template<typename ProbT>
 template<int NT, int VT>
 ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT* const log_probs,
                                                      ProbT* grads,
                                                      bool compute_alpha,
                                                      bool compute_beta ) {

     // One thread block per utterance
     const int grid_size = minibatch_;

     // The data is laid out so that the next timestep is minibatch entries
     // away
     const int stride = minibatch_;

     if (compute_alpha)
         compute_alpha_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
             (log_probs, label_sizes_, utt_length_,
              repeats_, labels_without_blanks_, label_offsets_,
              labels_with_blanks_, alphas_, nll_forward_,
              stride, out_dim_, S_, T_, blank_label_);


     if (compute_beta) {
         compute_betas_and_grad_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
             (log_probs, label_sizes_, utt_length_, repeats_,
              labels_with_blanks_, alphas_, nll_forward_, nll_backward_,
              grads, stride, out_dim_, S_, T_, blank_label_);

         cudaStreamSynchronize(stream_);
     }

     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess)
         return CTC_STATUS_EXECUTION_FAILED;

     return CTC_STATUS_SUCCESS;
 }

 template<typename ProbT>
 ctcStatus_t
 GpuCTC<ProbT>::create_metadata_and_choose_config(const int* const flat_labels,
                                                  const int* const label_lengths,
                                                  const int* const input_lengths,
                                                  size_t& best_config) {

     // Setup the metadata for GPU
     ctcStatus_t status = setup_gpu_metadata(flat_labels, label_lengths, input_lengths);
     if (status != CTC_STATUS_SUCCESS)
         return status;

     constexpr int num_configs = 12;

     int config_NT[num_configs] =
         {32, 64, 128, 64, 128, 32, 64, 128, 64, 128, 128, 128};
     int config_VT[num_configs] =
         { 1,  1,   1,  3,   2,  9,  6,   4,  9,   6,   9,  10};

     best_config = 0;

     for (int i = 0; i < num_configs; ++i) {
         if ((config_NT[i]* config_VT[i]) >= S_)
             break;
         else
             best_config++;
     }

     if (best_config >= num_configs)
         return CTC_STATUS_UNKNOWN_ERROR;

     return CTC_STATUS_SUCCESS;
 }

 template<typename ProbT>
 ctcStatus_t
 GpuCTC<ProbT>::launch_gpu_kernels(const ProbT* const log_probs,
                                   ProbT* grads,
                                   size_t config,
                                   bool l_a,
                                   bool l_b) {

     switch(config) {
         case 0:   {return launch_alpha_beta_kernels<32,   1>(log_probs, grads, l_a, l_b);}
         case 1:   {return launch_alpha_beta_kernels<64,   1>(log_probs, grads, l_a, l_b);}
         case 2:   {return launch_alpha_beta_kernels<128,  1>(log_probs, grads, l_a, l_b);}
         case 3:   {return launch_alpha_beta_kernels<64,   3>(log_probs, grads, l_a, l_b);}
         case 4:   {return launch_alpha_beta_kernels<128,  2>(log_probs, grads, l_a, l_b);}
         case 5:   {return launch_alpha_beta_kernels<32,   9>(log_probs, grads, l_a, l_b);}
         case 6:   {return launch_alpha_beta_kernels<64,   6>(log_probs, grads, l_a, l_b);}
         case 7:   {return launch_alpha_beta_kernels<128,  4>(log_probs, grads, l_a, l_b);}
         case 8:   {return launch_alpha_beta_kernels<64,   9>(log_probs, grads, l_a, l_b);}
         case 9:   {return launch_alpha_beta_kernels<128,  6>(log_probs, grads, l_a, l_b);}
         case 10:  {return launch_alpha_beta_kernels<128,  9>(log_probs, grads, l_a, l_b);}
         case 11:  {return launch_alpha_beta_kernels<128, 10>(log_probs, grads, l_a, l_b);}
     }

     return CTC_STATUS_EXECUTION_FAILED;
 }

 template<typename ProbT>
 ctcStatus_t
 GpuCTC<ProbT>::compute_log_probs(const ProbT* const activations) {

     cudaError_t cuda_status;
     cuda_status =
         cudaMemcpyAsync(log_probs_, activations,
                         activation_cols_ * out_dim_ *sizeof(ProbT),
                         cudaMemcpyDeviceToDevice, stream_);
     if (cuda_status != cudaSuccess)
         return CTC_STATUS_MEMOPS_FAILED;


     // create mshadow handles to data
     using namespace mshadow;
     using namespace mshadow::expr;
     Stream<mxnet::gpu> mxstream;
     mxstream.stream_ = stream_;
     Tensor<mxnet::gpu, 2, ProbT> log_probs_handle(log_probs_, mshadow::Shape2(activation_cols_, out_dim_), &mxstream);
     Tensor<mxnet::gpu, 1, ProbT> denoms_handle(denoms_, mshadow::Shape1(activation_cols_), &mxstream);
     denoms_handle = reduce_with_axis<red::maximum, false>(log_probs_handle, 1);


     // Kernel launch to subtract maximum
     const int NT = 128;
     const int VT = 1;
     const int NV = NT * VT;
     const int num_elements = out_dim_ * activation_cols_;
     const int grid_size = ctc_helper::div_up(num_elements, NV);

     prepare_stable_LSM_kernel<ProbT, VT> <<< grid_size, NT, 0, stream_>>>
        (ctc_helper::identity<ProbT>(), log_probs_,
         denoms_, out_dim_, num_elements);

     // compute denominators for softmax
     denoms_handle = reduce_with_axis<red::sum, false>(F<mxnet::op::mshadow_op::exp>(log_probs_handle), 1);

     // Kernel launch to calculate probabilities
     compute_log_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>
         (ctc_helper::identity<ProbT>(), log_probs_,
          denoms_, out_dim_, num_elements);

     cuda_status = cudaGetLastError();
     if (cuda_status != cudaSuccess)
         return CTC_STATUS_EXECUTION_FAILED;

     return CTC_STATUS_SUCCESS;
 }

 template<typename ProbT>
 ctcStatus_t
 GpuCTC<ProbT>::compute_cost_and_score(const ProbT* const activations,
                                       ProbT* grads,
                                       ProbT* costs,
                                       const int* const flat_labels,
                                       const int* const label_lengths,
                                       const int* const input_lengths,
                                       bool compute_alpha,
                                       bool compute_betas_and_grad) {

     size_t best_config;
     ctcStatus_t status = create_metadata_and_choose_config(flat_labels,
                                                            label_lengths,
                                                            input_lengths,
                                                            best_config);
     if (status != CTC_STATUS_SUCCESS)
         return status;

     status = compute_log_probs(activations);
     if (status != CTC_STATUS_SUCCESS)
         return status;

     launch_gpu_kernels(log_probs_, grads, best_config,
                        compute_alpha, compute_betas_and_grad);

     cudaError_t cuda_status_mem, cuda_status_sync;
     cuda_status_mem = cudaMemcpyAsync(costs, nll_forward_,
                                       sizeof(ProbT) * minibatch_,
                                       cudaMemcpyDeviceToHost, stream_);
     cuda_status_sync = cudaStreamSynchronize(stream_);
     if (cuda_status_mem != cudaSuccess || cuda_status_sync != cudaSuccess)
         return CTC_STATUS_MEMOPS_FAILED;

     return CTC_STATUS_SUCCESS;
 }

 template<typename ProbT>
 ctcStatus_t
 GpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
                              ProbT* grads,
                              ProbT* costs,
                              const int* const flat_labels,
                              const int* const label_lengths,
                              const int* const input_lengths) {
     if (activations == nullptr ||
         grads == nullptr ||
         costs == nullptr ||
         flat_labels == nullptr ||
         label_lengths == nullptr ||
         input_lengths == nullptr
         )
         return CTC_STATUS_INVALID_VALUE;

     return compute_cost_and_score(activations, grads, costs, flat_labels,
                                   label_lengths, input_lengths, true, true);
 }

 template<typename ProbT>
 ctcStatus_t
 GpuCTC<ProbT>::score_forward(const ProbT* const activations,
                              ProbT* costs,
                              const int* const flat_labels,
                              const int* const label_lengths,
                              const int* const input_lengths) {
     if (activations == nullptr ||
         costs == nullptr ||
         flat_labels == nullptr ||
         label_lengths == nullptr ||
         input_lengths == nullptr
         )
         return CTC_STATUS_INVALID_VALUE;

     return compute_cost_and_score(activations, nullptr, costs, flat_labels,
                                   label_lengths, input_lengths, true, false);
 }

 } // mxnet_warpctc
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	#pragma once


	#include "ctc_helper.h"
	#include "gpu_ctc_kernels.h"

	namespace mxnet_warpctc {

	template <typename ProbT>
	class GpuCTC {
	public:
	GpuCTC(int alphabet_size,
	int minibatch,
	void *workspace,
	CUstream stream,
	int blank_label) :
	out_dim_(alphabet_size), minibatch_(minibatch),
	gpu_workspace_(workspace), stream_(stream),
	blank_label_(blank_label) {};

	// Noncopyable
	GpuCTC(const GpuCTC&) = delete;
	GpuCTC& operator=(const GpuCTC&) = delete;

	ctcStatus_t
	cost_and_grad(const ProbT* const activations,
	ProbT* grads,
	ProbT* costs,
	const int* const flat_labels,
	const int* const label_lengths,
	const int* const input_lengths);

	ctcStatus_t
	score_forward(const ProbT* const activations,
	ProbT* costs,
	const int* const flat_labels,
	const int* const label_lengths,
	const int* const input_lengths);

	private:

	template<int NT, int VT>
	ctcStatus_t launch_alpha_beta_kernels(const ProbT* const log_probs,
	ProbT *grads,
	bool compute_alpha,
	bool compute_beta);

	ctcStatus_t
	launch_gpu_kernels(const ProbT* const log_probs,
	ProbT *grads,
	size_t config,
	bool launch_alpha,
	bool launch_beta);

	ctcStatus_t
	setup_gpu_metadata(const int* const flat_labels,
	const int* const label_lengths,
	const int* const input_lengths);

	ctcStatus_t
	create_metadata_and_choose_config(const int* const label_lengths,
	const int* const flat_labels,
	const int* const input_lengths,
	size_t& best_config);

	ctcStatus_t
	compute_log_probs(const ProbT* const activations);

	ctcStatus_t
	compute_cost_and_score(const ProbT* const activations,
	ProbT* grads,
	ProbT* costs,
	const int* const flat_labels,
	const int* const label_lengths,
	const int* const input_lengths,
	bool compute_alpha,
	bool compute_betas_and_grad);


	int out_dim_; // Number of characters plus blank
	int minibatch_;

	int S_;
	int T_;

	int activation_cols_; // Number of columns in activations

	void *gpu_workspace_; // Buffer for all temporary GPU memory
	CUstream stream_;
	int blank_label_;

	int *utt_length_; // T
	int *label_sizes_; // L
	int *repeats_; // repeats_
	int *label_offsets_;
	int *labels_without_blanks_;
	int *labels_with_blanks_;
	ProbT *alphas_;
	ProbT *nll_forward_;
	ProbT *nll_backward_;
	ProbT *denoms_; // Temporary storage for denoms for softmax
	ProbT *log_probs_; // Temporary storage for probabilities (log softmax output)
	};

	template<typename ProbT>
	ctcStatus_t
	GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
	const int* const label_lengths,
	const int* const input_lengths)
	{
	size_t gpu_bytes_used = 0;

	nll_forward_ =
	reinterpret_cast<ProbT >(static_cast<char>(gpu_workspace_) +
	gpu_bytes_used);
	gpu_bytes_used += minibatch_ * sizeof(ProbT);


	nll_backward_ =
	reinterpret_cast<ProbT >(static_cast<char>(gpu_workspace_) +
	gpu_bytes_used);
	gpu_bytes_used += minibatch_ * sizeof(ProbT);


	repeats_ =
	reinterpret_cast<int >(static_cast<char>(gpu_workspace_) +
	gpu_bytes_used);
	gpu_bytes_used += minibatch_ * sizeof(int);

	label_offsets_ =
	reinterpret_cast<int >(static_cast<char>(gpu_workspace_) +
	gpu_bytes_used);
	gpu_bytes_used += minibatch_ * sizeof(int);


	// This is the max of all S and T for all valid examples in the minibatch.
	// A valid example is one for which L + repeats <= T
	S_ = 0;
	T_ = 0;

	// This is the max of all timesteps, valid or not. Needed to compute offsets
	int Tmax = 0;

	// This is the max of all labels, valid or not. Needed to compute offsets
	int Lmax = 0;
	int total_label_length = 0;

	constexpr int cpu_buffer_size = 64;
	int repeats[cpu_buffer_size];
	int label_offsets[cpu_buffer_size];

	const int num_passes = ctc_helper::div_up(minibatch_, cpu_buffer_size);

	cudaError_t cuda_status;

	for (int pass = 0; pass < num_passes; ++pass) {

	const int start_idx = pass * cpu_buffer_size;
	const int end_idx = std::min(minibatch_, (pass+1) * cpu_buffer_size);

	for (int j = start_idx; j < end_idx; ++j) {
	const int L = label_lengths[j];
	const int local_T = input_lengths[j];
	const int *label_ptr = &(flat_labels[total_label_length]);

	label_offsets[j % cpu_buffer_size] = total_label_length;
	total_label_length += L;

	int repeat_counter = 0;

	for (int i = 1; i < L; ++i)
	repeat_counter += (label_ptr[i] == label_ptr[i-1]);

	repeats[j % cpu_buffer_size] = repeat_counter;
	const bool valid_label = ((L + repeat_counter) <= local_T);

	// Only update S and T if label is valid
	S_ = (valid_label) ? std::max(S_, L) : S_;
	T_ = (valid_label) ? std::max(T_, local_T) : T_;

	Tmax = std::max(Tmax, local_T);
	Lmax = std::max(Lmax, L);
	}

	cuda_status = cudaMemcpyAsync(&(repeats_[start_idx]), repeats,
	(end_idx - start_idx) * sizeof(int),
	cudaMemcpyHostToDevice, stream_);
	if (cuda_status != cudaSuccess)
	return CTC_STATUS_MEMOPS_FAILED;


	cuda_status = cudaMemcpyAsync(&(label_offsets_[start_idx]), label_offsets,
	(end_idx - start_idx) * sizeof(int),
	cudaMemcpyHostToDevice, stream_);
	if (cuda_status != cudaSuccess)
	return CTC_STATUS_MEMOPS_FAILED;
	}

	S_ = 2 * S_ + 1;
	const int Smax = 2 * Lmax + 1;

	activation_cols_ = minibatch_ * Tmax;

	// Allocate memory for T
	utt_length_ =
	reinterpret_cast<int >(static_cast<char>(gpu_workspace_) +
	gpu_bytes_used);
	gpu_bytes_used += minibatch_ * sizeof(int);

	cuda_status = cudaMemcpyAsync(utt_length_, input_lengths,
	minibatch_ * sizeof(int),
	cudaMemcpyHostToDevice, stream_);
	if (cuda_status != cudaSuccess)
	return CTC_STATUS_MEMOPS_FAILED;

	label_sizes_ =
	reinterpret_cast<int >(static_cast<char>(gpu_workspace_) +
	gpu_bytes_used);
	gpu_bytes_used += minibatch_ * sizeof(int);
	cuda_status = cudaMemcpyAsync(label_sizes_, label_lengths,
	minibatch_ * sizeof(int),
	cudaMemcpyHostToDevice, stream_);
	if (cuda_status != cudaSuccess)
	return CTC_STATUS_MEMOPS_FAILED;

	labels_without_blanks_ =
	reinterpret_cast<int >(static_cast<char>(gpu_workspace_) +
	gpu_bytes_used);
	gpu_bytes_used += Lmax * minibatch_ * sizeof(int);
	cuda_status = cudaMemcpyAsync(labels_without_blanks_, flat_labels,
	total_label_length * sizeof(int),
	cudaMemcpyHostToDevice, stream_);
	if (cuda_status != cudaSuccess)
	return CTC_STATUS_MEMOPS_FAILED;

	labels_with_blanks_ =
	reinterpret_cast<int >(static_cast<char>(gpu_workspace_) +
	gpu_bytes_used);
	gpu_bytes_used += Smax * minibatch_ * sizeof(int);

	alphas_ =
	reinterpret_cast<ProbT >(static_cast<char>(gpu_workspace_) +
	gpu_bytes_used);
	gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT);


	denoms_ =
	reinterpret_cast<ProbT >(static_cast<char>(gpu_workspace_) +
	gpu_bytes_used);
	gpu_bytes_used += activation_cols_ * sizeof(ProbT);

	log_probs_ =
	reinterpret_cast<ProbT >(static_cast<char>(gpu_workspace_) +
	gpu_bytes_used);
	gpu_bytes_used += out_dim_ * activation_cols_ * sizeof(ProbT);

	return CTC_STATUS_SUCCESS;
	}

	template<typename ProbT>
	template<int NT, int VT>
	ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT* const log_probs,
	ProbT* grads,
	bool compute_alpha,
	bool compute_beta ) {

	// One thread block per utterance
	const int grid_size = minibatch_;

	// The data is laid out so that the next timestep is minibatch entries
	// away
	const int stride = minibatch_;

	if (compute_alpha)
	compute_alpha_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
	(log_probs, label_sizes_, utt_length_,
	repeats_, labels_without_blanks_, label_offsets_,
	labels_with_blanks_, alphas_, nll_forward_,
	stride, out_dim_, S_, T_, blank_label_);


	if (compute_beta) {
	compute_betas_and_grad_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
	(log_probs, label_sizes_, utt_length_, repeats_,
	labels_with_blanks_, alphas_, nll_forward_, nll_backward_,
	grads, stride, out_dim_, S_, T_, blank_label_);

	cudaStreamSynchronize(stream_);
	}

	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess)
	return CTC_STATUS_EXECUTION_FAILED;

	return CTC_STATUS_SUCCESS;
	}

	template<typename ProbT>
	ctcStatus_t
	GpuCTC<ProbT>::create_metadata_and_choose_config(const int* const flat_labels,
	const int* const label_lengths,
	const int* const input_lengths,
	size_t& best_config) {

	// Setup the metadata for GPU
	ctcStatus_t status = setup_gpu_metadata(flat_labels, label_lengths, input_lengths);
	if (status != CTC_STATUS_SUCCESS)
	return status;

	constexpr int num_configs = 12;

	int config_NT[num_configs] =
	{32, 64, 128, 64, 128, 32, 64, 128, 64, 128, 128, 128};
	int config_VT[num_configs] =
	{ 1, 1, 1, 3, 2, 9, 6, 4, 9, 6, 9, 10};

	best_config = 0;

	for (int i = 0; i < num_configs; ++i) {
	if ((config_NT[i]* config_VT[i]) >= S_)
	break;
	else
	best_config++;
	}

	if (best_config >= num_configs)
	return CTC_STATUS_UNKNOWN_ERROR;

	return CTC_STATUS_SUCCESS;
	}

	template<typename ProbT>
	ctcStatus_t
	GpuCTC<ProbT>::launch_gpu_kernels(const ProbT* const log_probs,
	ProbT* grads,
	size_t config,
	bool l_a,
	bool l_b) {

	switch(config) {
	case 0: {return launch_alpha_beta_kernels<32, 1>(log_probs, grads, l_a, l_b);}
	case 1: {return launch_alpha_beta_kernels<64, 1>(log_probs, grads, l_a, l_b);}
	case 2: {return launch_alpha_beta_kernels<128, 1>(log_probs, grads, l_a, l_b);}
	case 3: {return launch_alpha_beta_kernels<64, 3>(log_probs, grads, l_a, l_b);}
	case 4: {return launch_alpha_beta_kernels<128, 2>(log_probs, grads, l_a, l_b);}
	case 5: {return launch_alpha_beta_kernels<32, 9>(log_probs, grads, l_a, l_b);}
	case 6: {return launch_alpha_beta_kernels<64, 6>(log_probs, grads, l_a, l_b);}
	case 7: {return launch_alpha_beta_kernels<128, 4>(log_probs, grads, l_a, l_b);}
	case 8: {return launch_alpha_beta_kernels<64, 9>(log_probs, grads, l_a, l_b);}
	case 9: {return launch_alpha_beta_kernels<128, 6>(log_probs, grads, l_a, l_b);}
	case 10: {return launch_alpha_beta_kernels<128, 9>(log_probs, grads, l_a, l_b);}
	case 11: {return launch_alpha_beta_kernels<128, 10>(log_probs, grads, l_a, l_b);}
	}

	return CTC_STATUS_EXECUTION_FAILED;
	}

	template<typename ProbT>
	ctcStatus_t
	GpuCTC<ProbT>::compute_log_probs(const ProbT* const activations) {

	cudaError_t cuda_status;
	cuda_status =
	cudaMemcpyAsync(log_probs_, activations,
	activation_cols_ * out_dim_ *sizeof(ProbT),
	cudaMemcpyDeviceToDevice, stream_);
	if (cuda_status != cudaSuccess)
	return CTC_STATUS_MEMOPS_FAILED;



	// create mshadow handles to data
	using namespace mshadow;
	using namespace mshadow::expr;
	Stream<mxnet::gpu> mxstream;
	mxstream.stream_ = stream_;
	Tensor<mxnet::gpu, 2, ProbT> log_probs_handle(log_probs_, mshadow::Shape2(activation_cols_, out_dim_), &mxstream);
	Tensor<mxnet::gpu, 1, ProbT> denoms_handle(denoms_, mshadow::Shape1(activation_cols_), &mxstream);
	denoms_handle = reduce_with_axis<red::maximum, false>(log_probs_handle, 1);



	// Kernel launch to subtract maximum
	const int NT = 128;
	const int VT = 1;
	const int NV = NT * VT;
	const int num_elements = out_dim_ * activation_cols_;
	const int grid_size = ctc_helper::div_up(num_elements, NV);

	prepare_stable_LSM_kernel<ProbT, VT> <<< grid_size, NT, 0, stream_>>>
	(ctc_helper::identity<ProbT>(), log_probs_,
	denoms_, out_dim_, num_elements);

	// compute denominators for softmax
	denoms_handle = reduce_with_axis<red::sum, false>(F<mxnet::op::mshadow_op::exp>(log_probs_handle), 1);

	// Kernel launch to calculate probabilities
	compute_log_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>
	(ctc_helper::identity<ProbT>(), log_probs_,
	denoms_, out_dim_, num_elements);

	cuda_status = cudaGetLastError();
	if (cuda_status != cudaSuccess)
	return CTC_STATUS_EXECUTION_FAILED;

	return CTC_STATUS_SUCCESS;
	}

	template<typename ProbT>
	ctcStatus_t
	GpuCTC<ProbT>::compute_cost_and_score(const ProbT* const activations,
	ProbT* grads,
	ProbT* costs,
	const int* const flat_labels,
	const int* const label_lengths,
	const int* const input_lengths,
	bool compute_alpha,
	bool compute_betas_and_grad) {

	size_t best_config;
	ctcStatus_t status = create_metadata_and_choose_config(flat_labels,
	label_lengths,
	input_lengths,
	best_config);
	if (status != CTC_STATUS_SUCCESS)
	return status;

	status = compute_log_probs(activations);
	if (status != CTC_STATUS_SUCCESS)
	return status;

	launch_gpu_kernels(log_probs_, grads, best_config,
	compute_alpha, compute_betas_and_grad);

	cudaError_t cuda_status_mem, cuda_status_sync;
	cuda_status_mem = cudaMemcpyAsync(costs, nll_forward_,
	sizeof(ProbT) * minibatch_,
	cudaMemcpyDeviceToHost, stream_);
	cuda_status_sync = cudaStreamSynchronize(stream_);
	if (cuda_status_mem != cudaSuccess \|\| cuda_status_sync != cudaSuccess)
	return CTC_STATUS_MEMOPS_FAILED;

	return CTC_STATUS_SUCCESS;
	}

	template<typename ProbT>
	ctcStatus_t
	GpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
	ProbT* grads,
	ProbT* costs,
	const int* const flat_labels,
	const int* const label_lengths,
	const int* const input_lengths) {
	if (activations == nullptr \|\|
	grads == nullptr \|\|
	costs == nullptr \|\|
	flat_labels == nullptr \|\|
	label_lengths == nullptr \|\|
	input_lengths == nullptr
	)
	return CTC_STATUS_INVALID_VALUE;

	return compute_cost_and_score(activations, grads, costs, flat_labels,
	label_lengths, input_lengths, true, true);
	}

	template<typename ProbT>
	ctcStatus_t
	GpuCTC<ProbT>::score_forward(const ProbT* const activations,
	ProbT* costs,
	const int* const flat_labels,
	const int* const label_lengths,
	const int* const input_lengths) {
	if (activations == nullptr \|\|
	costs == nullptr \|\|
	flat_labels == nullptr \|\|
	label_lengths == nullptr \|\|
	input_lengths == nullptr
	)
	return CTC_STATUS_INVALID_VALUE;

	return compute_cost_and_score(activations, nullptr, costs, flat_labels,
	label_lengths, input_lengths, true, false);
	}

	} // mxnet_warpctc