3rdparty/ctc_include/detail/cpu_ctc.h - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 #pragma once

 #include <tuple>
 #include <cmath>
 #include <limits>
 #include <algorithm>
 #include <numeric>

 #include <dmlc/omp.h>

 #include "ctc_helper.h"

 namespace mxnet_warpctc {

 template<typename ProbT>
 class CpuCTC {
 public:
     // Noncopyable
     CpuCTC(int alphabet_size, int minibatch, void* workspace,
            int blank_label) :
             alphabet_size_(alphabet_size), minibatch_(minibatch),
             workspace_(workspace), blank_label_(blank_label) {

     };

     CpuCTC(const CpuCTC&) = delete;
     CpuCTC& operator=(const CpuCTC&) = delete;

     ctcStatus_t cost_and_grad(const ProbT* const activations,
                               ProbT *grads,
                               ProbT* costs,
                               const int* const flat_labels,
                               const int* const label_lengths,
                               const int* const input_lengths);


     ctcStatus_t score_forward(const ProbT* const activations,
                               ProbT* costs,
                               const int* const flat_labels,
                               const int* const label_lengths,
                               const int* const input_lengths);

 private:

     class CpuCTC_metadata {

     private:
         int setup_labels(const int* const labels, int blank_label, int L, int S);

     public:
         CpuCTC_metadata(int L, int S, int T, int mb, int alphabet_size,
                         void* workspace, size_t bytes_used, int blank_label,
                         const int* const labels);

         ProbT* alphas;
         ProbT* betas;
         int* labels_w_blanks;
         int* e_inc;
         int* s_inc;
         ProbT* output;
         int repeats;
     };

     int alphabet_size_; // Number of characters plus blank
     int minibatch_;
     void* workspace_;
     int blank_label_;

     void log_softmax(const ProbT* const activations, ProbT* log_probs,
                      const int* const input_lengths);

     std::tuple<ProbT, bool>
             cost_and_grad_kernel(ProbT *grad, const ProbT* const log_probs,
                                  const int* const labels, int T, int L,
                                  int mb, size_t bytes_used);

     ProbT compute_alphas(const ProbT* log_probs, int repeats, int S, int T,
                          const int* const e_inc,
                          const int* const s_inc,
                          const int* const labels,
                          ProbT* alphas);

     ProbT compute_betas_and_grad(ProbT* grad, const ProbT* const log_probs,
                                  ProbT log_partition, int repeats,
                                  int S, int T, const int* const e_inc,
                                  const int* const s_inc,
                                  const int* const labels,
                                  ProbT* alphas,
                                  ProbT* betas,
                                  ProbT* output);
 };

 template<typename ProbT>
 CpuCTC<ProbT>::CpuCTC_metadata::CpuCTC_metadata(int L, int S, int T, int mb,
                                                 int alphabet_size,
                                                 void* workspace, size_t bytes_used,
                                                 int blank_label,
                                                 const int* const labels) {

     alphas = reinterpret_cast<ProbT *>(static_cast<char *>(workspace) + bytes_used);
     bytes_used += sizeof(ProbT) * S * T;
     std::fill(alphas, alphas + S * T, ctc_helper::neg_inf<ProbT>());
     betas = reinterpret_cast<ProbT *>(static_cast<char *>(workspace) + bytes_used);
     bytes_used += sizeof(ProbT) * S;
     std::fill(betas, betas + S, ctc_helper::neg_inf<ProbT>());
     labels_w_blanks = reinterpret_cast<int *>(static_cast<char *>(workspace) + bytes_used);
     bytes_used += sizeof(int) * S;
     e_inc = reinterpret_cast<int *>(static_cast<char *>(workspace) + bytes_used);
     bytes_used += sizeof(int) * S;
     s_inc = reinterpret_cast<int *>(static_cast<char *>(workspace) + bytes_used);
     bytes_used += sizeof(int) * S;
     output = reinterpret_cast<ProbT *>(static_cast<char *>(workspace) + bytes_used);
     bytes_used += sizeof(ProbT) * alphabet_size;

     repeats = setup_labels(labels, blank_label, L, S);
 }

 template<typename ProbT>
 int CpuCTC<ProbT>::CpuCTC_metadata::setup_labels(const int* const labels,
                                                  int blank_label, int L, int S) {
     int e_counter = 0;
     int s_counter = 0;

     s_inc[s_counter++] = 1;

     int repeats = 0;

     for (int i = 1; i < L; ++i) {
         if (labels[i-1] == labels[i]) {
             s_inc[s_counter++] = 1;
             s_inc[s_counter++] = 1;
             e_inc[e_counter++] = 1;
             e_inc[e_counter++] = 1;
             ++repeats;
         }
         else {
             s_inc[s_counter++] = 2;
             e_inc[e_counter++] = 2;
         }
     }
     e_inc[e_counter++] = 1;

     for (int i = 0; i < L; ++i) {
         labels_w_blanks[2 * i] = blank_label;
         labels_w_blanks[2 * i + 1] = labels[i];
     }
     labels_w_blanks[S - 1] = blank_label;

     return repeats;
 }

 template<typename ProbT>
 void
 CpuCTC<ProbT>::log_softmax(const ProbT* const activations, ProbT* log_probs,
                            const int* const input_lengths) {
 #pragma omp parallel for
     for (int mb = 0; mb < minibatch_; ++mb) {
         for(int c = 0; c < input_lengths[mb]; ++c) {
             int col_offset = (mb + minibatch_ * c) * alphabet_size_;
             ProbT max_activation = -std::numeric_limits<ProbT>::infinity();
             for(int r = 0; r < alphabet_size_; ++r)
                 max_activation = std::max(max_activation, activations[r + col_offset]);

             ProbT denom = ProbT(0.);
             for(int r = 0; r < alphabet_size_; ++r) {
                 denom += std::exp(activations[r + col_offset] - max_activation);
             }

             for(int r = 0; r < alphabet_size_; ++r) {
                 log_probs[r + col_offset] = activations[r + col_offset]
                                             - max_activation - std::log(denom);
             }
         }
     }
 }

 template<typename ProbT>
 std::tuple<ProbT, bool>
 CpuCTC<ProbT>::cost_and_grad_kernel(ProbT *grad, const ProbT* const log_probs,
                                     const int* const labels,
                                     int T, int L, int mb, size_t bytes_used) {

     const int S = 2*L + 1; // Number of labels with blanks

     CpuCTC_metadata ctcm(L, S, T, mb, alphabet_size_, workspace_, bytes_used, blank_label_, labels);

     bool over_threshold = false;

     if (L + ctcm.repeats > T) {
         return std::make_tuple(ProbT(0), over_threshold); // TODO, not right to return 0
     }

     ProbT llForward = compute_alphas(log_probs, ctcm.repeats, S, T, ctcm.e_inc,
                                      ctcm.s_inc, ctcm.labels_w_blanks,
                                      ctcm.alphas);

     ProbT llBackward = compute_betas_and_grad(grad, log_probs, llForward, ctcm.repeats,
                                               S, T, ctcm.e_inc, ctcm.s_inc,
                                               ctcm.labels_w_blanks,
                                               ctcm.alphas,
                                               ctcm.betas,
                                               ctcm.output);

     ProbT diff = std::abs(llForward - llBackward);
     if (diff > ctc_helper::threshold) {
         over_threshold = true;
     }

     return std::make_tuple(-llForward, over_threshold);
 }

 // Computes forward probabilities
 template<typename ProbT>
 ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* log_probs, int repeats, int S, int T,
                                     const int* const e_inc,
                                     const int* const s_inc,
                                     const int* const labels,
                                     ProbT* alphas) {

     int start =  (((S /2) + repeats - T) < 0) ? 0 : 1,
             end = S > 1 ? 2 : 1;

     for (int i = start; i < end; ++i) {
         alphas[i] = log_probs[labels[i]];
     }

     for(int t = 1; t < T; ++t) {
         int remain = (S / 2) + repeats - (T - t);
         if(remain >= 0)
             start += s_inc[remain];
         if(t <= (S / 2) + repeats)
             end += e_inc[t - 1];
         int startloop = start;
         int idx1 = t * S, idx2 = (t - 1) * S, idx3 = t * (alphabet_size_ * minibatch_);

         if (start == 0) {
             alphas[idx1] = alphas[idx2] + log_probs[blank_label_ + idx3];
             startloop += 1;
         }

         for(int i = startloop; i < end; ++i) {
             ProbT prev_sum = ctc_helper::log_plus<ProbT>()(alphas[i + idx2], alphas[(i-1) + idx2]);

             // Skip two if not on blank and not on repeat.
             if (labels[i] != blank_label_ && i != 1 && labels[i] != labels[i-2])
                 prev_sum = ctc_helper::log_plus<ProbT>()(prev_sum, alphas[(i-2) + idx2]);

             alphas[i + idx1] = prev_sum + log_probs[labels[i] + idx3];
         }
     }

     ProbT loglike = ctc_helper::neg_inf<ProbT>();
     for(int i = start; i < end; ++i) {
         loglike = ctc_helper::log_plus<ProbT>()(loglike, alphas[i + (T - 1) * S]);
     }

     return loglike;
 }

 // Starting from T, we sweep backward over the alpha array computing one column
 // of betas as we go.  At each position we can update product alpha * beta and then
 // sum into the gradient associated with each label.
 // NOTE computes gradient w.r.t UNNORMALIZED final layer activations.
 // Assumed passed in grads are already zeroed!
 template<typename ProbT>
 ProbT CpuCTC<ProbT>::compute_betas_and_grad(ProbT* grad, const ProbT* const log_probs,
                                             ProbT log_partition, int repeats,
                                             int S, int T, const int* const e_inc,
                                             const int* const s_inc,
                                             const int* const labels,
                                             ProbT* alphas,
                                             ProbT* betas,
                                             ProbT* output) {
     int start = S > 1 ? (S - 2) : 0,
             end = (T > (S / 2) + repeats) ? S : S-1;

     std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());

     //set the starting values in the beta column at the very right edge
     for (int i = start; i < end; ++i) {
         betas[i] = log_probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)];

         //compute alpha * beta in log space at this position in (S, T) space
         alphas[i + (T - 1) * S] += betas[i];

         //update the gradient associated with this label
         //essentially performing a reduce-by-key in a sequential manner
         output[labels[i]] =
                 ctc_helper::log_plus<ProbT>()(alphas[i + (T - 1) * S], output[labels[i]]);
     }

     //update the gradient wrt to each unique label
     for (int i = 0; i < alphabet_size_; ++i) {
         int idx3 = (T - 1) * alphabet_size_ * minibatch_ + i;

         if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
             log_probs[idx3] == ctc_helper::neg_inf<ProbT>()) {
             grad[idx3] = std::exp(log_probs[idx3]);
         } else {
             grad[idx3] = std::exp(log_probs[idx3])
                          - std::exp(output[i] - log_probs[idx3] - log_partition);
         }
     }

     //loop from the second to last column all the way to the left
     for(int t = T - 2; t >= 0; --t) {
         int remain = (S / 2) + repeats - (T - t);
         if(remain >= -1)
             start -= s_inc[remain + 1];
         if(t < (S / 2) + repeats)
             end -= e_inc[t];

         int endloop = end == S ? end - 1 : end;
         int idx1 = t * S, idx3 = t * (alphabet_size_ * minibatch_);

         std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());

         for(int i = start; i < endloop; ++i) {
             ProbT next_sum = ctc_helper::log_plus<ProbT>()(betas[i], betas[(i+1)]);
             // Skip two if not on blank and not on repeat.
             if (labels[i] != blank_label_ && i != (S-2) && labels[i] != labels[i+2]){
                 next_sum = ctc_helper::log_plus<ProbT>()(next_sum, betas[(i+2)]);
             }
             betas[i] = next_sum + log_probs[labels[i] + idx3];

             //compute alpha * beta in log space
             alphas[i + idx1] += betas[i];

             //update the gradient associated with this label
             output[labels[i]] =
                     ctc_helper::log_plus<ProbT>()(alphas[i + idx1], output[labels[i]]);
         }

         if (end == S) {
             betas[(S-1)] = betas[(S-1)] + log_probs[blank_label_ + idx3];
             alphas[(S-1) + idx1] += betas[(S-1)];

             output[labels[S-1]] =
                     ctc_helper::log_plus<ProbT>()(alphas[S-1 + idx1], output[labels[S-1]]);
         }

         //go over the unique labels and compute the final grad
         // wrt to each one at this time step
         for (int i = 0; i < alphabet_size_; ++i) {

             if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
                 log_probs[idx3] == ctc_helper::neg_inf<ProbT>()) {
                 grad[idx3] = std::exp(log_probs[idx3]);
             } else {
                 grad[idx3] = std::exp(log_probs[idx3])
                              - std::exp(output[i] - log_probs[idx3] - log_partition);
             }
             ++idx3;
         }
     }

     ProbT loglike = ctc_helper::neg_inf<ProbT>();
     for(int i = start; i < end; ++i) {
         loglike = ctc_helper::log_plus<ProbT>()(loglike, betas[i]);
     }

     return loglike;
 }

 template<typename ProbT>
 ctcStatus_t
 CpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
                              ProbT *grads,
                              ProbT *costs,
                              const int* const flat_labels,
                              const int* const label_lengths,
                              const int* const input_lengths) {
     if (activations == nullptr ||
         grads == nullptr ||
         costs == nullptr ||
         flat_labels == nullptr ||
         label_lengths == nullptr ||
         input_lengths == nullptr
         )
         return CTC_STATUS_INVALID_VALUE;

     ProbT* log_probs = static_cast<ProbT *>(workspace_);

     int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);

     size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;

     //per minibatch memory
     size_t per_minibatch_bytes = 0;

     int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);;
     int maxS = 2 * maxL + 1;

     //output
     per_minibatch_bytes += sizeof(float) * alphabet_size_;

     //alphas
     per_minibatch_bytes += sizeof(float) * maxS * maxT;

     //betas
     per_minibatch_bytes += sizeof(float) * maxS;

     //labels w/blanks, e_inc, s_inc
     per_minibatch_bytes += 3 * sizeof(int) * maxS;

     log_softmax(activations, log_probs, input_lengths);

 #pragma omp parallel for
     for (int mb = 0; mb < minibatch_; ++mb) {
         const int T = input_lengths[mb]; // Length of utterance (time)
         const int L = label_lengths[mb]; // Number of labels in transcription

         bool mb_status;

         std::tie(costs[mb], mb_status) =
                 cost_and_grad_kernel(grads + mb * alphabet_size_,
                                      log_probs + mb * alphabet_size_,
                                      flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0),
                                      T, L, mb,
                                      bytes_used + mb * per_minibatch_bytes);
     }

     return CTC_STATUS_SUCCESS;
 }

 template<typename ProbT>
 ctcStatus_t CpuCTC<ProbT>::score_forward(const ProbT* const activations,
                                          ProbT* costs,
                                          const int* const flat_labels,
                                          const int* const label_lengths,
                                          const int* const input_lengths) {
     if (activations == nullptr ||
         costs == nullptr ||
         flat_labels == nullptr ||
         label_lengths == nullptr ||
         input_lengths == nullptr
         )
         return CTC_STATUS_INVALID_VALUE;

     ProbT* log_probs = static_cast<ProbT *>(workspace_);

     int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);

     size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;

     //per minibatch memory
     size_t per_minibatch_bytes = 0;

     int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
     int maxS = 2 * maxL + 1;

     //output
     per_minibatch_bytes += sizeof(float) * alphabet_size_;

     //alphas
     per_minibatch_bytes += sizeof(float) * maxS * maxT;

     //betas
     per_minibatch_bytes += sizeof(float) * maxS;

     //labels w/blanks, e_inc, s_inc
     per_minibatch_bytes += 3 * sizeof(int) * maxS;

     log_softmax(activations, log_probs, input_lengths);

 #pragma omp parallel for
     for (int mb = 0; mb < minibatch_; ++mb) {
         const int T = input_lengths[mb]; // Length of utterance (time)
         const int L = label_lengths[mb]; // Number of labels in transcription
         const int S = 2*L + 1; // Number of labels with blanks

         CpuCTC_metadata ctcm(L, S, T, mb, alphabet_size_, workspace_,
                              bytes_used + mb * per_minibatch_bytes, blank_label_,
                              flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0));


         if (L + ctcm.repeats > T)
             costs[mb] = ProbT(0);
         else {
             costs[mb] = -compute_alphas(log_probs + mb * alphabet_size_, ctcm.repeats, S, T,
                                         ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks,
                                         ctcm.alphas);
         }

     }

     return CTC_STATUS_SUCCESS;
 }

 } // mxnet_warpctc
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	#pragma once

	#include <tuple>
	#include <cmath>
	#include <limits>
	#include <algorithm>
	#include <numeric>

	#include <dmlc/omp.h>

	#include "ctc_helper.h"

	namespace mxnet_warpctc {

	template<typename ProbT>
	class CpuCTC {
	public:
	// Noncopyable
	CpuCTC(int alphabet_size, int minibatch, void* workspace,
	int blank_label) :
	alphabet_size_(alphabet_size), minibatch_(minibatch),
	workspace_(workspace), blank_label_(blank_label) {

	};

	CpuCTC(const CpuCTC&) = delete;
	CpuCTC& operator=(const CpuCTC&) = delete;

	ctcStatus_t cost_and_grad(const ProbT* const activations,
	ProbT *grads,
	ProbT* costs,
	const int* const flat_labels,
	const int* const label_lengths,
	const int* const input_lengths);


	ctcStatus_t score_forward(const ProbT* const activations,
	ProbT* costs,
	const int* const flat_labels,
	const int* const label_lengths,
	const int* const input_lengths);

	private:

	class CpuCTC_metadata {

	private:
	int setup_labels(const int* const labels, int blank_label, int L, int S);

	public:
	CpuCTC_metadata(int L, int S, int T, int mb, int alphabet_size,
	void* workspace, size_t bytes_used, int blank_label,
	const int* const labels);

	ProbT* alphas;
	ProbT* betas;
	int* labels_w_blanks;
	int* e_inc;
	int* s_inc;
	ProbT* output;
	int repeats;
	};

	int alphabet_size_; // Number of characters plus blank
	int minibatch_;
	void* workspace_;
	int blank_label_;

	void log_softmax(const ProbT* const activations, ProbT* log_probs,
	const int* const input_lengths);

	std::tuple<ProbT, bool>
	cost_and_grad_kernel(ProbT grad, const ProbT const log_probs,
	const int* const labels, int T, int L,
	int mb, size_t bytes_used);

	ProbT compute_alphas(const ProbT* log_probs, int repeats, int S, int T,
	const int* const e_inc,
	const int* const s_inc,
	const int* const labels,
	ProbT* alphas);

	ProbT compute_betas_and_grad(ProbT* grad, const ProbT* const log_probs,
	ProbT log_partition, int repeats,
	int S, int T, const int* const e_inc,
	const int* const s_inc,
	const int* const labels,
	ProbT* alphas,
	ProbT* betas,
	ProbT* output);
	};

	template<typename ProbT>
	CpuCTC<ProbT>::CpuCTC_metadata::CpuCTC_metadata(int L, int S, int T, int mb,
	int alphabet_size,
	void* workspace, size_t bytes_used,
	int blank_label,
	const int* const labels) {

	alphas = reinterpret_cast<ProbT >(static_cast<char >(workspace) + bytes_used);
	bytes_used += sizeof(ProbT) * S * T;
	std::fill(alphas, alphas + S * T, ctc_helper::neg_inf<ProbT>());
	betas = reinterpret_cast<ProbT >(static_cast<char >(workspace) + bytes_used);
	bytes_used += sizeof(ProbT) * S;
	std::fill(betas, betas + S, ctc_helper::neg_inf<ProbT>());
	labels_w_blanks = reinterpret_cast<int >(static_cast<char >(workspace) + bytes_used);
	bytes_used += sizeof(int) * S;
	e_inc = reinterpret_cast<int >(static_cast<char >(workspace) + bytes_used);
	bytes_used += sizeof(int) * S;
	s_inc = reinterpret_cast<int >(static_cast<char >(workspace) + bytes_used);
	bytes_used += sizeof(int) * S;
	output = reinterpret_cast<ProbT >(static_cast<char >(workspace) + bytes_used);
	bytes_used += sizeof(ProbT) * alphabet_size;

	repeats = setup_labels(labels, blank_label, L, S);
	}

	template<typename ProbT>
	int CpuCTC<ProbT>::CpuCTC_metadata::setup_labels(const int* const labels,
	int blank_label, int L, int S) {
	int e_counter = 0;
	int s_counter = 0;

	s_inc[s_counter++] = 1;

	int repeats = 0;

	for (int i = 1; i < L; ++i) {
	if (labels[i-1] == labels[i]) {
	s_inc[s_counter++] = 1;
	s_inc[s_counter++] = 1;
	e_inc[e_counter++] = 1;
	e_inc[e_counter++] = 1;
	++repeats;
	}
	else {
	s_inc[s_counter++] = 2;
	e_inc[e_counter++] = 2;
	}
	}
	e_inc[e_counter++] = 1;

	for (int i = 0; i < L; ++i) {
	labels_w_blanks[2 * i] = blank_label;
	labels_w_blanks[2 * i + 1] = labels[i];
	}
	labels_w_blanks[S - 1] = blank_label;

	return repeats;
	}

	template<typename ProbT>
	void
	CpuCTC<ProbT>::log_softmax(const ProbT* const activations, ProbT* log_probs,
	const int* const input_lengths) {
	#pragma omp parallel for
	for (int mb = 0; mb < minibatch_; ++mb) {
	for(int c = 0; c < input_lengths[mb]; ++c) {
	int col_offset = (mb + minibatch_ * c) * alphabet_size_;
	ProbT max_activation = -std::numeric_limits<ProbT>::infinity();
	for(int r = 0; r < alphabet_size_; ++r)
	max_activation = std::max(max_activation, activations[r + col_offset]);

	ProbT denom = ProbT(0.);
	for(int r = 0; r < alphabet_size_; ++r) {
	denom += std::exp(activations[r + col_offset] - max_activation);
	}

	for(int r = 0; r < alphabet_size_; ++r) {
	log_probs[r + col_offset] = activations[r + col_offset]
	- max_activation - std::log(denom);
	}
	}
	}
	}

	template<typename ProbT>
	std::tuple<ProbT, bool>
	CpuCTC<ProbT>::cost_and_grad_kernel(ProbT grad, const ProbT const log_probs,
	const int* const labels,
	int T, int L, int mb, size_t bytes_used) {

	const int S = 2*L + 1; // Number of labels with blanks

	CpuCTC_metadata ctcm(L, S, T, mb, alphabet_size_, workspace_, bytes_used, blank_label_, labels);

	bool over_threshold = false;

	if (L + ctcm.repeats > T) {
	return std::make_tuple(ProbT(0), over_threshold); // TODO, not right to return 0
	}

	ProbT llForward = compute_alphas(log_probs, ctcm.repeats, S, T, ctcm.e_inc,
	ctcm.s_inc, ctcm.labels_w_blanks,
	ctcm.alphas);

	ProbT llBackward = compute_betas_and_grad(grad, log_probs, llForward, ctcm.repeats,
	S, T, ctcm.e_inc, ctcm.s_inc,
	ctcm.labels_w_blanks,
	ctcm.alphas,
	ctcm.betas,
	ctcm.output);

	ProbT diff = std::abs(llForward - llBackward);
	if (diff > ctc_helper::threshold) {
	over_threshold = true;
	}

	return std::make_tuple(-llForward, over_threshold);
	}

	// Computes forward probabilities
	template<typename ProbT>
	ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* log_probs, int repeats, int S, int T,
	const int* const e_inc,
	const int* const s_inc,
	const int* const labels,
	ProbT* alphas) {

	int start = (((S /2) + repeats - T) < 0) ? 0 : 1,
	end = S > 1 ? 2 : 1;

	for (int i = start; i < end; ++i) {
	alphas[i] = log_probs[labels[i]];
	}

	for(int t = 1; t < T; ++t) {
	int remain = (S / 2) + repeats - (T - t);
	if(remain >= 0)
	start += s_inc[remain];
	if(t <= (S / 2) + repeats)
	end += e_inc[t - 1];
	int startloop = start;
	int idx1 = t * S, idx2 = (t - 1) * S, idx3 = t * (alphabet_size_ * minibatch_);

	if (start == 0) {
	alphas[idx1] = alphas[idx2] + log_probs[blank_label_ + idx3];
	startloop += 1;
	}

	for(int i = startloop; i < end; ++i) {
	ProbT prev_sum = ctc_helper::log_plus<ProbT>()(alphas[i + idx2], alphas[(i-1) + idx2]);

	// Skip two if not on blank and not on repeat.
	if (labels[i] != blank_label_ && i != 1 && labels[i] != labels[i-2])
	prev_sum = ctc_helper::log_plus<ProbT>()(prev_sum, alphas[(i-2) + idx2]);

	alphas[i + idx1] = prev_sum + log_probs[labels[i] + idx3];
	}
	}

	ProbT loglike = ctc_helper::neg_inf<ProbT>();
	for(int i = start; i < end; ++i) {
	loglike = ctc_helper::log_plus<ProbT>()(loglike, alphas[i + (T - 1) * S]);
	}

	return loglike;
	}

	// Starting from T, we sweep backward over the alpha array computing one column
	// of betas as we go. At each position we can update product alpha * beta and then
	// sum into the gradient associated with each label.
	// NOTE computes gradient w.r.t UNNORMALIZED final layer activations.
	// Assumed passed in grads are already zeroed!
	template<typename ProbT>
	ProbT CpuCTC<ProbT>::compute_betas_and_grad(ProbT* grad, const ProbT* const log_probs,
	ProbT log_partition, int repeats,
	int S, int T, const int* const e_inc,
	const int* const s_inc,
	const int* const labels,
	ProbT* alphas,
	ProbT* betas,
	ProbT* output) {
	int start = S > 1 ? (S - 2) : 0,
	end = (T > (S / 2) + repeats) ? S : S-1;

	std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());

	//set the starting values in the beta column at the very right edge
	for (int i = start; i < end; ++i) {
	betas[i] = log_probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)];

	//compute alpha * beta in log space at this position in (S, T) space
	alphas[i + (T - 1) * S] += betas[i];

	//update the gradient associated with this label
	//essentially performing a reduce-by-key in a sequential manner
	output[labels[i]] =
	ctc_helper::log_plus<ProbT>()(alphas[i + (T - 1) * S], output[labels[i]]);
	}

	//update the gradient wrt to each unique label
	for (int i = 0; i < alphabet_size_; ++i) {
	int idx3 = (T - 1) * alphabet_size_ * minibatch_ + i;

	if (output[i] == 0.0 \|\| output[i] == ctc_helper::neg_inf<ProbT>() \|\|
	log_probs[idx3] == ctc_helper::neg_inf<ProbT>()) {
	grad[idx3] = std::exp(log_probs[idx3]);
	} else {
	grad[idx3] = std::exp(log_probs[idx3])
	- std::exp(output[i] - log_probs[idx3] - log_partition);
	}
	}

	//loop from the second to last column all the way to the left
	for(int t = T - 2; t >= 0; --t) {
	int remain = (S / 2) + repeats - (T - t);
	if(remain >= -1)
	start -= s_inc[remain + 1];
	if(t < (S / 2) + repeats)
	end -= e_inc[t];

	int endloop = end == S ? end - 1 : end;
	int idx1 = t * S, idx3 = t * (alphabet_size_ * minibatch_);

	std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());

	for(int i = start; i < endloop; ++i) {
	ProbT next_sum = ctc_helper::log_plus<ProbT>()(betas[i], betas[(i+1)]);
	// Skip two if not on blank and not on repeat.
	if (labels[i] != blank_label_ && i != (S-2) && labels[i] != labels[i+2]){
	next_sum = ctc_helper::log_plus<ProbT>()(next_sum, betas[(i+2)]);
	}
	betas[i] = next_sum + log_probs[labels[i] + idx3];

	//compute alpha * beta in log space
	alphas[i + idx1] += betas[i];

	//update the gradient associated with this label
	output[labels[i]] =
	ctc_helper::log_plus<ProbT>()(alphas[i + idx1], output[labels[i]]);
	}

	if (end == S) {
	betas[(S-1)] = betas[(S-1)] + log_probs[blank_label_ + idx3];
	alphas[(S-1) + idx1] += betas[(S-1)];

	output[labels[S-1]] =
	ctc_helper::log_plus<ProbT>()(alphas[S-1 + idx1], output[labels[S-1]]);
	}

	//go over the unique labels and compute the final grad
	// wrt to each one at this time step
	for (int i = 0; i < alphabet_size_; ++i) {

	if (output[i] == 0.0 \|\| output[i] == ctc_helper::neg_inf<ProbT>() \|\|
	log_probs[idx3] == ctc_helper::neg_inf<ProbT>()) {
	grad[idx3] = std::exp(log_probs[idx3]);
	} else {
	grad[idx3] = std::exp(log_probs[idx3])
	- std::exp(output[i] - log_probs[idx3] - log_partition);
	}
	++idx3;
	}
	}

	ProbT loglike = ctc_helper::neg_inf<ProbT>();
	for(int i = start; i < end; ++i) {
	loglike = ctc_helper::log_plus<ProbT>()(loglike, betas[i]);
	}

	return loglike;
	}

	template<typename ProbT>
	ctcStatus_t
	CpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
	ProbT *grads,
	ProbT *costs,
	const int* const flat_labels,
	const int* const label_lengths,
	const int* const input_lengths) {
	if (activations == nullptr \|\|
	grads == nullptr \|\|
	costs == nullptr \|\|
	flat_labels == nullptr \|\|
	label_lengths == nullptr \|\|
	input_lengths == nullptr
	)
	return CTC_STATUS_INVALID_VALUE;

	ProbT* log_probs = static_cast<ProbT *>(workspace_);

	int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);

	size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;

	//per minibatch memory
	size_t per_minibatch_bytes = 0;

	int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);;
	int maxS = 2 * maxL + 1;

	//output
	per_minibatch_bytes += sizeof(float) * alphabet_size_;

	//alphas
	per_minibatch_bytes += sizeof(float) * maxS * maxT;

	//betas
	per_minibatch_bytes += sizeof(float) * maxS;

	//labels w/blanks, e_inc, s_inc
	per_minibatch_bytes += 3 * sizeof(int) * maxS;

	log_softmax(activations, log_probs, input_lengths);

	#pragma omp parallel for
	for (int mb = 0; mb < minibatch_; ++mb) {
	const int T = input_lengths[mb]; // Length of utterance (time)
	const int L = label_lengths[mb]; // Number of labels in transcription

	bool mb_status;

	std::tie(costs[mb], mb_status) =
	cost_and_grad_kernel(grads + mb * alphabet_size_,
	log_probs + mb * alphabet_size_,
	flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0),
	T, L, mb,
	bytes_used + mb * per_minibatch_bytes);
	}

	return CTC_STATUS_SUCCESS;
	}

	template<typename ProbT>
	ctcStatus_t CpuCTC<ProbT>::score_forward(const ProbT* const activations,
	ProbT* costs,
	const int* const flat_labels,
	const int* const label_lengths,
	const int* const input_lengths) {
	if (activations == nullptr \|\|
	costs == nullptr \|\|
	flat_labels == nullptr \|\|
	label_lengths == nullptr \|\|
	input_lengths == nullptr
	)
	return CTC_STATUS_INVALID_VALUE;

	ProbT* log_probs = static_cast<ProbT *>(workspace_);

	int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);

	size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;

	//per minibatch memory
	size_t per_minibatch_bytes = 0;

	int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
	int maxS = 2 * maxL + 1;

	//output
	per_minibatch_bytes += sizeof(float) * alphabet_size_;

	//alphas
	per_minibatch_bytes += sizeof(float) * maxS * maxT;

	//betas
	per_minibatch_bytes += sizeof(float) * maxS;

	//labels w/blanks, e_inc, s_inc
	per_minibatch_bytes += 3 * sizeof(int) * maxS;

	log_softmax(activations, log_probs, input_lengths);

	#pragma omp parallel for
	for (int mb = 0; mb < minibatch_; ++mb) {
	const int T = input_lengths[mb]; // Length of utterance (time)
	const int L = label_lengths[mb]; // Number of labels in transcription
	const int S = 2*L + 1; // Number of labels with blanks

	CpuCTC_metadata ctcm(L, S, T, mb, alphabet_size_, workspace_,
	bytes_used + mb * per_minibatch_bytes, blank_label_,
	flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0));


	if (L + ctcm.repeats > T)
	costs[mb] = ProbT(0);
	else {
	costs[mb] = -compute_alphas(log_probs + mb * alphabet_size_, ctcm.repeats, S, T,
	ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks,
	ctcm.alphas);
	}

	}

	return CTC_STATUS_SUCCESS;
	}

	} // mxnet_warpctc