src/modules/crf/viterbi.cpp - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file viterbi.cpp
  *
  * @brief Viterbi Algorithm for inferencing implementation
  *
  * This function implements viterbi algorithm to generate best label sequence
  * given the state emission and state transition matrices.
  *
  *//* ----------------------------------------------------------------------- */

 #include <dbconnector/dbconnector.hpp>
 #include <cmath>
 #include "viterbi.hpp"

 namespace madlib {
 namespace modules {
 namespace crf {

 using namespace dbal::eigen_integration;
 using madlib::dbconnector::postgres::madlib_construct_array;


 AnyType vcrf_top1_label::run(AnyType& args) {

     ArrayHandle<double> mArray = args[0].getAs<ArrayHandle<double> >();
     ArrayHandle<double> rArray = args[1].getAs<ArrayHandle<double> >();
     const int32_t numLabels = args[2].getAs<int32_t>();

     if (numLabels == 0)
         throw std::invalid_argument("Number of labels cannot be zero");

     int doc_len = static_cast<int>(rArray.size() / numLabels);

     double* prev_top1_array = new double[numLabels];
     double* curr_top1_array = new double[numLabels];
     double* prev_norm_array = new double[numLabels];
     double* curr_norm_array = new double[numLabels];
     int* path = new int[doc_len*numLabels];

     memset(prev_top1_array, 0, numLabels*sizeof(double));
     memset(prev_norm_array, 0, numLabels*sizeof(double));
     memset(path, 0, doc_len*numLabels*sizeof(int));

     for(int start_pos = 0; start_pos < doc_len; start_pos++) {
         memset(curr_top1_array, 0, numLabels*sizeof(double));
         memset(curr_norm_array, 0, numLabels*sizeof(double));

         if (start_pos == 0) {
             for (int label = 0; label < numLabels; label++) {
                  curr_norm_array[label] = rArray[label] + mArray[label];
                  curr_top1_array[label] = rArray[label] + mArray[label];
             }
         } else {
             for (int curr_label = 0; curr_label < numLabels; curr_label++) {
                 for (int prev_label = 0; prev_label < numLabels; prev_label++) {
                     double top1_new_score = prev_top1_array[prev_label]
                                                + rArray[start_pos*numLabels + curr_label]
                                                + mArray[(prev_label+1)*numLabels + curr_label];

                     if (start_pos == doc_len - 1)
                         top1_new_score += mArray[(numLabels+1)*numLabels + curr_label];

                     if (top1_new_score > curr_top1_array[curr_label]) {
                         curr_top1_array[curr_label] = top1_new_score;
                         path[start_pos*numLabels + curr_label] = prev_label;
                     }

                     /* calculate the probability of the best label sequence */
                     double norm_new_score = prev_norm_array[prev_label]
                                                + rArray[start_pos * numLabels + curr_label]
                                                + mArray[(prev_label+1)*numLabels + curr_label];

                     /* last token in a sentence, the end feature should be fired */
                     if (start_pos == doc_len - 1)
                         norm_new_score += mArray[(numLabels+1)*numLabels + curr_label];

                     /* The following wants to do z = log(exp(x)+exp(y)), the faster implementation is
                      *  z=min(x,y) + log(exp(abs(x-y))+1)
                      *  0.5 is for rounding
                      */
                     if (curr_norm_array[curr_label] == 0)
                         curr_norm_array[curr_label] = norm_new_score;
                     else {
                         double x = curr_norm_array[curr_label];
                         double y = norm_new_score;
                         curr_norm_array[curr_label] = std::min(x,y) +
                                 static_cast<double>(log(std::exp(std::abs(y-x)/1000.0) +1)*1000.0 + 0.5);
                     }
                 }
             }
         }
         for (int label = 0; label < numLabels; label++) {
             prev_top1_array[label] = curr_top1_array[label];
             prev_norm_array[label] = curr_norm_array[label];
         }
     }

     /* find the label of the last token in a sentence */
     double max_score = 0.0;
     int top1_label = 0;
     for(int label = 0; label < numLabels; label++) {
         if(curr_top1_array[label] > max_score) {
             max_score = curr_top1_array[label];
             top1_label = label;
         }
     }

     /* Define the result array with doc_len+1 elements, where the first doc_len
      * elements are used to store the best labels and the last element is used
      * to store the conditional probability of the sequence.
      */

     // FIXME: construct_array functions circumvent the abstraction layer. These
     // should be replaced with appropriate Allocator:: calls.
     MutableArrayHandle<int> result(
         madlib_construct_array(
             NULL, doc_len + 1, INT4OID, sizeof(int32_t), true, 'i'));

     /* trace back to get the labels for the rest tokens in a sentence */
     result[doc_len - 1] = top1_label;
     for (int pos = doc_len - 1; pos >= 1; pos--) {
         top1_label = path[pos * numLabels + top1_label];
         result[pos-1] = top1_label;
     }

     /* compute the sum_i of log(v1[i]/1000), return (e^sum)*1000
      * used in the UDFs which needs marginalization e.g., normalization
      * the following wants to do z=log(exp(x)+exp(y)), the faster implementation is
      * z = min(x,y) + log(exp(abs(x-y))+1)
      */
     double norm_factor = 0.0;
     for (int i = 0; i < numLabels; i++) {
         if (i==0)
             norm_factor = curr_norm_array[0];
         else {
             double x = curr_norm_array[i];
             double y = norm_factor;
             norm_factor = std::min(x,y) + static_cast<double>(log(exp(std::abs(y-x)/1000.0) +1)*1000.0+0.5);
          }
     }

     /* calculate the conditional probability.
      * To convert the probability into integer, firstly,let it multiply 1000000, then later make the product divided by 1000000
      * to get the real conditional probability
      */
     result[doc_len] = static_cast<int>(std::exp((max_score - norm_factor)/1000.0)*1000000);

     delete[] prev_top1_array;
     delete[] curr_top1_array;
     delete[] prev_norm_array;
     delete[] curr_norm_array;
     delete[] path;

     return result;
 }

 }
 }
 }
	/* ----------------------------------------------------------------------- //*
	*
	* @file viterbi.cpp
	*
	* @brief Viterbi Algorithm for inferencing implementation
	*
	* This function implements viterbi algorithm to generate best label sequence
	* given the state emission and state transition matrices.
	*
	// ----------------------------------------------------------------------- */

	#include <dbconnector/dbconnector.hpp>
	#include <cmath>
	#include "viterbi.hpp"

	namespace madlib {
	namespace modules {
	namespace crf {

	using namespace dbal::eigen_integration;
	using madlib::dbconnector::postgres::madlib_construct_array;


	AnyType vcrf_top1_label::run(AnyType& args) {

	ArrayHandle<double> mArray = args[0].getAs<ArrayHandle<double> >();
	ArrayHandle<double> rArray = args[1].getAs<ArrayHandle<double> >();
	const int32_t numLabels = args[2].getAs<int32_t>();

	if (numLabels == 0)
	throw std::invalid_argument("Number of labels cannot be zero");

	int doc_len = static_cast<int>(rArray.size() / numLabels);

	double* prev_top1_array = new double[numLabels];
	double* curr_top1_array = new double[numLabels];
	double* prev_norm_array = new double[numLabels];
	double* curr_norm_array = new double[numLabels];
	int* path = new int[doc_len*numLabels];

	memset(prev_top1_array, 0, numLabels*sizeof(double));
	memset(prev_norm_array, 0, numLabels*sizeof(double));
	memset(path, 0, doc_lennumLabelssizeof(int));

	for(int start_pos = 0; start_pos < doc_len; start_pos++) {
	memset(curr_top1_array, 0, numLabels*sizeof(double));
	memset(curr_norm_array, 0, numLabels*sizeof(double));

	if (start_pos == 0) {
	for (int label = 0; label < numLabels; label++) {
	curr_norm_array[label] = rArray[label] + mArray[label];
	curr_top1_array[label] = rArray[label] + mArray[label];
	}
	} else {
	for (int curr_label = 0; curr_label < numLabels; curr_label++) {
	for (int prev_label = 0; prev_label < numLabels; prev_label++) {
	double top1_new_score = prev_top1_array[prev_label]
	+ rArray[start_pos*numLabels + curr_label]
	+ mArray[(prev_label+1)*numLabels + curr_label];

	if (start_pos == doc_len - 1)
	top1_new_score += mArray[(numLabels+1)*numLabels + curr_label];

	if (top1_new_score > curr_top1_array[curr_label]) {
	curr_top1_array[curr_label] = top1_new_score;
	path[start_pos*numLabels + curr_label] = prev_label;
	}

	/* calculate the probability of the best label sequence */
	double norm_new_score = prev_norm_array[prev_label]
	+ rArray[start_pos * numLabels + curr_label]
	+ mArray[(prev_label+1)*numLabels + curr_label];

	/* last token in a sentence, the end feature should be fired */
	if (start_pos == doc_len - 1)
	norm_new_score += mArray[(numLabels+1)*numLabels + curr_label];

	/* The following wants to do z = log(exp(x)+exp(y)), the faster implementation is
	* z=min(x,y) + log(exp(abs(x-y))+1)
	* 0.5 is for rounding
	*/
	if (curr_norm_array[curr_label] == 0)
	curr_norm_array[curr_label] = norm_new_score;
	else {
	double x = curr_norm_array[curr_label];
	double y = norm_new_score;
	curr_norm_array[curr_label] = std::min(x,y) +
	static_cast<double>(log(std::exp(std::abs(y-x)/1000.0) +1)*1000.0 + 0.5);
	}
	}
	}
	}
	for (int label = 0; label < numLabels; label++) {
	prev_top1_array[label] = curr_top1_array[label];
	prev_norm_array[label] = curr_norm_array[label];
	}
	}

	/* find the label of the last token in a sentence */
	double max_score = 0.0;
	int top1_label = 0;
	for(int label = 0; label < numLabels; label++) {
	if(curr_top1_array[label] > max_score) {
	max_score = curr_top1_array[label];
	top1_label = label;
	}
	}

	/* Define the result array with doc_len+1 elements, where the first doc_len
	* elements are used to store the best labels and the last element is used
	* to store the conditional probability of the sequence.
	*/

	// FIXME: construct_array functions circumvent the abstraction layer. These
	// should be replaced with appropriate Allocator:: calls.
	MutableArrayHandle<int> result(
	madlib_construct_array(
	NULL, doc_len + 1, INT4OID, sizeof(int32_t), true, 'i'));

	/* trace back to get the labels for the rest tokens in a sentence */
	result[doc_len - 1] = top1_label;
	for (int pos = doc_len - 1; pos >= 1; pos--) {
	top1_label = path[pos * numLabels + top1_label];
	result[pos-1] = top1_label;
	}

	/* compute the sum_i of log(v1[i]/1000), return (e^sum)*1000
	* used in the UDFs which needs marginalization e.g., normalization
	* the following wants to do z=log(exp(x)+exp(y)), the faster implementation is
	* z = min(x,y) + log(exp(abs(x-y))+1)
	*/
	double norm_factor = 0.0;
	for (int i = 0; i < numLabels; i++) {
	if (i==0)
	norm_factor = curr_norm_array[0];
	else {
	double x = curr_norm_array[i];
	double y = norm_factor;
	norm_factor = std::min(x,y) + static_cast<double>(log(exp(std::abs(y-x)/1000.0) +1)*1000.0+0.5);
	}
	}

	/* calculate the conditional probability.
	* To convert the probability into integer, firstly,let it multiply 1000000, then later make the product divided by 1000000
	* to get the real conditional probability
	*/
	result[doc_len] = static_cast<int>(std::exp((max_score - norm_factor)/1000.0)*1000000);

	delete[] prev_top1_array;
	delete[] curr_top1_array;
	delete[] prev_norm_array;
	delete[] curr_norm_array;
	delete[] path;

	return result;
	}

	}
	}
	}