src/modules/crf/linear_crf.cpp - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file linear_crf.cpp
  *
  * @brief Linear-chain Conditional Random Field functions
  *
  * We implement limited-memory BFGS method.
  *
  *//* ----------------------------------------------------------------------- */
 #include <iostream>
 #include <dbconnector/dbconnector.hpp>
 #include <modules/shared/HandleTraits.hpp>
 #include "linear_crf.hpp"
 namespace madlib {

 // Use Eigen
 using namespace dbal::eigen_integration;

 namespace modules {

 // Import names from other MADlib modules
 using dbal::NoSolutionFoundException;

 namespace crf {

 // Internal functions
 AnyType stateToResult(const Allocator &inAllocator,
                       const HandleMap<const ColumnVector,
                       TransparentHandle<double> > &incoef,
                       double loglikelihood, const uint32_t &iterations);

 /**
  * @brief Inter- and intra-iteration state for lbfgs method for
  * linear-chain conditional random field
  *
  * TransitionState encapsualtes the transition state during the
  * linear-chain crf aggregate function. To the database, the state is
  * exposed as a single DOUBLE PRECISION array, to the C++ code it is a proper
  * object containing scalars and vectors.
  *
  * Note: We assume that the DOUBLE PRECISION array is initialized by the
  * database with length at least 58, and all elemenets are 0.
  */
 template <class Handle>
 class LinCrfLBFGSTransitionState {
     template <class OtherHandle>
     friend class LinCrfLBFGSTransitionState;

 public:
     LinCrfLBFGSTransitionState(const AnyType &inArray)
         : mStorage(inArray.getAs<Handle>()) {

         rebind(static_cast<uint32_t>(mStorage[1]));
     }

     /**
      * @brief Convert to backend representation
      *
      * We define this function so that we can use State in the
      * argument list and as a return type.
      */
     inline operator AnyType() const {
         return mStorage;
     }

     /**
      * @brief Initialize the lbfgs state.
      *
      * This function is only called for the first row.
      */
     inline void initialize(const Allocator &inAllocator, uint32_t inWidthOfX,
                             uint32_t tagSize) {
         mStorage = inAllocator.allocateArray<double, dbal::AggregateContext,
         dbal::DoZero, dbal::ThrowBadAlloc>(arraySize(inWidthOfX));
         rebind(inWidthOfX);
         num_features = inWidthOfX;
         num_labels =  tagSize;
         if(iteration == 0)
             diag.fill(1);
     }

     /**
      * @brief We need to support assigning the previous state
      */
     template <class OtherHandle>
     LinCrfLBFGSTransitionState &operator=(
         const LinCrfLBFGSTransitionState<OtherHandle> &inOtherState) {

         for (size_t i = 0; i < mStorage.size(); i++)
             mStorage[i] = inOtherState.mStorage[i];
         return *this;
     }

     /**
      * @brief Merge with another State object by copying the intra-iteration
      * fields
      */
     template <class OtherHandle>
     LinCrfLBFGSTransitionState &operator+=(
         const LinCrfLBFGSTransitionState<OtherHandle> &inOtherState) {
         if (mStorage.size() != inOtherState.mStorage.size())
             throw std::logic_error("Internal error: Incompatible transition "
                                    "states");
         numRows += inOtherState.numRows;
         grad += inOtherState.grad;
         loglikelihood += inOtherState.loglikelihood;
         return *this;
     }

     /**
      * @brief Reset the inter-iteration fields.
      */
     inline void reset() {
         numRows = 0;
         grad.fill(0);
         loglikelihood = 0;
     }

     static const int m=7;// The number of corrections used in the LBFGS update.

 private:
     static inline uint32_t arraySize(const uint32_t num_features) {
         return 52 + 3 * num_features + num_features*(2*m+1)+2*m;
     }

     void rebind(uint32_t inWidthOfFeature) {
         iteration.rebind(&mStorage[0]);
         num_features.rebind(&mStorage[1]);
         num_labels.rebind(&mStorage[2]);
         coef.rebind(&mStorage[3], inWidthOfFeature);
         diag.rebind(&mStorage[3 + inWidthOfFeature], inWidthOfFeature);
         grad.rebind(&mStorage[3 + 2 * inWidthOfFeature], inWidthOfFeature);
         ws.rebind(&mStorage[3 + 3 * inWidthOfFeature],
                                     inWidthOfFeature*(2*m+1)+2*m);
         numRows.rebind(&mStorage[3 + 3 * inWidthOfFeature +
                                     inWidthOfFeature*(2*m+1)+2*m]);
         loglikelihood.rebind(&mStorage[4 + 3 * inWidthOfFeature +
                                     inWidthOfFeature*(2*m+1)+2*m]);
         lbfgs_state.rebind(&mStorage[5 + 3 * inWidthOfFeature +
                                     inWidthOfFeature*(2*m+1)+2*m], 21);
         mcsrch_state.rebind(&mStorage[26 + 3 * inWidthOfFeature +
                                     inWidthOfFeature*(2*m+1)+2*m], 25);
     }
     Handle mStorage;

 public:
     typename HandleTraits<Handle>::ReferenceToUInt32 iteration;
     typename HandleTraits<Handle>::ReferenceToUInt32 num_features;
     typename HandleTraits<Handle>::ReferenceToUInt32 num_labels;
     typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap coef;
     typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap diag;
     typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap grad;
     typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap ws;

     typename HandleTraits<Handle>::ReferenceToUInt64 numRows;
     typename HandleTraits<Handle>::ReferenceToDouble loglikelihood;

     typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap lbfgs_state;
     typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap mcsrch_state;
 };


 /** This class contains code for the limited-memory Broyden-Fletcher-Goldfarb-Shanno
  * (LBFGS) algorithm for large-scale multidimensional unconstrained minimization problems.
  * This following class is a translation of Fortran code written by Jorge Nocedal.
  */
 class LBFGS {
 public:
     // shared variable in lbfgs
     double stp1, ftol, stp, sq, yr, beta;
     // iflag A return with <code>iflag &lt; 0</code> indicates an error,
     // and <code>iflag = 0</code> indicates that the routine has
     // terminated without detecting errors. On a return with
     // <code>iflag = 1</code>, the user must evaluate the function
     // <code>f</code> and gradient <code>g</code>. On a return with
     // iflag is negative , lbfgs failed

     int  iflag, iter, nfun, point, ispt, iypt, maxfev, info, bound, npt, cp, nfev, inmc, iycn, iscn;
     // shared varibles in mcscrch
     int infoc;
     double dg, dgm, dginit, dgtest, dgx, dgxm, dgy, dgym, finit, ftest1, fm, fx, fxm, fy, fym, p5, p66, stx, sty, stmin, stmax, width, width1, xtrapf;
     bool brackt, stage1, finish;

     ColumnVector w;//
     ColumnVector x;// solution vector
     ColumnVector diag;

     LBFGS(LinCrfLBFGSTransitionState<MutableArrayHandle<double> >&);
     void save_state(LinCrfLBFGSTransitionState<MutableArrayHandle<double> > &state);
     void mcstep (double&, double& , double&, double&, double& , double&, double&, double, double, bool&, double, double, int&);
     void mcsrch (int, Eigen::VectorXd&, double, Eigen::VectorXd&, const Eigen::VectorXd&, double&, double, double, int, int&, int&, Eigen::VectorXd&);
     void lbfgs(int, int, double, Eigen::VectorXd, double, double);
 };

 /**
  *@brief initialize state of current lbfgs iteration with the state of last iteration
  */
 LBFGS::LBFGS(LinCrfLBFGSTransitionState<MutableArrayHandle<double> >& state) {
     w = state.ws;
     diag = state.diag;
     x = state.coef;

     stp1 = state.lbfgs_state(0);
     ftol = state.lbfgs_state(1);
     stp = state.lbfgs_state(2);
     sq = state.lbfgs_state(3);
     yr = state.lbfgs_state(4);
     beta = state.lbfgs_state(5);
     iflag = static_cast<int>(state.lbfgs_state(6));
     iter = static_cast<int>(state.lbfgs_state(7));
     nfun = static_cast<int>(state.lbfgs_state(8));
     point = static_cast<int>(state.lbfgs_state(9));
     ispt = static_cast<int>(state.lbfgs_state(10));
     iypt = static_cast<int>(state.lbfgs_state(11));
     maxfev = static_cast<int>(state.lbfgs_state(12));
     info = static_cast<int>(state.lbfgs_state(13));
     bound = static_cast<int>(state.lbfgs_state(14));
     npt = static_cast<int>(state.lbfgs_state(15));
     cp = static_cast<int>(state.lbfgs_state(16));
     nfev = static_cast<int>(state.lbfgs_state(17));
     inmc = static_cast<int>(state.lbfgs_state(18));
     iycn = static_cast<int>(state.lbfgs_state(19));
     iscn = static_cast<int>(state.lbfgs_state(20));

     infoc = static_cast<int>(state.mcsrch_state(0));
     dg = state.mcsrch_state(1);
     dgm = state.mcsrch_state(2);
     dginit = state.mcsrch_state(3);
     dgtest = state.mcsrch_state(4);
     dgx = state.mcsrch_state(5);
     dgxm = state.mcsrch_state(6);
     dgy = state.mcsrch_state(7);
     dgym = state.mcsrch_state(8);
     finit = state.mcsrch_state(9);
     ftest1 = state.mcsrch_state(10);
     fm = state.mcsrch_state(11);
     fx = state.mcsrch_state(12);
     fxm = state.mcsrch_state(13);
     fy = state.mcsrch_state(14);
     fym = state.mcsrch_state(15);
     stx = state.mcsrch_state(16);
     sty = state.mcsrch_state(17);
     stmin = state.mcsrch_state(18);
     stmax = state.mcsrch_state(19);
     width = state.mcsrch_state(20);
     width1 = state.mcsrch_state(21);
     brackt = (state.mcsrch_state(22) == 1.0 ? true: false);
     stage1 = (state.mcsrch_state(23) == 1.0 ? true: false);
     finish = (state.mcsrch_state(24) == 1.0 ? true: false);
 }

 /**
  *@brief save current lbfgs state for the next lbfgs iteration
  */
 void LBFGS::save_state(LinCrfLBFGSTransitionState<MutableArrayHandle<double> > &state) {
     state.ws = w ;
     state.diag = diag ;
     state.coef = x ;

     state.lbfgs_state(0) = stp1;
     state.lbfgs_state(1) = ftol;
     state.lbfgs_state(2) = stp;
     state.lbfgs_state(3) = sq;
     state.lbfgs_state(4) = yr;
     state.lbfgs_state(5) = beta;
     state.lbfgs_state(6) = iflag;
     state.lbfgs_state(7) = iter;
     state.lbfgs_state(8) = nfun;
     state.lbfgs_state(9) = point;
     state.lbfgs_state(10) = ispt;
     state.lbfgs_state(11) = iypt;
     state.lbfgs_state(12) = maxfev;
     state.lbfgs_state(13) = info;
     state.lbfgs_state(14) = bound;
     state.lbfgs_state(15) = npt;
     state.lbfgs_state(16) = cp;
     state.lbfgs_state(17) = nfev;
     state.lbfgs_state(18) = inmc;
     state.lbfgs_state(19) = iycn;
     state.lbfgs_state(20) = iscn;

     state.mcsrch_state(0) = infoc;
     state.mcsrch_state(1) =  dg;
     state.mcsrch_state(2) = dgm;
     state.mcsrch_state(3) = dginit;
     state.mcsrch_state(4) = dgtest;
     state.mcsrch_state(5) = dgx;
     state.mcsrch_state(6) = dgxm;
     state.mcsrch_state(7) = dgy;
     state.mcsrch_state(8) = dgym;
     state.mcsrch_state(9) = finit;
     state.mcsrch_state(10) = ftest1;
     state.mcsrch_state(11) = fm;
     state.mcsrch_state(12) = fx;
     state.mcsrch_state(13) = fxm;
     state.mcsrch_state(14) = fy;
     state.mcsrch_state(15) = fym;
     state.mcsrch_state(16) = stx;
     state.mcsrch_state(17) = sty;
     state.mcsrch_state(18) = stmin;
     state.mcsrch_state(19) = stmax;
     state.mcsrch_state(20) = width;
     state.mcsrch_state(21) = width1;
     state.mcsrch_state(22) = (brackt == true ? 1.0 : 0.0);
     state.mcsrch_state(23) = (stage1 == true ? 1.0 : 0.0);
     state.mcsrch_state(24) = (finish == true ? 1.0 : 0.0);
 }
 void LBFGS::mcstep(double& stx, double& fx, double& dx,
                    double& sty, double& fy, double& dy,
                    double& stp, double fp, double dp, bool& brackt,
                    double stmin, double stmax, int& info)
 {
     bool bound;
     double gamma, p, q, r, sgnd, stpc, stpf, stpq, theta, s;

     info = 0;

     if ((brackt && ((stp <= std::min(stx, sty)) || (stp >= std::max(stx, sty)))) ||
             (dx * (stp - stx) >= 0) || (stmax < stmin)) {
         return;
     }

     sgnd = dp*(dx/fabs(dx));
     if (fp > fx) {
         info = 1;
         bound = true;
         theta = 3.0 * (fx - fp) / (stp - stx) + dx + dp;
         s = std::max(fabs(theta), std::max(fabs(dx), fabs(dp)));
         gamma = s * sqrt((theta / s) * (theta / s) - (dx / s) * (dp / s));
         if (stp < stx) {
             gamma = -gamma;
         }
         p = gamma - dx + theta;
         q = gamma - dx + gamma + dp;
         r = p / q;
         stpc = stx + r * (stp - stx);
         stpq = stx + dx/((fx - fp)/(stp - stx) + dx)/2 * (stp - stx);
         if (fabs(stpc - stx) < fabs(stpq - stx)) {
             stpf = stpc;
         } else {
             stpf = stpc + (stpq - stpc)/2;
         }
         brackt = true;

     } else if (sgnd < 0.0) {
         info = 2;
         bound = false;
         theta = 3.0 * (fx - fp) / (stp - stx) + dx + dp;
         s = std::max(fabs(theta), std::max(fabs(dx), fabs(dp)));
         gamma = s * sqrt((theta / s) * (theta / s) - (dx / s) * (dp / s));
         if (stp > stx) {
             gamma = -gamma;
         }
         p = gamma - dp + theta;
         q = gamma - dp + gamma + dx;
         r = p / q;
         stpc = stp + r * (stx - stp);
         stpq = stp + dp / (dp - dx) * (stx - stp);
         stpf = (fabs(stpc - stp) > fabs(stpq - stp)) ? stpc : stpq;
         brackt = true;

     } else if (fabs(dp) < fabs(dx)) {
         info = 3;
         bound = true;
         theta = 3.0 * (fx - fp) / (stp - stx) + dx + dp;
         s = std::max(fabs(theta), std::max(fabs(dx), fabs(dp)));
         gamma = s * sqrt(std::max(0.0, (theta/s)*(theta/s) - (dx/s)*(dp/s)));
         if (stp > stx) {
             gamma = -gamma;
         }
         p = gamma - dp + theta;
         q = gamma + (dx - dp) + gamma;
         r = p / q;
         if ((r < 0.0) && (gamma != 0.0)) {
             stpc = stp + r * (stx - stp);
         } else {
             stpc = (stp > stx) ? stmax : stmin;
         }

         stpq = stp + dp / (dp - dx) * (stx - stp);
         if (brackt) {
             stpf = (fabs(stp - stpc) < fabs(stp - stpq)) ? stpc : stpq;
         } else {
             stpf = (fabs(stp - stpc) > fabs(stp - stpq)) ? stpc : stpq;
         }

     } else {
         info = 4;
         bound = false;
         if (brackt) {
             theta = 3.0 * (fp - fy) / (sty - stp) + dy + dp;
             s = std::max(fabs(theta), std::max(fabs(dy), fabs(dp)));
             gamma = s * sqrt((theta/s)*(theta/s) - (dy/s)*(dp/s));
             if (stp > sty) {
                 gamma = -gamma;
             }
             p = gamma - dp + theta;
             q = gamma - dp + gamma + dy;
             r = p / q;
             stpc = stp + r * (sty - stp);
             stpf = stpc;
         } else {
             stpf = (stp > stx) ? stmax : stmin;
         }
     }

     if (fp > fx) {
         sty = stp;
         fy = fp;
         dy = dp;
     } else {
         if (sgnd < 0.0) {
             sty = stx;
             fy = fx;
             dy = dx;
         }
         stx = stp;
         fx = fp;
         dx = dp;
     }

     stp = std::max(stmin, std::min(stmax, stpf));
     if (brackt && bound) {
         if (sty > stx) {
             stp = std::min(stx + 0.66*(sty - stx), stp);
         } else {
             stp = std::max(stx + 0.66*(sty - stx), stp);
         }
     }

     return;
 }

 void LBFGS::mcsrch(int n, Eigen::VectorXd& x, double f, Eigen::VectorXd& g,
                     const Eigen::VectorXd& s, double& stp, double ftol,
                     double xtol, int maxfev, int& info, int& nfev,
                     Eigen::VectorXd& wa)
 {
     double stpmin = 1e-20;
     double stpmax = 1e20;
     double p5 = 0.5;
     double p66 = 0.66;
     double xtrapf = 4.0;
     double gtol = 0.9;

     if(info != -1) {
         infoc = 1;
         if (n <= 0 || stp <= 0 || ftol < 0 || gtol < 0 || xtol < 0 ||
                     stpmin < 0 || stpmax < stpmin || maxfev <= 0 )
             return;

         dginit = g.dot(s);
         if (dginit >= 0.0) {
             std::cout << "The search direction is not a descent direction."
                       << std::endl;
             return;
         }

         brackt = false;
         stage1 = true;
         nfev = 0;
         finit = f;
         dgtest = ftol * dginit;
         width = stpmax - stpmin;
         width1 = width/p5;

         wa = x;

         stx = 0.0;
         fx = finit;
         dgx = dginit;
         sty = 0.0;
         fy = finit;
         dgy = dginit;
     }

     while(true)
     {
         if(info != -1)
         {
             if (brackt) {
                 if (stx < sty) {
                     stmin = stx;
                     stmax = sty;
                 } else {
                     stmin = sty;
                     stmax = stx;
                 }
             } else {
                 stmin = stx;
                 stmax = stp + xtrapf * (stp - stx);
             }

             stp = std::max(stpmin, std::min(stpmax, stp));

             if ((brackt && ((stp <= stmin) || (stp >= stmax))) ||
                     (nfev == maxfev - 1) ||
                     (!infoc) || (brackt && ((stmax - stmin) <= xtol * stmax))) {
                 stp = stx;
             }

             x = wa + stp * s;

             info = -1;
             return;
         }
         info = 0;
         nfev= nfev + 1;
         dg = g.dot(s);
         ftest1 = finit + stp * dgtest;

         if ((brackt && ((stp <= stmin) || (stp >= stmax))) || (!infoc)) {
             info = 6;
         }
         if ((stp == stpmax) && (f <= ftest1) && (dg <= dgtest)) {
             info = 5;
         }
         if ((stp == stpmin) && ((f >= ftest1) || (dg >= dgtest))) {
             info = 4;
         }
         if (nfev >= maxfev) {
             info = 3;
         }
         if (brackt && (stmax - stmin <= xtol * stmax)) {
             info = 2;
         }
         if ((f <= ftest1) && (fabs(dg) <= -gtol * dginit)) {
             info = 1;
         }
         if (info !=0 )
             return ;


         if ( stage1 && f <= ftest1 && dg >= std::min(ftol , gtol) * dginit )
             stage1 = false;

         if (stage1 && f <= fx && f > ftest1) {
             fm = f - stp * dgtest;
             fxm = fx - stx * dgtest;
             fym = fy - sty * dgtest;
             dgm = dg - dgtest;
             dgxm = dgx - dgtest;
             dgym = dgy - dgtest;
             mcstep(stx, fxm, dgxm, sty, fym, dgym, stp, fm, dgm, brackt,
                     stmin, stmax, infoc);
             fx = fxm + stx * dgtest;
             fy = fym + sty * dgtest;
             dgx = dgxm + dgtest;
             dgy = dgym + dgtest;
         } else {
             mcstep(stx, fx, dgx, sty, fy, dgy, stp, f, dg, brackt, stmin,
                     stmax, infoc);
         }

         if (brackt) {
             if (fabs(sty - stx) >= p66 * width1) {
                 stp = stx + p5 * (sty - stx);
             }
             width1 = width;
             width = fabs(sty - stx);
         }
     }
 }


 void LBFGS::lbfgs(int n, int m, double f, Eigen::VectorXd g, double eps,
                     double xtol)
 {
     bool execute_entire_while_loop = false;
     if(iflag == 0) {
         iter=0;
         if ( n <= 0 || m <= 0 )
         {
             iflag= -3;
         }

         nfun= 1;
         point = 0;
         finish = false;
         ispt = n + 2*m;
         iypt = ispt + n*m;
         npt = 0;
         w.segment(ispt, n) = (-g).cwiseProduct(diag);

         stp1 = 1.0 / g.norm();
         ftol= 0.0001;
         maxfev= 20;
         execute_entire_while_loop = true;
     }
     while(true) {
         if(execute_entire_while_loop) {
             iter++;
             info = 0;
             bound=iter-1;
             if (iter!=1) {
                 if (iter > m) bound = m;
                 double ys = w.segment(iypt + npt, n).dot(w.segment(ispt + npt, n));
                 double yy = w.segment(iypt + npt, n).squaredNorm();
                 diag.setConstant(ys / yy);
                 cp = point;
                 if (point ==0 ) cp =m;
                 w[n + cp-1] = 1.0 / ys;
                 w.head(n) = -g;
                 cp = point;
                 for (int i = 0; i < bound; i++) {
                     cp -= 1;
                     if (cp == -1) {
                         cp = m - 1;
                     }
                     sq = w.segment(ispt + cp *n,n).dot(w.head(n));
                     inmc = n + m + cp;
                     iycn = iypt + cp * n;
                     w[inmc] = sq * w[n + cp];
                     w.head(n) -= w[inmc] * w.segment(iycn, n);
                 }
                 w.head(n)=w.head(n).cwiseProduct(diag);

                 for (int i = 0; i < bound; i++) {
                     yr = w.segment(iypt + cp * n, n).dot(w.head(n));
                     inmc = n + m + cp;
                     beta = w[inmc] - w[n + cp] * yr;
                     iscn = ispt + cp * n;
                     w.head(n) += beta * w.segment(iscn, n);
                     cp += 1;
                     if (cp == m) {
                         cp = 0;
                     }
                 }
                 w.segment(ispt + point * n, n) = w.head(n);
             }
             nfev = 0;
             stp = (iter == 1) ? stp1 : 1.0;
             w.head(n) = g;
         }
         mcsrch(n, x, f, g, w.segment(ispt + point * n, n), stp, ftol,
                 xtol, maxfev, info, nfev, diag);
         if(info == -1) {
             iflag = 1;
             return;
         } else {
             iflag = -1;
         }
         nfun = nfun + nfev;
         npt = point * n;
         w.segment(ispt + npt,n) *=stp;
         w.segment(iypt + npt,n) =g - w.head(n);
         point = point + 1;
         if (point == m) {
             point = 0;
         }
         if(g.norm()/std::max(1.0,x.norm())<=eps) {
             finish = true;
         }
         if (finish) {
             iflag = 0;
             return;
         }
         execute_entire_while_loop = true;
     }
 }

 /**
  *@brief compute exponential of Mi and Vi
  */
 void compute_exp_Mi(int num_labels, Eigen::MatrixXd &Mi, Eigen::VectorXd &Vi) {
     // take exponential operator
     for (int m = 0; m < num_labels; m++) {
         Vi(m) = std::exp(Vi(m));
         for (int n = 0; n < num_labels; n++) {
             Mi(m, n) = std::exp(Mi(m, n));
         }
     }
 }

 Eigen::VectorXd mult(Eigen::MatrixXd Mi, Eigen::VectorXd Vi, bool trans,
                      int num_label)
 {
     int i=0,j=0,r=0,c=0;
     Eigen::VectorXd z(num_label);
     z.fill(0);
     for (i = 0; i < num_label; i++ )
     {
         for (j=0; j< num_label; j++) {
             r= i;
             c=j;
             if(trans) {
                 r= j;
                 c=i;
             }
             z(r)+=Mi(i,j)*Vi(c);
         }
     }
     return z;
 }


 /**
  *@brief compute loglikelihood and gradient using forward-backward algorithm
  */
 void validate_label(int label_id, int num_labels)
 {
     if ((label_id < 0) || (label_id >= num_labels))
         throw std::runtime_error("Out of bound label ids found in feature table.");
 }

 void compute_logli_gradient(LinCrfLBFGSTransitionState<MutableArrayHandle<double> >& state,
                             MappedColumnVector& sparse_r,
                             MappedColumnVector& dense_m,
                             MappedColumnVector& sparse_m) {
     int r_size = static_cast<int>(sparse_r.size());
     int sparse_m_size = static_cast<int>(sparse_m.size());
     int seq_len = static_cast<int>(sparse_r(r_size-2)) + 1;

     Eigen::MatrixXd betas(static_cast<uint32_t>(state.num_labels), seq_len);
     Eigen::VectorXd scale(seq_len);
     Eigen::MatrixXd Mi(static_cast<uint32_t>(state.num_labels),
                                     static_cast<uint32_t>(state.num_labels));
     Eigen::VectorXd Vi(state.num_labels);
     Eigen::VectorXd alpha(state.num_labels);
     Eigen::VectorXd next_alpha(state.num_labels);
     Eigen::VectorXd temp(state.num_labels);
     Eigen::VectorXd ExpF(state.num_features);
     betas.fill(0);
     scale.fill(0);
     alpha.fill(1);
     next_alpha.fill(0);
     temp.fill(0);
     ExpF.fill(0);

     // compute beta values in a backward fashion
     // also scale beta-values to 1 to avoid numerical problems
     scale(seq_len - 1) = state.num_labels;
     betas.col(seq_len - 1).fill(1.0 / scale(seq_len - 1));

     int index = r_size-1;
     for (int i = seq_len - 1; i > 0; i--) {
         Mi.fill(0);
         Vi.fill(0);
         // examine all features at position "pos"
         //(prev_labe, curr_label, f_index, start_pos, exist)
         while (index-4>=0 && sparse_r(index-1) == i) {
             int curr_label =  static_cast<int>(sparse_r(index-3));
             validate_label(curr_label, state.num_labels);
             int f_index =  static_cast<int>(sparse_r(index-2));
             Vi(curr_label) += state.coef(f_index);
             index-=5;
         }
         //(f_index, prev_label, curr_label)
         for(int n=0; n+2<sparse_m_size ; n+=3) {
             int prev_label = static_cast<int>(sparse_m(n+1));
             int curr_label = static_cast<int>(sparse_m(n+2));
             validate_label(prev_label, state.num_labels);
             validate_label(curr_label, state.num_labels);
             Mi(prev_label, curr_label) += state.coef(static_cast<int>(sparse_m(n)));
         }

         compute_exp_Mi(state.num_labels, Mi, Vi);

         temp=betas.col(i);
         temp=temp.cwiseProduct(Vi);
         betas.col(i -1) = mult(Mi,temp,false,state.num_labels);
         // scale for the next (backward) beta values
         scale(i - 1)=betas.col(i-1).sum();
         betas.col(i - 1)*=(1.0 / scale(i - 1));
     } // end of beta values computation

     index = 0;
     // start to compute the log-likelihood of the current sequence
     for (int j = 0; j < seq_len; j++) {
         Mi.fill(0);
         Vi.fill(0);
         // examine all features at position "pos"
         int ori_index = index;

         while (((index+4) <= (r_size-1)) && sparse_r(index+3) == j) {
             int curr_label =  static_cast<int>(sparse_r(index+1));
             int f_index =  static_cast<int>(sparse_r(index+2));
             Vi(curr_label) += state.coef(f_index);
             index+=5;
         }
         if(j>=1) {
             for(int n=0; n+2<sparse_m_size ; n+=3) {
                 int prev_label = static_cast<int>(sparse_m(n+1));
                 int curr_label = static_cast<int>(sparse_m(n+2));
                 Mi(prev_label, curr_label) += state.coef(static_cast<int>(sparse_m(n)));
             }
         }

         compute_exp_Mi(state.num_labels, Mi, Vi);

         if(j>0) {
             temp = alpha;
             next_alpha=mult(Mi,temp,true,state.num_labels);
             next_alpha=next_alpha.cwiseProduct(Vi);
         } else {
             next_alpha=Vi;
         }


         index = ori_index;
         while (((index+4) <= (r_size-1)) && sparse_r(index+3) == j) {
             int curr_label =  (int)sparse_r(index+1);
             int f_index =  (int)sparse_r(index+2);
             int exist = (int)sparse_r(index+4);
             if (exist == 1) {
                 state.grad(f_index) += 1;
                 state.loglikelihood += state.coef(f_index);
             }
             ExpF(f_index) += next_alpha(curr_label) * betas(curr_label,j);

             index+=5;
         }
         // Edge feature
         if(j>=1) {
             int f_index = (int)dense_m((j-1)*5+2);
             state.grad(f_index) += 1;
             state.loglikelihood += state.coef(f_index);
             //(f_index, prev_label, curr_label)
             for(int n=0; n+2<sparse_m_size ; n+=3) {
                 int f_index = (int)sparse_m(n);
                 int prev_label = static_cast<int>(sparse_m(n+1));
                 int curr_label = static_cast<int>(sparse_m(n+2));
                 ExpF(f_index) += alpha[prev_label] * Vi(curr_label) * Mi(prev_label,curr_label) * betas(curr_label, j);
             }
         }

         alpha = next_alpha;
         alpha*=(1.0 / scale(j));
     }

     // Zx = sum(alpha_i_n) where i = 1..num_labels, n = seq_len
     double Zx = alpha.sum();
     state.loglikelihood -= std::log(Zx);

     // re-correct the value of seq_logli because Zx was computed from
     // scaled alpha values
     for (int k = 0; k < seq_len; k++) {
         state.loglikelihood -= std::log(scale(k));
     }
     // update the gradient vector
     for (size_t k = 0; k < state.num_features; k++) {
         state.grad(k) -= ExpF(k) / Zx;
     }
 }

 /**
  * @brief Compute the log likelihood and gradient vector for each tuple
  */
 AnyType
 lincrf_lbfgs_step_transition::run(AnyType &args) {
     LinCrfLBFGSTransitionState<MutableArrayHandle<double> > state = args[0];
     MappedColumnVector sparse_r = args[1].getAs<MappedColumnVector>();
     MappedColumnVector dense_m = args[2].getAs<MappedColumnVector>();
     MappedColumnVector sparse_m = args[3].getAs<MappedColumnVector>();
     if (state.numRows == 0) {
         state.initialize(*this, static_cast<uint32_t>(args[4].getAs<double>()), static_cast<uint32_t>(args[5].getAs<double>()));
         if (!args[6].isNull()) {
             LinCrfLBFGSTransitionState<ArrayHandle<double> > previousState = args[6];
             state = previousState;
             state.reset();
         }
     }
     state.numRows++;
     compute_logli_gradient(state, sparse_r, dense_m, sparse_m);
     return state;
 }

 /**
  * @brief Perform the perliminary aggregation function: Merge transition states
  */
 AnyType
 lincrf_lbfgs_step_merge_states::run(AnyType &args) {
     LinCrfLBFGSTransitionState<MutableArrayHandle<double> > stateLeft = args[0];
     LinCrfLBFGSTransitionState<ArrayHandle<double> > stateRight = args[1];

     // We first handle the trivial case where this function is called with one
     // of the states being the initial state
     if (stateLeft.numRows == 0)
         return stateRight;
     else if (stateRight.numRows == 0)
         return stateLeft;

     // Merge states together and return
     stateLeft += stateRight;
     return stateLeft;
 }

 /**
  * @brief Perform the licrf_lbfgs final step
  */
 AnyType
 lincrf_lbfgs_step_final::run(AnyType &args) {
 // We request a mutable object. Depending on the backend, this might perform
     // a deep copy.
     LinCrfLBFGSTransitionState<MutableArrayHandle<double> > state = args[0];

     // Aggregates that haven't seen any data just return Null.
     if (state.numRows == 0)
         return Null();

     // To avoid overfitting, penalize the likelihood with a spherical Gaussian
     // weight prior
     double sigma_square = 100;
     state.loglikelihood -= (state.coef.dot(state.coef)/(2 * sigma_square));
     state.grad -= state.coef/sigma_square;

     // the lbfgs minimize function, we want to maximize loglikelihood
     state.loglikelihood = state.loglikelihood * -1;
     state.grad = -state.grad;

     double eps = 0.001; //accuracy of the solution to be found
     double xtol = 1.0e-16; //an estimate of the machine precision

     assert((state.m > 0) && (state.m <= state.num_features) && (eps >= 0.0));
     LBFGS instance(state);// initialize the lbfgs with state of last iteration
     instance.lbfgs(state.num_features, state.m, state.loglikelihood, state.grad, eps, xtol);// lbfgs optimization
     instance.save_state(state);//save current state for the next iteration of lbfgs
     switch(instance.iflag)
     {
     case -1:
         throw std::logic_error("The line search rountine mcsch failed");
         break;
     case -2:
         throw std::logic_error("The i-th diagonal element of the diagonal inverse Hessian"
                                "approximation, given in DIAG, is not positive");
         break;
     case -3:
         throw std::logic_error("Improper input parameters for LBFGS n or m are not positive");
         break;
     }

     if(!state.coef.is_finite())
         throw NoSolutionFoundException("Over- or underflow in "
                                        "L-BFGS step, while updating coefficients. Input data "
                                        "is likely of poor numerical condition.");

     state.iteration++;
     return state;
 }

 /**
  * @brief Return iflag which indicates whether L-BFGS converge or not
  */
 AnyType
 internal_lincrf_lbfgs_converge::run(AnyType &args) {
     LinCrfLBFGSTransitionState<ArrayHandle<double> > state = args[0];
     return state.lbfgs_state(6);
 }

 /**
  * @brief Return the coefficients and diagnostic statistics of the state
  */
 AnyType
 internal_lincrf_lbfgs_result::run(AnyType &args) {
     LinCrfLBFGSTransitionState<ArrayHandle<double> > state = args[0];
     return stateToResult(*this, state.coef, state.loglikelihood, state.iteration);
 }

 /**
  * @brief Return the results for the L-BFGS method.
  */
 AnyType stateToResult(
     const Allocator &inAllocator,
     const HandleMap<const ColumnVector, TransparentHandle<double> > &incoef,
     double loglikelihood, const uint32_t &iterations) {
     // FIXME: We currently need to copy the coefficient to a native array
     // This should be transparent to user code
     MutableNativeColumnVector coef(
         inAllocator.allocateArray<double>(incoef.size()));
     coef = incoef;

     // Return all coefficients, loglikelihood in a tuple
     AnyType tuple;
     tuple << coef << loglikelihood << iterations;
     return tuple;
 }

 } // namespace crf

 } // namespace modules

 } // namespace madlib