doc/design/modules/crf.tex - madlib - Git at Google

 % When using TeXShop on the Mac, let it know the root document.
 % The following must be one of the first 20 lines.
 % !TEX root = ../design.tex

 \chapter[Linear-chain Conditional Random Field]{Linear-chain Conditional Random Field}
 % Motivation. Why do we want to have this abstract layer?
 Conditional random field(CRF) \cite{DBLP:conf/icml/LaffertyMP01} is a type of discriminative undirected probabilistic graphical model.
 Linear-chain CRFs are special CRFs which assume that the next state depends only on the current state.
 Linear-chain CRFs achieve state of the art accuracy in some real world natural language processing tasks such
 as part of inforamtion extraction\cite{chen2012optimizing}, speech tagging(POS) and named entity resolution(NER).

 \section{Linear-chain CRF Learning}

 \subsection{Mathematical Notations}
 \begin{itemize}
 \item $p(\boldsymbol Y | \boldsymbol X)$: conditional probability distributions of label sequence $\boldsymbol Y$ given input sequence $\boldsymbol X$.
 \item $M$: total number of unique features.
 \item $I$: the position of last token in a sentence.
 \item $N$: number of sentences in the training data set.
 \item $\lambda$: the coefficients (feature weights).
 \item $\ell_{\lambda}$: log-likelihood summed over all training sentences.
 \item $\nabla \ell_{\lambda}$: gradient vector summed over all training sentences.
 \item $\ell_{\lambda}^\prime$: adjusted log-likelihood to avoid overfitting using spherical Gaussian weight prior.
 \item $\nabla \ell_{\lambda}^\prime$: adjusted gradient vector to avoid overfitting using spherical Gaussian weight prior.

 \end{itemize}

 \subsection{Formulation}
 A linear-chain CRF \cite{DBLP:conf/naacl/ShaP03} is a distribution
     \[p(\boldsymbol Y | \boldsymbol X) = \frac{\exp{\sum_{m=1}^M \sum_{i=0}^{I} \lambda_m f_m(y_i,y_{i-1},x_i)}}{Z(X)},\]

 where Z(X) is an instance specific normalizer
 \[Z(X) = \sum_{y} \exp{\sum_{m=1}^M \sum_{i=0}^{I} \lambda_m f_m(y_i,y_{i-1},x_i)}.\]

 Train a CRF by maximizing the log-likelihood of a given training set $ T=\{(\vec{x^{(k)}},\vec{y^{(k)}})\}_{k=1}^N$.
 Seek the zero of the gradient.\\
     \[\ell_{\lambda}=\sum_k \log p_\lambda(\vec{y^{(k)}}|\vec{x^{(k)}}) =\sum_k[\lambda F(\vec{y^{(k)}},\vec{x^{(k)}})-\log Z_\lambda(\vec{x^{(k)}})]\]
     \[\nabla \ell_{\lambda}=\sum_k[F(\vec{y^{(k)}},\vec{x^{(k)}})-E_{p_\lambda(Y|\vec{x^{(k)}})}F(Y,\vec{x^{(k)}})]\]

 To avoid overfitting, we penalize the likelihood with a spherical Gaussian weight prior:\\
     \[\ell_{\lambda}^\prime=\sum_k[\lambda F(\vec{y^{(k)}},\vec{x^{(k)}})-\log Z_\lambda(\vec{x^{(k)}})]-\frac{\lVert \lambda \rVert^2}{2\sigma ^2}\]
     \[\nabla \ell_{\lambda}^\prime=\sum_k[F(\vec{y^{(k)}},\vec{x^{(k)}})-E_{p_\lambda(Y|\vec{x^{(k)}})}F(Y,\vec{x^{(k)}})]-\frac{\lambda}{\sigma ^2}\]
 Note:We hard code $\sigma$ as $100$ in the implementation which follows other CRF packages in the literature.

 \subsection{Forward-backward Algorithm}
 $E_{p_\lambda(Y|x)}F(Y,x)$ is computed using a variant of the forward-backward algorithm:

     \[E_{p_\lambda(Y|x)}F(Y,x) = \sum_y p_\lambda(y|x)F(y,x) = \sum_i\frac{\alpha_{i-1}(f_i*M_i)\beta_i^T}{Z_\lambda(x)}\]
     \[Z_\lambda(x) = \alpha_I.1^T\]
     where $\alpha_i$ and $\beta_i$ the forward and backward state cost vectors defined by\\
   \[\alpha_i =
     \begin{cases}
     \alpha_{i-1}M_i, & 0<i<=n\\
     1, & i=0
     \end{cases}
     ,
     \beta_i^T =
     \begin{cases}
     M_{i+1}\lambda_{i+1}^T, & 1<=i<I\\
     1, & i=n
     \end{cases}
   \]

 \subsection{L-BFGS Convex Solver}

 The limited-memory BFGS(L-BFGS) \cite{DBLP:journals/siamjo/MoralesN00} is the
 limited memory variation of the Broyden-Fletcher-Goldfarb-Shanno(BFGS) algorithm
 which is the state of art of large scale non constraint convex optimization
 method. We translate the in-memory Java implementation to C++ in-database
 implementation using Eigen support. Eigen vector and Eigen matrix are used
 instead of the plain one dimentional and two dimentional arrays. In the Java in-
 memory implementation, it defines many static variables defined and shared
 between the interations. However, in the MADlib implementation, we define these
 variables in the state object. Before each iteration of L-BFGS optimization, we
 need to initialize the L-BFGS with the current state object.  At the end of each
 iteration, we need to dump the updated variables to the database state for next
 iteration.


 \subsection{Parallel CRF Training}
 \begin{algorithm}[CRF training$(z_{1:M})$] \label{alg:CRF training}
 \alginput{Observation set $z_{1:M}$,\\
 convergence criterion $\mathit{Convergence}()$,\\
 start strategy $\mathit{Start}()$,\\
 initialization strategy $\mathit{Initialization}()$,\\
 transition strategy $\mathit{Transition}()$,\\
 finalization strategy $\mathit{Finalization}()$}
 \algoutput{Coefficients $w \in \mathbb{R}^N$}
 \algprecond{$iteration = 0, diag = \boldsymbol 1$}
 \begin{algorithmic}[1]
 	\State $w_\text{new} \set \mathit{Start}(z_{1:M})$
 	\Repeat
         \State $w_\text{old} \set w_\text{new}$
         \State $\mathit{state} \set \mathit{Initialization}(w_\text{new})$
 		\For{$m \in 1..M$} \Comment{Single entry in the observation set}
 			\State $\mathit{state} \set \mathit{Transition}(\mathit{state}, z_m)$
                 \Comment{Computing gradient and log-likelihood.}
 		\EndFor
 		\State $w_\text{new} \set Finalization(\mathit{state})$ \Comment{Mainly invoke L-BFGS convex solver}
 	\Until{$Convergence(w_\text{new}, g_\text{new}, \mathit{iteration})$}
     \State \Return $w_\text{new}$
 \end{algorithmic}
 \end{algorithm}

 \paragraph{Programming Model.}
 We provide above the algorithm of parallel CRF training strategy, in the fashion of the selected programming model supported by MADlib (mainly user-defined aggregate).

 \paragraph{Parallelism.}
 The outer loop is inherently sequential over multiple iterations.
 The iteration $n+1$ takes the output of iteration $n$ as input, so on so forth until the stop criterion is satisfied.
 The inner loop which calculates the gradient and log-likelihood for each document is data-parallel.
 Simple model averaging are used to merge two states.
 A merge function is not explicitly added to the pseudocode for simplicity.
 The finalization function invokes the L-BFGS convex solver to get a new solution. L-BFGS is sequential, but very fast.
 Experiments show that the speed-up ration approaches the number of segments configured in the Greenplum database.

 \paragraph{Convergence criterion.}
 Usually, the following conditions are combined by AND, OR, or NOT.
 \begin{enumerate}
     \item The norm of gradient divided by the norm of coefficient drops below a given threshold.
     \item The maximum number of iterations is reached.
     \item There could be more.
 \end{enumerate}

 \paragraph{Start strategy.}
 In most cases, zeros are used unless otherwise specified.

 \paragraph{Transition strategies.}
 This function contains the logic of computing the gradient and log-likelihood for each tuple using the forward-backward
 algorithm. The algorithms will be discussed in the following sections.

 \begin{algorithm}[transition-lbfgs$(\mathit{state}, z_m)$] \label{alg:transition-lbfgs}
 \alginput{Transition state $\mathit{state}$,\\
 observation entry $z_m$,\\
 gradient function $\mathit{Gradient}()$}
 \algoutput{Transition state $\mathit{state}$}
 \begin{algorithmic}[1]
     \State $\{state.g,state.loglikelihood\}  \set \mathit{Gradient}(\mathit{state}, z_m)$
         \Comment{using forward-backward algorithm to calculate gradient and loglikelihood}
     \State $\mathit{state}.num\_rows \set \mathit{state}.num\_rows + 1$
     \State \Return $\mathit{state}$
 \end{algorithmic}
 \end{algorithm}


 \paragraph{Merge strategies.}
 The merge function simply sums the gradient and log-likelihood over all training documents
 \begin{algorithm}[merge-lbfgs$(\mathit{state_1}, \mathit{state_2})$] \label{alg:merge-lbfgs}
 \alginput{Transition state $\mathit{state_1}$,\\
 Transition state $\mathit{state_2}$}
 \algoutput{Transition state $\mathit{state_{new}}$}
 \begin{algorithmic}[1]
     \State $\mathit{state_{new}}.g \set \mathit{state_1}.g + \mathit{state_2}.g$
     \State $\mathit{state_{new}}.loglikelihood \set \mathit{state_1}.loglikelihood + \mathit{state_2}.loglikelihood$
     \State \Return $\mathit{state_{new}}$
 \end{algorithmic}
 \end{algorithm}


 \paragraph{Finalization strategy.}
 The finalization function invokes the L-BFGS convex solver to get a new coefficent vector.\\

 \begin{algorithm}[finalization-lbfgs$(state)$] \label{alg:CRF training}
 \alginput{Transition state $state$,\\
 LBFGS $\mathit{lbfgs}()$}
 \algoutput{Transition state  $state$}
 \begin{algorithmic}[1]
         \State $\{state.g,state.loglikelihood\} \set penalty(state.g,state.loglikelihood)$ \Comment{To avoid overfitting, add penalization}
         \State $\{state.g,state.loglikelihood\}\set-\{state.g,state.loglikelihood\}$ \Comment{negation for maximization}
         \State LBFGS instance($state)$ \Comment{initialize the L-BFGS instance with previous state}
         \State instance.$lbfgs()$ \Comment{invoke the L-BFGS convex solver}
         \State instance.save\_state$(state)$ \Comment{save updated variables to the state for next iteration}
         \State \Return $state$
 \end{algorithmic}
 \end{algorithm}

 Feeding with current solution, gradient, log-likelihood, etc., the L-BFGS will output a new solution.
 To avoid overfitting, a penalization function is needed. We choose to penalize the log-likelihood with a spherical Gaussian weight prior.
 Also, L-BFGS is to maximum objective, so we need to negate the gradient vector and log-likelihood to fit our needs in order minimize the log-likehood.

 \section{Linear-chain CRF Applications}
 Linear-chain CRF can be used in various applications such as  part of speech tagging and named entity resolution.
 All the following sections assume that the application is part of speech tagging. It can be fitted to named entity resolution with minimal effort.

 \subsection{Part of Speech Tagging}
 Part-of-speech tagging, also called grammatical tagging or word-category disambiguation \cite{DBLP:journals/coling/DeRose88}, is the process of assigning
 a part of speech to each word in a sentence. POS has been widely used in information retrieval and text to speech. There are two distinct methods for
 POS task: rule-based and stochastic.
 In rule-based method, large collection of rules are defined to indentify the tag. Stochastic method is based on probabilistic
 graphic models such as hidden markov models and conditional random fields. In practice, conditional random fields are approved
 to achieve the state of art accuracy.
 \subsection{Tag Set}
 There are various tag set used in the literature. The Pennsylvania Treebank tag-set \cite{DBLP:journals/coling/MarcusSM94} is the commonly used tag set and contains
 45 tags. The following table shows part of tags in the tag set.

 \begin {table}[h]
 \caption {Pen Treebank III Tag Set} \label{tab:tagset}
 \[\begin{tabular}{lll||lll}
   Tag & Description         & Example       & Tag & Description         & Example\\
   \hline
   CC  & Coordin,Conjunction & and,but,or    & SYM & Symbol              & +,\%,\&\\
   CD  & Cardinal number     & one,two,three & TO  & 'to'                & to\\
   DT  & Determiner          & a,the         & UH  & Interjection        & ah,oops\\
   EX  & Existential         & there         & VB  & Verb,base form      & eat\\
   ... & ...                 & ...           & ... & ...                 & ...\\
   RBR & Adverb,comparative  & faster        & .   & Sentence-final      & (.!?)\\
   RBS & Adverb,superlative  & fastest       & :   & Mid-sentence punc   & (:;...-)\\
   RP  & Particle            & up,off        &     &                     &\\
   \hline
 \end{tabular}\]
 \end{table}

 \subsection{Regular Expression Table}
   Regex feature captures the relationship between the morphology of a token and it's corresponding tag.
 For example, a token ending with 's' is mostly likely to be a plural noun whereas a token ending with 'ly' is more likely to be an
 adverb. One can define his/her own regular expressions to capture the intrinsic characteristics of the given training data.
 \begin {table}[h]
 \caption {Regular expression table} \label{tab:regex}
 \begin{center}
 \begin{tabular}{ll||ll}
   pattern & name             & pattern & name\\
   \hline
   $\wedge[A-Z][a-z]+\$$    & InitCapital       & $\wedge[A-Z]+\$$  & isAllCapital \\
   $\wedge.*[0-9]+.*\$$     & containsDigit     & $\wedge.+[.]\$$   & endsWithDot\\
   $\wedge.+[,]\$$          & endsWithComma     & $\wedge.+er\$$    & endsWithEr\\
   $\wedge.+est\$$	   & endsWithEst       & $\wedge.+ed\$$    & endsWithEd\\
   $\wedge.+s\$$	           & endsWithS         & $\wedge.+ing\$$   & endsWithIng\\
   $\wedge.+ly\$$	   & endsWithly        & $\wedge.+-.+\$$   & isDashSeparatedWords\\
   $\wedge.*@.*\$$	   & isEmailId         &           & \\
   \hline
 \end{tabular}
 \end{center}
 \end{table}

 \section{Feature Extraction}
 The Feature Extraction module provides functionality for basic text-analysis
 tasks such as part-of-speech (POS) tagging and named-entity resolution.
 At present, six feature types are implemented.
     \begin{itemize}
     \item Edge Feature: transition feature that encodes the transition feature weight from current label to next label.
     \item Start Feature: fired when the current token is the first token in a sentence.
     \item End Feature: fired when the current token is the last token in a sentence.
     \item Word Feature: fired when the current token is observed in the trained dictionary.
     \item Unknown Feature: fired when the current token is not observed in the trained dictionary for at least certain times.
     \item Regex Feature: fired when the current token can be matched by the regular expression.
     \end{itemize}

 Advantages of extracting features using SQL statements:
 \begin{itemize}
 \item [$\star$] Decoupling the feature extraction and other code.
 \item [$\star$] Compared with procedure language, SQL is much more easier to understand.
 \item [$\star$] Storing all the features in tables avoids recomputing of features over iterations. It also boosts the performance.
 \item [$\star$] SQL is naivelly paralleled.
 \end{itemize}

 \subsection{Column Names Convention and Table Schema}
 \subsubsection{Column Names Convention}
 The following column names are commonly used in the tables of the following sections.
 \begin{itemize}
 \item doc\_id: Unique integer indentifier of a document.
 \item start\_pos: Position of the token in the document starting from 0.
 \item seg\_text: Text token itself.
 \item prev\_label: Label of previous token.
 \item label: Label of current token.
 \item max\_pos: End positon of the document.
 \item weight: Feature weight associated with certain feature
 \item f\_index: Unique integer identifier of a feature
 \item f\_name: Feature name.
 \end{itemize}

 \subsubsection{Training and Testing Data Schema}
 The text data has to been tokenized before it can be stored in the database. One of the commonly used tokenization program for part-of-speech
 tagging is the Treebank tokenization script.
 The following table dipicts how the training/testing data is stored in the database table.

 \begin {table}[h]
 \caption {Training or Testing data} \label{tab:trainingdata}
 \begin{center}
     \footnotesize
 \begin{tabular}{llll||llll}
   start\_pos & doc\_id & seg\_text & label & start\_pos & doc\_id & seg\_text & label\\
   \hline
         0&1&'confidence'&11& 1&1&'in'&5\\
         2&1&'the'&2& 3&1&'pound'&11\\
         4&1&'is'&31& 5&1&'widely'&19\\
         6&1&'expected'&29& 7&1&'to'&24\\
         8&1&'take'&26& 9&1&'another'&2\\
         10&1&'sharp'&6& 11&1&'dive'&11\\
         12&1&'if'&5& 13&1&'trade'&11\\
         14&1&'figures'&12& 15&1&'for'&5\\
         16&1&'september'&13& 17&1&','&42\\
         18&1&'due'&6& 19&1&'for'&5\\\hline
 \end{tabular}
 \end{center}
 \end{table}

 \subsection{Design Challanges and Work-arounds}
 As far as I know, the MADlib C++ abstraction layer doesn't support array of self-defined composite data types or multi-dimentional arrays.
 But we do have the need of these complex data types in the implemenation of this module. For example, the
 $viterbi\_mtbl$ table is indeed a two dimentional arrays. Due to the limitations of current C++ abstraction layer, we have to convert the matrix
 to an array and later index the data with  $M[i*n+j]$ instead of the normal way $M[i][j]$. Another example is the data types to represent the
 features. A single feature cannot be represented by a single DOUBLE varialbe but by a unit of struct : $[prev\_label, label, f\_index, start\_pos, exist]$
 But there is no arrays of struct type, we had to represent it with one dimentional array.
 Also we have to store the features for a document using  array of doubles instead of array of struct.

 \subsection{Training Data Feature Extraction}
 Given training data, SQLs are writen to extract all the features. It happened that any type of features mentioned above can be extracted out by one single SQL clause which makes the code succinct. We illustrate the training data feature extraction by SQL clause examples.

 \paragraph{Sample Feature Extraction SQLs for edge features and regex features}
 \begin{itemize}
 \item $SQL_1$:\\
 \begin{lstlisting}[language=SQL,gobble=4]
     SELECT doc2.start_pos, doc2.doc_id, 'E.', ARRAY[doc1.label, doc2.label]
     FROM   segmenttbl doc1, segmenttbl doc2
     WHERE  doc1.doc_id = doc2.doc_id AND doc1.start_pos+1 = doc2.start_pos
 \end{lstlisting}

 \item $SQL_2$:\\
 \begin{lstlisting}[language=SQL,gobble=4]
     SELECT start_pos, doc_id, 'R_' || name, ARRAY[-1, label]
     FROM  regextbl, segmenttbl
     WHERE seg_text ~ pattern
 \end{lstlisting}
 \end{itemize}

 \paragraph{Build the feature dictionary and assign each feature with a unique feature id}
 \begin{itemize}
 \item $SQL_3$\\
 \begin{lstlisting}[language=SQL,gobble=4]
     INSERT INTO tmp_featureset(f_name, feature)
     SELECT DISTINCT f_name, feature
     FROM   tmp1_feature;
     INSERT INTO featureset(f_index, f_name, feature)
     SELECT nextval('seq')-1, f_name, feature
     FROM   tmp_featureset;
 \end{lstlisting}
 \end{itemize}

 \paragraph{Generate sparse\_r table}
 \begin{itemize}
 \item $SQL_3$\\
 \begin{lstlisting}[language=SQL,gobble=4]
     INSERT INTO rtbl(start_pos,doc_id,feature)
     SELECT start_pos, doc_id, array_cat(fset.feature,
 			ARRAY[f_index,start_pos,
 			CASE WHEN tmp1_feature.feature = fset.feature THEN 1
 			ELSE 0 END] )
     FROM   tmp1_feature, featureset fset
     WHERE  tmp1_feature.f_name = fset.f_name AND fset.f_name <> 'E.';
 \end{lstlisting}
 \end{itemize}


 The final input table schema which contains all the feature data for the crf learning algorithm is as follows:
 \begin{center}
     \begin{tabular}{ | l | l | l | l |}
     \hline
     doc\_id & sparse\_r & dense\_m & sparse\_m \\
     \hline
     \end{tabular}
 \end{center}

 \begin{itemize}

 \item
 sparse r feature(single state feature):(prev\_label, label, f\_index, start\_pos, exist)

 \begin{center}
     \begin{tabular}{ | l | l |}
     \hline
     label & Description \\ \hline
     prev\_label & the label of previous token, it is always 0 in r table.\\
     label       & the label of the single state feature\\
     f\_index    & the index of the feature in the feature table\\
     start\_pos  & the postion of the token(starting from 0)\\
     exist       & indicate whether the token exists or not in the acutal training data set\\
     \hline
     \end{tabular}
 \end{center}

 \item
 dense m feature:
 (prev\_label, label, f\_index, start\_pos, exist)
 \begin{center}
     \begin{tabular}{ | l | l |}
     \hline
     label & Description \\ \hline
     prev\_label & the label of previous token.\\
     label       & the label of current token\\
     f\_index    & the index of the feature in the feature table\\
     start\_pos  & the postion of the token in a sentence(starting from 0)\\
     exist       & indicate whether the token exists or not in the acutal training data set\\
     \hline
     \end{tabular}
 \end{center}

 \item
 sparse m feature:(f\_index, prev\_label, label)
 \begin{center}
     \begin{tabular}{ | l | l |}
     \hline
     label & Description \\ \hline
     f\_index    &  the index of the feature in the feature table\\
     prev\_label &  the label of previous token \\
     label       &  the label of current token\\
     \hline
     \end{tabular}
 \end{center}

 \end{itemize}
 For performance consideraton, we split the m feature to $dense\_m$ feature and $sparse\_m$ feature\\
 The acutal $spare\_r$ table is array union of individal r features ordered by the start positon of tokens.
 So the function to compute the gradient vector and log-likelihood can scan the feature arrays from beginning to end.

 \subsection{Learned Model}
 The CRF learning algorithm will generate two tables: feature table and dictionary table
 Feature table stores all the features and their corresponding feature weight.
 The dictionary constains all the tokens and the number of times they appear in the training data.\\
 \begin {table}[h]
 \caption {Feature table} \label{tab:featuretbl}
 \begin{center}
     \scriptsize
     \begin{tabular}{ | l | l | l | l | l || l | l | l | l | l | }
     \hline
     f\_index & f\_name & prev\_label & label & weight & f\_index & f\_name & prev\_label & label & weight\\
     \hline
     0&'U'&-1&6&2.037& 1&'E.'&2&11&2.746   \\
     2&'W\_exchequer'&-1&13&1.821& 3&'W\_is'&-1&31&1.802 \\
     4&'E.'&11&31&2.469& 5&'W\_in'&-1&5&3.252 \\
     6&'E.'&11&12&1.305& 7&'U'&-1&2&-0.385 \\
     8&'E.'&31&29&1.958& 9&'U'&-1&29&1.422 \\
     10&'R\_endsWithIng'&-1&11&1.061&11&'W\_of'&-1&5&3.652 \\
     12&'S.'&-1&13&1.829& 13&'E.'&24&26&3.282 \\
     14&'W\_helped'&-1&29&1.214& 15&'E.'&11&24&1.556 \\
     \hline
     \end{tabular}
 \end{center}
 \end {table}

 \begin {table}[h]
 \caption {Dictionary table} \label{tab:title}
 \begin{center}
     \scriptsize
     \begin{tabular}{ | l | l || l | l || l | l || l | l | }
     \hline
     token     & total & token   & total & token     & total & token       & total\\
     'freefall'& 1     & 'policy'& 2     & 'measures'&1      & 'commitment'&1\\
     'new'&1& 'speech'&1& '''s'&2& 'reckon'&1\\
     'underlying'&1&'week'&1& 'prevent'&1& 'has'&2\\
     'failure'&1& 'restated'&1&'announce'&1& 'thursday'&1\\
     'but'&1& 'lawson'&1& 'last'&1& 'firm'&1\\
     'exchequer'&1& 'helped'&1& 'sterling'&2& $\ldots$ & $\ldots$\\
     \hline
     \end{tabular}
 \end{center}
 \end {table}


 \subsection{Testing Data Feature Extraction}
   This component extracts features from the testing data based on the learned models.
   It will produce two factor tables
   $viterbi\_mtbl$ and  $viterbi\_rtbl$. The $viterbi\_mtbl$
   table and a $viterbi\_rtbl$ table are used to calculate the best label
   sequence for each sentence.

 \paragraph{Sample Feature Extraction SQLs}
 \begin{itemize}
 \item $SQL_1$: Extracting unique tokens:\\
               \begin{lstlisting}[language=SQL,gobble=4]
                         INSERT INTO segment_hashtbl
                         SELECT DISTINCT seg_text
                         FROM   segmenttbl
               \end{lstlisting}

 \item $SQL_2$: Summerize over all single state features with respect to specific tokens and labels :\\
               \begin{lstlisting}[language=SQL,gobble=4]
                         INSERT INTO viterbi_rtbl
                         SELECT seg_text, label, SUM(value)
                         FROM   rtbl
                         GROUP BY seg_text,label
               \end{lstlisting}
 \end{itemize}
 \begin{center}
     \begin{tabular}{ | l | l | l | l |}
     \hline
     doc\_id & $viterbi\_mtbl$ & $viterbi\_rtbl$ \\
     \hline
     \end{tabular}
 \end{center}

   \begin{itemize}
   \item
   $viterbi\_mtbl$ table
   encodes the edge features which are solely dependent on upon current label and
   previous y value. The m table has three columns which are prev\_label, label,
   and value respectively.
   If the number of labels is $n$, then the m factor table will have $n^2$
   rows. Each row encodes the transition feature weight value from the previous label
   to the current label.

   startFeature is considered as a special edge feature which is from the
   beginning to the first token. Likewise, endFeature can be considered
   as a special edge feature which is from the last token to the very end.
   So m table encodes the edgeFeature, startFeature, and endFeature.
   If the total number of labels in the label space is 45 from 0 to 44,
   then the m factor array is as follows:
   \item
   $viterbi\_r$ table
   is related to specific tokens. It encodes the single state features,
   e.g., wordFeature, RegexFeature for all tokens. The r table is represented as
   shown in the table.\\

 \begin {table}[h]
 \caption {viterbi\_mtbl table} \label{tab:viterbimtbl}
  \begin{center}
 %\tiny
   \begin{tabular}{l*{6}{c}r}
    token             & 0   & 1   & 2   & 3   & ... & 43 &  44 \\
    \hline
   -1                 & 2.1 & 1.1 & 1.0 & 1.1 & 1.1 & 2.1 & 1.1  \\
    0                 & 1.1 & 3.9 & 1.2 & 2.1 & 2.8 & 1.8 & 0.8  \\
    1                 & 0.7 & 1.7 & 2.9 & 3.8 & 0.6 & 3.2 & 0.2  \\
    2                 & 0.2 & 3.2 & 3.8 & 2.9 & 0.2 & 0.1 & 0.2  \\
    3                 & 1.2 & 6.9 & 7.8 & 8.0 & 0.1 & 1.9 & 1.7  \\
    ...               & ... & ... & ... & ... & ... & ... & ...  \\
    44                & 8.2 & 1.8 & 3.7 & 2.1 & 7.2 & 1.3 & 7.2  \\
    45                & 1.8 & 7.8 & 5.6 & 9.8 & 2.3 & 9.4 & 1.1
   \end{tabular}
  \end{center}
 \end{table}
 \begin {table}[h]
 \caption {viterbi\_rtbl table} \label{tab:viterbirtbl}
  \begin{center}
 %\tiny
   \begin{tabular}{l*{6}{c}r}
    token             & 0   & 1   & 2   & 3   & ... & 43 &  44 \\
    \hline
    madlib            & 0.2 & 4.1 & 0.0 & 2.1 & 0.1 & 2.5 & 1.2  \\
    is                & 1.3 & 3.0 & 0.2 & 3.1 & 0.8 & 1.9 & 0.9  \\
    an                & 0.9 & 1.1 & 1.9 & 3.8 & 0.7 & 3.8 & 0.7  \\
    open-source       & 0.8 & 0.2 & 1.8 & 2.7 & 0.5 & 0.8 & 0.1  \\
    library           & 1.8 & 1.9 & 1.8 & 8.7 & 0.2 & 1.8 & 1.1  \\
    ...               & ... & ... & ... & ... & ... & ... & ...  \\
   \end{tabular}
  \end{center}
 \end{table}
 \end{itemize}

 \section{Linear-chain CRF Inference}
 The Viterbi algorithm \cite{DBLP:journals/scholarpedia/Viterbi09} is the popular algorithm to find the top-k most likely labelings of a document for CRF models.
 For the tasks in natural language processing domain, it is sufficient to only generate the best label sequence.
 We chose to use a SQL clause to drive the inference over all documents.
 In Greenplum, Viterbi can be run in parallel over different subsets of the document on a multi-core machine.

 \subsection{Parallel CRF Inference}
 The $vcrf\_top\_label$ is implemented sequentially and each function call will finish labeling of one document.
 The inference is paralledl in the level of document. We use a SQL statment to drive the inference of all documents.
 So, the CRF inference is naivelly parallel.
 \begin{lstlisting}[language=SQL,gobble=4]
         SELECT doc_id, vcrf_top1_label(mfactors.score, rfactors.score)
         FROM   mfactors,rfactors
 \end{lstlisting}

 \subsection{Viterbi Inference Algorithm}
 \[
 V(i,y) =
 \begin{cases}
  \max_{y^\prime}(V(i-1,y^\prime)) + \textstyle \sum_{k=1}^K \lambda_kf_k(y,y^\prime,x_i), & \text{if } i\ge0 \\
  0, & \text{if } i=-1.
 \end{cases}
 \]

 \subsection{Viterbi Inference output}
 The final inference output will produce the the best label sequence for each document and also the conditional probability given the
 observed input sequence.
 \begin {table}[h]
 \caption {Viterbi inference output}
 \begin{center}
     \scriptsize
     \begin{tabular}{ | l | l | l | l | l | }
     \hline
     doc\_id & start\_pos & token & label & probability     \\\hline
     1   & 0    & madlib        & proper noun, singular &0.6\\
     1   & 1    & is            & Verb, base form       &0.6 \\
     1   & 2    & an            & determiner            &0.6 \\
     1   & 2    & open-source   & adjective             &0.6 \\
     1   & 4    & library       & noun                  &0.6 \\
     1   & 5    & for           & preposition           &0.6 \\
     1   & 6    & scalable      & adjective             &0.6 \\
     1   & 7    & in-dababase   & adverb                &0.6 \\
     1   & 8    & analytics     & noun, singular        &0.6 \\
     1   & 9    & .             & sentence-final punc   &0.6 \\
     2   & 0    & it            & personal pronoun      &0.4 \\
     2   & 1    & provides      & verb, base form       &0.4 \\
     2   & 2    & data-parallel & noun                  &0.4 \\
     2   & 2    & $\ldots$      & $\ldots$                    &0.4 \\
     \hline
     \end{tabular}
 \end{center}
 \end{table}