docs/v0.6/crf_8sql__in_source.html - madlib-site - Git at Google

 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml">
 <head>
 <meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
 <title>MADlib: crf.sql_in Source File</title>

 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <link href="doxygen.css" rel="stylesheet" type="text/css" />
 <link href="navtree.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="resize.js"></script>
 <script type="text/javascript" src="navtree.js"></script>
 <script type="text/javascript">
   $(document).ready(initResizable);
 </script>
 <link href="search/search.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="search/search.js"></script>
 <script type="text/javascript">
   $(document).ready(function() { searchBox.OnSelectItem(0); });
 </script>
 <script src="../mathjax/MathJax.js">
   MathJax.Hub.Config({
     extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
     jax: ["input/TeX","output/HTML-CSS"],
 });
 </script>
 </head>
 <body>
 <div id="top"><!-- do not remove this div! -->


 <div id="titlearea">
 <table cellspacing="0" cellpadding="0">
  <tbody>
  <tr style="height: 56px;">


   <td style="padding-left: 0.5em;">
    <div id="projectname">MADlib
    &#160;<span id="projectnumber">0.6</span> <span style="font-size:10pt; font-style:italic"><a href="../latest/./crf_8sql__in_source.html"> A newer version is available</a></span>
    </div>
    <div id="projectbrief">User Documentation</div>
   </td>


  </tr>
  </tbody>
 </table>
 </div>

 <!-- Generated by Doxygen 1.7.5.1 -->
 <script type="text/javascript">
 var searchBox = new SearchBox("searchBox", "search",false,'Search');
 </script>
 <script type="text/javascript" src="dynsections.js"></script>
   <div id="navrow1" class="tabs">
     <ul class="tablist">
       <li><a href="index.html"><span>Main&#160;Page</span></a></li>
       <li><a href="modules.html"><span>Modules</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
           <img id="MSearchSelect" src="search/mag_sel.png"
                onmouseover="return searchBox.OnSearchSelectShow()"
                onmouseout="return searchBox.OnSearchSelectHide()"
                alt=""/>
           <input type="text" id="MSearchField" value="Search" accesskey="S"
                onfocus="searchBox.OnSearchFieldFocus(true)"
                onblur="searchBox.OnSearchFieldFocus(false)"
                onkeyup="searchBox.OnSearchFieldChange(event)"/>
           </span><span class="right">
             <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
           </span>
         </div>
       </li>
     </ul>
   </div>
   <div id="navrow2" class="tabs2">
     <ul class="tablist">
       <li><a href="files.html"><span>File&#160;List</span></a></li>
       <li><a href="globals.html"><span>File&#160;Members</span></a></li>
     </ul>
   </div>
 </div>
 <div id="side-nav" class="ui-resizable side-nav-resizable">
   <div id="nav-tree">
     <div id="nav-tree-contents">
     </div>
   </div>
   <div id="splitbar" style="-moz-user-select:none;"
        class="ui-resizable-handle">
   </div>
 </div>
 <script type="text/javascript">
   initNavTree('crf_8sql__in.html','');
 </script>
 <div id="doc-content">
 <div class="header">
   <div class="headertitle">
 <div class="title">crf.sql_in</div>  </div>
 </div>
 <div class="contents">
 <a href="crf_8sql__in.html">Go to the documentation of this file.</a><div class="fragment"><pre class="fragment"><a name="l00001"></a>00001 <span class="comment">/* ----------------------------------------------------------------------- */</span><span class="comment">/** </span>
 <a name="l00002"></a>00002 <span class="comment"> *</span>
 <a name="l00003"></a>00003 <span class="comment"> * @file crf.sql_in</span>
 <a name="l00004"></a>00004 <span class="comment"> *</span>
 <a name="l00005"></a>00005 <span class="comment"> * @brief SQL functions for conditional random field</span>
 <a name="l00006"></a>00006 <span class="comment"> * @date July 2012</span>
 <a name="l00007"></a>00007 <span class="comment"> *</span>
 <a name="l00008"></a>00008 <span class="comment"> * @sa For a brief introduction to conditional random field, see the</span>
 <a name="l00009"></a>00009 <span class="comment"> *     module description \ref grp_crf.</span>
 <a name="l00010"></a>00010 <span class="comment"> *</span>
 <a name="l00011"></a>00011 <span class="comment"> */</span><span class="comment">/* ----------------------------------------------------------------------- */</span>
 <a name="l00012"></a>00012
 <a name="l00013"></a>00013 m4_include(`SQLCommon.m4<span class="stringliteral">&#39;)</span>
 <a name="l00014"></a>00014 <span class="stringliteral"></span><span class="comment"></span>
 <a name="l00015"></a>00015 <span class="comment">/**</span>
 <a name="l00016"></a>00016 <span class="comment">@addtogroup grp_crf</span>
 <a name="l00017"></a>00017 <span class="comment"></span>
 <a name="l00018"></a>00018 <span class="comment">@about</span>
 <a name="l00019"></a>00019 <span class="comment">A conditional random field (CRF) is a type of discriminative, undirected probabilistic graphical model.  A linear-chain CRF is a special </span>
 <a name="l00020"></a>00020 <span class="comment">type of CRF that assumes the current state depends only on the previous state.  </span>
 <a name="l00021"></a>00021 <span class="comment"></span>
 <a name="l00022"></a>00022 <span class="comment">Specifically, a linear-chain CRF is a distribution defined by</span>
 <a name="l00023"></a>00023 <span class="comment">\f[</span>
 <a name="l00024"></a>00024 <span class="comment">    p_\lambda(\boldsymbol y | \boldsymbol x) =</span>
 <a name="l00025"></a>00025 <span class="comment">        \frac{\exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol y)}}{Z_\lambda(\boldsymbol x)}</span>
 <a name="l00026"></a>00026 <span class="comment">    \,.</span>
 <a name="l00027"></a>00027 <span class="comment">\f]</span>
 <a name="l00028"></a>00028 <span class="comment"></span>
 <a name="l00029"></a>00029 <span class="comment">where </span>
 <a name="l00030"></a>00030 <span class="comment">- \f$ F_m(\boldsymbol x, \boldsymbol y) = \sum_{i=1}^n f_m(y_i,y_{i-1},x_i) \f$ is a global feature function that is a sum along a sequence </span>
 <a name="l00031"></a>00031 <span class="comment">  \f$ \boldsymbol x \f$ of length \f$ n \f$</span>
 <a name="l00032"></a>00032 <span class="comment">- \f$ f_m(y_i,y_{i-1},x_i) \f$ is a local feature function dependent on the current token label \f$ y_i \f$, the previous token label \f$ y_{i-1} \f$, </span>
 <a name="l00033"></a>00033 <span class="comment">  and the observation \f$ x_i \f$</span>
 <a name="l00034"></a>00034 <span class="comment">- \f$ \lambda_m \f$ is the corresponding feature weight </span>
 <a name="l00035"></a>00035 <span class="comment">- \f$ Z_\lambda(\boldsymbol x) \f$ is an instance-specific normalizer</span>
 <a name="l00036"></a>00036 <span class="comment">\f[</span>
 <a name="l00037"></a>00037 <span class="comment">Z_\lambda(\boldsymbol x) = \sum_{\boldsymbol y&#39;} \exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol y&#39;)}</span>
 <a name="l00038"></a>00038 <span class="comment">\f]</span>
 <a name="l00039"></a>00039 <span class="comment"></span>
 <a name="l00040"></a>00040 <span class="comment">A linear-chain CRF estimates the weights \f$ \lambda_m \f$ by maximizing the log-likelihood </span>
 <a name="l00041"></a>00041 <span class="comment">of a given training set \f$ T=\{(x_k,y_k)\}_{k=1}^N \f$.  </span>
 <a name="l00042"></a>00042 <span class="comment"></span>
 <a name="l00043"></a>00043 <span class="comment">The log-likelihood is defined as</span>
 <a name="l00044"></a>00044 <span class="comment">\f[</span>
 <a name="l00045"></a>00045 <span class="comment">    \ell_{\lambda}=\sum_k \log p_\lambda(y_k|x_k) =\sum_k[\sum_{m=1}^M \lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)]</span>
 <a name="l00046"></a>00046 <span class="comment">\f]</span>
 <a name="l00047"></a>00047 <span class="comment"></span>
 <a name="l00048"></a>00048 <span class="comment">and the zero of its gradient</span>
 <a name="l00049"></a>00049 <span class="comment">\f[</span>
 <a name="l00050"></a>00050 <span class="comment">    \nabla \ell_{\lambda}=\sum_k[F(x_k,y_k)-E_{p_\lambda(Y|x_k)}[F(x_k,Y)]]</span>
 <a name="l00051"></a>00051 <span class="comment">\f]</span>
 <a name="l00052"></a>00052 <span class="comment"></span>
 <a name="l00053"></a>00053 <span class="comment">is found since the maximum likelihood is reached when the empirical average of the global feature vector equals its model expectation.  The MADlib implementation uses limited-memory BFGS (L-BFGS), a limited-memory variation of the Broyden–Fletcher–Goldfarb–Shanno (BFGS) update, a quasi-Newton method for unconstrained optimization. </span>
 <a name="l00054"></a>00054 <span class="comment"></span>
 <a name="l00055"></a>00055 <span class="comment">\f$E_{p_\lambda(Y|x)}[F(x,Y)]\f$ is found by using a variant of the forward-backward algorithm:</span>
 <a name="l00056"></a>00056 <span class="comment">\f[</span>
 <a name="l00057"></a>00057 <span class="comment">    E_{p_\lambda(Y|x)}[F(x,Y)] = \sum_y p_\lambda(y|x)F(x,y)</span>
 <a name="l00058"></a>00058 <span class="comment">                            = \sum_i\frac{\alpha_{i-1}(f_i*M_i)\beta_i^T}{Z_\lambda(x)}</span>
 <a name="l00059"></a>00059 <span class="comment">\f]</span>
 <a name="l00060"></a>00060 <span class="comment">\f[</span>
 <a name="l00061"></a>00061 <span class="comment">    Z_\lambda(x) = \alpha_n.1^T</span>
 <a name="l00062"></a>00062 <span class="comment">\f]</span>
 <a name="l00063"></a>00063 <span class="comment">    where \f$\alpha_i\f$  and \f$ \beta_i\f$ are the forward and backward state cost vectors defined by</span>
 <a name="l00064"></a>00064 <span class="comment">\f[</span>
 <a name="l00065"></a>00065 <span class="comment">    \alpha_i = </span>
 <a name="l00066"></a>00066 <span class="comment">    \begin{cases}</span>
 <a name="l00067"></a>00067 <span class="comment">    \alpha_{i-1}M_i, &amp; 0&lt;i&lt;=n\\</span>
 <a name="l00068"></a>00068 <span class="comment">    1, &amp; i=0</span>
 <a name="l00069"></a>00069 <span class="comment">    \end{cases}\\</span>
 <a name="l00070"></a>00070 <span class="comment">\f]</span>
 <a name="l00071"></a>00071 <span class="comment">\f[</span>
 <a name="l00072"></a>00072 <span class="comment">    \beta_i^T = </span>
 <a name="l00073"></a>00073 <span class="comment">    \begin{cases}</span>
 <a name="l00074"></a>00074 <span class="comment">    M_{i+1}\beta_{i+1}^T, &amp; 1&lt;=i&lt;n\\</span>
 <a name="l00075"></a>00075 <span class="comment">    1, &amp; i=n</span>
 <a name="l00076"></a>00076 <span class="comment">    \end{cases}</span>
 <a name="l00077"></a>00077 <span class="comment">\f]</span>
 <a name="l00078"></a>00078 <span class="comment"></span>
 <a name="l00079"></a>00079 <span class="comment">To avoid overfitting, we penalize the likelihood with a spherical Gaussian weight prior:</span>
 <a name="l00080"></a>00080 <span class="comment">\f[</span>
 <a name="l00081"></a>00081 <span class="comment">    \ell_{\lambda}^\prime=\sum_k[\sum_{m=1}^M \lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)] - \frac{\lVert \lambda \rVert^2}{2\sigma ^2}</span>
 <a name="l00082"></a>00082 <span class="comment">\f]</span>
 <a name="l00083"></a>00083 <span class="comment"></span>
 <a name="l00084"></a>00084 <span class="comment">\f[</span>
 <a name="l00085"></a>00085 <span class="comment">    \nabla \ell_{\lambda}^\prime=\sum_k[F(x_k,y_k) - E_{p_\lambda(Y|x_k)}[F(x_k,Y)]] - \frac{\lambda}{\sigma ^2}</span>
 <a name="l00086"></a>00086 <span class="comment">\f]</span>
 <a name="l00087"></a>00087 <span class="comment"></span>
 <a name="l00088"></a>00088 <span class="comment">    </span>
 <a name="l00089"></a>00089 <span class="comment"></span>
 <a name="l00090"></a>00090 <span class="comment">Feature extraction modules are provided for text-analysis</span>
 <a name="l00091"></a>00091 <span class="comment">tasks such as part-of-speech (POS) tagging and named-entity resolution (NER).  Currently, six feature types are implemented:</span>
 <a name="l00092"></a>00092 <span class="comment">- Edge Feature: transition feature that encodes the transition feature</span>
 <a name="l00093"></a>00093 <span class="comment">weight from current label to next label.</span>
 <a name="l00094"></a>00094 <span class="comment">- Start Feature: fired when the current token is the first token in a sequence.</span>
 <a name="l00095"></a>00095 <span class="comment">- End Feature: fired when the current token is the last token in a sequence.</span>
 <a name="l00096"></a>00096 <span class="comment">- Word Feature: fired when the current token is observed in the trained</span>
 <a name="l00097"></a>00097 <span class="comment">dictionary.</span>
 <a name="l00098"></a>00098 <span class="comment">- Unknown Feature: fired when the current token is not observed in the trained</span>
 <a name="l00099"></a>00099 <span class="comment">dictionary for at least a certain number of times (default 1).</span>
 <a name="l00100"></a>00100 <span class="comment">- Regex Feature: fired when the current token can be matched by a regular</span>
 <a name="l00101"></a>00101 <span class="comment">expression.</span>
 <a name="l00102"></a>00102 <span class="comment"></span>
 <a name="l00103"></a>00103 <span class="comment">A Viterbi implementation is also provided </span>
 <a name="l00104"></a>00104 <span class="comment">to get the best label sequence and the conditional probability</span>
 <a name="l00105"></a>00105 <span class="comment">\f$ \Pr( \text{best label sequence} \mid \text{sequence}) \f$.</span>
 <a name="l00106"></a>00106 <span class="comment"></span>
 <a name="l00107"></a>00107 <span class="comment">For a full example of how to use the MADlib CRF modules for a text analytics application, see the &quot;Example&quot; section below.</span>
 <a name="l00108"></a>00108 <span class="comment"></span>
 <a name="l00109"></a>00109 <span class="comment">@input</span>
 <a name="l00110"></a>00110 <span class="comment">- User-provided input:\n</span>
 <a name="l00111"></a>00111 <span class="comment">The user is expected to at least provide the label table, the regular expression table, and the segment table:</span>
 <a name="l00112"></a>00112 <span class="comment">&lt;pre&gt;{TABLE|VIEW} &lt;em&gt;labelTableName&lt;/em&gt; (</span>
 <a name="l00113"></a>00113 <span class="comment">    ...</span>
 <a name="l00114"></a>00114 <span class="comment">    &lt;em&gt;id&lt;/em&gt; INTEGER,</span>
 <a name="l00115"></a>00115 <span class="comment">    &lt;em&gt;label&lt;/em&gt; TEXT,</span>
 <a name="l00116"></a>00116 <span class="comment">    ...</span>
 <a name="l00117"></a>00117 <span class="comment">)&lt;/pre&gt;</span>
 <a name="l00118"></a>00118 <span class="comment">where &lt;em&gt;id&lt;/em&gt; is a unique ID for the label and &lt;em&gt;label&lt;/em&gt; is the label name.</span>
 <a name="l00119"></a>00119 <span class="comment">&lt;pre&gt;{TABLE|VIEW} &lt;em&gt;regexTableName&lt;/em&gt; (</span>
 <a name="l00120"></a>00120 <span class="comment">    ...</span>
 <a name="l00121"></a>00121 <span class="comment">    &lt;em&gt;pattern&lt;/em&gt; TEXT,</span>
 <a name="l00122"></a>00122 <span class="comment">    &lt;em&gt;name&lt;/em&gt; TEXT,</span>
 <a name="l00123"></a>00123 <span class="comment">    ...</span>
 <a name="l00124"></a>00124 <span class="comment">)&lt;/pre&gt;</span>
 <a name="l00125"></a>00125 <span class="comment">where &lt;em&gt;pattern&lt;/em&gt; is a regular expression pattern (e.g. &#39;^.+ing$&#39;) and &lt;em&gt;name&lt;/em&gt; is a name for the regular expression pattern (e.g. &#39;endsWithIng&#39;).</span>
 <a name="l00126"></a>00126 <span class="comment">&lt;pre&gt;{TABLE|VIEW} &lt;em&gt;segmentTableName&lt;/em&gt; (</span>
 <a name="l00127"></a>00127 <span class="comment">    ...</span>
 <a name="l00128"></a>00128 <span class="comment">    &lt;em&gt;start_pos&lt;/em&gt; INTEGER,</span>
 <a name="l00129"></a>00129 <span class="comment">    &lt;em&gt;doc_id&lt;/em&gt; INTEGER,</span>
 <a name="l00130"></a>00130 <span class="comment">    &lt;em&gt;seg_text&lt;/em&gt; TEXT,</span>
 <a name="l00131"></a>00131 <span class="comment">    &lt;em&gt;label&lt;/em&gt; INTEGER,</span>
 <a name="l00132"></a>00132 <span class="comment">    &lt;em&gt;max_pos&lt;/em&gt; INTEGER,</span>
 <a name="l00133"></a>00133 <span class="comment">    ...</span>
 <a name="l00134"></a>00134 <span class="comment">)&lt;/pre&gt;</span>
 <a name="l00135"></a>00135 <span class="comment">where &lt;em&gt;start_pos&lt;/em&gt; is the position of the word in the sequence, &lt;em&gt;doc_id&lt;/em&gt; is a unique ID for the sequence, &lt;em&gt;seg_text&lt;/em&gt; is the word, &lt;em&gt;label&lt;/em&gt; is the label for the word, and &lt;em&gt;max_pos&lt;/em&gt; is the length of the sequence.</span>
 <a name="l00136"></a>00136 <span class="comment"></span>
 <a name="l00137"></a>00137 <span class="comment">- Training (\ref lincrf) input:\n</span>
 <a name="l00138"></a>00138 <span class="comment">The feature table used for training is expected to be of the following form (this table can also be generated by \ref crf_train_fgen):\n</span>
 <a name="l00139"></a>00139 <span class="comment">&lt;pre&gt;{TABLE|VIEW} &lt;em&gt;featureTableName&lt;/em&gt; (</span>
 <a name="l00140"></a>00140 <span class="comment">    ...</span>
 <a name="l00141"></a>00141 <span class="comment">    &lt;em&gt;doc_id&lt;/em&gt; INTEGER,</span>
 <a name="l00142"></a>00142 <span class="comment">    &lt;em&gt;f_size&lt;/em&gt; INTEGER,</span>
 <a name="l00143"></a>00143 <span class="comment">    &lt;em&gt;sparse_r&lt;/em&gt; FLOAT8[],</span>
 <a name="l00144"></a>00144 <span class="comment">    &lt;em&gt;dense_m&lt;/em&gt; FLOAT8[],</span>
 <a name="l00145"></a>00145 <span class="comment">    &lt;em&gt;sparse_m&lt;/em&gt; FLOAT8[],</span>
 <a name="l00146"></a>00146 <span class="comment">    ...</span>
 <a name="l00147"></a>00147 <span class="comment">)&lt;/pre&gt;</span>
 <a name="l00148"></a>00148 <span class="comment">where </span>
 <a name="l00149"></a>00149 <span class="comment">  - &lt;em&gt;doc_id&lt;/em&gt; is a unique ID for the sequence</span>
 <a name="l00150"></a>00150 <span class="comment">  - &lt;em&gt;f_size&lt;/em&gt; is the number of features</span>
 <a name="l00151"></a>00151 <span class="comment">  - &lt;em&gt;sparse_r&lt;/em&gt; is the array union of (previous label, label, feature index, start position, training existance indicator) of individal single-state features (e.g. word features, regex features) ordered by their start positon</span>
 <a name="l00152"></a>00152 <span class="comment">  - &lt;em&gt;dense_m&lt;/em&gt; is the array union of (previous label, label, feature index, start position, training existance indicator) of edge features ordered by start position</span>
 <a name="l00153"></a>00153 <span class="comment">  - &lt;em&gt;sparse_m&lt;/em&gt; is the array union of (feature index, previous label, label) of edge features ordered by feature index.  </span>
 <a name="l00154"></a>00154 <span class="comment">Edge features were split into dense_m and sparse_m for performance reasons.</span>
 <a name="l00155"></a>00155 <span class="comment"></span>
 <a name="l00156"></a>00156 <span class="comment">The set of features used for training is expected to be of the following form (also can be generated by \ref crf_train_fgen):\n</span>
 <a name="l00157"></a>00157 <span class="comment">&lt;pre&gt;{TABLE|VIEW} &lt;em&gt;featureSetName&lt;/em&gt; (</span>
 <a name="l00158"></a>00158 <span class="comment">    ...</span>
 <a name="l00159"></a>00159 <span class="comment">    &lt;em&gt;f_index&lt;/em&gt; INTEGER,</span>
 <a name="l00160"></a>00160 <span class="comment">    &lt;em&gt;f_name&lt;/em&gt; TEXT,</span>
 <a name="l00161"></a>00161 <span class="comment">    &lt;em&gt;feature_labels&lt;/em&gt; INTEGER[],</span>
 <a name="l00162"></a>00162 <span class="comment">    ...</span>
 <a name="l00163"></a>00163 <span class="comment">)&lt;/pre&gt;</span>
 <a name="l00164"></a>00164 <span class="comment">where </span>
 <a name="l00165"></a>00165 <span class="comment">  - &lt;em&gt;f_index&lt;/em&gt; is a unique ID for the feature</span>
 <a name="l00166"></a>00166 <span class="comment">  - &lt;em&gt;f_name&lt;/em&gt; is the feature name</span>
 <a name="l00167"></a>00167 <span class="comment">  - &lt;em&gt;feature_labels&lt;/em&gt; is an array representing {previous label, label}.</span>
 <a name="l00168"></a>00168 <span class="comment"></span>
 <a name="l00169"></a>00169 <span class="comment">The empty feature weight table (which will be populated after training) is expected to be of the following form:</span>
 <a name="l00170"></a>00170 <span class="comment">&lt;pre&gt;{TABLE|VIEW} &lt;em&gt;featureWeightsName&lt;/em&gt; (</span>
 <a name="l00171"></a>00171 <span class="comment">    ...</span>
 <a name="l00172"></a>00172 <span class="comment">    &lt;em&gt;f_index&lt;/em&gt; INTEGER,</span>
 <a name="l00173"></a>00173 <span class="comment">    &lt;em&gt;f_name&lt;/em&gt; TEXT,</span>
 <a name="l00174"></a>00174 <span class="comment">    &lt;em&gt;previous_label&lt;/em&gt; INTEGER,</span>
 <a name="l00175"></a>00175 <span class="comment">    &lt;em&gt;label&lt;/em&gt; INTEGER,</span>
 <a name="l00176"></a>00176 <span class="comment">    &lt;em&gt;weight&lt;/em&gt; FLOAT8,</span>
 <a name="l00177"></a>00177 <span class="comment">    ...</span>
 <a name="l00178"></a>00178 <span class="comment">)&lt;/pre&gt;</span>
 <a name="l00179"></a>00179 <span class="comment"></span>
 <a name="l00180"></a>00180 <span class="comment">@usage</span>
 <a name="l00181"></a>00181 <span class="comment">- Get number of iterations and weights for features:\n</span>
 <a name="l00182"></a>00182 <span class="comment">  &lt;pre&gt;SELECT * FROM \ref lincrf(</span>
 <a name="l00183"></a>00183 <span class="comment">    &#39;&lt;em&gt;featureTableName&lt;/em&gt;&#39;, &#39;&lt;em&gt;sparse_r&lt;/em&gt;&#39;, &#39;&lt;em&gt;dense_m&lt;/em&gt;&#39;,&#39;&lt;em&gt;sparse_m&lt;/em&gt;&#39;, &#39;&lt;em&gt;f_size&lt;/em&gt;&#39;, &lt;em&gt;tag_size&lt;/em&gt;, &#39;&lt;em&gt;feature_set&lt;/em&gt;&#39;, &#39;&lt;em&gt;featureWeightsName&lt;/em&gt;&#39;</span>
 <a name="l00184"></a>00184 <span class="comment">    [, &lt;em&gt;maxNumberOfIterations&lt;/em&gt; ] ]</span>
 <a name="l00185"></a>00185 <span class="comment">);&lt;/pre&gt;</span>
 <a name="l00186"></a>00186 <span class="comment">  where tag_size is the total number of labels.</span>
 <a name="l00187"></a>00187 <span class="comment"></span>
 <a name="l00188"></a>00188 <span class="comment">  Output:</span>
 <a name="l00189"></a>00189 <span class="comment">&lt;pre&gt; lincrf</span>
 <a name="l00190"></a>00190 <span class="comment">-----------------</span>
 <a name="l00191"></a>00191 <span class="comment"> [number of iterations]&lt;/pre&gt;</span>
 <a name="l00192"></a>00192 <span class="comment"></span>
 <a name="l00193"></a>00193 <span class="comment">  &lt;em&gt;featureWeightsName&lt;/em&gt;:</span>
 <a name="l00194"></a>00194 <span class="comment">&lt;pre&gt; id |      name      | prev_label_id | label_id |      weight       </span>
 <a name="l00195"></a>00195 <span class="comment">----+----------------+---------------+----------+-------------------</span>
 <a name="l00196"></a>00196 <span class="comment">&lt;/pre&gt;</span>
 <a name="l00197"></a>00197 <span class="comment"></span>
 <a name="l00198"></a>00198 <span class="comment">- Generate text features, calculate their weights, and output the best label sequence for test data:\n</span>
 <a name="l00199"></a>00199 <span class="comment"> -# Create tables to store the input data, intermediate data, and output data.</span>
 <a name="l00200"></a>00200 <span class="comment">    Also import the training data to the database.</span>
 <a name="l00201"></a>00201 <span class="comment">    &lt;pre&gt;SELECT madlib.crf_train_data(</span>
 <a name="l00202"></a>00202 <span class="comment">         &#39;&lt;em&gt;/path/to/data&lt;/em&gt;&#39;);&lt;/pre&gt; </span>
 <a name="l00203"></a>00203 <span class="comment"> -# Generate text analytics features for the training data.</span>
 <a name="l00204"></a>00204 <span class="comment">    &lt;pre&gt;SELECT madlib.crf_train_fgen(</span>
 <a name="l00205"></a>00205 <span class="comment">         &#39;&lt;em&gt;segmenttbl&lt;/em&gt;&#39;,</span>
 <a name="l00206"></a>00206 <span class="comment">         &#39;&lt;em&gt;regextbl&lt;/em&gt;&#39;,</span>
 <a name="l00207"></a>00207 <span class="comment">         &#39;&lt;em&gt;dictionary&lt;/em&gt;&#39;,</span>
 <a name="l00208"></a>00208 <span class="comment">         &#39;&lt;em&gt;featuretbl&lt;/em&gt;&#39;,</span>
 <a name="l00209"></a>00209 <span class="comment">         &#39;&lt;em&gt;featureset&lt;/em&gt;&#39;);&lt;/pre&gt;</span>
 <a name="l00210"></a>00210 <span class="comment"> -# Use linear-chain CRF for training.</span>
 <a name="l00211"></a>00211 <span class="comment">    &lt;pre&gt;SELECT madlib.lincrf(</span>
 <a name="l00212"></a>00212 <span class="comment">         &#39;&lt;em&gt;source&lt;/em&gt;&#39;,</span>
 <a name="l00213"></a>00213 <span class="comment">         &#39;&lt;em&gt;sparse_r&lt;/em&gt;&#39;,</span>
 <a name="l00214"></a>00214 <span class="comment">         &#39;&lt;em&gt;dense_m&lt;/em&gt;&#39;,</span>
 <a name="l00215"></a>00215 <span class="comment">         &#39;&lt;em&gt;sparse_m&lt;/em&gt;&#39;,</span>
 <a name="l00216"></a>00216 <span class="comment">         &#39;&lt;em&gt;f_size&lt;/em&gt;&#39;,</span>
 <a name="l00217"></a>00217 <span class="comment">         &lt;em&gt;tag_size&lt;/em&gt;,</span>
 <a name="l00218"></a>00218 <span class="comment">         &#39;&lt;em&gt;feature_set&lt;/em&gt;&#39;,</span>
 <a name="l00219"></a>00219 <span class="comment">         &#39;&lt;em&gt;featureWeights&lt;/em&gt;&#39;,</span>
 <a name="l00220"></a>00220 <span class="comment">         &#39;&lt;em&gt;maxNumIterations&lt;/em&gt;&#39;);&lt;/pre&gt;</span>
 <a name="l00221"></a>00221 <span class="comment"> -# Import CRF model to the database.</span>
 <a name="l00222"></a>00222 <span class="comment">    Also load the CRF testing data to the database.</span>
 <a name="l00223"></a>00223 <span class="comment">    &lt;pre&gt;SELECT madlib.crf_test_data(</span>
 <a name="l00224"></a>00224 <span class="comment">         &#39;&lt;em&gt;/path/to/data&lt;/em&gt;&#39;);&lt;/pre&gt;</span>
 <a name="l00225"></a>00225 <span class="comment"> -# Generate text analytics features for the testing data.</span>
 <a name="l00226"></a>00226 <span class="comment">    &lt;pre&gt;SELECT madlib.crf_test_fgen(</span>
 <a name="l00227"></a>00227 <span class="comment">         &#39;&lt;em&gt;segmenttbl&lt;/em&gt;&#39;,</span>
 <a name="l00228"></a>00228 <span class="comment">         &#39;&lt;em&gt;dictionary&lt;/em&gt;&#39;,</span>
 <a name="l00229"></a>00229 <span class="comment">         &#39;&lt;em&gt;labeltbl&lt;/em&gt;&#39;,</span>
 <a name="l00230"></a>00230 <span class="comment">         &#39;&lt;em&gt;regextbl&lt;/em&gt;&#39;,</span>
 <a name="l00231"></a>00231 <span class="comment">         &#39;&lt;em&gt;featuretbl&lt;/em&gt;&#39;,</span>
 <a name="l00232"></a>00232 <span class="comment">         &#39;&lt;em&gt;viterbi_mtbl&lt;/em&gt;&#39;,</span>
 <a name="l00233"></a>00233 <span class="comment">         &#39;&lt;em&gt;viterbi_rtbl&lt;/em&gt;&#39;);&lt;/pre&gt;</span>
 <a name="l00234"></a>00234 <span class="comment">    &#39;viterbi_mtbl&#39; and &#39;viterbi_rtbl&#39; are simply text representing names for tables created in the feature generation module (i.e. they are NOT empty tables).</span>
 <a name="l00235"></a>00235 <span class="comment"> -# Run the Viterbi function to get the best label sequence and the conditional</span>
 <a name="l00236"></a>00236 <span class="comment">    probability \f$ \Pr( \text{best label sequence} \mid \text{sequence}) \f$.</span>
 <a name="l00237"></a>00237 <span class="comment">    &lt;pre&gt;SELECT madlib.vcrf_label(</span>
 <a name="l00238"></a>00238 <span class="comment">         &#39;&lt;em&gt;segmenttbl&lt;/em&gt;&#39;,</span>
 <a name="l00239"></a>00239 <span class="comment">         &#39;&lt;em&gt;viterbi_mtbl&lt;/em&gt;&#39;,</span>
 <a name="l00240"></a>00240 <span class="comment">         &#39;&lt;em&gt;viterbi_rtbl&lt;/em&gt;&#39;,</span>
 <a name="l00241"></a>00241 <span class="comment">         &#39;&lt;em&gt;labeltbl&lt;/em&gt;&#39;,</span>
 <a name="l00242"></a>00242 <span class="comment">         &#39;&lt;em&gt;resulttbl&lt;/em&gt;&#39;);&lt;/pre&gt;</span>
 <a name="l00243"></a>00243 <span class="comment"></span>
 <a name="l00244"></a>00244 <span class="comment">@examp</span>
 <a name="l00245"></a>00245 <span class="comment">-# Load the label table, the regular expressions table, and the training segment table:</span>
 <a name="l00246"></a>00246 <span class="comment">@verbatim </span>
 <a name="l00247"></a>00247 <span class="comment">sql&gt; SELECT * FROM crf_label;</span>
 <a name="l00248"></a>00248 <span class="comment"> id | label </span>
 <a name="l00249"></a>00249 <span class="comment">----+-------</span>
 <a name="l00250"></a>00250 <span class="comment">  1 | CD</span>
 <a name="l00251"></a>00251 <span class="comment"> 13 | NNP</span>
 <a name="l00252"></a>00252 <span class="comment"> 15 | PDT</span>
 <a name="l00253"></a>00253 <span class="comment"> 17 | PRP</span>
 <a name="l00254"></a>00254 <span class="comment"> 29 | VBN</span>
 <a name="l00255"></a>00255 <span class="comment"> 31 | VBZ</span>
 <a name="l00256"></a>00256 <span class="comment"> 33 | WP</span>
 <a name="l00257"></a>00257 <span class="comment"> 35 | WRB</span>
 <a name="l00258"></a>00258 <span class="comment">...</span>
 <a name="l00259"></a>00259 <span class="comment"></span>
 <a name="l00260"></a>00260 <span class="comment">sql&gt; SELECT * from crf_regex;</span>
 <a name="l00261"></a>00261 <span class="comment">    pattern    |         name         </span>
 <a name="l00262"></a>00262 <span class="comment">---------------+----------------------</span>
 <a name="l00263"></a>00263 <span class="comment"> ^.+ing$       | endsWithIng</span>
 <a name="l00264"></a>00264 <span class="comment"> ^[A-Z][a-z]+$ | InitCapital</span>
 <a name="l00265"></a>00265 <span class="comment"> ^[A-Z]+$      | isAllCapital</span>
 <a name="l00266"></a>00266 <span class="comment"> ^.*[0-9]+.*$  | containsDigit</span>
 <a name="l00267"></a>00267 <span class="comment">...</span>
 <a name="l00268"></a>00268 <span class="comment"></span>
 <a name="l00269"></a>00269 <span class="comment">sql&gt; SELECT * from train_segmenttbl;</span>
 <a name="l00270"></a>00270 <span class="comment"> start_pos | doc_id |  seg_text  | label | max_pos</span>
 <a name="l00271"></a>00271 <span class="comment">-----------+--------+------------+-------+---------</span>
 <a name="l00272"></a>00272 <span class="comment">         8 |      1 | alliance   |    11 |      26</span>
 <a name="l00273"></a>00273 <span class="comment">        10 |      1 | Ford       |    13 |      26</span>
 <a name="l00274"></a>00274 <span class="comment">        12 |      1 | that       |     5 |      26</span>
 <a name="l00275"></a>00275 <span class="comment">        24 |      1 | likely     |     6 |      26</span>
 <a name="l00276"></a>00276 <span class="comment">        26 |      1 | .          |    43 |      26</span>
 <a name="l00277"></a>00277 <span class="comment">         8 |      2 | interest   |    11 |      10</span>
 <a name="l00278"></a>00278 <span class="comment">        10 |      2 | .          |    43 |      10</span>
 <a name="l00279"></a>00279 <span class="comment">         9 |      1 | after      |     5 |      26</span>
 <a name="l00280"></a>00280 <span class="comment">        11 |      1 | concluded  |    27 |      26</span>
 <a name="l00281"></a>00281 <span class="comment">        23 |      1 | the        |     2 |      26</span>
 <a name="l00282"></a>00282 <span class="comment">        25 |      1 | return     |    11 |      26</span>
 <a name="l00283"></a>00283 <span class="comment">         9 |      2 | later      |    19 |      10</span>
 <a name="l00284"></a>00284 <span class="comment">...</span>
 <a name="l00285"></a>00285 <span class="comment">@endverbatim</span>
 <a name="l00286"></a>00286 <span class="comment">-# Create the (empty) dictionary table, feature table, and feature set:</span>
 <a name="l00287"></a>00287 <span class="comment">@verbatim</span>
 <a name="l00288"></a>00288 <span class="comment">sql&gt; CREATE TABLE crf_dictionary(token text,total integer);</span>
 <a name="l00289"></a>00289 <span class="comment">sql&gt; CREATE TABLE train_featuretbl(doc_id integer,f_size FLOAT8,sparse_r FLOAT8[],dense_m FLOAT8[],sparse_m FLOAT8[]);</span>
 <a name="l00290"></a>00290 <span class="comment">sql&gt; CREATE TABLE train_featureset(f_index integer, f_name text, feature integer[]);</span>
 <a name="l00291"></a>00291 <span class="comment">@endverbatim</span>
 <a name="l00292"></a>00292 <span class="comment">-# Generate the training features:</span>
 <a name="l00293"></a>00293 <span class="comment">@verbatim</span>
 <a name="l00294"></a>00294 <span class="comment">sql&gt; SELECT crf_train_fgen(&#39;train_segmenttbl&#39;, &#39;crf_regex&#39;, &#39;crf_dictionary&#39;, &#39;train_featuretbl&#39;,&#39;train_featureset&#39;);</span>
 <a name="l00295"></a>00295 <span class="comment"></span>
 <a name="l00296"></a>00296 <span class="comment">sql&gt; SELECT * from crf_dictionary;</span>
 <a name="l00297"></a>00297 <span class="comment">   token    | total </span>
 <a name="l00298"></a>00298 <span class="comment">------------+-------</span>
 <a name="l00299"></a>00299 <span class="comment"> talks      |     1</span>
 <a name="l00300"></a>00300 <span class="comment"> that       |     1</span>
 <a name="l00301"></a>00301 <span class="comment"> would      |     1</span>
 <a name="l00302"></a>00302 <span class="comment"> alliance   |     1</span>
 <a name="l00303"></a>00303 <span class="comment"> Saab       |     2</span>
 <a name="l00304"></a>00304 <span class="comment"> cost       |     1</span>
 <a name="l00305"></a>00305 <span class="comment"> after      |     1</span>
 <a name="l00306"></a>00306 <span class="comment"> operations |     1</span>
 <a name="l00307"></a>00307 <span class="comment">...</span>
 <a name="l00308"></a>00308 <span class="comment"></span>
 <a name="l00309"></a>00309 <span class="comment">sql&gt; SELECT * from train_featuretbl;</span>
 <a name="l00310"></a>00310 <span class="comment"> doc_id | f_size |            sparse_r           |             dense_m             |       sparse_m</span>
 <a name="l00311"></a>00311 <span class="comment">--------+--------+-------------------------------+---------------------------------+-----------------------</span>
 <a name="l00312"></a>00312 <span class="comment">      2 |     87 | {-1,13,12,0,1,-1,13,9,0,1,..} | {13,31,79,1,1,31,29,70,2,1,...} | {51,26,2,69,29,17,...}</span>
 <a name="l00313"></a>00313 <span class="comment">      1 |     87 | {-1,13,0,0,1,-1,13,9,0,1,...} | {13,0,62,1,1,0,13,54,2,1,13,..} | {51,26,2,69,29,17,...}</span>
 <a name="l00314"></a>00314 <span class="comment"></span>
 <a name="l00315"></a>00315 <span class="comment">sql&gt; SELECT * from train_featureset;</span>
 <a name="l00316"></a>00316 <span class="comment"> f_index |    f_name     | feature </span>
 <a name="l00317"></a>00317 <span class="comment">---------+---------------+---------</span>
 <a name="l00318"></a>00318 <span class="comment">       1 | R_endsWithED  | {-1,29}</span>
 <a name="l00319"></a>00319 <span class="comment">      13 | W_outweigh    | {-1,26}</span>
 <a name="l00320"></a>00320 <span class="comment">      29 | U             | {-1,5}</span>
 <a name="l00321"></a>00321 <span class="comment">      31 | U             | {-1,29}</span>
 <a name="l00322"></a>00322 <span class="comment">      33 | U             | {-1,12}</span>
 <a name="l00323"></a>00323 <span class="comment">      35 | W_a           | {-1,2}</span>
 <a name="l00324"></a>00324 <span class="comment">      37 | W_possible    | {-1,6}</span>
 <a name="l00325"></a>00325 <span class="comment">      15 | W_signaled    | {-1,29}</span>
 <a name="l00326"></a>00326 <span class="comment">      17 | End.          | {-1,43}</span>
 <a name="l00327"></a>00327 <span class="comment">      49 | W_&#39;s          | {-1,16}</span>
 <a name="l00328"></a>00328 <span class="comment">      63 | W_acquire     | {-1,26}</span>
 <a name="l00329"></a>00329 <span class="comment">      51 | E.            | {26,2}</span>
 <a name="l00330"></a>00330 <span class="comment">      69 | E.            | {29,17}</span>
 <a name="l00331"></a>00331 <span class="comment">      71 | E.            | {2,11}</span>
 <a name="l00332"></a>00332 <span class="comment">      83 | W_the         | {-1,2}</span>
 <a name="l00333"></a>00333 <span class="comment">      85 | E.            | {16,11}</span>
 <a name="l00334"></a>00334 <span class="comment">       4 | W_return      | {-1,11}</span>
 <a name="l00335"></a>00335 <span class="comment">...</span>
 <a name="l00336"></a>00336 <span class="comment"></span>
 <a name="l00337"></a>00337 <span class="comment">@endverbatim</span>
 <a name="l00338"></a>00338 <span class="comment">-# Create the (empty) feature weight table:</span>
 <a name="l00339"></a>00339 <span class="comment">@verbatim</span>
 <a name="l00340"></a>00340 <span class="comment">sql&gt; CREATE TABLE train_crf_feature (id integer,name text,prev_label_id integer,label_id integer,weight float);</span>
 <a name="l00341"></a>00341 <span class="comment">@endverbatim</span>
 <a name="l00342"></a>00342 <span class="comment">-# Train using linear CRF:</span>
 <a name="l00343"></a>00343 <span class="comment">@verbatim</span>
 <a name="l00344"></a>00344 <span class="comment">sql&gt; SELECT lincrf(&#39;train_featuretbl&#39;,&#39;sparse_r&#39;,&#39;dense_m&#39;,&#39;sparse_m&#39;,&#39;f_size&#39;,45, &#39;train_featureset&#39;,&#39;train_crf_feature&#39;, 20);</span>
 <a name="l00345"></a>00345 <span class="comment"> lincrf </span>
 <a name="l00346"></a>00346 <span class="comment">--------</span>
 <a name="l00347"></a>00347 <span class="comment">     20</span>
 <a name="l00348"></a>00348 <span class="comment"></span>
 <a name="l00349"></a>00349 <span class="comment">sql&gt; SELECT * from train_crf_feature;</span>
 <a name="l00350"></a>00350 <span class="comment"> id |     name      | prev_label_id | label_id |      weight       </span>
 <a name="l00351"></a>00351 <span class="comment">----+---------------+---------------+----------+-------------------</span>
 <a name="l00352"></a>00352 <span class="comment">  1 | R_endsWithED  |            -1 |       29 |  1.54128249293937</span>
 <a name="l00353"></a>00353 <span class="comment"> 13 | W_outweigh    |            -1 |       26 |  1.70691232223653</span>
 <a name="l00354"></a>00354 <span class="comment"> 29 | U             |            -1 |        5 |  1.40708515869008</span>
 <a name="l00355"></a>00355 <span class="comment"> 31 | U             |            -1 |       29 | 0.830356200936407</span>
 <a name="l00356"></a>00356 <span class="comment"> 33 | U             |            -1 |       12 | 0.769587378281239</span>
 <a name="l00357"></a>00357 <span class="comment"> 35 | W_a           |            -1 |        2 |  2.68470625883726</span>
 <a name="l00358"></a>00358 <span class="comment"> 37 | W_possible    |            -1 |        6 |  3.41773107604468</span>
 <a name="l00359"></a>00359 <span class="comment"> 15 | W_signaled    |            -1 |       29 |  1.68187039165771</span>
 <a name="l00360"></a>00360 <span class="comment"> 17 | End.          |            -1 |       43 |  3.07687845517082</span>
 <a name="l00361"></a>00361 <span class="comment"> 49 | W_&#39;s          |            -1 |       16 |  2.61430312229883</span>
 <a name="l00362"></a>00362 <span class="comment"> 63 | W_acquire     |            -1 |       26 |  1.67247047385797</span>
 <a name="l00363"></a>00363 <span class="comment"> 51 | E.            |            26 |        2 |   3.0114240119435</span>
 <a name="l00364"></a>00364 <span class="comment"> 69 | E.            |            29 |       17 |  2.82385531733866</span>
 <a name="l00365"></a>00365 <span class="comment"> 71 | E.            |             2 |       11 |  3.00970493772732</span>
 <a name="l00366"></a>00366 <span class="comment"> 83 | W_the         |            -1 |        2 |  2.58742315259326</span>
 <a name="l00367"></a>00367 <span class="comment">...</span>
 <a name="l00368"></a>00368 <span class="comment"></span>
 <a name="l00369"></a>00369 <span class="comment">@endverbatim</span>
 <a name="l00370"></a>00370 <span class="comment">-# To find the best labels for a test set using the trained linear CRF model, repeat steps #1-2 and generate the test features, except instead of creating a new dictionary, use the dictionary generated from the training set.</span>
 <a name="l00371"></a>00371 <span class="comment">@verbatim</span>
 <a name="l00372"></a>00372 <span class="comment">sql&gt; SELECT * from test_segmenttbl;</span>
 <a name="l00373"></a>00373 <span class="comment"> start_pos | doc_id |  seg_text   | max_pos </span>
 <a name="l00374"></a>00374 <span class="comment">-----------+--------+-------------+---------</span>
 <a name="l00375"></a>00375 <span class="comment">         1 |      1 | collapse    |      22</span>
 <a name="l00376"></a>00376 <span class="comment">        13 |      1 | ,           |      22</span>
 <a name="l00377"></a>00377 <span class="comment">        15 |      1 | is          |      22</span>
 <a name="l00378"></a>00378 <span class="comment">        17 |      1 | a           |      22</span>
 <a name="l00379"></a>00379 <span class="comment">         4 |      1 | speculation |      22</span>
 <a name="l00380"></a>00380 <span class="comment">         6 |      1 | Ford        |      22</span>
 <a name="l00381"></a>00381 <span class="comment">        18 |      1 | defensive   |      22</span>
 <a name="l00382"></a>00382 <span class="comment">        20 |      1 | with        |      22</span>
 <a name="l00383"></a>00383 <span class="comment">...</span>
 <a name="l00384"></a>00384 <span class="comment"></span>
 <a name="l00385"></a>00385 <span class="comment">sql&gt; SELECT crf_test_fgen(&#39;test_segmenttbl&#39;,&#39;crf_dictionary&#39;,&#39;crf_label&#39;,&#39;crf_regex&#39;,&#39;train_crf_feature&#39;,&#39;viterbi_mtbl&#39;,&#39;viterbi_rtbl&#39;);</span>
 <a name="l00386"></a>00386 <span class="comment">@endverbatim</span>
 <a name="l00387"></a>00387 <span class="comment">-# Calculate the best label sequence:</span>
 <a name="l00388"></a>00388 <span class="comment">@verbatim</span>
 <a name="l00389"></a>00389 <span class="comment">sql&gt; SELECT vcrf_label(&#39;test_segmenttbl&#39;,&#39;viterbi_mtbl&#39;,&#39;viterbi_rtbl&#39;,&#39;crf_label&#39;,&#39;extracted_best_labels&#39;);</span>
 <a name="l00390"></a>00390 <span class="comment"></span>
 <a name="l00391"></a>00391 <span class="comment">sql&gt; SELECT * FROM extracted_best_labels;</span>
 <a name="l00392"></a>00392 <span class="comment"> doc_id | start_pos |  seg_text   | label | id | prob  </span>
 <a name="l00393"></a>00393 <span class="comment">--------+-----------+-------------+-------+----+-------</span>
 <a name="l00394"></a>00394 <span class="comment">      1 |         2 | Friday      | NNP   | 14 | 9e-06</span>
 <a name="l00395"></a>00395 <span class="comment">      1 |         6 | Ford        | NNP   | 14 | 9e-06</span>
 <a name="l00396"></a>00396 <span class="comment">      1 |        12 | Jaguar      | NNP   | 14 | 9e-06</span>
 <a name="l00397"></a>00397 <span class="comment">      1 |         3 | prompted    | VBD   | 28 | 9e-06</span>
 <a name="l00398"></a>00398 <span class="comment">      1 |         8 | intensify   | NN    | 12 | 9e-06</span>
 <a name="l00399"></a>00399 <span class="comment">      1 |        14 | which       | NN    | 12 | 9e-06</span>
 <a name="l00400"></a>00400 <span class="comment">      1 |        18 | defensive   | NN    | 12 | 9e-06</span>
 <a name="l00401"></a>00401 <span class="comment">      1 |        21 | GM          | NN    | 12 | 9e-06</span>
 <a name="l00402"></a>00402 <span class="comment">      1 |        22 | .           | .     | 44 | 9e-06</span>
 <a name="l00403"></a>00403 <span class="comment">      1 |         1 | collapse    | CC    |  1 | 9e-06</span>
 <a name="l00404"></a>00404 <span class="comment">      1 |         7 | would       | POS   | 17 | 9e-06</span>
 <a name="l00405"></a>00405 <span class="comment">...</span>
 <a name="l00406"></a>00406 <span class="comment">@endverbatim</span>
 <a name="l00407"></a>00407 <span class="comment">(Note that this example was done on a trivial training and test data set.)</span>
 <a name="l00408"></a>00408 <span class="comment"></span>
 <a name="l00409"></a>00409 <span class="comment">@literature</span>
 <a name="l00410"></a>00410 <span class="comment">[1] F. Sha, F. Pereira. Shallow Parsing with Conditional Random Fields, http://www-bcf.usc.edu/~feisha/pubs/shallow03.pdf</span>
 <a name="l00411"></a>00411 <span class="comment"></span>
 <a name="l00412"></a>00412 <span class="comment">[2] Wikipedia, Conditional Random Field, http://en.wikipedia.org/wiki/Conditional_random_field</span>
 <a name="l00413"></a>00413 <span class="comment"></span>
 <a name="l00414"></a>00414 <span class="comment">[3] A. Jaiswal, S.Tawari, I. Mansuri, K. Mittal, C. Tiwari (2012), CRF, http://crf.sourceforge.net/</span>
 <a name="l00415"></a>00415 <span class="comment"></span>
 <a name="l00416"></a>00416 <span class="comment">[4] D. Wang, ViterbiCRF, http://www.cs.berkeley.edu/~daisyw/ViterbiCRF.html</span>
 <a name="l00417"></a>00417 <span class="comment"></span>
 <a name="l00418"></a>00418 <span class="comment">[5] Wikipedia, Viterbi Algorithm, http://en.wikipedia.org/wiki/Viterbi_algorithm</span>
 <a name="l00419"></a>00419 <span class="comment"></span>
 <a name="l00420"></a>00420 <span class="comment">[6] J. Nocedal. Updating Quasi-Newton Matrices with Limited Storage (1980), Mathematics of Computation 35, pp. 773-782</span>
 <a name="l00421"></a>00421 <span class="comment"></span>
 <a name="l00422"></a>00422 <span class="comment">[7] J. Nocedal, Software for Large-scale Unconstrained Optimization, http://users.eecs.northwestern.edu/~nocedal/lbfgs.html</span>
 <a name="l00423"></a>00423 <span class="comment"></span>
 <a name="l00424"></a>00424 <span class="comment">@sa File crf.sql_in crf_feature_gen.sql_in viterbi.sql_in (documenting the SQL functions)</span>
 <a name="l00425"></a>00425 <span class="comment"></span>
 <a name="l00426"></a>00426 <span class="comment">*/</span>
 <a name="l00427"></a>00427
 <a name="l00428"></a>00428 DROP TYPE IF EXISTS MADLIB_SCHEMA.lincrf_result;
 <a name="l00429"></a>00429 CREATE TYPE MADLIB_SCHEMA.lincrf_result AS (
 <a name="l00430"></a>00430     coef DOUBLE PRECISION[],
 <a name="l00431"></a>00431     log_likelihood DOUBLE PRECISION,
 <a name="l00432"></a>00432     num_iterations INTEGER
 <a name="l00433"></a>00433 );
 <a name="l00434"></a>00434
 <a name="l00435"></a>00435 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_lbfgs_step_transition(
 <a name="l00436"></a>00436     DOUBLE PRECISION[],
 <a name="l00437"></a>00437     DOUBLE PRECISION[],
 <a name="l00438"></a>00438     DOUBLE PRECISION[],
 <a name="l00439"></a>00439     DOUBLE PRECISION[],
 <a name="l00440"></a>00440     DOUBLE PRECISION,
 <a name="l00441"></a>00441     DOUBLE PRECISION,
 <a name="l00442"></a>00442     DOUBLE PRECISION[])
 <a name="l00443"></a>00443 RETURNS DOUBLE PRECISION[]
 <a name="l00444"></a>00444 AS &#39;MODULE_PATHNAME<span class="stringliteral">&#39;</span>
 <a name="l00445"></a>00445 <span class="stringliteral">LANGUAGE C IMMUTABLE;</span>
 <a name="l00446"></a>00446 <span class="stringliteral"></span>
 <a name="l00447"></a>00447 <span class="stringliteral">CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_lbfgs_step_merge_states(</span>
 <a name="l00448"></a>00448 <span class="stringliteral">    state1 DOUBLE PRECISION[],</span>
 <a name="l00449"></a>00449 <span class="stringliteral">    state2 DOUBLE PRECISION[])</span>
 <a name="l00450"></a>00450 <span class="stringliteral">RETURNS DOUBLE PRECISION[]</span>
 <a name="l00451"></a>00451 <span class="stringliteral">AS &#39;</span>MODULE_PATHNAME<span class="stringliteral">&#39;</span>
 <a name="l00452"></a>00452 <span class="stringliteral">LANGUAGE C IMMUTABLE STRICT;</span>
 <a name="l00453"></a>00453 <span class="stringliteral"></span>
 <a name="l00454"></a>00454 <span class="stringliteral">CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_lbfgs_step_final(</span>
 <a name="l00455"></a>00455 <span class="stringliteral">    state DOUBLE PRECISION[])</span>
 <a name="l00456"></a>00456 <span class="stringliteral">RETURNS DOUBLE PRECISION[]</span>
 <a name="l00457"></a>00457 <span class="stringliteral">AS &#39;</span>MODULE_PATHNAME<span class="stringliteral">&#39;</span>
 <a name="l00458"></a>00458 <span class="stringliteral">LANGUAGE C IMMUTABLE STRICT;</span>
 <a name="l00459"></a>00459 <span class="stringliteral"></span>
 <a name="l00460"></a>00460 <span class="stringliteral">CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_lincrf_lbfgs_converge(</span>
 <a name="l00461"></a>00461 <span class="stringliteral">    /*+ state */ DOUBLE PRECISION[])</span>
 <a name="l00462"></a>00462 <span class="stringliteral">RETURNS DOUBLE PRECISION AS</span>
 <a name="l00463"></a>00463 <span class="stringliteral">&#39;</span>MODULE_PATHNAME<span class="stringliteral">&#39;</span>
 <a name="l00464"></a>00464 <span class="stringliteral">LANGUAGE c IMMUTABLE STRICT;</span>
 <a name="l00465"></a>00465 <span class="stringliteral"></span>
 <a name="l00466"></a>00466 <span class="stringliteral"></span>
 <a name="l00467"></a>00467 <span class="stringliteral">CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_lincrf_lbfgs_result(</span>
 <a name="l00468"></a>00468 <span class="stringliteral">    /*+ state */ DOUBLE PRECISION[])</span>
 <a name="l00469"></a>00469 <span class="stringliteral">RETURNS MADLIB_SCHEMA.lincrf_result AS</span>
 <a name="l00470"></a>00470 <span class="stringliteral">&#39;</span>MODULE_PATHNAME<span class="stringliteral">&#39;</span>
 <a name="l00471"></a>00471 <span class="stringliteral">LANGUAGE c IMMUTABLE STRICT;</span>
 <a name="l00472"></a>00472 <span class="stringliteral"></span><span class="comment"></span>
 <a name="l00473"></a>00473 <span class="comment">/**</span>
 <a name="l00474"></a>00474 <span class="comment"> * @internal</span>
 <a name="l00475"></a>00475 <span class="comment"> * @brief Perform one iteration of the L-BFGS method for computing</span>
 <a name="l00476"></a>00476 <span class="comment"> * conditional random field</span>
 <a name="l00477"></a>00477 <span class="comment"> */</span>
 <a name="l00478"></a>00478 CREATE AGGREGATE MADLIB_SCHEMA.lincrf_lbfgs_step(
 <a name="l00479"></a>00479     /* sparse_r columns */ DOUBLE PRECISION[],
 <a name="l00480"></a>00480     /* dense_m columns */ DOUBLE PRECISION[],
 <a name="l00481"></a>00481     /* sparse_m columns */ DOUBLE PRECISION[],
 <a name="l00482"></a>00482     /* feature size */ DOUBLE PRECISION,
 <a name="l00483"></a>00483     /* tag size */ DOUBLE PRECISION,
 <a name="l00484"></a>00484     /* previous_state */ DOUBLE PRECISION[]) (
 <a name="l00485"></a>00485
 <a name="l00486"></a>00486     STYPE=DOUBLE PRECISION[],
 <a name="l00487"></a>00487     SFUNC=MADLIB_SCHEMA.lincrf_lbfgs_step_transition,
 <a name="l00488"></a>00488     m4_ifdef(`__GREENPLUM__&#39;,`prefunc=MADLIB_SCHEMA.lincrf_lbfgs_step_merge_states,<span class="stringliteral">&#39;)</span>
 <a name="l00489"></a>00489 <span class="stringliteral">    FINALFUNC=MADLIB_SCHEMA.lincrf_lbfgs_step_final,</span>
 <a name="l00490"></a>00490 <span class="stringliteral">    INITCOND=&#39;</span>{0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}<span class="stringliteral">&#39;</span>
 <a name="l00491"></a>00491 <span class="stringliteral">);</span>
 <a name="l00492"></a>00492 <span class="stringliteral"></span>
 <a name="l00493"></a>00493 <span class="stringliteral">m4_changequote(&lt;!,!&gt;)</span>
 <a name="l00494"></a>00494 <span class="stringliteral">m4_ifdef(&lt;!__HAS_ORDERED_AGGREGATES__!&gt;,&lt;!</span>
 <a name="l00495"></a>00495 <span class="stringliteral">CREATE</span>
 <a name="l00496"></a>00496 <span class="stringliteral">m4_ifdef(&lt;!__GREENPLUM__!&gt;,&lt;!ORDERED!&gt;)</span>
 <a name="l00497"></a>00497 <span class="stringliteral">AGGREGATE MADLIB_SCHEMA.array_union(anyarray) (</span>
 <a name="l00498"></a>00498 <span class="stringliteral">    SFUNC = array_cat, </span>
 <a name="l00499"></a>00499 <span class="stringliteral">    STYPE = anyarray</span>
 <a name="l00500"></a>00500 <span class="stringliteral">); </span>
 <a name="l00501"></a>00501 <span class="stringliteral">!&gt;)</span>
 <a name="l00502"></a>00502 <span class="stringliteral">m4_changequote(`,&#39;</span>)
 <a name="l00503"></a>00503
 <a name="l00504"></a>00504 -- We only need to document the last one (unfortunately, in Greenplum we have to
 <a name="l00505"></a>00505 -- use <span class="keyword">function</span> overloading instead of <span class="keywordflow">default</span> arguments).
 <a name="l00506"></a>00506 CREATE FUNCTION MADLIB_SCHEMA.compute_lincrf(
 <a name="l00507"></a>00507     <span class="stringliteral">&quot;source&quot;</span> VARCHAR,
 <a name="l00508"></a>00508     <span class="stringliteral">&quot;sparse_R&quot;</span> VARCHAR,
 <a name="l00509"></a>00509     <span class="stringliteral">&quot;dense_M&quot;</span> VARCHAR,
 <a name="l00510"></a>00510     <span class="stringliteral">&quot;sparse_M&quot;</span> VARCHAR,
 <a name="l00511"></a>00511     <span class="stringliteral">&quot;featureSize&quot;</span> VARCHAR,
 <a name="l00512"></a>00512     <span class="stringliteral">&quot;tagSize&quot;</span> INTEGER,
 <a name="l00513"></a>00513     <span class="stringliteral">&quot;maxNumIterations&quot;</span> INTEGER)
 <a name="l00514"></a>00514 RETURNS INTEGER
 <a name="l00515"></a>00515 AS $$PythonFunction(crf, crf, compute_lincrf)$$
 <a name="l00516"></a>00516 LANGUAGE plpythonu VOLATILE;
 <a name="l00517"></a>00517 <span class="comment"></span>
 <a name="l00518"></a>00518 <span class="comment">/**</span>
 <a name="l00519"></a>00519 <span class="comment"> * @brief Compute linear-chain crf coefficients and diagnostic statistics</span>
 <a name="l00520"></a>00520 <span class="comment"> *</span>
 <a name="l00521"></a>00521 <span class="comment"> * @param source Name of the source relation containing the training data</span>
 <a name="l00522"></a>00522 <span class="comment"> * @param sparse_R Name of the sparse single state feature column (of type DOUBLE PRECISION[])</span>
 <a name="l00523"></a>00523 <span class="comment"> * @param dense_M Name of the dense two state feature column (of type DOUBLE PRECISION[])</span>
 <a name="l00524"></a>00524 <span class="comment"> * @param sparse_M Name of the sparse two state feature column (of type DOUBLE PRECISION[])</span>
 <a name="l00525"></a>00525 <span class="comment"> * @param featureSize Name of feature size column (of type DOUBLE PRECISION)</span>
 <a name="l00526"></a>00526 <span class="comment"> * @param tagSize The number of tags in the tag set</span>
 <a name="l00527"></a>00527 <span class="comment"> * @param featureset The unique feature set</span>
 <a name="l00528"></a>00528 <span class="comment"> * @param crf_feature The Name of output feature table</span>
 <a name="l00529"></a>00529 <span class="comment"> * @param maxNumIterations The maximum number of iterations</span>
 <a name="l00530"></a>00530 <span class="comment"> *</span>
 <a name="l00531"></a>00531 <span class="comment"> * @return a composite value:</span>
 <a name="l00532"></a>00532 <span class="comment"> * - &lt;tt&gt;coef FLOAT8[]&lt;/tt&gt; - Array of coefficients, \f$ \boldsymbol c \f$    </span>
 <a name="l00533"></a>00533 <span class="comment"> * - &lt;tt&gt;log_likelihood FLOAT8&lt;/tt&gt; - Log-likelihood \f$ l(\boldsymbol c) \f$</span>
 <a name="l00534"></a>00534 <span class="comment"> * - &lt;tt&gt;num_iterations INTEGER&lt;/tt&gt; - The number of iterations before the</span>
 <a name="l00535"></a>00535 <span class="comment"> *   algorithm terminated \n\n</span>
 <a name="l00536"></a>00536 <span class="comment"> * A &#39;crf_feature&#39; table is used to store all the features and corresponding weights</span>
 <a name="l00537"></a>00537 <span class="comment"> *</span>
 <a name="l00538"></a>00538 <span class="comment"> * @note This function starts an iterative algorithm. It is not an aggregate</span>
 <a name="l00539"></a>00539 <span class="comment"> * function. Source and column names have to be passed as strings (due to</span>
 <a name="l00540"></a>00540 <span class="comment"> * limitations of the SQL syntax).</span>
 <a name="l00541"></a>00541 <span class="comment"> *</span>
 <a name="l00542"></a>00542 <span class="comment"> * @internal</span>
 <a name="l00543"></a>00543 <span class="comment"> * @sa This function is a wrapper for crf::compute_lincrf(), which</span>
 <a name="l00544"></a>00544 <span class="comment"> * sets the default values.</span>
 <a name="l00545"></a>00545 <span class="comment"> */</span>
 <a name="l00546"></a>00546
 <a name="l00547"></a>00547 CREATE FUNCTION MADLIB_SCHEMA.lincrf(
 <a name="l00548"></a>00548     <span class="stringliteral">&quot;source&quot;</span> VARCHAR,
 <a name="l00549"></a>00549     <span class="stringliteral">&quot;sparse_R&quot;</span> VARCHAR,
 <a name="l00550"></a>00550     <span class="stringliteral">&quot;dense_M&quot;</span> VARCHAR,
 <a name="l00551"></a>00551     <span class="stringliteral">&quot;sparse_M&quot;</span> VARCHAR,
 <a name="l00552"></a>00552     <span class="stringliteral">&quot;featureSize&quot;</span> VARCHAR,
 <a name="l00553"></a>00553     <span class="stringliteral">&quot;tagSize&quot;</span> INTEGER,
 <a name="l00554"></a>00554     <span class="stringliteral">&quot;featureset&quot;</span> VARCHAR,
 <a name="l00555"></a>00555     <span class="stringliteral">&quot;crf_feature&quot;</span> VARCHAR,
 <a name="l00556"></a>00556     <span class="stringliteral">&quot;maxNumIterations&quot;</span> INTEGER <span class="comment">/*+ DEFAULT 20 */</span>)
 <a name="l00557"></a>00557 RETURNS INTEGER AS $$
 <a name="l00558"></a>00558 DECLARE
 <a name="l00559"></a>00559     theIteration INTEGER;
 <a name="l00560"></a>00560 BEGIN
 <a name="l00561"></a>00561     theIteration := (
 <a name="l00562"></a>00562         SELECT MADLIB_SCHEMA.compute_lincrf($1, $2, $3, $4, $5, $6, $9)
 <a name="l00563"></a>00563     );
 <a name="l00564"></a>00564     -- Because of Greenplum bug MPP-10050, we have to use dynamic SQL (<span class="keyword">using</span>
 <a name="l00565"></a>00565     -- EXECUTE) in the following
 <a name="l00566"></a>00566     -- Because of Greenplum bug MPP-6731, we have to hide the tuple-returning
 <a name="l00567"></a>00567     -- function in a subquery
 <a name="l00568"></a>00568     EXECUTE
 <a name="l00569"></a><a class="code" href="crf_8sql__in.html#afb77a0c0a2cfacdfff33fb826ff1c0cd">00569</a>         $sql$
 <a name="l00570"></a>00570         INSERT INTO $sql$ || $8 || $sql$
 <a name="l00571"></a>00571         SELECT f_index, f_name, feature[1], feature[2], (result).coef[f_index+1]
 <a name="l00572"></a>00572         FROM (
 <a name="l00573"></a>00573               SELECT MADLIB_SCHEMA.internal_lincrf_lbfgs_result(_madlib_state) AS result
 <a name="l00574"></a>00574               FROM   _madlib_iterative_alg
 <a name="l00575"></a>00575               WHERE  _madlib_iteration = $sql$ || theIteration || $sql$
 <a name="l00576"></a>00576              ) subq, $sql$ || $7 || $sql$
 <a name="l00577"></a>00577         $sql$;
 <a name="l00578"></a>00578     RETURN theIteration;
 <a name="l00579"></a>00579 END;
 <a name="l00580"></a>00580 $$ LANGUAGE plpgsql VOLATILE;
 <a name="l00581"></a>00581
 <a name="l00582"></a>00582 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="crf_8sql__in.html#afb77a0c0a2cfacdfff33fb826ff1c0cd" title="Compute linear-chain crf coefficients and diagnostic statistics.">lincrf</a>(
 <a name="l00583"></a>00583     &quot;source&quot; VARCHAR,
 <a name="l00584"></a>00584     &quot;sparse_R&quot; VARCHAR,
 <a name="l00585"></a>00585     &quot;dense_M&quot; VARCHAR,
 <a name="l00586"></a>00586     &quot;sparse_M&quot; VARCHAR,
 <a name="l00587"></a>00587     &quot;featureSize&quot; VARCHAR,
 <a name="l00588"></a>00588     &quot;tagSize&quot; INTEGER,
 <a name="l00589"></a>00589     &quot;featureset&quot; VARCHAR,
 <a name="l00590"></a>00590     &quot;crf_feature&quot; VARCHAR)
 <a name="l00591"></a>00591 RETURNS INTEGER AS
 <a name="l00592"></a>00592 $$SELECT MADLIB_SCHEMA.<a class="code" href="crf_8sql__in.html#afb77a0c0a2cfacdfff33fb826ff1c0cd" title="Compute linear-chain crf coefficients and diagnostic statistics.">lincrf</a>($1, $2, $3, $4, $5, $6, $7, $8, 20);$$
 <a name="l00593"></a>00593 LANGUAGE sql VOLATILE;
 </pre></div></div>
 </div>
   <div id="nav-path" class="navpath">
     <ul>
       <li class="navelem"><a class="el" href="crf_8sql__in.html">crf.sql_in</a>      </li>
 <!-- window showing the filter options -->
 <div id="MSearchSelectWindow"
      onmouseover="return searchBox.OnSearchSelectShow()"
      onmouseout="return searchBox.OnSearchSelectHide()"
      onkeydown="return searchBox.OnSearchSelectKey(event)">
 <a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark">&#160;</span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark">&#160;</span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark">&#160;</span>Functions</a></div>

 <!-- iframe showing the search results (closed by default) -->
 <div id="MSearchResultsWindow">
 <iframe src="javascript:void(0)" frameborder="0"
         name="MSearchResults" id="MSearchResults">
 </iframe>
 </div>


     <li class="footer">Generated on Tue Apr 2 2013 14:57:03 for MADlib by
     <a href="http://www.doxygen.org/index.html">
     <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.7.5.1 </li>
    </ul>
  </div>


 </body>
 </html>