| /* ----------------------------------------------------------------------- *//** | 
 |  * | 
 |  * @file crf.sql_in | 
 |  * | 
 |  * @brief SQL functions for conditional random field | 
 |  * @date July 2012 | 
 |  * | 
 |  * @sa For a brief introduction to conditional random field, see the | 
 |  *     module description \ref grp_crf. | 
 |  * | 
 |  *//* ----------------------------------------------------------------------- */ | 
 |  | 
 | m4_include(`SQLCommon.m4') | 
 |  | 
 | /** | 
 | @addtogroup grp_crf | 
 |  | 
 | <div class="toc"><b>Contents</b> | 
 | <ul> | 
 | <li><a href="#train_feature">Training Feature Generation</a></li> | 
 | <li><a href="#train">CRF Training Function</a></li> | 
 | <li><a href="#test_feature">Testing Feature Generation</a></li> | 
 | <li><a href="#inference">Inference using Viterbi</a></li> | 
 | <li><a href="#usage">Using CRF</a></li> | 
 | <li><a href="#examples">Examples</a></li> | 
 | <li><a href="#background">Technical Background</a></li> | 
 | <li><a href="#literature">Literature</a></li> | 
 | <li><a href="#related">Related Topics</a></li> | 
 | </ul> | 
 | </div> | 
 |  | 
 | @brief Constructs a Conditional Random Fields (CRF) model for labeling sequential data. | 
 |  | 
 | A conditional random field (CRF) is a type of discriminative, undirected | 
 | probabilistic graphical model. A linear-chain CRF is a special type of CRF | 
 | that assumes the current state depends only on the previous state. | 
 |  | 
 | Feature extraction modules are provided for text-analysis tasks such as part-of-speech | 
 | (POS) tagging and named-entity resolution (NER).  Currently, six | 
 | feature types are implemented: | 
 |  | 
 | - Edge Feature: transition feature that encodes the transition feature | 
 | weight from current label to next label. | 
 | - Start Feature: fired when the current token is the first token in a sequence. | 
 | - End Feature: fired when the current token is the last token in a sequence. | 
 | - Word Feature: fired when the current token is observed in the trained | 
 | dictionary. | 
 | - Unknown Feature: fired when the current token is not observed in the trained | 
 | dictionary for at least a certain number of times (default 1). | 
 | - Regex Feature: fired when the current token can be matched by a regular | 
 | expression. | 
 |  | 
 | A Viterbi implementation is also provided | 
 | to get the best label sequence and the conditional probability | 
 | \f$ \Pr( \text{best label sequence} \mid \text{sequence}) \f$. | 
 |  | 
 | Following steps are required for CRF Learning and Inference: | 
 | -# <li><a href="#train_feature">Training Feature Generation</a></li> | 
 | -# <li><a href="#train">CRF Training</a></li> | 
 | -# <li><a href="#test_feature">Testing Feature Generation</a></li> | 
 | -# <li><a href="#inference">Inference using Viterbi</a></li> | 
 |  | 
 | @anchor train_feature | 
 | @par Training Feature Generation | 
 | The function takes \c train_segment_tbl and \c regex_tbl as input and does feature generation generating three tables | 
 | \c dictionary_tbl, \c train_feature_tbl and \c train_featureset_tbl, that are required as an input for CRF training. | 
 | <pre class="syntax"> | 
 | crf_train_fgen(train_segment_tbl, | 
 |                regex_tbl, | 
 |                label_tbl, | 
 |                dictionary_tbl, | 
 |                train_feature_tbl, | 
 |                train_featureset_tbl) | 
 | </pre> | 
 | \b Arguments | 
 | <dl class="arglist"> | 
 |     <dt>train_segment_tbl</dt> | 
 |     <dd>TEXT. Name of the training segment table. The table is expected to have the following columns: | 
 |         <table class="output"> | 
 |             <tr> | 
 |                 <th>doc_id</th> | 
 |                 <td>INTEGER. Document id column</td> | 
 |              </tr> | 
 |              <tr> | 
 |                 <th>start_pos</th> | 
 |                 <td>INTEGER. Index of a particular term in the respective document</td> | 
 |              </tr> | 
 |              <tr> | 
 |                 <th>seg_text</th> | 
 |                 <td>TEXT. Term at the respective \c start_pos in the document</td> | 
 |               </tr> | 
 |             <tr> | 
 |                 <th>label</th> | 
 |                 <td>INTEGER. Label id for the term corresponding to the actual label from \c label_tbl</td> | 
 |             </tr> | 
 |         </table> | 
 |     </dd> | 
 |     <dt>regex_tbl</dt> | 
 |     <dd>TEXT. Name of the regular expression table. The table is expected to have the following columns: | 
 |         <table class="output"> | 
 |             <tr> | 
 |                 <th>pattern</th> | 
 |                 <td>TEXT. Regular Expression</td> | 
 |             </tr> | 
 |             <tr> | 
 |                 <th>name</th> | 
 |                 <td>TEXT. Regular Expression name</td> | 
 |             </tr> | 
 |          </table> | 
 |      </dd> | 
 |     <dt>label_tbl</dt> | 
 |     <dd>TEXT. Name of the table containing unique labels and their id's. The table is expected to have the following columns: | 
 |         <table class="output"> | 
 |             <tr> | 
 |                 <th>id</th> | 
 |                 <td>INTEGER. Unique label id. NOTE: Must range from 0 to total number of labels in the table - 1. | 
 |                 </td> | 
 |             </tr> | 
 |             <tr> | 
 |                 <th>label</th> | 
 |                 <td>TEXT. Label name</td> | 
 |             </tr> | 
 |         </table> | 
 |     </dd> | 
 |     <dt>dictionary_tbl</dt> | 
 |     <dd>TEXT. Name of the dictionary table to be created containing unique terms along with their counts. The table will have the following columns: | 
 |         <table class="output"> | 
 |             <tr> | 
 |                 <th>token</th> | 
 |                 <td>TEXT. Contains all the unique terms found in \c train_segment_tbl</td> | 
 |             </tr> | 
 |             <tr> | 
 |                 <th>total</th> | 
 |                 <td>INTEGER. Respective counts for the terms</td> | 
 |             </tr> | 
 |         </table> | 
 |     <dd> | 
 |     <dt>train_feature_tbl<dt> | 
 |     <dd>TEXT. Name of the training feature table to be created. The table will have the following columns: | 
 |         <table class="output"> | 
 |             <tr> | 
 |                 <th>doc_id</th> | 
 |                 <td>INTEGER. Document id</td> | 
 |             </tr> | 
 |             <tr> | 
 |                 <th>f_size</th> | 
 |                 <td>INTEGER. Feature set size. This value will be same for all the tuples in the table</td> | 
 |             </tr> | 
 |             <tr> | 
 |                 <th>sparse_r</th> | 
 |                 <td>DOUBLE PRECISION[]. Array union of individual single state features (previous label, label, feature index, start position, training existance indicator), ordered by their start position.</td> | 
 |              </tr> | 
 |              <tr> | 
 |                 <th>dense_m</th> | 
 |                 <td>DOUBLE PRECISION[]. Array union of (previous label, label, feature index, start position, training existance indicator) of edge features ordered by start position.</td> | 
 |              </tr> | 
 |              <tr> | 
 |                 <th>sparse_m</th> | 
 |                 <td>DOUBLE PRECISION[]. Array union of (feature index, previous label, label) of edge features ordered by feature index.</td> | 
 |              </tr> | 
 |           </table> | 
 |     </dd> | 
 |  | 
 |     <dt>train_featureset_tbl</dt> | 
 |     <dd>TEXT. Name of the table to be created containing distinct featuresets generated from training feature extraction. The table will have the following columns: | 
 |         <table class="output"> | 
 |             <tr> | 
 |                 <th>f_index</th> | 
 |                 <td>INTEGER. Column containing distinct featureset ids</td> | 
 |             </tr> | 
 |             <tr> | 
 |                 <th>f_name</th> | 
 |                 <td>TEXT. Feature name </td> | 
 |             </tr> | 
 |             <tr> | 
 |                 <th>feature</th> | 
 |                 <td>ARRAY. Feature value. The value is of the form [L1, L2] | 
 |                            \n - If L1 = -1: represents single state feature with L2 being the current label id. | 
 |                            \n - If L1 != -1: represents transition feature with L1 be the previous label and L2 be the current label. | 
 |                 </td> | 
 |             </tr> | 
 |     </dd> | 
 | </dl> | 
 |  | 
 | @anchor train | 
 | @par Linear Chain CRF Training Function | 
 | The function takes \c train_feature_tbl and \c train_featureset_tbl tables generated in the training feature generation steps as input | 
 | along with other required parameters and produces two output tables \c crf_stats_tbl and \c crf_weights_tbl. | 
 |  | 
 | <pre class="syntax"> | 
 | lincrf_train(train_feature_tbl, | 
 |              train_featureset_tbl, | 
 |              label_tbl, | 
 |              crf_stats_tbl, | 
 |              crf_weights_tbl | 
 |              max_iterations | 
 |             ) | 
 | </pre> | 
 | \b Arguments | 
 | <dl class="arglist"> | 
 |     <dt>train_feature_tbl</dt> | 
 |     <dd>TEXT. Name of the feature table generated during training feature generation</dd> | 
 |  | 
 |     <dt>train_featureset_tbl</dt> | 
 |     <dd>TEXT. Name of the featureset table generated during training feature generation</dd> | 
 |  | 
 |     <dt>label_tbl</dt> | 
 |     <dd>TEXT. Name of the label table used</dd> | 
 |  | 
 |     <dt>crf_stats_table</dt> | 
 |     <dd>TEXT. Name of the table to be created containing statistics for CRF training. The table has the following columns: | 
 |         <table class="output"> | 
 |             <tr> | 
 |                 <th>coef</th> | 
 |                 <td>DOUBLE PRECISION[]. Array of coefficients</td> | 
 |             </tr> | 
 |             <tr> | 
 |                 <th>log_likelihood</th> | 
 |                 <td>DOUBLE. Log-likelihood</td> | 
 |             </tr> | 
 |             <tr> | 
 |                 <th>num_iterations</th> | 
 |                 <td>INTEGER. The number of iterations at which the algorithm terminated</td> | 
 |             </tr> | 
 |         </table> | 
 |     </dd> | 
 |     <dt>crf_weights_table</dt> | 
 |     <dd>TEXT. Name of the table to be created creating learned feature weights. The table has the following columns: | 
 |         <table class="output"> | 
 |             <tr> | 
 |                 <th>id</th> | 
 |                 <td>INTEGER. Feature set id</td> | 
 |             </tr> | 
 |             <tr> | 
 |                 <th>name</th> | 
 |                 <td>TEXT. Feature name</td> | 
 |             </tr> | 
 |             <tr> | 
 |                 <th>prev_label_id</th> | 
 |                 <td>INTEGER. Label for the previous token encountered</td> | 
 |             </tr> | 
 |             <tr> | 
 |                 <th>label_id</th> | 
 |                 <td>INTEGER. Label of the token with the respective feature</td> | 
 |             </tr> | 
 |             <tr> | 
 |                 <th>weight</th> | 
 |                 <td>DOUBLE PRECISION. Weight for the respective feature set</td> | 
 |             </tr> | 
 |         </table> | 
 |     </dd> | 
 |  | 
 |     <dt>max_iterations</dt> | 
 |     <dd>INTEGER. The maximum number of iterations</dd> | 
 | </dl> | 
 |  | 
 | @anchor test_feature | 
 | @par Testing Feature Generation | 
 |  | 
 | <pre class="syntax"> | 
 | crf_test_fgen(test_segment_tbl, | 
 |               dictionary_tbl, | 
 |               label_tbl, | 
 |               regex_tbl, | 
 |               crf_weights_tbl, | 
 |               viterbi_mtbl, | 
 |               viterbi_rtbl | 
 |              ) | 
 | </pre> | 
 | \b Arguments | 
 | <dl class="arglist"> | 
 |     <dt>test_segment_tbl</dt> | 
 |     <dd>TEXT. Name of the testing segment table. The table is expected to have the following columns: | 
 |         <table class="output"> | 
 |             <tr> | 
 |                 <th>doc_id</th> | 
 |                 <td>INTEGER. Document id column</td> | 
 |              </tr> | 
 |              <tr> | 
 |                 <th>start_pos</th> | 
 |                 <td>INTEGER. Index of a particular term in the respective document</td> | 
 |              </tr> | 
 |              <tr> | 
 |                 <th>seg_text</th> | 
 |                 <td>TEXT. Term at the respective \c start_pos in the document</td> | 
 |               </tr> | 
 |         </table> | 
 |     </dd> | 
 |  | 
 |     <dt>dictionary_tbl</dt> | 
 |     <dd>TEXT. Name of the dictionary table created during training feature generation (\c crf_train_fgen)</dd> | 
 |  | 
 |     <dt>label_tbl</dt> | 
 |     <dd>TEXT. Name of the label table</dd> | 
 |  | 
 |     <dt>regex_tbl</dt> | 
 |     <dd>TEXT. Name of the regular expression table</dd> | 
 |  | 
 |     <dt>crf_weights_tbl</dt> | 
 |     <dd>TEXT. Name of the weights table generated during CRF training (\c lincrf_train) | 
 |  | 
 |     <dt>viterbi_mtbl</dt> | 
 |     <dd>TEXT. Name of the Viterbi M table to be created</dd> | 
 |  | 
 |     <dt>viterbi_rtbl</dt> | 
 |     <dd>TEXT. Name of the Viterbi R table to be created</dd> | 
 | </dl> | 
 |  | 
 | @anchor inference | 
 | @par Inference using Viterbi | 
 | <pre class="syntax"> | 
 | vcrf_label(test_segment_tbl, | 
 |            viterbi_mtbl, | 
 |            viterbi_rtbl, | 
 |            label_tbl, | 
 |            result_tbl) | 
 | </pre> | 
 | \b Arguments | 
 | <dl class="arglist"> | 
 |     <dt>test_segment_tbl</dt> | 
 |     <dd>TEXT. Name of the testing segment table. For required table schema, please refer to arguments in previous section</dd> | 
 |     <dt>viterbi_mtbl</dt> | 
 |     <dd>TEXT. Name of the table \c viterbi_mtbl generated from testing feature generation \c crf_test_fgen. | 
 |     <dt>viterbi_rtbl</dt> | 
 |     <dd>TEXT. Name of the table \c viterbi_rtbl generated from testing feature generation \c crf_test_fgen. | 
 |     <dt>label_tbl</dt> | 
 |     <dd>TEXT. Name of the label table.</dd> | 
 |     <dt>result_tbl</dt> | 
 |     <dd>TEXT. Name of the result table to be created containing extracted best label sequences.</dd> | 
 | </dl> | 
 |  | 
 | @anchor usage | 
 | @par Using CRF | 
 |  | 
 | Generate text features, calculate their weights, and output the best label sequence for test data:\n | 
 |  | 
 |  -# Perform feature generation on training data i.e. \c train_segment_tbl generating \c train_feature_tbl and \c train_featureset_tbl. | 
 |     <pre>SELECT madlib.crf_train_fgen( | 
 |          '<em>train_segment_tbl</em>', | 
 |          '<em>regex_tbl</em>', | 
 |          '<em>label_tbl</em>', | 
 |          '<em>dictionary_tbl</em>', | 
 |          '<em>train_feature_tbl</em>', | 
 |          '<em>train_featureset_tbl</em>');</pre> | 
 |  -# Use linear-chain CRF for training providing \c train_feature_tbl and \c train_featureset_tbl generated from previous step as an input. | 
 |     <pre>SELECT madlib.lincrf_train( | 
 |          '<em>train_feature_tbl</em>', | 
 |          '<em>train_featureset_tbl</em>', | 
 |          '<em>label_tbl</em>', | 
 |          '<em>crf_stats_tbl</em>', | 
 |          '<em>crf_weights_tbl</em>', | 
 |          <em>max_iterations</em>);</pre> | 
 |  -# Perform feature generation on testing data \c test_segment_tbl generating \c viterbi_mtbl and \c viterbi_rtbl required for inferencing. | 
 |     <pre>SELECT madlib.crf_test_fgen( | 
 |          '<em>test_segment_tbl</em>', | 
 |          '<em>dictionary_tbl</em>', | 
 |          '<em>label_tbl</em>', | 
 |          '<em>regex_tbl</em>', | 
 |          '<em>crf_weights_tbl</em>', | 
 |          '<em>viterbi_mtbl</em>', | 
 |          '<em>viterbi_rtbl</em>');</pre> | 
 |  -# Run the Viterbi function to get the best label sequence and the conditional | 
 |     probability \f$ \Pr( \text{best label sequence} \mid \text{sequence}) \f$. | 
 |     <pre>SELECT madlib.vcrf_label( | 
 |          '<em>test_segment_tbl</em>', | 
 |          '<em>viterbi_mtbl</em>', | 
 |          '<em>viterbi_rtbl</em>', | 
 |          '<em>label_tbl</em>', | 
 |          '<em>result_tbl</em>');</pre> | 
 |  | 
 | @anchor examples | 
 | @examp | 
 | This example uses a trivial training and test data set. | 
 |  | 
 | -# Load the label table, the regular expressions table, and the training segment table: | 
 | <pre class="example"> | 
 | CREATE TABLE crf_label (id integer,label character varying); | 
 | INSERT INTO crf_label VALUES | 
 | (0,'CC'),   (1,'CD'),  (2,'DT'),    (3,'EX'),   (4,'FW'),  (5,'IN'),   (6,'JJ'),  (7,'JJR'), (8,'JJS'), | 
 | (9,'LS'),   (10,'MD'), (11,'NN'),   (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'), | 
 | (18,'PRP$'),(19,'RB'), (20,'RBR'),  (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'), | 
 | (27,'VBD'), (28,'VBG'),(29,'VBN'),  (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'), | 
 | (36,'$'),   (37,'#'),  (38,''''''), (39,'``'),  (40,'('),  (41,')'),   (42,','),  (43,'.'),  (44,':'); | 
 | CREATE TABLE crf_regex (pattern text,name text); | 
 | INSERT INTO crf_regex VALUES | 
 | ('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'), ('^.*[0-9]+.*$','containsDigit'), | 
 | ('^.+[.]$','endsWithDot'),       ('^.+[,]$','endsWithComma'), ('^.+er$','endsWithER'), | 
 | ('^.+est$','endsWithEst'),       ('^.+ed$','endsWithED'),     ('^.+s$','endsWithS'), | 
 | ('^.+ing$','endsWithIng'),       ('^.+ly$','endsWithly'),     ('^.+-.+$','isDashSeparatedWords'), | 
 | ('^.*@.*$','isEmailId'); | 
 | CREATE TABLE train_segmenttbl(start_pos integer,doc_id integer,seg_text text,label integer,max_pos integer); | 
 | INSERT INTO train_segmenttbl VALUES | 
 | (0,1,'confidence',11,36),  (1,1,'in',5,36),         (2,1,'the',2,36),         (3,1,'pound',11,36), | 
 | (4,1,'is',31,36),          (5,1,'widely',19,36),    (6,1,'expected',29,36),   (7,1,'to',24,36), | 
 | (8,1,'take',26,36),        (9,1,'another',2,36),    (10,1,'sharp',6,36),      (11,1,'dive',11,36), | 
 | (12,1,'if',5,36),          (13,1,'trade',11,36),    (14,1,'figures',12,36),   (15,1,'for',5,36), | 
 | (16,1,'september',13,36),  (17,1,',',42,36),        (18,1,'due',6,36),        (19,1,'for',5,36), | 
 | (20,1,'release',11,36),    (21,1,'tomorrow',11,36), (22,1,',',42,36),         (23,1,'fail',26,36), | 
 | (24,1,'to',24,36),         (25,1,'show',26,36),     (26,1,'a',2,36),          (27,1,'substantial',6,36), | 
 | (28,1,'improvement',11,36),(29,1,'from',5,36),      (30,1,'july',13,36),      (31,1,'and',0,36), | 
 | (32,1,'august',13,36),     (33,1,'''s',16,36),      (34,1,'near-record',6,36),(35,1,'deficits',12,36), | 
 | (36,1,'.',43,36),          (0,2,'chancellor',13,26),(1,2,'of',5,26),          (2,2,'the',2,26), | 
 | (3,2,'exchequer',13,26),   (4,2,'nigel',13,26),     (5,2,'lawson',13,26),     (6,2,'''s',16,26), | 
 | (7,2,'restated',29,26),    (8,2,'commitment',11,26),(9,2,'to',24,26),         (10,2,'a',2,26), | 
 | (11,2,'firm',11,26),       (12,2,'monetary',6,26),  (13,2,'policy',11,26),    (14,2,'has',31,26), | 
 | (15,2,'helped',29,26),     (16,2,'to',24,26),       (17,2,'prevent',26,26),   (18,2,'a',2,26), | 
 | (19,2,'freefall',11,26),   (20,2,'in',5,26),        (21,2,'sterling',11,26),  (22,2,'over',5,26), | 
 | (23,2,'the',2,26),         (24,2,'past',6,26),      (25,2,'week',11,26),      (26,2,'.',43,26); | 
 | </pre> | 
 | -# Generate the training features: | 
 | <pre class="example"> | 
 | SELECT crf_train_fgen( 'train_segmenttbl', | 
 |                        'crf_regex', | 
 |                        'crf_label', | 
 |                        'crf_dictionary', | 
 |                        'train_featuretbl', | 
 |                        'train_featureset' | 
 |                      ); | 
 | SELECT * from crf_dictionary; | 
 | </pre> | 
 | Result: | 
 | <pre class="result"> | 
 |     token    | total | 
 |  ------------+------- | 
 |  a           |     3 | 
 |  and         |     1 | 
 |  august      |     1 | 
 |  chancellor  |     1 | 
 |  dive        |     1 | 
 |  exchequer   |     1 | 
 | ... | 
 | </pre> | 
 | <pre class="example"> | 
 | SELECT * from train_featuretbl; | 
 | </pre> | 
 | Result: | 
 | <pre class="result"> | 
 |  doc_id | f_size |            sparse_r          |              dense_m             |       sparse_m | 
 |  -------+--------+------------------------------+----------------------------------+----------------------- | 
 |       1 |    115 | {-1,11,82,0,1,-1,2,32,0,...} | {11,5,11,1,1,5,2,8,2,1,2,11,...} | {5,2,13,11,11,5,13,...} | 
 |       2 |    115 | {-1,19,35,0,0,-1,26,38,0,..} | {13,5,66,1,1,5,2,8,2,1,2,13,...} | {5,2,13,11,11,5,13,...} | 
 | </pre> | 
 | <pre class="example"> | 
 | SELECT * from train_featureset; | 
 | </pre> | 
 | <pre class="result"> | 
 |  f_index |    f_name     | feature | 
 |  --------+---------------+--------- | 
 |        6 | W_the         | {-1,2} | 
 |        9 | R_endsWithly  | {-1,19} | 
 |       14 | W_figures     | {-1,12} | 
 |       17 | W_helped      | {-1,29} | 
 |       25 | W_show        | {-1,26} | 
 |       28 | W_'s          | {-1,16} | 
 |       33 | W_chancellor  | {-1,13} | 
 |       43 | W_over        | {-1,5} | 
 |       52 | W_trade       | {-1,11} | 
 |       10 | W_july        | {-1,13} | 
 |       21 | W_substantial | {-1,6} | 
 |        5 | E.            | {2,13} | 
 | ... | 
 | </pre> | 
 |  | 
 | -# Train using linear CRF: | 
 | <pre class="example"> | 
 | SELECT lincrf_train( 'train_featuretbl', | 
 |                      'train_featureset', | 
 |                      'crf_label', | 
 |                      'crf_stats_tbl', | 
 |                      'crf_weights_tbl', | 
 |                      20 | 
 |              ); | 
 | </pre> | 
 | <pre class="result"> | 
 |                                 lincrf_train | 
 |  ----------------------------------------------------------------------------------- | 
 |  CRF Train successful. Results stored in the specified CRF stats and weights table | 
 |  lincrf | 
 | </pre> | 
 | View the feature weight table. | 
 | <pre class="example"> | 
 | SELECT * from crf_weights_tbl; | 
 | </pre> | 
 | Result: | 
 | <pre class="result"> | 
 |   id |     name      | prev_label_id | label_id |      weight | 
 |  ----+---------------+---------------+----------+------------------- | 
 |    4 | W_lawson      |            -1 |       13 |  1.73698153439171 | 
 |    3 | End.          |            -1 |       43 |   3.3198742329636 | 
 |    7 | W_has         |            -1 |       31 |  2.19831004450897 | 
 |   24 | W_tomorrow    |            -1 |       11 |  3.34106414300743 | 
 |   29 | W_.           |            -1 |       43 |   3.3198742329636 | 
 |   34 | W_from        |            -1 |        5 |  2.80284597986744 | 
 |   37 | W_august      |            -1 |       13 |  1.34455487966976 | 
 |   39 | W_due         |            -1 |        6 |  3.39258895715363 | 
 |   41 | W_exchequer   |            -1 |       13 |  1.82177698489335 | 
 | ... | 
 | </pre> | 
 |  | 
 | -# To find the best labels for a test set using the trained linear CRF model, repeat steps #1-2 and generate the test features, except instead of creating a new dictionary, use the dictionary generated from the training set. | 
 | <pre class="example"> | 
 | CREATE TABLE test_segmenttbl (start_pos integer,doc_id integer,seg_text text,max_pos integer); | 
 | INSERT INTO test_segmenttbl VALUES | 
 | (0,1,'chancellor',26),(1,1,'of',26),       (2,1,'the',26),      (3,1,'exchequer',26), (4,1,'nigel',26), | 
 | (5,1,'lawson',26),    (6,1,'''s',26),      (7,1,'restated',26), (8,1,'commitment',26),(9,1,'to',26), | 
 | (10,1,'a',26),        (11,1,'firm',26),    (12,1,'monetary',26),(13,1,'policy',26),   (14,1,'has',26), | 
 | (15,1,'helped',26),   (16,1,'to',26),      (17,1,'prevent',26), (18,1,'a',26),        (19,1,'freefall',26), | 
 | (20,1,'in',26),       (21,1,'sterling',26),(22,1,'over',26),    (23,1,'the',26),      (24,1,'past',26), | 
 | (25,1,'week',26),     (26,1,'.',26),       (0,2,'but',28),      (1,2,'analysts',28),  (2,2,'reckon',28), | 
 | (3,2,'underlying',28),(4,2,'support',28),  (5,2,'for',28),      (6,2,'sterling',28),  (7,2,'has',28), | 
 | (8,2,'been',28),      (9,2,'eroded',28),   (10,2,'by',28),      (11,2,'the',28),      (12,2,'chancellor',28), | 
 | (13,2,'''s',28),      (14,2,'failure',28), (15,2,'to',28),      (16,2,'announce',28), (17,2,'any',28), | 
 | (18,2,'new',28),      (19,2,'policy',28),  (20,2,'measures',28),(21,2,'in',28),       (22,2,'his',28), | 
 | (23,2,'mansion',28),  (24,2,'house',28),   (25,2,'speech',28),  (26,2,'last',28),     (27,2,'thursday',28), | 
 | (28,2,'.',28),        (0,3,'his',4),       (1,3,'actions',4),   (2,3,'prevent',4),    (3,3,'disaster',4), | 
 | (4,3,'.',4); | 
 | </pre> | 
 | <pre class="example"> | 
 | SELECT crf_test_fgen( 'test_segmenttbl', | 
 |                       'crf_dictionary', | 
 |                       'crf_label', | 
 |                       'crf_regex', | 
 |                       'crf_weights_tbl', | 
 |                       'viterbi_mtbl', | 
 |                       'viterbi_rtbl' | 
 |                     ); | 
 | </pre> | 
 | -# Calculate the best label sequence and save in the table \c extracted_best_labels. | 
 | <pre class="example"> | 
 | SELECT vcrf_label( 'test_segmenttbl', | 
 |                    'viterbi_mtbl', | 
 |                    'viterbi_rtbl', | 
 |                    'crf_label', | 
 |                    'extracted_best_labels' | 
 |                  ); | 
 | </pre> | 
 | View the best labels. | 
 | <pre class="example"> | 
 | SELECT * FROM extracted_best_labels; | 
 | </pre> | 
 | Result: | 
 | <pre class="result"> | 
 |  doc_id | start_pos |   seg_text    | label | id | max_pos |   prob | 
 |  -------+-----------+---------------+-------+----+---------+---------- | 
 |       1 |         4 | nigel         | NNP   | 13 |      26 | 0.387118 | 
 |       1 |         5 | lawson        | NNP   | 13 |      26 | 0.387118 | 
 |       1 |         7 | restated      | VBN   | 29 |      26 | 0.387118 | 
 |       1 |         8 | commitment    | NN    | 11 |      26 | 0.387118 | 
 | ... | 
 |       3 |         0 | his           | NNP   | 13 |       4 | 0.047757 | 
 |       3 |         2 | prevent       | JJ    |  6 |       4 | 0.047757 | 
 |       3 |         4 | .             | .     | 43 |       4 | 0.047757 | 
 | ... | 
 | </pre> | 
 |  | 
 | @anchor background | 
 | @par Technical Background | 
 |  | 
 | Specifically, a linear-chain CRF is a distribution defined by | 
 | \f[ | 
 |     p_\lambda(\boldsymbol y | \boldsymbol x) = | 
 |         \frac{\exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol y)}}{Z_\lambda(\boldsymbol x)} | 
 |     \,. | 
 | \f] | 
 |  | 
 | where | 
 | - \f$ F_m(\boldsymbol x, \boldsymbol y) = \sum_{i=1}^n f_m(y_i,y_{i-1},x_i) \f$ is a global feature function that is a sum along a sequence | 
 |   \f$ \boldsymbol x \f$ of length \f$ n \f$ | 
 | - \f$ f_m(y_i,y_{i-1},x_i) \f$ is a local feature function dependent on the current token label \f$ y_i \f$, the previous token label \f$ y_{i-1} \f$, | 
 |   and the observation \f$ x_i \f$ | 
 | - \f$ \lambda_m \f$ is the corresponding feature weight | 
 | - \f$ Z_\lambda(\boldsymbol x) \f$ is an instance-specific normalizer | 
 | \f[ | 
 | Z_\lambda(\boldsymbol x) = \sum_{\boldsymbol y'} \exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol y')} | 
 | \f] | 
 |  | 
 | A linear-chain CRF estimates the weights \f$ \lambda_m \f$ by maximizing the log-likelihood | 
 | of a given training set \f$ T=\{(x_k,y_k)\}_{k=1}^N \f$. | 
 |  | 
 | The log-likelihood is defined as | 
 | \f[ | 
 |     \ell_{\lambda}=\sum_k \log p_\lambda(y_k|x_k) =\sum_k[\sum_{m=1}^M \lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)] | 
 | \f] | 
 |  | 
 | and the zero of its gradient | 
 | \f[ | 
 |     \nabla \ell_{\lambda}=\sum_k[F(x_k,y_k)-E_{p_\lambda(Y|x_k)}[F(x_k,Y)]] | 
 | \f] | 
 |  | 
 | is found since the maximum likelihood is reached when the empirical average of the global feature vector equals its model expectation.  The MADlib implementation uses limited-memory BFGS (L-BFGS), a limited-memory variation of the Broyden–Fletcher–Goldfarb–Shanno (BFGS) update, a quasi-Newton method for unconstrained optimization. | 
 |  | 
 | \f$E_{p_\lambda(Y|x)}[F(x,Y)]\f$ is found by using a variant of the forward-backward algorithm: | 
 | \f[ | 
 |     E_{p_\lambda(Y|x)}[F(x,Y)] = \sum_y p_\lambda(y|x)F(x,y) | 
 |                             = \sum_i\frac{\alpha_{i-1}(f_i*M_i)\beta_i^T}{Z_\lambda(x)} | 
 | \f] | 
 | \f[ | 
 |     Z_\lambda(x) = \alpha_n.1^T | 
 | \f] | 
 |     where \f$\alpha_i\f$  and \f$ \beta_i\f$ are the forward and backward state cost vectors defined by | 
 | \f[ | 
 |     \alpha_i = | 
 |     \begin{cases} | 
 |     \alpha_{i-1}M_i, & 0<i<=n\\ | 
 |     1, & i=0 | 
 |     \end{cases}\\ | 
 | \f] | 
 | \f[ | 
 |     \beta_i^T = | 
 |     \begin{cases} | 
 |     M_{i+1}\beta_{i+1}^T, & 1<=i<n\\ | 
 |     1, & i=n | 
 |     \end{cases} | 
 | \f] | 
 |  | 
 | To avoid overfitting, we penalize the likelihood with a spherical Gaussian weight prior: | 
 | \f[ | 
 |     \ell_{\lambda}^\prime=\sum_k[\sum_{m=1}^M \lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)] - \frac{\lVert \lambda \rVert^2}{2\sigma ^2} | 
 | \f] | 
 |  | 
 | \f[ | 
 |     \nabla \ell_{\lambda}^\prime=\sum_k[F(x_k,y_k) - E_{p_\lambda(Y|x_k)}[F(x_k,Y)]] - \frac{\lambda}{\sigma ^2} | 
 | \f] | 
 |  | 
 |  | 
 |  | 
 | @literature | 
 | [1] F. Sha, F. Pereira. Shallow Parsing with Conditional Random Fields, http://www-bcf.usc.edu/~feisha/pubs/shallow03.pdf | 
 |  | 
 | [2] Wikipedia, Conditional Random Field, http://en.wikipedia.org/wiki/Conditional_random_field | 
 |  | 
 | [3] A. Jaiswal, S.Tawari, I. Mansuri, K. Mittal, C. Tiwari (2012), CRF, http://crf.sourceforge.net/ | 
 |  | 
 | [4] D. Wang, ViterbiCRF, http://www.cs.berkeley.edu/~daisyw/ViterbiCRF.html | 
 |  | 
 | [5] Wikipedia, Viterbi Algorithm, http://en.wikipedia.org/wiki/Viterbi_algorithm | 
 |  | 
 | [6] J. Nocedal. Updating Quasi-Newton Matrices with Limited Storage (1980), Mathematics of Computation 35, pp. 773-782 | 
 |  | 
 | [7] J. Nocedal, Software for Large-scale Unconstrained Optimization, http://users.eecs.northwestern.edu/~nocedal/lbfgs.html | 
 |  | 
 | @anchor related | 
 | @par Related Topics | 
 |  | 
 | File crf.sql_in crf_feature_gen.sql_in viterbi.sql_in (documenting the SQL functions) | 
 |  | 
 | */ | 
 |  | 
 | DROP TYPE IF EXISTS MADLIB_SCHEMA.lincrf_result CASCADE; | 
 | CREATE TYPE MADLIB_SCHEMA.lincrf_result AS ( | 
 |     coef DOUBLE PRECISION[], | 
 |     log_likelihood DOUBLE PRECISION, | 
 |     num_iterations INTEGER | 
 | ); | 
 |  | 
 | CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_lbfgs_step_transition( | 
 |     DOUBLE PRECISION[], | 
 |     DOUBLE PRECISION[], | 
 |     DOUBLE PRECISION[], | 
 |     DOUBLE PRECISION[], | 
 |     DOUBLE PRECISION, | 
 |     DOUBLE PRECISION, | 
 |     DOUBLE PRECISION[]) | 
 | RETURNS DOUBLE PRECISION[] | 
 | AS 'MODULE_PATHNAME' | 
 | LANGUAGE C IMMUTABLE; | 
 |  | 
 | CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_lbfgs_step_merge_states( | 
 |     state1 DOUBLE PRECISION[], | 
 |     state2 DOUBLE PRECISION[]) | 
 | RETURNS DOUBLE PRECISION[] | 
 | AS 'MODULE_PATHNAME' | 
 | LANGUAGE C IMMUTABLE STRICT; | 
 |  | 
 | CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_lbfgs_step_final( | 
 |     state DOUBLE PRECISION[]) | 
 | RETURNS DOUBLE PRECISION[] | 
 | AS 'MODULE_PATHNAME' | 
 | LANGUAGE C IMMUTABLE STRICT; | 
 |  | 
 | CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_lincrf_lbfgs_converge( | 
 |     /*+ state */ DOUBLE PRECISION[]) | 
 | RETURNS DOUBLE PRECISION AS | 
 | 'MODULE_PATHNAME' | 
 | LANGUAGE c IMMUTABLE STRICT; | 
 |  | 
 |  | 
 | CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_lincrf_lbfgs_result( | 
 |     /*+ state */ DOUBLE PRECISION[]) | 
 | RETURNS MADLIB_SCHEMA.lincrf_result AS | 
 | 'MODULE_PATHNAME' | 
 | LANGUAGE c IMMUTABLE STRICT; | 
 |  | 
 | /** | 
 |  * @internal | 
 |  * @brief Perform one iteration of the L-BFGS method for computing | 
 |  * conditional random field | 
 |  */ | 
 | DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.lincrf_lbfgs_step( | 
 |     DOUBLE PRECISION[], | 
 |     DOUBLE PRECISION[], | 
 |     DOUBLE PRECISION[], | 
 |     DOUBLE PRECISION, | 
 |     DOUBLE PRECISION, | 
 |     DOUBLE PRECISION[] | 
 | ) CASCADE; | 
 | CREATE AGGREGATE MADLIB_SCHEMA.lincrf_lbfgs_step( | 
 |     /* sparse_r columns */ DOUBLE PRECISION[], | 
 |     /* dense_m columns */ DOUBLE PRECISION[], | 
 |     /* sparse_m columns */ DOUBLE PRECISION[], | 
 |     /* feature size */ DOUBLE PRECISION, | 
 |     /* tag size */ DOUBLE PRECISION, | 
 |     /* previous_state */ DOUBLE PRECISION[]) ( | 
 |  | 
 |     STYPE=DOUBLE PRECISION[], | 
 |     SFUNC=MADLIB_SCHEMA.lincrf_lbfgs_step_transition, | 
 |     m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.lincrf_lbfgs_step_merge_states,') | 
 |     FINALFUNC=MADLIB_SCHEMA.lincrf_lbfgs_step_final, | 
 |     INITCOND='{0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}' | 
 | ); | 
 |  | 
 | m4_ifdef(`USE_COMPATIBLE_ARRAY', | 
 | `DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.array_union(anycompatiblearray) CASCADE;', | 
 | `DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.array_union(anyarray) CASCADE;') | 
 |  | 
 | CREATE m4_ifdef(`__POSTGRESQL__', `', | 
 |     m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED')) AGGREGATE | 
 | m4_ifdef(`USE_COMPATIBLE_ARRAY', | 
 | `MADLIB_SCHEMA.array_union(anycompatiblearray) (SFUNC = array_cat, STYPE = anycompatiblearray);', | 
 | `MADLIB_SCHEMA.array_union(anyarray) (SFUNC = array_cat, STYPE = anyarray);' | 
 | ) | 
 | -- We only need to document the last one (unfortunately, in Greenplum we have to | 
 | -- use function overloading instead of default arguments). | 
 | CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.compute_lincrf( | 
 |     "source" TEXT, | 
 |     "sparse_R" TEXT, | 
 |     "dense_M" TEXT, | 
 |     "sparse_M" TEXT, | 
 |     "featureSize" TEXT, | 
 |     "tagSize" INTEGER, | 
 |     "maxNumIterations" INTEGER) | 
 | RETURNS INTEGER | 
 | AS $$PythonFunction(crf, crf, compute_lincrf)$$ | 
 | LANGUAGE plpython3u VOLATILE | 
 | m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); | 
 |  | 
 | /** | 
 |  * @brief Compute linear-chain crf coefficients and diagnostic statistics | 
 |  * | 
 |  * @param source Name of the source relation containing the training data | 
 |  * @param sparse_R Name of the sparse single state feature column (of type DOUBLE PRECISION[]) | 
 |  * @param dense_M Name of the dense two state feature column (of type DOUBLE PRECISION[]) | 
 |  * @param sparse_M Name of the sparse two state feature column (of type DOUBLE PRECISION[]) | 
 |  * @param featureSize Name of feature size column (of type DOUBLE PRECISION) | 
 |  * @param tagSize The number of tags in the tag set | 
 |  * @param featureset The unique feature set | 
 |  * @param crf_feature The Name of output feature table | 
 |  * @param maxNumIterations The maximum number of iterations | 
 |  * | 
 |  * @return a composite value: | 
 |  * - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$ | 
 |  * - <tt>log_likelihood FLOAT8</tt> - Log-likelihood \f$ l(\boldsymbol c) \f$ | 
 |  * - <tt>num_iterations INTEGER</tt> - The number of iterations before the | 
 |  *   algorithm terminated \n\n | 
 |  * A 'crf_feature' table is used to store all the features and corresponding weights | 
 |  * | 
 |  * @note This function starts an iterative algorithm. It is not an aggregate | 
 |  * function. Source and column names have to be passed as strings (due to | 
 |  * limitations of the SQL syntax). | 
 |  * | 
 |  * @internal | 
 |  * @sa This function is a wrapper for crf::compute_lincrf(), which | 
 |  * sets the default values. | 
 |  */ | 
 |  | 
 | CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_train( | 
 |      train_feature_tbl      TEXT, | 
 |      train_featureset_tbl   TEXT, | 
 |      label_tbl              TEXT, | 
 |      crf_stats_tbl          TEXT, | 
 |      crf_weights_tbl        TEXT, | 
 |      max_iterations         INTEGER /* DEFAULT 20 */ | 
 | ) RETURNS TEXT AS $$ | 
 | PythonFunction(crf, crf, lincrf_train) | 
 | $$ LANGUAGE plpython3u | 
 | m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); | 
 |  | 
 | CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_train( | 
 |      train_feature_tbl      TEXT, | 
 |      train_featureset_tbl   TEXT, | 
 |      label_tbl              TEXT, | 
 |      crf_stats_tbl          TEXT, | 
 |      crf_weights_tbl        TEXT | 
 | ) RETURNS TEXT AS | 
 | $$ | 
 | SELECT MADLIB_SCHEMA.lincrf_train($1, $2, $3, $4, $5, 20); | 
 | $$ LANGUAGE sql VOLATILE | 
 | m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |