blob: 9f12f48f423b6877df8f1c03c160d6b5e46fb1e4 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file crf.sql_in
*
* @brief SQL functions for conditional random field
* @date July 2012
*
* @sa For a brief introduction to conditional random field, see the
* module description \ref grp_crf.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_crf
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#train_feature">Training Feature Generation</a></li>
<li><a href="#train">CRF Training Function</a></li>
<li><a href="#test_feature">Testing Feature Generation</a></li>
<li><a href="#inference">Inference using Viterbi</a></li>
<li><a href="#usage">Using CRF</a></li>
<li><a href="#examples">Examples</a></li>
<li><a href="#background">Technical Background</a></li>
<li><a href="#literature">Literature</a></li>
<li><a href="#related">Related Topics</a></li>
</ul>
</div>
@brief Constructs a Conditional Random Fields (CRF) model for labeling sequential data.
A conditional random field (CRF) is a type of discriminative, undirected
probabilistic graphical model. A linear-chain CRF is a special type of CRF
that assumes the current state depends only on the previous state.
Feature extraction modules are provided for text-analysis tasks such as part-of-speech
(POS) tagging and named-entity resolution (NER). Currently, six
feature types are implemented:
- Edge Feature: transition feature that encodes the transition feature
weight from current label to next label.
- Start Feature: fired when the current token is the first token in a sequence.
- End Feature: fired when the current token is the last token in a sequence.
- Word Feature: fired when the current token is observed in the trained
dictionary.
- Unknown Feature: fired when the current token is not observed in the trained
dictionary for at least a certain number of times (default 1).
- Regex Feature: fired when the current token can be matched by a regular
expression.
A Viterbi implementation is also provided
to get the best label sequence and the conditional probability
\f$ \Pr( \text{best label sequence} \mid \text{sequence}) \f$.
Following steps are required for CRF Learning and Inference:
-# <li><a href="#train_feature">Training Feature Generation</a></li>
-# <li><a href="#train">CRF Training</a></li>
-# <li><a href="#test_feature">Testing Feature Generation</a></li>
-# <li><a href="#inference">Inference using Viterbi</a></li>
@anchor train_feature
@par Training Feature Generation
The function takes \c train_segment_tbl and \c regex_tbl as input and does feature generation generating three tables
\c dictionary_tbl, \c train_feature_tbl and \c train_featureset_tbl, that are required as an input for CRF training.
<pre class="syntax">
crf_train_fgen(train_segment_tbl,
regex_tbl,
label_tbl,
dictionary_tbl,
train_feature_tbl,
train_featureset_tbl)
</pre>
\b Arguments
<dl class="arglist">
<dt>train_segment_tbl</dt>
<dd>TEXT. Name of the training segment table. The table is expected to have the following columns:
<table class="output">
<tr>
<th>doc_id</th>
<td>INTEGER. Document id column</td>
</tr>
<tr>
<th>start_pos</th>
<td>INTEGER. Index of a particular term in the respective document</td>
</tr>
<tr>
<th>seg_text</th>
<td>TEXT. Term at the respective \c start_pos in the document</td>
</tr>
<tr>
<th>label</th>
<td>INTEGER. Label id for the term corresponding to the actual label from \c label_tbl</td>
</tr>
</table>
</dd>
<dt>regex_tbl</dt>
<dd>TEXT. Name of the regular expression table. The table is expected to have the following columns:
<table class="output">
<tr>
<th>pattern</th>
<td>TEXT. Regular Expression</td>
</tr>
<tr>
<th>name</th>
<td>TEXT. Regular Expression name</td>
</tr>
</table>
</dd>
<dt>label_tbl</dt>
<dd>TEXT. Name of the table containing unique labels and their id's. The table is expected to have the following columns:
<table class="output">
<tr>
<th>id</th>
<td>INTEGER. Unique label id. NOTE: Must range from 0 to total number of labels in the table - 1.
</td>
</tr>
<tr>
<th>label</th>
<td>TEXT. Label name</td>
</tr>
</table>
</dd>
<dt>dictionary_tbl</dt>
<dd>TEXT. Name of the dictionary table to be created containing unique terms along with their counts. The table will have the following columns:
<table class="output">
<tr>
<th>token</th>
<td>TEXT. Contains all the unique terms found in \c train_segment_tbl</td>
</tr>
<tr>
<th>total</th>
<td>INTEGER. Respective counts for the terms</td>
</tr>
</table>
<dd>
<dt>train_feature_tbl<dt>
<dd>TEXT. Name of the training feature table to be created. The table will have the following columns:
<table class="output">
<tr>
<th>doc_id</th>
<td>INTEGER. Document id</td>
</tr>
<tr>
<th>f_size</th>
<td>INTEGER. Feature set size. This value will be same for all the tuples in the table</td>
</tr>
<tr>
<th>sparse_r</th>
<td>DOUBLE PRECISION[]. Array union of individual single state features (previous label, label, feature index, start position, training existance indicator), ordered by their start position.</td>
</tr>
<tr>
<th>dense_m</th>
<td>DOUBLE PRECISION[]. Array union of (previous label, label, feature index, start position, training existance indicator) of edge features ordered by start position.</td>
</tr>
<tr>
<th>sparse_m</th>
<td>DOUBLE PRECISION[]. Array union of (feature index, previous label, label) of edge features ordered by feature index.</td>
</tr>
</table>
</dd>
<dt>train_featureset_tbl</dt>
<dd>TEXT. Name of the table to be created containing distinct featuresets generated from training feature extraction. The table will have the following columns:
<table class="output">
<tr>
<th>f_index</th>
<td>INTEGER. Column containing distinct featureset ids</td>
</tr>
<tr>
<th>f_name</th>
<td>TEXT. Feature name </td>
</tr>
<tr>
<th>feature</th>
<td>ARRAY. Feature value. The value is of the form [L1, L2]
\n - If L1 = -1: represents single state feature with L2 being the current label id.
\n - If L1 != -1: represents transition feature with L1 be the previous label and L2 be the current label.
</td>
</tr>
</dd>
</dl>
@anchor train
@par Linear Chain CRF Training Function
The function takes \c train_feature_tbl and \c train_featureset_tbl tables generated in the training feature generation steps as input
along with other required parameters and produces two output tables \c crf_stats_tbl and \c crf_weights_tbl.
<pre class="syntax">
lincrf_train(train_feature_tbl,
train_featureset_tbl,
label_tbl,
crf_stats_tbl,
crf_weights_tbl
max_iterations
)
</pre>
\b Arguments
<dl class="arglist">
<dt>train_feature_tbl</dt>
<dd>TEXT. Name of the feature table generated during training feature generation</dd>
<dt>train_featureset_tbl</dt>
<dd>TEXT. Name of the featureset table generated during training feature generation</dd>
<dt>label_tbl</dt>
<dd>TEXT. Name of the label table used</dd>
<dt>crf_stats_table</dt>
<dd>TEXT. Name of the table to be created containing statistics for CRF training. The table has the following columns:
<table class="output">
<tr>
<th>coef</th>
<td>DOUBLE PRECISION[]. Array of coefficients</td>
</tr>
<tr>
<th>log_likelihood</th>
<td>DOUBLE. Log-likelihood</td>
</tr>
<tr>
<th>num_iterations</th>
<td>INTEGER. The number of iterations at which the algorithm terminated</td>
</tr>
</table>
</dd>
<dt>crf_weights_table</dt>
<dd>TEXT. Name of the table to be created creating learned feature weights. The table has the following columns:
<table class="output">
<tr>
<th>id</th>
<td>INTEGER. Feature set id</td>
</tr>
<tr>
<th>name</th>
<td>TEXT. Feature name</td>
</tr>
<tr>
<th>prev_label_id</th>
<td>INTEGER. Label for the previous token encountered</td>
</tr>
<tr>
<th>label_id</th>
<td>INTEGER. Label of the token with the respective feature</td>
</tr>
<tr>
<th>weight</th>
<td>DOUBLE PRECISION. Weight for the respective feature set</td>
</tr>
</table>
</dd>
<dt>max_iterations</dt>
<dd>INTEGER. The maximum number of iterations</dd>
</dl>
@anchor test_feature
@par Testing Feature Generation
<pre class="syntax">
crf_test_fgen(test_segment_tbl,
dictionary_tbl,
label_tbl,
regex_tbl,
crf_weights_tbl,
viterbi_mtbl,
viterbi_rtbl
)
</pre>
\b Arguments
<dl class="arglist">
<dt>test_segment_tbl</dt>
<dd>TEXT. Name of the testing segment table. The table is expected to have the following columns:
<table class="output">
<tr>
<th>doc_id</th>
<td>INTEGER. Document id column</td>
</tr>
<tr>
<th>start_pos</th>
<td>INTEGER. Index of a particular term in the respective document</td>
</tr>
<tr>
<th>seg_text</th>
<td>TEXT. Term at the respective \c start_pos in the document</td>
</tr>
</table>
</dd>
<dt>dictionary_tbl</dt>
<dd>TEXT. Name of the dictionary table created during training feature generation (\c crf_train_fgen)</dd>
<dt>label_tbl</dt>
<dd>TEXT. Name of the label table</dd>
<dt>regex_tbl</dt>
<dd>TEXT. Name of the regular expression table</dd>
<dt>crf_weights_tbl</dt>
<dd>TEXT. Name of the weights table generated during CRF training (\c lincrf_train)
<dt>viterbi_mtbl</dt>
<dd>TEXT. Name of the Viterbi M table to be created</dd>
<dt>viterbi_rtbl</dt>
<dd>TEXT. Name of the Viterbi R table to be created</dd>
</dl>
@anchor inference
@par Inference using Viterbi
<pre class="syntax">
vcrf_label(test_segment_tbl,
viterbi_mtbl,
viterbi_rtbl,
label_tbl,
result_tbl)
</pre>
\b Arguments
<dl class="arglist">
<dt>test_segment_tbl</dt>
<dd>TEXT. Name of the testing segment table. For required table schema, please refer to arguments in previous section</dd>
<dt>viterbi_mtbl</dt>
<dd>TEXT. Name of the table \c viterbi_mtbl generated from testing feature generation \c crf_test_fgen.
<dt>viterbi_rtbl</dt>
<dd>TEXT. Name of the table \c viterbi_rtbl generated from testing feature generation \c crf_test_fgen.
<dt>label_tbl</dt>
<dd>TEXT. Name of the label table.</dd>
<dt>result_tbl</dt>
<dd>TEXT. Name of the result table to be created containing extracted best label sequences.</dd>
</dl>
@anchor usage
@par Using CRF
Generate text features, calculate their weights, and output the best label sequence for test data:\n
-# Perform feature generation on training data i.e. \c train_segment_tbl generating \c train_feature_tbl and \c train_featureset_tbl.
<pre>SELECT madlib.crf_train_fgen(
'<em>train_segment_tbl</em>',
'<em>regex_tbl</em>',
'<em>label_tbl</em>',
'<em>dictionary_tbl</em>',
'<em>train_feature_tbl</em>',
'<em>train_featureset_tbl</em>');</pre>
-# Use linear-chain CRF for training providing \c train_feature_tbl and \c train_featureset_tbl generated from previous step as an input.
<pre>SELECT madlib.lincrf_train(
'<em>train_feature_tbl</em>',
'<em>train_featureset_tbl</em>',
'<em>label_tbl</em>',
'<em>crf_stats_tbl</em>',
'<em>crf_weights_tbl</em>',
<em>max_iterations</em>);</pre>
-# Perform feature generation on testing data \c test_segment_tbl generating \c viterbi_mtbl and \c viterbi_rtbl required for inferencing.
<pre>SELECT madlib.crf_test_fgen(
'<em>test_segment_tbl</em>',
'<em>dictionary_tbl</em>',
'<em>label_tbl</em>',
'<em>regex_tbl</em>',
'<em>crf_weights_tbl</em>',
'<em>viterbi_mtbl</em>',
'<em>viterbi_rtbl</em>');</pre>
-# Run the Viterbi function to get the best label sequence and the conditional
probability \f$ \Pr( \text{best label sequence} \mid \text{sequence}) \f$.
<pre>SELECT madlib.vcrf_label(
'<em>test_segment_tbl</em>',
'<em>viterbi_mtbl</em>',
'<em>viterbi_rtbl</em>',
'<em>label_tbl</em>',
'<em>result_tbl</em>');</pre>
@anchor examples
@examp
This example uses a trivial training and test data set.
-# Load the label table, the regular expressions table, and the training segment table:
<pre class="example">
CREATE TABLE crf_label (id integer,label character varying);
INSERT INTO crf_label VALUES
(0,'CC'), (1,'CD'), (2,'DT'), (3,'EX'), (4,'FW'), (5,'IN'), (6,'JJ'), (7,'JJR'), (8,'JJS'),
(9,'LS'), (10,'MD'), (11,'NN'), (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'),
(18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
(27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
(36,'$'), (37,'#'), (38,''''''), (39,'``'), (40,'('), (41,')'), (42,','), (43,'.'), (44,':');
CREATE TABLE crf_regex (pattern text,name text);
INSERT INTO crf_regex VALUES
('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'), ('^.*[0-9]+.*$','containsDigit'),
('^.+[.]$','endsWithDot'), ('^.+[,]$','endsWithComma'), ('^.+er$','endsWithER'),
('^.+est$','endsWithEst'), ('^.+ed$','endsWithED'), ('^.+s$','endsWithS'),
('^.+ing$','endsWithIng'), ('^.+ly$','endsWithly'), ('^.+-.+$','isDashSeparatedWords'),
('^.*@.*$','isEmailId');
CREATE TABLE train_segmenttbl(start_pos integer,doc_id integer,seg_text text,label integer,max_pos integer);
INSERT INTO train_segmenttbl VALUES
(0,1,'confidence',11,36), (1,1,'in',5,36), (2,1,'the',2,36), (3,1,'pound',11,36),
(4,1,'is',31,36), (5,1,'widely',19,36), (6,1,'expected',29,36), (7,1,'to',24,36),
(8,1,'take',26,36), (9,1,'another',2,36), (10,1,'sharp',6,36), (11,1,'dive',11,36),
(12,1,'if',5,36), (13,1,'trade',11,36), (14,1,'figures',12,36), (15,1,'for',5,36),
(16,1,'september',13,36), (17,1,',',42,36), (18,1,'due',6,36), (19,1,'for',5,36),
(20,1,'release',11,36), (21,1,'tomorrow',11,36), (22,1,',',42,36), (23,1,'fail',26,36),
(24,1,'to',24,36), (25,1,'show',26,36), (26,1,'a',2,36), (27,1,'substantial',6,36),
(28,1,'improvement',11,36),(29,1,'from',5,36), (30,1,'july',13,36), (31,1,'and',0,36),
(32,1,'august',13,36), (33,1,'''s',16,36), (34,1,'near-record',6,36),(35,1,'deficits',12,36),
(36,1,'.',43,36), (0,2,'chancellor',13,26),(1,2,'of',5,26), (2,2,'the',2,26),
(3,2,'exchequer',13,26), (4,2,'nigel',13,26), (5,2,'lawson',13,26), (6,2,'''s',16,26),
(7,2,'restated',29,26), (8,2,'commitment',11,26),(9,2,'to',24,26), (10,2,'a',2,26),
(11,2,'firm',11,26), (12,2,'monetary',6,26), (13,2,'policy',11,26), (14,2,'has',31,26),
(15,2,'helped',29,26), (16,2,'to',24,26), (17,2,'prevent',26,26), (18,2,'a',2,26),
(19,2,'freefall',11,26), (20,2,'in',5,26), (21,2,'sterling',11,26), (22,2,'over',5,26),
(23,2,'the',2,26), (24,2,'past',6,26), (25,2,'week',11,26), (26,2,'.',43,26);
</pre>
-# Generate the training features:
<pre class="example">
SELECT crf_train_fgen( 'train_segmenttbl',
'crf_regex',
'crf_label',
'crf_dictionary',
'train_featuretbl',
'train_featureset'
);
SELECT * from crf_dictionary;
</pre>
Result:
<pre class="result">
token | total
&nbsp;------------+-------
a | 3
and | 1
august | 1
chancellor | 1
dive | 1
exchequer | 1
...
</pre>
<pre class="example">
SELECT * from train_featuretbl;
</pre>
Result:
<pre class="result">
doc_id | f_size | sparse_r | dense_m | sparse_m
&nbsp;-------+--------+------------------------------+----------------------------------+-----------------------
1 | 115 | {-1,11,82,0,1,-1,2,32,0,...} | {11,5,11,1,1,5,2,8,2,1,2,11,...} | {5,2,13,11,11,5,13,...}
2 | 115 | {-1,19,35,0,0,-1,26,38,0,..} | {13,5,66,1,1,5,2,8,2,1,2,13,...} | {5,2,13,11,11,5,13,...}
</pre>
<pre class="example">
SELECT * from train_featureset;
</pre>
<pre class="result">
f_index | f_name | feature
&nbsp;--------+---------------+---------
6 | W_the | {-1,2}
9 | R_endsWithly | {-1,19}
14 | W_figures | {-1,12}
17 | W_helped | {-1,29}
25 | W_show | {-1,26}
28 | W_'s | {-1,16}
33 | W_chancellor | {-1,13}
43 | W_over | {-1,5}
52 | W_trade | {-1,11}
10 | W_july | {-1,13}
21 | W_substantial | {-1,6}
5 | E. | {2,13}
...
</pre>
-# Train using linear CRF:
<pre class="example">
SELECT lincrf_train( 'train_featuretbl',
'train_featureset',
'crf_label',
'crf_stats_tbl',
'crf_weights_tbl',
20
);
</pre>
<pre class="result">
lincrf_train
&nbsp;-----------------------------------------------------------------------------------
CRF Train successful. Results stored in the specified CRF stats and weights table
lincrf
</pre>
View the feature weight table.
<pre class="example">
SELECT * from crf_weights_tbl;
</pre>
Result:
<pre class="result">
id | name | prev_label_id | label_id | weight
&nbsp;----+---------------+---------------+----------+-------------------
4 | W_lawson | -1 | 13 | 1.73698153439171
3 | End. | -1 | 43 | 3.3198742329636
7 | W_has | -1 | 31 | 2.19831004450897
24 | W_tomorrow | -1 | 11 | 3.34106414300743
29 | W_. | -1 | 43 | 3.3198742329636
34 | W_from | -1 | 5 | 2.80284597986744
37 | W_august | -1 | 13 | 1.34455487966976
39 | W_due | -1 | 6 | 3.39258895715363
41 | W_exchequer | -1 | 13 | 1.82177698489335
...
</pre>
-# To find the best labels for a test set using the trained linear CRF model, repeat steps #1-2 and generate the test features, except instead of creating a new dictionary, use the dictionary generated from the training set.
<pre class="example">
CREATE TABLE test_segmenttbl (start_pos integer,doc_id integer,seg_text text,max_pos integer);
INSERT INTO test_segmenttbl VALUES
(0,1,'chancellor',26),(1,1,'of',26), (2,1,'the',26), (3,1,'exchequer',26), (4,1,'nigel',26),
(5,1,'lawson',26), (6,1,'''s',26), (7,1,'restated',26), (8,1,'commitment',26),(9,1,'to',26),
(10,1,'a',26), (11,1,'firm',26), (12,1,'monetary',26),(13,1,'policy',26), (14,1,'has',26),
(15,1,'helped',26), (16,1,'to',26), (17,1,'prevent',26), (18,1,'a',26), (19,1,'freefall',26),
(20,1,'in',26), (21,1,'sterling',26),(22,1,'over',26), (23,1,'the',26), (24,1,'past',26),
(25,1,'week',26), (26,1,'.',26), (0,2,'but',28), (1,2,'analysts',28), (2,2,'reckon',28),
(3,2,'underlying',28),(4,2,'support',28), (5,2,'for',28), (6,2,'sterling',28), (7,2,'has',28),
(8,2,'been',28), (9,2,'eroded',28), (10,2,'by',28), (11,2,'the',28), (12,2,'chancellor',28),
(13,2,'''s',28), (14,2,'failure',28), (15,2,'to',28), (16,2,'announce',28), (17,2,'any',28),
(18,2,'new',28), (19,2,'policy',28), (20,2,'measures',28),(21,2,'in',28), (22,2,'his',28),
(23,2,'mansion',28), (24,2,'house',28), (25,2,'speech',28), (26,2,'last',28), (27,2,'thursday',28),
(28,2,'.',28), (0,3,'his',4), (1,3,'actions',4), (2,3,'prevent',4), (3,3,'disaster',4),
(4,3,'.',4);
</pre>
<pre class="example">
SELECT crf_test_fgen( 'test_segmenttbl',
'crf_dictionary',
'crf_label',
'crf_regex',
'crf_weights_tbl',
'viterbi_mtbl',
'viterbi_rtbl'
);
</pre>
-# Calculate the best label sequence and save in the table \c extracted_best_labels.
<pre class="example">
SELECT vcrf_label( 'test_segmenttbl',
'viterbi_mtbl',
'viterbi_rtbl',
'crf_label',
'extracted_best_labels'
);
</pre>
View the best labels.
<pre class="example">
SELECT * FROM extracted_best_labels;
</pre>
Result:
<pre class="result">
doc_id | start_pos | seg_text | label | id | max_pos | prob
&nbsp;-------+-----------+---------------+-------+----+---------+----------
1 | 4 | nigel | NNP | 13 | 26 | 0.387118
1 | 5 | lawson | NNP | 13 | 26 | 0.387118
1 | 7 | restated | VBN | 29 | 26 | 0.387118
1 | 8 | commitment | NN | 11 | 26 | 0.387118
...
3 | 0 | his | NNP | 13 | 4 | 0.047757
3 | 2 | prevent | JJ | 6 | 4 | 0.047757
3 | 4 | . | . | 43 | 4 | 0.047757
...
</pre>
@anchor background
@par Technical Background
Specifically, a linear-chain CRF is a distribution defined by
\f[
p_\lambda(\boldsymbol y | \boldsymbol x) =
\frac{\exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol y)}}{Z_\lambda(\boldsymbol x)}
\,.
\f]
where
- \f$ F_m(\boldsymbol x, \boldsymbol y) = \sum_{i=1}^n f_m(y_i,y_{i-1},x_i) \f$ is a global feature function that is a sum along a sequence
\f$ \boldsymbol x \f$ of length \f$ n \f$
- \f$ f_m(y_i,y_{i-1},x_i) \f$ is a local feature function dependent on the current token label \f$ y_i \f$, the previous token label \f$ y_{i-1} \f$,
and the observation \f$ x_i \f$
- \f$ \lambda_m \f$ is the corresponding feature weight
- \f$ Z_\lambda(\boldsymbol x) \f$ is an instance-specific normalizer
\f[
Z_\lambda(\boldsymbol x) = \sum_{\boldsymbol y'} \exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol y')}
\f]
A linear-chain CRF estimates the weights \f$ \lambda_m \f$ by maximizing the log-likelihood
of a given training set \f$ T=\{(x_k,y_k)\}_{k=1}^N \f$.
The log-likelihood is defined as
\f[
\ell_{\lambda}=\sum_k \log p_\lambda(y_k|x_k) =\sum_k[\sum_{m=1}^M \lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)]
\f]
and the zero of its gradient
\f[
\nabla \ell_{\lambda}=\sum_k[F(x_k,y_k)-E_{p_\lambda(Y|x_k)}[F(x_k,Y)]]
\f]
is found since the maximum likelihood is reached when the empirical average of the global feature vector equals its model expectation. The MADlib implementation uses limited-memory BFGS (L-BFGS), a limited-memory variation of the Broyden–Fletcher–Goldfarb–Shanno (BFGS) update, a quasi-Newton method for unconstrained optimization.
\f$E_{p_\lambda(Y|x)}[F(x,Y)]\f$ is found by using a variant of the forward-backward algorithm:
\f[
E_{p_\lambda(Y|x)}[F(x,Y)] = \sum_y p_\lambda(y|x)F(x,y)
= \sum_i\frac{\alpha_{i-1}(f_i*M_i)\beta_i^T}{Z_\lambda(x)}
\f]
\f[
Z_\lambda(x) = \alpha_n.1^T
\f]
where \f$\alpha_i\f$ and \f$ \beta_i\f$ are the forward and backward state cost vectors defined by
\f[
\alpha_i =
\begin{cases}
\alpha_{i-1}M_i, & 0<i<=n\\
1, & i=0
\end{cases}\\
\f]
\f[
\beta_i^T =
\begin{cases}
M_{i+1}\beta_{i+1}^T, & 1<=i<n\\
1, & i=n
\end{cases}
\f]
To avoid overfitting, we penalize the likelihood with a spherical Gaussian weight prior:
\f[
\ell_{\lambda}^\prime=\sum_k[\sum_{m=1}^M \lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)] - \frac{\lVert \lambda \rVert^2}{2\sigma ^2}
\f]
\f[
\nabla \ell_{\lambda}^\prime=\sum_k[F(x_k,y_k) - E_{p_\lambda(Y|x_k)}[F(x_k,Y)]] - \frac{\lambda}{\sigma ^2}
\f]
@literature
[1] F. Sha, F. Pereira. Shallow Parsing with Conditional Random Fields, http://www-bcf.usc.edu/~feisha/pubs/shallow03.pdf
[2] Wikipedia, Conditional Random Field, http://en.wikipedia.org/wiki/Conditional_random_field
[3] A. Jaiswal, S.Tawari, I. Mansuri, K. Mittal, C. Tiwari (2012), CRF, http://crf.sourceforge.net/
[4] D. Wang, ViterbiCRF, http://www.cs.berkeley.edu/~daisyw/ViterbiCRF.html
[5] Wikipedia, Viterbi Algorithm, http://en.wikipedia.org/wiki/Viterbi_algorithm
[6] J. Nocedal. Updating Quasi-Newton Matrices with Limited Storage (1980), Mathematics of Computation 35, pp. 773-782
[7] J. Nocedal, Software for Large-scale Unconstrained Optimization, http://users.eecs.northwestern.edu/~nocedal/lbfgs.html
@anchor related
@par Related Topics
File crf.sql_in crf_feature_gen.sql_in viterbi.sql_in (documenting the SQL functions)
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.lincrf_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.lincrf_result AS (
coef DOUBLE PRECISION[],
log_likelihood DOUBLE PRECISION,
num_iterations INTEGER
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_lbfgs_step_transition(
DOUBLE PRECISION[],
DOUBLE PRECISION[],
DOUBLE PRECISION[],
DOUBLE PRECISION[],
DOUBLE PRECISION,
DOUBLE PRECISION,
DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_lbfgs_step_merge_states(
state1 DOUBLE PRECISION[],
state2 DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_lbfgs_step_final(
state DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_lincrf_lbfgs_converge(
/*+ state */ DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION AS
'MODULE_PATHNAME'
LANGUAGE c IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_lincrf_lbfgs_result(
/*+ state */ DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.lincrf_result AS
'MODULE_PATHNAME'
LANGUAGE c IMMUTABLE STRICT;
/**
* @internal
* @brief Perform one iteration of the L-BFGS method for computing
* conditional random field
*/
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.lincrf_lbfgs_step(
DOUBLE PRECISION[],
DOUBLE PRECISION[],
DOUBLE PRECISION[],
DOUBLE PRECISION,
DOUBLE PRECISION,
DOUBLE PRECISION[]
) CASCADE;
CREATE AGGREGATE MADLIB_SCHEMA.lincrf_lbfgs_step(
/* sparse_r columns */ DOUBLE PRECISION[],
/* dense_m columns */ DOUBLE PRECISION[],
/* sparse_m columns */ DOUBLE PRECISION[],
/* feature size */ DOUBLE PRECISION,
/* tag size */ DOUBLE PRECISION,
/* previous_state */ DOUBLE PRECISION[]) (
STYPE=DOUBLE PRECISION[],
SFUNC=MADLIB_SCHEMA.lincrf_lbfgs_step_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.lincrf_lbfgs_step_merge_states,')
FINALFUNC=MADLIB_SCHEMA.lincrf_lbfgs_step_final,
INITCOND='{0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}'
);
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.array_union(anyarray) CASCADE;
CREATE m4_ifdef(`__POSTGRESQL__', `',
m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED')) AGGREGATE
MADLIB_SCHEMA.array_union(anyarray) (
SFUNC = array_cat,
STYPE = anyarray
);
-- We only need to document the last one (unfortunately, in Greenplum we have to
-- use function overloading instead of default arguments).
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.compute_lincrf(
"source" TEXT,
"sparse_R" TEXT,
"dense_M" TEXT,
"sparse_M" TEXT,
"featureSize" TEXT,
"tagSize" INTEGER,
"maxNumIterations" INTEGER)
RETURNS INTEGER
AS $$PythonFunction(crf, crf, compute_lincrf)$$
LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Compute linear-chain crf coefficients and diagnostic statistics
*
* @param source Name of the source relation containing the training data
* @param sparse_R Name of the sparse single state feature column (of type DOUBLE PRECISION[])
* @param dense_M Name of the dense two state feature column (of type DOUBLE PRECISION[])
* @param sparse_M Name of the sparse two state feature column (of type DOUBLE PRECISION[])
* @param featureSize Name of feature size column (of type DOUBLE PRECISION)
* @param tagSize The number of tags in the tag set
* @param featureset The unique feature set
* @param crf_feature The Name of output feature table
* @param maxNumIterations The maximum number of iterations
*
* @return a composite value:
* - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$
* - <tt>log_likelihood FLOAT8</tt> - Log-likelihood \f$ l(\boldsymbol c) \f$
* - <tt>num_iterations INTEGER</tt> - The number of iterations before the
* algorithm terminated \n\n
* A 'crf_feature' table is used to store all the features and corresponding weights
*
* @note This function starts an iterative algorithm. It is not an aggregate
* function. Source and column names have to be passed as strings (due to
* limitations of the SQL syntax).
*
* @internal
* @sa This function is a wrapper for crf::compute_lincrf(), which
* sets the default values.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_train(
train_feature_tbl TEXT,
train_featureset_tbl TEXT,
label_tbl TEXT,
crf_stats_tbl TEXT,
crf_weights_tbl TEXT,
max_iterations INTEGER /* DEFAULT 20 */
) RETURNS TEXT AS $$
PythonFunction(crf, crf, lincrf_train)
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_train(
train_feature_tbl TEXT,
train_featureset_tbl TEXT,
label_tbl TEXT,
crf_stats_tbl TEXT,
crf_weights_tbl TEXT
) RETURNS TEXT AS
$$
SELECT MADLIB_SCHEMA.lincrf_train($1, $2, $3, $4, $5, 20);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');