blob: f55037f7d02a9c878d7f96c37b04cd98cbdb64ea [file] [log] [blame]
"""
@file crf_feature_gen.py_in
@brief Conditional Random Field: Feature Extraction for Tranining and Testing.
@namespace crf
Conditional Random Field: Feature Extraction for Training and Testing.
"""
import plpy
from utilities.validate_args import table_exists
from utilities.validate_args import columns_exist_in_table
from utilities.utilities import _assert
from utilities.utilities import add_postfix
def generate_train_features(schema_madlib, train_segment_tbl, regex_tbl, label_tbl,
dictionary_tbl, train_feature_tbl, train_featureset_tbl, **kwargs):
_validate_train_args(train_segment_tbl, regex_tbl, label_tbl,
dictionary_tbl, train_feature_tbl, train_featureset_tbl)
origClientMinMessages = plpy.execute("SELECT setting AS setting FROM pg_settings WHERE name = \'client_min_messages\';")
plpy.execute("SET client_min_messages TO warning;")
plpy.execute("SELECT {schema_madlib}.create_schema_pg_temp();".format(schema_madlib=schema_madlib));
tmp1_feature = "pg_temp._madlib_tmp1_feature"
tmp_rtbl = "pg_temp._madlib_tmp_rtbl"
tmp_dense_mtbl = "pg_temp._madlib_tmp_dense_mtbl"
dense_mtbl = "pg_temp._madlib_dense_mtbl"
sparse_rtbl = "pg_temp._madlib_sparse_rtbl"
sparse_mtbl = "pg_temp._madlib_sparse_mtbl"
tmp_featureset = "pg_temp._madlib_tmp_featureset"
tmp_segmenttbl = "pg_temp._madlib_tmp_segmenttbl"
tmp_segcount_tbl = "pg_temp._madlib_tmp_segcount_tbl"
plpy.execute("""DROP TABLE IF EXISTS {tmp1_feature},
{tmp_rtbl},
{tmp_dense_mtbl},
{dense_mtbl},
{sparse_rtbl},
{sparse_mtbl},
{tmp_featureset},
{tmp_segmenttbl}
""".format(tmp1_feature = tmp1_feature,
tmp_rtbl = tmp_rtbl,
tmp_dense_mtbl = tmp_dense_mtbl,
dense_mtbl = dense_mtbl,
sparse_rtbl = sparse_rtbl,
sparse_mtbl = sparse_mtbl,
tmp_featureset = tmp_featureset,
tmp_segmenttbl = tmp_segmenttbl))
plpy.execute("""CREATE TABLE """ + tmp1_feature + """(start_pos integer,doc_id integer, f_name text, feature integer[])
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (doc_id)')""")
plpy.execute("""CREATE TABLE """ + tmp_rtbl + """(start_pos integer,doc_id integer, feature integer[])
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (doc_id)')""")
plpy.execute("""CREATE TABLE """ + tmp_dense_mtbl + """(start_pos integer,doc_id integer, feature integer[])
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (doc_id)')""")
plpy.execute("""CREATE TABLE """ + dense_mtbl + """(doc_id integer, dense_m integer[])
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (doc_id)')""")
plpy.execute("""CREATE TABLE """ + sparse_rtbl + """(doc_id integer,f_size integer, sparse_r integer[])
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (doc_id)')""")
plpy.execute("""CREATE TABLE """ + sparse_mtbl + """(sparse_m integer[])
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (sparse_m)')""")
plpy.execute("""CREATE TABLE """ + tmp_featureset + """(f_name text, feature integer[])
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (f_name)')""")
plpy.execute("""CREATE TABLE """ + tmp_segmenttbl + """(start_pos int,doc_id int,seg_text text,label int)
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (doc_id)')""")
plpy.execute("""CREATE TABLE """ + tmp_segcount_tbl + """(doc_id int, doc_len int)
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (doc_id)')""")
# replace digits with "DIGIT" keyword
plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,seg_text,label FROM """ + train_segment_tbl + """ WHERE
NOT (seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~ E'^[-+]?[0-9]*[.][0-9]+$');""")
plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,'DIGIT',label FROM """ + train_segment_tbl + """ WHERE
seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~E'^[-+]?[0-9]*[.][0-9]+$';""")
# Create the dictionary_tbl table containing distinct tokens
plpy.execute("""
CREATE TABLE {dictionary_tbl} AS
SELECT seg_text token, count(*) total FROM {tmp_segmenttbl}
GROUP BY seg_text
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (token)')
""".format(dictionary_tbl = dictionary_tbl,
tmp_segmenttbl = tmp_segmenttbl))
plpy.execute("""
DROP TABLE IF EXISTS {tmp_segcount_tbl};
CREATE TABLE {tmp_segcount_tbl} AS
SELECT doc_id, count(*) - 1 doc_len
FROM {train_segment_tbl}
GROUP BY doc_id
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (doc_id)')
""".format(tmp_segcount_tbl = tmp_segcount_tbl,
train_segment_tbl = train_segment_tbl))
# create a temporary table to store all the features
# extract all the edge features
plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
SELECT doc2.start_pos, doc2.doc_id, 'E.', ARRAY[doc1.label, doc2.label]
FROM """ + tmp_segmenttbl + """ doc1, """ + tmp_segmenttbl + """ doc2
WHERE doc1.doc_id = doc2.doc_id AND doc1.start_pos+1 = doc2.start_pos;""")
#extract all the regex features
plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
SELECT start_pos, doc_id, 'R_' || name, ARRAY[-1, label]
FROM """ + regex_tbl + """, """ + tmp_segmenttbl + """
WHERE seg_text ~ pattern;""")
#extract all the start feature
plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
SELECT start_pos, doc_id, 'S.', ARRAY[-1, label]
FROM """ + tmp_segmenttbl + """
WHERE start_pos = 0;""")
#extract all the end featue
plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
SELECT start_pos, t.doc_id, 'End.', ARRAY[-1, label]
FROM """ + tmp_segmenttbl + """ t, """ + tmp_segcount_tbl + """ q
WHERE t.doc_id = q.doc_id AND t.start_pos = q.doc_len""")
#word feature
plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
SELECT start_pos, doc_id, 'W_' || seg_text, ARRAY[-1, label]
FROM """ + tmp_segmenttbl + """;""")
#unknown feature
plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
SELECT start_pos, doc_id, 'U', ARRAY[-1, label]
FROM """ + tmp_segmenttbl + """ seg, """ + dictionary_tbl + """ dic
WHERE seg.seg_text = dic.token AND dic.total <= 1;""")
plpy.execute("""INSERT INTO """ + tmp_featureset + """(f_name, feature)
SELECT DISTINCT f_name, feature
FROM """ + tmp1_feature + """;""")
# Enforce ANALYZE to gather proper table statistics required to generate optimized query plans
plpy.execute("""ANALYZE {tmp1_feature} """.format(tmp1_feature = tmp1_feature))
plpy.execute("""DROP SEQUENCE IF EXISTS seq;
CREATE SEQUENCE seq START 1 INCREMENT 1;""")
#get all distcint features
plpy.execute("""
CREATE table {train_featureset_tbl} AS
SELECT CAST(nextval('seq')-1 AS INTEGER) f_index, f_name, feature FROM {tmp_featureset}
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (f_index)')
""".format(train_featureset_tbl = train_featureset_tbl,
tmp_featureset = tmp_featureset))
# Enforce ANALYZE to gather proper table statistics required to generate optimized query plans
plpy.execute(""" ANALYZE {train_featureset_tbl} """.format(train_featureset_tbl = train_featureset_tbl))
rv = plpy.execute("""SELECT COUNT(*) AS total_feature FROM """ + train_featureset_tbl + """;""")
plpy.execute("""INSERT INTO """ + tmp_rtbl + """(start_pos,doc_id,feature)
SELECT start_pos, doc_id, array_cat(fset.feature,
ARRAY[f_index,start_pos,
CASE WHEN """ + tmp1_feature + """.feature = fset.feature THEN 1
ELSE 0
END] )
FROM """ + tmp1_feature + """, """ + train_featureset_tbl + """ fset
WHERE """ + tmp1_feature + """.f_name = fset.f_name AND fset.f_name <> 'E.';""")
plpy.execute("""INSERT INTO {sparse_rtbl} (doc_id, f_size, sparse_r)
SELECT doc_id, {f_size}, {schema_madlib}.array_union(feature::integer[] order by start_pos)
FROM {tmp_rtbl}
GROUP BY doc_id;""".format(schema_madlib = schema_madlib,
sparse_rtbl = sparse_rtbl,
f_size = rv[0]['total_feature'],
tmp_rtbl = tmp_rtbl))
plpy.execute("""INSERT INTO """ + tmp_dense_mtbl + """(start_pos,doc_id,feature)
SELECT start_pos, doc_id,
array_cat(fset.feature, ARRAY[f_index,start_pos,1])
FROM """ + tmp1_feature + """, """ + train_featureset_tbl + """ fset
WHERE start_pos > 0 AND """ + tmp1_feature + """.f_name = fset.f_name AND """ + tmp1_feature + """.feature = fset.feature AND fset.f_name = 'E.';""")
plpy.execute("""INSERT INTO {dense_mtbl} (doc_id, dense_m)
SELECT doc_id, {schema_madlib}.array_union(feature::integer[] order by start_pos)
FROM {tmp_dense_mtbl}
GROUP BY doc_id;""".format(schema_madlib = schema_madlib,
dense_mtbl = dense_mtbl,
tmp_dense_mtbl = tmp_dense_mtbl))
plpy.execute("""INSERT INTO {sparse_mtbl} (sparse_m)
SELECT {schema_madlib}.array_union(array_cat(ARRAY[f_index],feature))
FROM {train_featureset_tbl} fset
WHERE f_name = 'E.';""".format(schema_madlib = schema_madlib,
sparse_mtbl = sparse_mtbl,
train_featureset_tbl = train_featureset_tbl))
plpy.execute("""
CREATE TABLE {train_feature_tbl} AS
SELECT {sparse_rtbl}.doc_id, f_size, sparse_r, dense_m, sparse_m
FROM {sparse_rtbl}, {dense_mtbl}, {sparse_mtbl}
WHERE {sparse_rtbl}.doc_id = {dense_mtbl}.doc_id
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (doc_id)')
""".format(train_feature_tbl = train_feature_tbl,
sparse_rtbl = sparse_rtbl,
sparse_mtbl = sparse_mtbl,
dense_mtbl = dense_mtbl))
# Enforce ANALYZE to gather proper table statistics required to generate optimized query plans
plpy.execute(""" ANALYZE {train_feature_tbl} """.format(train_feature_tbl = train_feature_tbl))
plpy.execute("""SET client_min_messages TO """ + str(origClientMinMessages[0]['setting']) + """;""")
def generate_test_features(schema_madlib, test_segment_tbl,
dictionary_tbl, label_tbl, regex_tbl, crf_weights_tbl,
viterbi_mtbl, viterbi_rtbl, **kwargs):
_validate_test_args(test_segment_tbl, dictionary_tbl, label_tbl,
regex_tbl, crf_weights_tbl, viterbi_mtbl, viterbi_rtbl)
# Create m&r factor table
plpy.execute("""
CREATE TABLE {viterbi_mtbl} (score DOUBLE PRECISION[])
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (score)')
""".format(viterbi_mtbl = viterbi_mtbl));
plpy.execute("""
CREATE TABLE {viterbi_rtbl}
(seg_text text, label integer, score DOUBLE PRECISION)
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (label)')
""".format(viterbi_rtbl = viterbi_rtbl))
# Create index for performance
_tablename = viterbi_rtbl.split('.')
if len(_tablename) == 1:
rtbl_name = _tablename[0]
else:
rtbl_name = _tablename[-1]
rtbl_name_idx = add_postfix(rtbl_name, "_idx")
plpy.execute("""
CREATE INDEX {rtbl_name_idx} ON {viterbi_rtbl} (seg_text)
""".format(rtbl_name_idx = rtbl_name_idx,
viterbi_rtbl = viterbi_rtbl))
origClientMinMessages = plpy.execute("""SELECT setting AS setting
FROM pg_settings WHERE name = \'client_min_messages\';""")
plpy.execute("SET client_min_messages TO warning;")
plpy.execute("SELECT {schema_madlib}.create_schema_pg_temp()".format(schema_madlib=schema_madlib))
prev_labeltbl = "pg_temp._madlib_prev_labeltbl"
segment_hashtbl = "pg_temp._madlib_segment_hashtbl"
unknown_segment_hashtbl = "pg_temp._madlib_unknown_segment_hashtbl"
rtbl = "pg_temp._madlib_rtbl"
mtbl = "pg_temp._madlib_mtbl"
tmp_segment_tbl = "pg_temp._madlib_tmp_segment_tbl"
tmp_dict = "pg_temp._madlib_tmp_dict"
plpy.execute("""DROP TABLE IF EXISTS {prev_labeltbl},
{segment_hashtbl},
{unknown_segment_hashtbl},
{rtbl},
{mtbl},
{tmp_segment_tbl},
{tmp_dict}
""".format(prev_labeltbl = prev_labeltbl,
segment_hashtbl = segment_hashtbl,
unknown_segment_hashtbl = unknown_segment_hashtbl,
rtbl = rtbl,
mtbl = mtbl,
tmp_segment_tbl = tmp_segment_tbl,
tmp_dict = tmp_dict))
plpy.execute("""CREATE TABLE """ + prev_labeltbl + """(id int)
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)')""")
# Insert unique tokens into the """ + segment_hashtbl + """
plpy.execute("CREATE TABLE """ + segment_hashtbl + """(seg_text text)
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (seg_text)')""")
# create a temp partial dictionary_tbl table which stores the words whose occurance
# is below certain threshold, refer to the CRF Package
plpy.execute("""CREATE TABLE """ + unknown_segment_hashtbl + """(seg_text text)
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (seg_text)')""")
# Generate a sparse matrix to store the r factors
plpy.execute("""CREATE TABLE """ + rtbl + """ (seg_text text NOT NULL, label integer, value double precision)
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (label)')""")
# Generate M factor table
plpy.execute("""CREATE TABLE """ + mtbl + """(prev_label integer, label integer, value double precision)
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (label)')""")
# temp tables to keep segments and dictionary_tbl with all digits replaced by the word 'DIGIT'
plpy.execute("""CREATE TABLE """ + tmp_segment_tbl + """(start_pos int,doc_id int,seg_text text)
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (doc_id)')""")
plpy.execute("""CREATE TABLE """+ tmp_dict + """(token text, total int)
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (token)')""")
plpy.execute("""SET client_min_messages TO """ + str(origClientMinMessages[0]['setting']) + """;""")
# Calculate the number of labels in the label space
rv = plpy.execute("""SELECT COUNT(*) AS total_label FROM """ + label_tbl + """;""")
nlabel = rv[0]['total_label']
# replace digits with "DIGIT" keyword
plpy.execute("""INSERT INTO """ + tmp_segment_tbl + """ SELECT start_pos,doc_id,seg_text FROM """ + test_segment_tbl + """ WHERE
NOT (seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~ E'^[-+]?[0-9]*[.][0-9]+$')""")
plpy.execute("""INSERT INTO """ + tmp_segment_tbl + """ SELECT start_pos,doc_id,'DIGIT' FROM """ + test_segment_tbl + """ WHERE
seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~E'^[-+]?[0-9]*[.][0-9]+$';""")
plpy.execute("""INSERT INTO """+ tmp_dict + """ SELECT token,sum(total) FROM """ + dictionary_tbl + """ GROUP BY token
HAVING (token NOT LIKE E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' AND token NOT LIKE E'^[-+]?[0-9]*[.][0-9]+$')""")
plpy.execute("""INSERT INTO """+ tmp_dict + """ SELECT 'DIGIT',sum(total) FROM """ + dictionary_tbl + """ WHERE
(token ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR token ~ E'^[-+]?[0-9]*[.][0-9]+$') GROUP BY token;""")
plpy.execute("""INSERT INTO """ + segment_hashtbl + """(seg_text)
SELECT DISTINCT seg_text
FROM """ + tmp_segment_tbl + """;""")
plpy.execute("""INSERT INTO """ + unknown_segment_hashtbl + """(seg_text)
((SELECT DISTINCT seg_text
FROM """ + segment_hashtbl + """)
EXCEPT
(SELECT DISTINCT token
FROM """+ tmp_dict + """
WHERE total>1));""")
plpy.execute("""INSERT INTO """ + prev_labeltbl + """
SELECT id
FROM """ + label_tbl + """;
INSERT INTO """ + prev_labeltbl + """ VALUES(-1);
INSERT INTO """ + prev_labeltbl + """ VALUES( """ + str(nlabel) + """);""")
# Generate sparse M factor table
plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value)
SELECT prev_label.id, label.id, 0
FROM """ + label_tbl + """ AS label,
""" + prev_labeltbl + """ as prev_label;""")
# EdgeFeature and startFeature, startFeature can be considered as a special edgeFeature
plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value)
SELECT prev_label_id,label_id,weight
FROM """ + crf_weights_tbl + """ AS features
WHERE features.prev_label_id<>-1 OR features.name = 'S.';""")
# EndFeature, endFeature can be considered as a special edgeFeature
plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value)
SELECT """ + str(nlabel) + """, label_id, weight
FROM """ + crf_weights_tbl + """ AS features
WHERE features.name = 'End.';""")
m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `
plpy.execute("""INSERT INTO """ + viterbi_mtbl + """
SELECT array_agg(weight ORDER BY prev_label,label)
FROM (SELECT prev_label, label, (SUM(value)*1000)::FLOAT8 AS weight
FROM """ + mtbl + """
GROUP BY prev_label,label
ORDER BY prev_label,label) as TEMP_MTBL;""".format(
viterbi_mtbl = viterbi_mtbl
))
', `
plpy.execute("""INSERT INTO """ + viterbi_mtbl + """
SELECT ARRAY(
SELECT
(SUM(value) * 1000)::FLOAT8
FROM
""" + mtbl + """
GROUP BY
prev_label, label
ORDER BY
prev_label, label
);""".format(
viterbi_mtbl = viterbi_mtbl
))
')
plpy.execute("""INSERT INTO """ + rtbl + """(seg_text, label, value)
SELECT segment_hashtbl.seg_text, labels.id, 0
FROM """ + segment_hashtbl + """ segment_hashtbl,
""" + label_tbl + """ AS labels;""")
# RegExFeature
plpy.execute("""INSERT INTO """ + rtbl + """(seg_text, label, value)
SELECT segment_hashtbl.seg_text, features.label_id, features.weight
FROM """ + segment_hashtbl + """ AS segment_hashtbl,
""" + crf_weights_tbl + """ AS features,
""" + regex_tbl + """ AS regex
WHERE segment_hashtbl.seg_text ~ regex.pattern
AND features.name ='R_' || regex.name;""")
# UnknownFeature
plpy.execute("""INSERT INTO """ + rtbl + """(seg_text, label, value)
SELECT segment_hashtbl.seg_text, features.label_id, features.weight
FROM """ + unknown_segment_hashtbl + """ AS segment_hashtbl,
""" + crf_weights_tbl + """ AS features
WHERE features.name = 'U';""")
# Wordfeature
plpy.execute("""INSERT INTO """ + rtbl + """(seg_text, label, value)
SELECT seg_text, label_id, weight
FROM """ + segment_hashtbl + """,
""" + crf_weights_tbl + """
WHERE name = 'W_' || seg_text;""")
# Factor table
plpy.execute("""INSERT INTO """ + viterbi_rtbl + """(seg_text, label, score)
SELECT seg_text,label,(SUM(value)*1000)::FLOAT8 AS score
FROM """ + rtbl + """
GROUP BY seg_text,label;""")
# Enforce ANALYZE to gather proper table statistics required to generate optimized query plans
plpy.execute(""" ANALYZE {viterbi_mtbl} """.format(viterbi_mtbl = viterbi_mtbl))
plpy.execute(""" ANALYZE {viterbi_rtbl} """.format(viterbi_rtbl = viterbi_rtbl))
def _validate_label_tbl(label_tbl):
rv = plpy.execute("""SELECT count(*), max(id), min(id) FROM {label_tbl}
""".format(label_tbl = label_tbl))
count = rv[0]['count']
max_id = rv[0]['max']
min_id = rv[0]['min']
_assert(min_id >= 0 and max_id <= count - 1,
"CRF error: Bound check failed for label table."
" Expected id values between 0 to total number of elements in the table - 1")
def _validate_columns(cols, table_name, err_msg_tbl):
"""
@brief Validate if cols exists in the table
"""
_assert(columns_exist_in_table(table_name, cols),
"CRF error: Missing required columns from %s table: %s" % (err_msg_tbl, ', '.join(cols)))
def _validate_train_args(train_segment_tbl, regex_tbl, label_tbl,
dictionary_tbl, train_feature_tbl, train_featureset_tbl):
"""
@brief Validate the arguments: Feature extraction for training.
"""
# Validate existence of input tables.
_assert(table_exists(train_segment_tbl),
"CRF error: Train segment table does not exist!")
_assert(table_exists(regex_tbl),
"CRF error: Regex table does not exist!")
_assert(table_exists(label_tbl),
"CRF error: Label table does not exist!")
# Validate required column names existence in respective tables.
_validate_columns(['doc_id', 'start_pos', 'seg_text', 'label'],
train_segment_tbl, "segment")
_validate_columns(['pattern', 'name'], regex_tbl, "regex")
_validate_columns(['id', 'label'], label_tbl, "label")
_validate_label_tbl(label_tbl)
# Validate output tables for valid names.
_assert(dictionary_tbl is not None and
dictionary_tbl.lower().strip() not in ('null', ''),
"CRF error: Invalid dictionary table name")
_assert(train_feature_tbl is not None and
train_feature_tbl.lower().strip() not in ('null', ''),
"CRF error: Invalid train feature table name")
_assert(train_featureset_tbl is not None and
train_featureset_tbl.lower().strip() not in ('null', ''),
"CRF error: Invalid train fatureset table name")
_assert(not table_exists(dictionary_tbl),
"CRF error: Dictionary table name already exist!"
" Please provide a different table name.")
_assert(not table_exists(train_feature_tbl),
"CRF error: Train feature table name already exist!"
" Please provide a different table name.")
_assert(not table_exists(train_featureset_tbl),
"CRF error: Train featureset table name already exist!"
" Please provide a different table name.")
def _validate_test_args(test_segment_tbl, dictionary_tbl, label_tbl,
regex_tbl, crf_weights_tbl, viterbi_mtbl, viterbi_rtbl):
"""
@brief Validate the arguments: Feature extraction for testing.
"""
# Check existence of input tables.
_assert(table_exists(test_segment_tbl),
"CRF error: Test segment table does not exist!")
_assert(table_exists(dictionary_tbl),
"CRF error: Dictionary table does not exist!")
_assert(table_exists(label_tbl),
"CRF error: Label table does not exist!")
_assert(table_exists(regex_tbl),
"CRF error: Regex table does not exist!")
_assert(table_exists(crf_weights_tbl),
"CRF error: CRF weights table does not exist!")
# Validate required column names existence in respective tables.
_validate_columns(['doc_id', 'start_pos', 'seg_text'], test_segment_tbl, "segment")
_validate_columns(['token', 'total'], dictionary_tbl, "dictionary")
_validate_columns(['id', 'label'], label_tbl, "label")
_validate_columns(['pattern', 'name'], regex_tbl, "regex")
_validate_columns(['id', 'name', 'prev_label_id', 'label_id', 'weight'],
crf_weights_tbl, "crf weights")
_validate_label_tbl(label_tbl)
# Validate output tables for valid names.
_assert(viterbi_mtbl is not None and
viterbi_mtbl.lower().strip() not in ('null', ''),
"CRF error: Invalid viterbi mtable name")
_assert(viterbi_rtbl is not None and
viterbi_rtbl.lower().strip() not in ('null', ''),
"CRF error: Invalid viterbi rtable name")
_assert(not table_exists(viterbi_mtbl),
"CRF error: Viterbi M table name already exist!"
" Please provide a different table name.")
_assert(not table_exists(viterbi_rtbl),
"CRF error: Viterbi R table name already exist!"
" Please provide a different table name.")